1 /**********************************************************************
6 created at: Tue Oct 30 16:10:22 JST 2007
8 Copyright (C) 2007 Martin Duerst
10 **********************************************************************/
12 #include "ruby/internal/config.h"
17 #include "internal/array.h"
18 #include "internal/inits.h"
19 #include "internal/object.h"
20 #include "internal/string.h"
21 #include "internal/transcode.h"
22 #include "ruby/encoding.h"
24 #include "transcode_data.h"
27 #define ENABLE_ECONV_NEWLINE_OPTION 1
29 /* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
30 static VALUE rb_eUndefinedConversionError
;
31 static VALUE rb_eInvalidByteSequenceError
;
32 static VALUE rb_eConverterNotFoundError
;
34 VALUE rb_cEncodingConverter
;
36 static ID id_destination_encoding
;
37 static ID id_destination_encoding_name
;
38 static ID id_error_bytes
;
39 static ID id_error_char
;
40 static ID id_incomplete_input
;
41 static ID id_readagain_bytes
;
42 static ID id_source_encoding
;
43 static ID id_source_encoding_name
;
45 static VALUE sym_invalid
, sym_undef
, sym_replace
, sym_fallback
;
46 static VALUE sym_xml
, sym_text
, sym_attr
;
47 static VALUE sym_universal_newline
;
48 static VALUE sym_crlf_newline
;
49 static VALUE sym_cr_newline
;
50 #ifdef ENABLE_ECONV_NEWLINE_OPTION
51 static VALUE sym_newline
, sym_universal
, sym_crlf
, sym_cr
, sym_lf
;
53 static VALUE sym_partial_input
;
55 static VALUE sym_invalid_byte_sequence
;
56 static VALUE sym_undefined_conversion
;
57 static VALUE sym_destination_buffer_full
;
58 static VALUE sym_source_buffer_empty
;
59 static VALUE sym_finished
;
60 static VALUE sym_after_output
;
61 static VALUE sym_incomplete_input
;
63 static unsigned char *
64 allocate_converted_string(const char *sname
, const char *dname
,
65 const unsigned char *str
, size_t len
,
66 unsigned char *caller_dst_buf
, size_t caller_dst_bufsize
,
69 /* dynamic structure, one per conversion (similar to iconv_t) */
70 /* may carry conversion state (e.g. for iso-2022-jp) */
71 typedef struct rb_transcoding
{
72 const rb_transcoder
*transcoder
;
77 unsigned int next_table
;
79 unsigned char next_byte
;
80 unsigned int output_index
;
82 ssize_t recognized_len
; /* already interpreted */
83 ssize_t readagain_len
; /* not yet interpreted */
85 unsigned char ary
[8]; /* max_input <= sizeof(ary) */
86 unsigned char *ptr
; /* length: max_input */
87 } readbuf
; /* recognized_len + readagain_len used */
92 unsigned char ary
[8]; /* max_output <= sizeof(ary) */
93 unsigned char *ptr
; /* length: max_output */
96 union rb_transcoding_state_t
{ /* opaque data for stateful encoding */
98 char ary
[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
99 double dummy_for_alignment
;
102 #define TRANSCODING_READBUF(tc) \
103 ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
104 (tc)->readbuf.ary : \
106 #define TRANSCODING_WRITEBUF(tc) \
107 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
108 (tc)->writebuf.ary : \
110 #define TRANSCODING_WRITEBUF_SIZE(tc) \
111 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
112 sizeof((tc)->writebuf.ary) : \
113 (size_t)(tc)->transcoder->max_output)
114 #define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
115 #define TRANSCODING_STATE(tc) \
116 ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
121 struct rb_transcoding
*tc
;
122 unsigned char *out_buf_start
;
123 unsigned char *out_data_start
;
124 unsigned char *out_data_end
;
125 unsigned char *out_buf_end
;
126 rb_econv_result_t last_result
;
131 int started
; /* bool */
133 const char *source_encoding_name
;
134 const char *destination_encoding_name
;
136 const unsigned char *replacement_str
;
137 size_t replacement_len
;
138 const char *replacement_enc
;
140 unsigned char *in_buf_start
;
141 unsigned char *in_data_start
;
142 unsigned char *in_data_end
;
143 unsigned char *in_buf_end
;
144 rb_econv_elem_t
*elems
;
145 int replacement_allocated
; /* bool */
149 struct rb_transcoding
*last_tc
;
153 rb_econv_result_t result
;
154 struct rb_transcoding
*error_tc
;
155 const char *source_encoding
;
156 const char *destination_encoding
;
157 const unsigned char *error_bytes_start
;
158 size_t error_bytes_len
;
159 size_t readagain_len
;
162 /* The following fields are only for Encoding::Converter.
163 * rb_econv_open set them NULL. */
164 rb_encoding
*source_encoding
;
165 rb_encoding
*destination_encoding
;
169 * Dispatch data and logic
172 #define DECORATOR_P(sname, dname) (*(sname) == '\0')
177 const char *lib
; /* null means no need to load a library */
178 const rb_transcoder
*transcoder
;
179 } transcoder_entry_t
;
181 static st_table
*transcoder_table
;
183 static transcoder_entry_t
*
184 make_transcoder_entry(const char *sname
, const char *dname
)
189 if (!st_lookup(transcoder_table
, (st_data_t
)sname
, &val
)) {
190 val
= (st_data_t
)st_init_strcasetable();
191 st_add_direct(transcoder_table
, (st_data_t
)sname
, val
);
193 table2
= (st_table
*)val
;
194 if (!st_lookup(table2
, (st_data_t
)dname
, &val
)) {
195 transcoder_entry_t
*entry
= ALLOC(transcoder_entry_t
);
196 entry
->sname
= sname
;
197 entry
->dname
= dname
;
199 entry
->transcoder
= NULL
;
200 val
= (st_data_t
)entry
;
201 st_add_direct(table2
, (st_data_t
)dname
, val
);
203 return (transcoder_entry_t
*)val
;
206 static transcoder_entry_t
*
207 get_transcoder_entry(const char *sname
, const char *dname
)
212 if (!st_lookup(transcoder_table
, (st_data_t
)sname
, &val
)) {
215 table2
= (st_table
*)val
;
216 if (!st_lookup(table2
, (st_data_t
)dname
, &val
)) {
219 return (transcoder_entry_t
*)val
;
223 rb_register_transcoder(const rb_transcoder
*tr
)
225 const char *const sname
= tr
->src_encoding
;
226 const char *const dname
= tr
->dst_encoding
;
228 transcoder_entry_t
*entry
;
230 entry
= make_transcoder_entry(sname
, dname
);
231 if (entry
->transcoder
) {
232 rb_raise(rb_eArgError
, "transcoder from %s to %s has been already registered",
236 entry
->transcoder
= tr
;
240 declare_transcoder(const char *sname
, const char *dname
, const char *lib
)
242 transcoder_entry_t
*entry
;
244 entry
= make_transcoder_entry(sname
, dname
);
248 static const char transcoder_lib_prefix
[] = "enc/trans/";
251 rb_declare_transcoder(const char *enc1
, const char *enc2
, const char *lib
)
254 rb_raise(rb_eArgError
, "invalid library name - (null)");
256 declare_transcoder(enc1
, enc2
, lib
);
259 #define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
261 typedef struct search_path_queue_tag
{
262 struct search_path_queue_tag
*next
;
264 } search_path_queue_t
;
268 search_path_queue_t
*queue
;
269 search_path_queue_t
**queue_last_ptr
;
270 const char *base_enc
;
274 transcode_search_path_i(st_data_t key
, st_data_t val
, st_data_t arg
)
276 const char *dname
= (const char *)key
;
277 search_path_bfs_t
*bfs
= (search_path_bfs_t
*)arg
;
278 search_path_queue_t
*q
;
280 if (st_lookup(bfs
->visited
, (st_data_t
)dname
, &val
)) {
284 q
= ALLOC(search_path_queue_t
);
287 *bfs
->queue_last_ptr
= q
;
288 bfs
->queue_last_ptr
= &q
->next
;
290 st_add_direct(bfs
->visited
, (st_data_t
)dname
, (st_data_t
)bfs
->base_enc
);
295 transcode_search_path(const char *sname
, const char *dname
,
296 void (*callback
)(const char *sname
, const char *dname
, int depth
, void *arg
),
299 search_path_bfs_t bfs
;
300 search_path_queue_t
*q
;
306 if (encoding_equal(sname
, dname
))
309 q
= ALLOC(search_path_queue_t
);
312 bfs
.queue_last_ptr
= &q
->next
;
315 bfs
.visited
= st_init_strcasetable();
316 st_add_direct(bfs
.visited
, (st_data_t
)sname
, (st_data_t
)NULL
);
322 bfs
.queue_last_ptr
= &bfs
.queue
;
324 if (!st_lookup(transcoder_table
, (st_data_t
)q
->enc
, &val
)) {
328 table2
= (st_table
*)val
;
330 if (st_lookup(table2
, (st_data_t
)dname
, &val
)) {
331 st_add_direct(bfs
.visited
, (st_data_t
)dname
, (st_data_t
)q
->enc
);
337 bfs
.base_enc
= q
->enc
;
338 st_foreach(table2
, transcode_search_path_i
, (st_data_t
)&bfs
);
353 const char *enc
= dname
;
357 st_lookup(bfs
.visited
, (st_data_t
)enc
, &val
);
361 enc
= (const char *)val
;
366 st_lookup(bfs
.visited
, (st_data_t
)enc
, &val
);
369 callback((const char *)val
, enc
, --depth
, arg
);
370 enc
= (const char *)val
;
374 st_free_table(bfs
.visited
);
376 return pathlen
; /* is -1 if not found */
379 int rb_require_internal_silent(VALUE fname
);
381 static const rb_transcoder
*
382 load_transcoder_entry(transcoder_entry_t
*entry
)
384 if (entry
->transcoder
)
385 return entry
->transcoder
;
388 const char *const lib
= entry
->lib
;
389 const size_t len
= strlen(lib
);
390 const size_t total_len
= sizeof(transcoder_lib_prefix
) - 1 + len
;
391 const VALUE fn
= rb_str_new(0, total_len
);
392 char *const path
= RSTRING_PTR(fn
);
394 memcpy(path
, transcoder_lib_prefix
, sizeof(transcoder_lib_prefix
) - 1);
395 memcpy(path
+ sizeof(transcoder_lib_prefix
) - 1, lib
, len
);
396 rb_str_set_len(fn
, total_len
);
398 rb_require_internal_silent(fn
);
401 if (entry
->transcoder
)
402 return entry
->transcoder
;
408 get_replacement_character(const char *encname
, size_t *len_ret
, const char **repl_encname_ptr
)
410 if (encoding_equal(encname
, "UTF-8")) {
412 *repl_encname_ptr
= "UTF-8";
413 return "\xEF\xBF\xBD";
417 *repl_encname_ptr
= "US-ASCII";
423 * Transcoding engine logic
426 static const unsigned char *
427 transcode_char_start(rb_transcoding
*tc
,
428 const unsigned char *in_start
,
429 const unsigned char *inchar_start
,
430 const unsigned char *in_p
,
431 size_t *char_len_ptr
)
433 const unsigned char *ptr
;
434 if (inchar_start
- in_start
< tc
->recognized_len
) {
435 MEMCPY(TRANSCODING_READBUF(tc
) + tc
->recognized_len
,
436 inchar_start
, unsigned char, in_p
- inchar_start
);
437 ptr
= TRANSCODING_READBUF(tc
);
440 ptr
= inchar_start
- tc
->recognized_len
;
442 *char_len_ptr
= tc
->recognized_len
+ (in_p
- inchar_start
);
446 static rb_econv_result_t
447 transcode_restartable0(const unsigned char **in_pos
, unsigned char **out_pos
,
448 const unsigned char *in_stop
, unsigned char *out_stop
,
452 const rb_transcoder
*tr
= tc
->transcoder
;
453 int unitlen
= tr
->input_unit_length
;
454 ssize_t readagain_len
= 0;
456 const unsigned char *inchar_start
;
457 const unsigned char *in_p
;
459 unsigned char *out_p
;
461 in_p
= inchar_start
= *in_pos
;
465 #define SUSPEND(ret, num) \
467 tc->resume_position = (num); \
468 if (0 < in_p - inchar_start) \
469 MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
470 inchar_start, unsigned char, in_p - inchar_start); \
473 tc->recognized_len += in_p - inchar_start; \
474 if (readagain_len) { \
475 tc->recognized_len -= readagain_len; \
476 tc->readagain_len = readagain_len; \
479 resume_label ## num:; \
481 #define SUSPEND_OBUF(num) \
483 while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
486 #define SUSPEND_AFTER_OUTPUT(num) \
487 if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
488 SUSPEND(econv_after_output, num); \
491 #define next_table (tc->next_table)
492 #define next_info (tc->next_info)
493 #define next_byte (tc->next_byte)
494 #define writebuf_len (tc->writebuf_len)
495 #define writebuf_off (tc->writebuf_off)
497 switch (tc
->resume_position
) {
499 case 1: goto resume_label1
;
500 case 2: goto resume_label2
;
501 case 3: goto resume_label3
;
502 case 4: goto resume_label4
;
503 case 5: goto resume_label5
;
504 case 6: goto resume_label6
;
505 case 7: goto resume_label7
;
506 case 8: goto resume_label8
;
507 case 9: goto resume_label9
;
508 case 10: goto resume_label10
;
509 case 11: goto resume_label11
;
510 case 12: goto resume_label12
;
511 case 13: goto resume_label13
;
512 case 14: goto resume_label14
;
513 case 15: goto resume_label15
;
514 case 16: goto resume_label16
;
515 case 17: goto resume_label17
;
516 case 18: goto resume_label18
;
517 case 19: goto resume_label19
;
518 case 20: goto resume_label20
;
519 case 21: goto resume_label21
;
520 case 22: goto resume_label22
;
521 case 23: goto resume_label23
;
522 case 24: goto resume_label24
;
523 case 25: goto resume_label25
;
524 case 26: goto resume_label26
;
525 case 27: goto resume_label27
;
526 case 28: goto resume_label28
;
527 case 29: goto resume_label29
;
528 case 30: goto resume_label30
;
529 case 31: goto resume_label31
;
530 case 32: goto resume_label32
;
531 case 33: goto resume_label33
;
532 case 34: goto resume_label34
;
537 tc
->recognized_len
= 0;
538 next_table
= tr
->conv_tree_start
;
540 SUSPEND_AFTER_OUTPUT(24);
542 if (in_stop
<= in_p
) {
543 if (!(opt
& ECONV_PARTIAL_INPUT
))
545 SUSPEND(econv_source_buffer_empty
, 7);
549 #define BYTE_ADDR(index) (tr->byte_array + (index))
550 #define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
551 #define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
552 #define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
553 #define BL_MIN_BYTE (BL_BASE[0])
554 #define BL_MAX_BYTE (BL_BASE[1])
555 #define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
556 #define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
558 next_byte
= (unsigned char)*in_p
++;
560 if (next_byte
< BL_MIN_BYTE
|| BL_MAX_BYTE
< next_byte
)
563 next_info
= (VALUE
)BL_ACTION(next_byte
);
566 switch (next_info
& 0x1F) {
569 const unsigned char *p
= inchar_start
;
572 TRANSCODING_WRITEBUF(tc
)[writebuf_off
++] = (unsigned char)*p
++;
574 writebuf_len
= writebuf_off
;
576 while (writebuf_off
< writebuf_len
) {
578 *out_p
++ = TRANSCODING_WRITEBUF(tc
)[writebuf_off
++];
582 case 0x00: case 0x04: case 0x08: case 0x0C:
583 case 0x10: case 0x14: case 0x18: case 0x1C:
584 SUSPEND_AFTER_OUTPUT(25);
585 while (in_p
>= in_stop
) {
586 if (!(opt
& ECONV_PARTIAL_INPUT
))
588 SUSPEND(econv_source_buffer_empty
, 5);
590 next_byte
= (unsigned char)*in_p
++;
591 next_table
= (unsigned int)next_info
;
593 case ZERObt
: /* drop input */
596 SUSPEND_OBUF(9); *out_p
++ = getBT1(next_info
);
599 SUSPEND_OBUF(10); *out_p
++ = getBT1(next_info
);
600 SUSPEND_OBUF(21); *out_p
++ = getBT2(next_info
);
603 SUSPEND_OBUF(11); *out_p
++ = getBT1(next_info
);
604 SUSPEND_OBUF(15); *out_p
++ = getBT2(next_info
);
605 SUSPEND_OBUF(16); *out_p
++ = getBT3(next_info
);
608 SUSPEND_OBUF(12); *out_p
++ = getBT0(next_info
);
609 SUSPEND_OBUF(17); *out_p
++ = getBT1(next_info
);
610 SUSPEND_OBUF(18); *out_p
++ = getBT2(next_info
);
611 SUSPEND_OBUF(19); *out_p
++ = getBT3(next_info
);
614 SUSPEND_OBUF(29); *out_p
++ = getGB4bt0(next_info
);
615 SUSPEND_OBUF(30); *out_p
++ = getGB4bt1(next_info
);
616 SUSPEND_OBUF(31); *out_p
++ = getGB4bt2(next_info
);
617 SUSPEND_OBUF(32); *out_p
++ = getGB4bt3(next_info
);
620 tc
->output_index
= 0;
621 while (tc
->output_index
< STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info
)))) {
622 SUSPEND_OBUF(28); *out_p
++ = BYTE_ADDR(STR1_BYTEINDEX(next_info
))[1+tc
->output_index
];
627 next_info
= (VALUE
)(*tr
->func_ii
)(TRANSCODING_STATE(tc
), next_info
);
631 const unsigned char *char_start
;
633 char_start
= transcode_char_start(tc
, *in_pos
, inchar_start
, in_p
, &char_len
);
634 next_info
= (VALUE
)(*tr
->func_si
)(TRANSCODING_STATE(tc
), char_start
, (size_t)char_len
);
639 if (tr
->max_output
<= out_stop
- out_p
)
640 out_p
+= tr
->func_io(TRANSCODING_STATE(tc
),
641 next_info
, out_p
, out_stop
- out_p
);
643 writebuf_len
= tr
->func_io(TRANSCODING_STATE(tc
),
645 TRANSCODING_WRITEBUF(tc
), TRANSCODING_WRITEBUF_SIZE(tc
));
647 while (writebuf_off
< writebuf_len
) {
649 *out_p
++ = TRANSCODING_WRITEBUF(tc
)[writebuf_off
++];
655 const unsigned char *char_start
;
658 if (tr
->max_output
<= out_stop
- out_p
) {
659 char_start
= transcode_char_start(tc
, *in_pos
, inchar_start
, in_p
, &char_len
);
660 out_p
+= tr
->func_so(TRANSCODING_STATE(tc
),
661 char_start
, (size_t)char_len
,
662 out_p
, out_stop
- out_p
);
665 char_start
= transcode_char_start(tc
, *in_pos
, inchar_start
, in_p
, &char_len
);
666 writebuf_len
= tr
->func_so(TRANSCODING_STATE(tc
),
667 char_start
, (size_t)char_len
,
668 TRANSCODING_WRITEBUF(tc
), TRANSCODING_WRITEBUF_SIZE(tc
));
670 while (writebuf_off
< writebuf_len
) {
672 *out_p
++ = TRANSCODING_WRITEBUF(tc
)[writebuf_off
++];
679 const unsigned char *char_start
;
682 if (tr
->max_output
<= out_stop
- out_p
) {
683 char_start
= transcode_char_start(tc
, *in_pos
, inchar_start
, in_p
, &char_len
);
684 out_p
+= tr
->func_sio(TRANSCODING_STATE(tc
),
685 char_start
, (size_t)char_len
, next_info
,
686 out_p
, out_stop
- out_p
);
689 char_start
= transcode_char_start(tc
, *in_pos
, inchar_start
, in_p
, &char_len
);
690 writebuf_len
= tr
->func_sio(TRANSCODING_STATE(tc
),
691 char_start
, (size_t)char_len
, next_info
,
692 TRANSCODING_WRITEBUF(tc
), TRANSCODING_WRITEBUF_SIZE(tc
));
694 while (writebuf_off
< writebuf_len
) {
696 *out_p
++ = TRANSCODING_WRITEBUF(tc
)[writebuf_off
++];
702 if (tc
->recognized_len
+ (in_p
- inchar_start
) <= unitlen
) {
703 if (tc
->recognized_len
+ (in_p
- inchar_start
) < unitlen
)
704 SUSPEND_AFTER_OUTPUT(26);
705 while ((opt
& ECONV_PARTIAL_INPUT
) && tc
->recognized_len
+ (in_stop
- inchar_start
) < unitlen
) {
707 SUSPEND(econv_source_buffer_empty
, 8);
709 if (tc
->recognized_len
+ (in_stop
- inchar_start
) <= unitlen
) {
713 in_p
= inchar_start
+ (unitlen
- tc
->recognized_len
);
717 ssize_t invalid_len
; /* including the last byte which causes invalid */
719 invalid_len
= tc
->recognized_len
+ (in_p
- inchar_start
);
720 discard_len
= ((invalid_len
- 1) / unitlen
) * unitlen
;
721 readagain_len
= invalid_len
- discard_len
;
727 rb_raise(rb_eRuntimeError
, "unknown transcoding instruction");
732 SUSPEND(econv_invalid_byte_sequence
, 1);
736 SUSPEND(econv_incomplete_input
, 27);
740 SUSPEND(econv_undefined_conversion
, 2);
745 if (tr
->finish_func
) {
747 if (tr
->max_output
<= out_stop
- out_p
) {
748 out_p
+= tr
->finish_func(TRANSCODING_STATE(tc
),
749 out_p
, out_stop
- out_p
);
752 writebuf_len
= tr
->finish_func(TRANSCODING_STATE(tc
),
753 TRANSCODING_WRITEBUF(tc
), TRANSCODING_WRITEBUF_SIZE(tc
));
755 while (writebuf_off
< writebuf_len
) {
757 *out_p
++ = TRANSCODING_WRITEBUF(tc
)[writebuf_off
++];
762 SUSPEND(econv_finished
, 6);
771 static rb_econv_result_t
772 transcode_restartable(const unsigned char **in_pos
, unsigned char **out_pos
,
773 const unsigned char *in_stop
, unsigned char *out_stop
,
777 if (tc
->readagain_len
) {
778 unsigned char *readagain_buf
= ALLOCA_N(unsigned char, tc
->readagain_len
);
779 const unsigned char *readagain_pos
= readagain_buf
;
780 const unsigned char *readagain_stop
= readagain_buf
+ tc
->readagain_len
;
781 rb_econv_result_t res
;
783 MEMCPY(readagain_buf
, TRANSCODING_READBUF(tc
) + tc
->recognized_len
,
784 unsigned char, tc
->readagain_len
);
785 tc
->readagain_len
= 0;
786 res
= transcode_restartable0(&readagain_pos
, out_pos
, readagain_stop
, out_stop
, tc
, opt
|ECONV_PARTIAL_INPUT
);
787 if (res
!= econv_source_buffer_empty
) {
788 MEMCPY(TRANSCODING_READBUF(tc
) + tc
->recognized_len
+ tc
->readagain_len
,
789 readagain_pos
, unsigned char, readagain_stop
- readagain_pos
);
790 tc
->readagain_len
+= readagain_stop
- readagain_pos
;
794 return transcode_restartable0(in_pos
, out_pos
, in_stop
, out_stop
, tc
, opt
);
797 static rb_transcoding
*
798 rb_transcoding_open_by_transcoder(const rb_transcoder
*tr
, int flags
)
802 tc
= ALLOC(rb_transcoding
);
805 if (TRANSCODING_STATE_EMBED_MAX
< tr
->state_size
)
806 tc
->state
.ptr
= xmalloc(tr
->state_size
);
807 if (tr
->state_init_func
) {
808 (tr
->state_init_func
)(TRANSCODING_STATE(tc
)); /* xxx: check return value */
810 tc
->resume_position
= 0;
811 tc
->recognized_len
= 0;
812 tc
->readagain_len
= 0;
813 tc
->writebuf_len
= 0;
814 tc
->writebuf_off
= 0;
815 if ((int)sizeof(tc
->readbuf
.ary
) < tr
->max_input
) {
816 tc
->readbuf
.ptr
= xmalloc(tr
->max_input
);
818 if ((int)sizeof(tc
->writebuf
.ary
) < tr
->max_output
) {
819 tc
->writebuf
.ptr
= xmalloc(tr
->max_output
);
824 static rb_econv_result_t
825 rb_transcoding_convert(rb_transcoding
*tc
,
826 const unsigned char **input_ptr
, const unsigned char *input_stop
,
827 unsigned char **output_ptr
, unsigned char *output_stop
,
830 return transcode_restartable(
831 input_ptr
, output_ptr
,
832 input_stop
, output_stop
,
837 rb_transcoding_close(rb_transcoding
*tc
)
839 const rb_transcoder
*tr
= tc
->transcoder
;
840 if (tr
->state_fini_func
) {
841 (tr
->state_fini_func
)(TRANSCODING_STATE(tc
)); /* check return value? */
843 if (TRANSCODING_STATE_EMBED_MAX
< tr
->state_size
)
844 xfree(tc
->state
.ptr
);
845 if ((int)sizeof(tc
->readbuf
.ary
) < tr
->max_input
)
846 xfree(tc
->readbuf
.ptr
);
847 if ((int)sizeof(tc
->writebuf
.ary
) < tr
->max_output
)
848 xfree(tc
->writebuf
.ptr
);
853 rb_transcoding_memsize(rb_transcoding
*tc
)
855 size_t size
= sizeof(rb_transcoding
);
856 const rb_transcoder
*tr
= tc
->transcoder
;
858 if (TRANSCODING_STATE_EMBED_MAX
< tr
->state_size
) {
859 size
+= tr
->state_size
;
861 if ((int)sizeof(tc
->readbuf
.ary
) < tr
->max_input
) {
862 size
+= tr
->max_input
;
864 if ((int)sizeof(tc
->writebuf
.ary
) < tr
->max_output
) {
865 size
+= tr
->max_output
;
871 rb_econv_alloc(int n_hint
)
878 ec
= ALLOC(rb_econv_t
);
880 ec
->source_encoding_name
= NULL
;
881 ec
->destination_encoding_name
= NULL
;
883 ec
->replacement_str
= NULL
;
884 ec
->replacement_len
= 0;
885 ec
->replacement_enc
= NULL
;
886 ec
->replacement_allocated
= 0;
887 ec
->in_buf_start
= NULL
;
888 ec
->in_data_start
= NULL
;
889 ec
->in_data_end
= NULL
;
890 ec
->in_buf_end
= NULL
;
891 ec
->num_allocated
= n_hint
;
893 ec
->elems
= ALLOC_N(rb_econv_elem_t
, ec
->num_allocated
);
894 ec
->num_finished
= 0;
896 ec
->last_error
.result
= econv_source_buffer_empty
;
897 ec
->last_error
.error_tc
= NULL
;
898 ec
->last_error
.source_encoding
= NULL
;
899 ec
->last_error
.destination_encoding
= NULL
;
900 ec
->last_error
.error_bytes_start
= NULL
;
901 ec
->last_error
.error_bytes_len
= 0;
902 ec
->last_error
.readagain_len
= 0;
903 ec
->source_encoding
= NULL
;
904 ec
->destination_encoding
= NULL
;
909 rb_econv_add_transcoder_at(rb_econv_t
*ec
, const rb_transcoder
*tr
, int i
)
915 if (ec
->num_trans
== ec
->num_allocated
) {
916 n
= ec
->num_allocated
* 2;
917 REALLOC_N(ec
->elems
, rb_econv_elem_t
, n
);
918 ec
->num_allocated
= n
;
921 p
= xmalloc(bufsize
);
923 MEMMOVE(ec
->elems
+i
+1, ec
->elems
+i
, rb_econv_elem_t
, ec
->num_trans
-i
);
925 ec
->elems
[i
].tc
= rb_transcoding_open_by_transcoder(tr
, 0);
926 ec
->elems
[i
].out_buf_start
= p
;
927 ec
->elems
[i
].out_buf_end
= p
+ bufsize
;
928 ec
->elems
[i
].out_data_start
= p
;
929 ec
->elems
[i
].out_data_end
= p
;
930 ec
->elems
[i
].last_result
= econv_source_buffer_empty
;
934 if (!DECORATOR_P(tr
->src_encoding
, tr
->dst_encoding
))
935 for (j
= ec
->num_trans
-1; i
<= j
; j
--) {
936 rb_transcoding
*tc
= ec
->elems
[j
].tc
;
937 const rb_transcoder
*tr2
= tc
->transcoder
;
938 if (!DECORATOR_P(tr2
->src_encoding
, tr2
->dst_encoding
)) {
948 rb_econv_open_by_transcoder_entries(int n
, transcoder_entry_t
**entries
)
953 for (i
= 0; i
< n
; i
++) {
954 const rb_transcoder
*tr
;
955 tr
= load_transcoder_entry(entries
[i
]);
960 ec
= rb_econv_alloc(n
);
962 for (i
= 0; i
< n
; i
++) {
963 const rb_transcoder
*tr
= load_transcoder_entry(entries
[i
]);
964 ret
= rb_econv_add_transcoder_at(ec
, tr
, ec
->num_trans
);
974 struct trans_open_t
{
975 transcoder_entry_t
**entries
;
980 trans_open_i(const char *sname
, const char *dname
, int depth
, void *arg
)
982 struct trans_open_t
*toarg
= arg
;
984 if (!toarg
->entries
) {
985 toarg
->entries
= ALLOC_N(transcoder_entry_t
*, depth
+1+toarg
->num_additional
);
987 toarg
->entries
[depth
] = get_transcoder_entry(sname
, dname
);
991 rb_econv_open0(const char *sname
, const char *dname
, int ecflags
)
993 transcoder_entry_t
**entries
= NULL
;
997 /* Just check if sname and dname are defined */
998 /* (This check is needed?) */
999 if (*sname
) rb_enc_find_index(sname
);
1000 if (*dname
) rb_enc_find_index(dname
);
1002 if (*sname
== '\0' && *dname
== '\0') {
1008 struct trans_open_t toarg
;
1009 toarg
.entries
= NULL
;
1010 toarg
.num_additional
= 0;
1011 num_trans
= transcode_search_path(sname
, dname
, trans_open_i
, (void *)&toarg
);
1012 entries
= toarg
.entries
;
1013 if (num_trans
< 0) {
1019 ec
= rb_econv_open_by_transcoder_entries(num_trans
, entries
);
1024 ec
->flags
= ecflags
;
1025 ec
->source_encoding_name
= sname
;
1026 ec
->destination_encoding_name
= dname
;
1031 #define MAX_ECFLAGS_DECORATORS 32
1034 decorator_names(int ecflags
, const char **decorators_ret
)
1038 switch (ecflags
& ECONV_NEWLINE_DECORATOR_MASK
) {
1039 case ECONV_UNIVERSAL_NEWLINE_DECORATOR
:
1040 case ECONV_CRLF_NEWLINE_DECORATOR
:
1041 case ECONV_CR_NEWLINE_DECORATOR
:
1048 if ((ecflags
& ECONV_XML_TEXT_DECORATOR
) &&
1049 (ecflags
& ECONV_XML_ATTR_CONTENT_DECORATOR
))
1054 if (ecflags
& ECONV_XML_TEXT_DECORATOR
)
1055 decorators_ret
[num_decorators
++] = "xml_text_escape";
1056 if (ecflags
& ECONV_XML_ATTR_CONTENT_DECORATOR
)
1057 decorators_ret
[num_decorators
++] = "xml_attr_content_escape";
1058 if (ecflags
& ECONV_XML_ATTR_QUOTE_DECORATOR
)
1059 decorators_ret
[num_decorators
++] = "xml_attr_quote";
1061 if (ecflags
& ECONV_CRLF_NEWLINE_DECORATOR
)
1062 decorators_ret
[num_decorators
++] = "crlf_newline";
1063 if (ecflags
& ECONV_CR_NEWLINE_DECORATOR
)
1064 decorators_ret
[num_decorators
++] = "cr_newline";
1065 if (ecflags
& ECONV_UNIVERSAL_NEWLINE_DECORATOR
)
1066 decorators_ret
[num_decorators
++] = "universal_newline";
1068 return num_decorators
;
1072 rb_econv_open(const char *sname
, const char *dname
, int ecflags
)
1076 const char *decorators
[MAX_ECFLAGS_DECORATORS
];
1079 num_decorators
= decorator_names(ecflags
, decorators
);
1080 if (num_decorators
== -1)
1083 ec
= rb_econv_open0(sname
, dname
, ecflags
& ECONV_ERROR_HANDLER_MASK
);
1087 for (i
= 0; i
< num_decorators
; i
++)
1088 if (rb_econv_decorate_at_last(ec
, decorators
[i
]) == -1) {
1093 ec
->flags
|= ecflags
& ~ECONV_ERROR_HANDLER_MASK
;
1099 trans_sweep(rb_econv_t
*ec
,
1100 const unsigned char **input_ptr
, const unsigned char *input_stop
,
1101 unsigned char **output_ptr
, unsigned char *output_stop
,
1108 const unsigned char **ipp
, *is
, *iold
;
1109 unsigned char **opp
, *os
, *oold
;
1110 rb_econv_result_t res
;
1115 for (i
= start
; i
< ec
->num_trans
; i
++) {
1116 rb_econv_elem_t
*te
= &ec
->elems
[i
];
1123 rb_econv_elem_t
*prev_te
= &ec
->elems
[i
-1];
1124 ipp
= (const unsigned char **)&prev_te
->out_data_start
;
1125 is
= prev_te
->out_data_end
;
1128 if (i
== ec
->num_trans
-1) {
1133 if (te
->out_buf_start
!= te
->out_data_start
) {
1134 ssize_t len
= te
->out_data_end
- te
->out_data_start
;
1135 ssize_t off
= te
->out_data_start
- te
->out_buf_start
;
1136 MEMMOVE(te
->out_buf_start
, te
->out_data_start
, unsigned char, len
);
1137 te
->out_data_start
= te
->out_buf_start
;
1138 te
->out_data_end
-= off
;
1140 opp
= &te
->out_data_end
;
1141 os
= te
->out_buf_end
;
1145 if (ec
->num_finished
!= i
)
1146 f
|= ECONV_PARTIAL_INPUT
;
1147 if (i
== 0 && (flags
& ECONV_AFTER_OUTPUT
)) {
1149 flags
&= ~ECONV_AFTER_OUTPUT
;
1152 f
&= ~ECONV_AFTER_OUTPUT
;
1155 te
->last_result
= res
= rb_transcoding_convert(te
->tc
, ipp
, is
, opp
, os
, f
);
1156 if (iold
!= *ipp
|| oold
!= *opp
)
1160 case econv_invalid_byte_sequence
:
1161 case econv_incomplete_input
:
1162 case econv_undefined_conversion
:
1163 case econv_after_output
:
1166 case econv_destination_buffer_full
:
1167 case econv_source_buffer_empty
:
1170 case econv_finished
:
1171 ec
->num_finished
= i
+1;
1179 static rb_econv_result_t
1180 rb_trans_conv(rb_econv_t
*ec
,
1181 const unsigned char **input_ptr
, const unsigned char *input_stop
,
1182 unsigned char **output_ptr
, unsigned char *output_stop
,
1184 int *result_position_ptr
)
1187 int needreport_index
;
1190 unsigned char empty_buf
;
1191 unsigned char *empty_ptr
= &empty_buf
;
1194 input_ptr
= (const unsigned char **)&empty_ptr
;
1195 input_stop
= empty_ptr
;
1199 output_ptr
= &empty_ptr
;
1200 output_stop
= empty_ptr
;
1203 if (ec
->elems
[0].last_result
== econv_after_output
)
1204 ec
->elems
[0].last_result
= econv_source_buffer_empty
;
1206 for (i
= ec
->num_trans
-1; 0 <= i
; i
--) {
1207 switch (ec
->elems
[i
].last_result
) {
1208 case econv_invalid_byte_sequence
:
1209 case econv_incomplete_input
:
1210 case econv_undefined_conversion
:
1211 case econv_after_output
:
1212 case econv_finished
:
1214 goto found_needreport
;
1216 case econv_destination_buffer_full
:
1217 case econv_source_buffer_empty
:
1221 rb_bug("unexpected transcode last result");
1225 /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */
1227 if (ec
->elems
[ec
->num_trans
-1].last_result
== econv_destination_buffer_full
&&
1228 (flags
& ECONV_AFTER_OUTPUT
)) {
1229 rb_econv_result_t res
;
1231 res
= rb_trans_conv(ec
, NULL
, NULL
, output_ptr
, output_stop
,
1232 (flags
& ~ECONV_AFTER_OUTPUT
)|ECONV_PARTIAL_INPUT
,
1233 result_position_ptr
);
1235 if (res
== econv_source_buffer_empty
)
1236 return econv_after_output
;
1245 needreport_index
= trans_sweep(ec
, input_ptr
, input_stop
, output_ptr
, output_stop
, flags
, sweep_start
);
1246 sweep_start
= needreport_index
+ 1;
1247 } while (needreport_index
!= -1 && needreport_index
!= ec
->num_trans
-1);
1249 for (i
= ec
->num_trans
-1; 0 <= i
; i
--) {
1250 if (ec
->elems
[i
].last_result
!= econv_source_buffer_empty
) {
1251 rb_econv_result_t res
= ec
->elems
[i
].last_result
;
1252 if (res
== econv_invalid_byte_sequence
||
1253 res
== econv_incomplete_input
||
1254 res
== econv_undefined_conversion
||
1255 res
== econv_after_output
) {
1256 ec
->elems
[i
].last_result
= econv_source_buffer_empty
;
1258 if (result_position_ptr
)
1259 *result_position_ptr
= i
;
1263 if (result_position_ptr
)
1264 *result_position_ptr
= -1;
1265 return econv_source_buffer_empty
;
1268 static rb_econv_result_t
1269 rb_econv_convert0(rb_econv_t
*ec
,
1270 const unsigned char **input_ptr
, const unsigned char *input_stop
,
1271 unsigned char **output_ptr
, unsigned char *output_stop
,
1274 rb_econv_result_t res
;
1275 int result_position
;
1278 memset(&ec
->last_error
, 0, sizeof(ec
->last_error
));
1280 if (ec
->num_trans
== 0) {
1282 if (ec
->in_buf_start
&& ec
->in_data_start
!= ec
->in_data_end
) {
1283 if (output_stop
- *output_ptr
< ec
->in_data_end
- ec
->in_data_start
) {
1284 len
= output_stop
- *output_ptr
;
1285 memcpy(*output_ptr
, ec
->in_data_start
, len
);
1286 *output_ptr
= output_stop
;
1287 ec
->in_data_start
+= len
;
1288 res
= econv_destination_buffer_full
;
1291 len
= ec
->in_data_end
- ec
->in_data_start
;
1292 memcpy(*output_ptr
, ec
->in_data_start
, len
);
1294 ec
->in_data_start
= ec
->in_data_end
= ec
->in_buf_start
;
1295 if (flags
& ECONV_AFTER_OUTPUT
) {
1296 res
= econv_after_output
;
1300 if (output_stop
- *output_ptr
< input_stop
- *input_ptr
) {
1301 len
= output_stop
- *output_ptr
;
1304 len
= input_stop
- *input_ptr
;
1306 if (0 < len
&& (flags
& ECONV_AFTER_OUTPUT
)) {
1307 *(*output_ptr
)++ = *(*input_ptr
)++;
1308 res
= econv_after_output
;
1311 memcpy(*output_ptr
, *input_ptr
, len
);
1314 if (*input_ptr
!= input_stop
)
1315 res
= econv_destination_buffer_full
;
1316 else if (flags
& ECONV_PARTIAL_INPUT
)
1317 res
= econv_source_buffer_empty
;
1319 res
= econv_finished
;
1323 if (ec
->elems
[ec
->num_trans
-1].out_data_start
) {
1324 unsigned char *data_start
= ec
->elems
[ec
->num_trans
-1].out_data_start
;
1325 unsigned char *data_end
= ec
->elems
[ec
->num_trans
-1].out_data_end
;
1326 if (data_start
!= data_end
) {
1328 if (output_stop
- *output_ptr
< data_end
- data_start
) {
1329 len
= output_stop
- *output_ptr
;
1330 memcpy(*output_ptr
, data_start
, len
);
1331 *output_ptr
= output_stop
;
1332 ec
->elems
[ec
->num_trans
-1].out_data_start
+= len
;
1333 res
= econv_destination_buffer_full
;
1336 len
= data_end
- data_start
;
1337 memcpy(*output_ptr
, data_start
, len
);
1339 ec
->elems
[ec
->num_trans
-1].out_data_start
=
1340 ec
->elems
[ec
->num_trans
-1].out_data_end
=
1341 ec
->elems
[ec
->num_trans
-1].out_buf_start
;
1346 if (ec
->in_buf_start
&&
1347 ec
->in_data_start
!= ec
->in_data_end
) {
1348 res
= rb_trans_conv(ec
, (const unsigned char **)&ec
->in_data_start
, ec
->in_data_end
, output_ptr
, output_stop
,
1349 (flags
&~ECONV_AFTER_OUTPUT
)|ECONV_PARTIAL_INPUT
, &result_position
);
1350 if (res
!= econv_source_buffer_empty
)
1355 (flags
& ECONV_AFTER_OUTPUT
) &&
1356 *input_ptr
!= input_stop
) {
1357 input_stop
= *input_ptr
;
1358 res
= rb_trans_conv(ec
, input_ptr
, input_stop
, output_ptr
, output_stop
, flags
, &result_position
);
1359 if (res
== econv_source_buffer_empty
)
1360 res
= econv_after_output
;
1362 else if ((flags
& ECONV_AFTER_OUTPUT
) ||
1363 ec
->num_trans
== 1) {
1364 res
= rb_trans_conv(ec
, input_ptr
, input_stop
, output_ptr
, output_stop
, flags
, &result_position
);
1367 flags
|= ECONV_AFTER_OUTPUT
;
1369 res
= rb_trans_conv(ec
, input_ptr
, input_stop
, output_ptr
, output_stop
, flags
, &result_position
);
1370 } while (res
== econv_after_output
);
1374 ec
->last_error
.result
= res
;
1375 if (res
== econv_invalid_byte_sequence
||
1376 res
== econv_incomplete_input
||
1377 res
== econv_undefined_conversion
) {
1378 rb_transcoding
*error_tc
= ec
->elems
[result_position
].tc
;
1379 ec
->last_error
.error_tc
= error_tc
;
1380 ec
->last_error
.source_encoding
= error_tc
->transcoder
->src_encoding
;
1381 ec
->last_error
.destination_encoding
= error_tc
->transcoder
->dst_encoding
;
1382 ec
->last_error
.error_bytes_start
= TRANSCODING_READBUF(error_tc
);
1383 ec
->last_error
.error_bytes_len
= error_tc
->recognized_len
;
1384 ec
->last_error
.readagain_len
= error_tc
->readagain_len
;
1390 static int output_replacement_character(rb_econv_t
*ec
);
1393 output_hex_charref(rb_econv_t
*ec
)
1396 unsigned char utfbuf
[1024];
1397 const unsigned char *utf
;
1399 int utf_allocated
= 0;
1400 char charef_buf
[16];
1401 const unsigned char *p
;
1403 if (encoding_equal(ec
->last_error
.source_encoding
, "UTF-32BE")) {
1404 utf
= ec
->last_error
.error_bytes_start
;
1405 utf_len
= ec
->last_error
.error_bytes_len
;
1408 utf
= allocate_converted_string(ec
->last_error
.source_encoding
, "UTF-32BE",
1409 ec
->last_error
.error_bytes_start
, ec
->last_error
.error_bytes_len
,
1410 utfbuf
, sizeof(utfbuf
),
1414 if (utf
!= utfbuf
&& utf
!= ec
->last_error
.error_bytes_start
)
1418 if (utf_len
% 4 != 0)
1422 while (4 <= utf_len
) {
1428 snprintf(charef_buf
, sizeof(charef_buf
), "&#x%X;", u
);
1430 ret
= rb_econv_insert_output(ec
, (unsigned char *)charef_buf
, strlen(charef_buf
), "US-ASCII");
1449 rb_econv_convert(rb_econv_t
*ec
,
1450 const unsigned char **input_ptr
, const unsigned char *input_stop
,
1451 unsigned char **output_ptr
, unsigned char *output_stop
,
1454 rb_econv_result_t ret
;
1456 unsigned char empty_buf
;
1457 unsigned char *empty_ptr
= &empty_buf
;
1462 input_ptr
= (const unsigned char **)&empty_ptr
;
1463 input_stop
= empty_ptr
;
1467 output_ptr
= &empty_ptr
;
1468 output_stop
= empty_ptr
;
1472 ret
= rb_econv_convert0(ec
, input_ptr
, input_stop
, output_ptr
, output_stop
, flags
);
1474 if (ret
== econv_invalid_byte_sequence
||
1475 ret
== econv_incomplete_input
) {
1476 /* deal with invalid byte sequence */
1477 /* todo: add more alternative behaviors */
1478 switch (ec
->flags
& ECONV_INVALID_MASK
) {
1479 case ECONV_INVALID_REPLACE
:
1480 if (output_replacement_character(ec
) == 0)
1485 if (ret
== econv_undefined_conversion
) {
1486 /* valid character in source encoding
1487 * but no related character(s) in destination encoding */
1488 /* todo: add more alternative behaviors */
1489 switch (ec
->flags
& ECONV_UNDEF_MASK
) {
1490 case ECONV_UNDEF_REPLACE
:
1491 if (output_replacement_character(ec
) == 0)
1495 case ECONV_UNDEF_HEX_CHARREF
:
1496 if (output_hex_charref(ec
) == 0)
1506 rb_econv_encoding_to_insert_output(rb_econv_t
*ec
)
1508 rb_transcoding
*tc
= ec
->last_tc
;
1509 const rb_transcoder
*tr
;
1514 tr
= tc
->transcoder
;
1516 if (tr
->asciicompat_type
== asciicompat_encoder
)
1517 return tr
->src_encoding
;
1518 return tr
->dst_encoding
;
1521 static unsigned char *
1522 allocate_converted_string(const char *sname
, const char *dname
,
1523 const unsigned char *str
, size_t len
,
1524 unsigned char *caller_dst_buf
, size_t caller_dst_bufsize
,
1525 size_t *dst_len_ptr
)
1527 unsigned char *dst_str
;
1532 rb_econv_result_t res
;
1534 const unsigned char *sp
;
1538 dst_bufsize
= caller_dst_bufsize
;
1544 ec
= rb_econv_open(sname
, dname
, 0);
1548 dst_str
= caller_dst_buf
;
1550 dst_str
= xmalloc(dst_bufsize
);
1553 dp
= dst_str
+dst_len
;
1554 res
= rb_econv_convert(ec
, &sp
, str
+len
, &dp
, dst_str
+dst_bufsize
, 0);
1555 dst_len
= dp
- dst_str
;
1556 while (res
== econv_destination_buffer_full
) {
1557 if (SIZE_MAX
/2 < dst_bufsize
) {
1561 if (dst_str
== caller_dst_buf
) {
1563 tmp
= xmalloc(dst_bufsize
);
1564 memcpy(tmp
, dst_str
, dst_bufsize
/2);
1568 dst_str
= xrealloc(dst_str
, dst_bufsize
);
1570 dp
= dst_str
+dst_len
;
1571 res
= rb_econv_convert(ec
, &sp
, str
+len
, &dp
, dst_str
+dst_bufsize
, 0);
1572 dst_len
= dp
- dst_str
;
1574 if (res
!= econv_finished
) {
1578 *dst_len_ptr
= dst_len
;
1582 if (dst_str
!= caller_dst_buf
)
1588 /* result: 0:success -1:failure */
1590 rb_econv_insert_output(rb_econv_t
*ec
,
1591 const unsigned char *str
, size_t len
, const char *str_encoding
)
1593 const char *insert_encoding
= rb_econv_encoding_to_insert_output(ec
);
1594 unsigned char insert_buf
[4096];
1595 const unsigned char *insert_str
= NULL
;
1598 int last_trans_index
;
1601 unsigned char **buf_start_p
;
1602 unsigned char **data_start_p
;
1603 unsigned char **data_end_p
;
1604 unsigned char **buf_end_p
;
1613 if (encoding_equal(insert_encoding
, str_encoding
)) {
1618 insert_str
= allocate_converted_string(str_encoding
, insert_encoding
,
1619 str
, len
, insert_buf
, sizeof(insert_buf
), &insert_len
);
1620 if (insert_str
== NULL
)
1626 last_trans_index
= ec
->num_trans
-1;
1627 if (ec
->num_trans
== 0) {
1629 buf_start_p
= &ec
->in_buf_start
;
1630 data_start_p
= &ec
->in_data_start
;
1631 data_end_p
= &ec
->in_data_end
;
1632 buf_end_p
= &ec
->in_buf_end
;
1634 else if (ec
->elems
[last_trans_index
].tc
->transcoder
->asciicompat_type
== asciicompat_encoder
) {
1635 tc
= ec
->elems
[last_trans_index
].tc
;
1636 need
+= tc
->readagain_len
;
1637 if (need
< insert_len
)
1639 if (last_trans_index
== 0) {
1640 buf_start_p
= &ec
->in_buf_start
;
1641 data_start_p
= &ec
->in_data_start
;
1642 data_end_p
= &ec
->in_data_end
;
1643 buf_end_p
= &ec
->in_buf_end
;
1646 rb_econv_elem_t
*ee
= &ec
->elems
[last_trans_index
-1];
1647 buf_start_p
= &ee
->out_buf_start
;
1648 data_start_p
= &ee
->out_data_start
;
1649 data_end_p
= &ee
->out_data_end
;
1650 buf_end_p
= &ee
->out_buf_end
;
1654 rb_econv_elem_t
*ee
= &ec
->elems
[last_trans_index
];
1655 buf_start_p
= &ee
->out_buf_start
;
1656 data_start_p
= &ee
->out_data_start
;
1657 data_end_p
= &ee
->out_data_end
;
1658 buf_end_p
= &ee
->out_buf_end
;
1659 tc
= ec
->elems
[last_trans_index
].tc
;
1662 if (*buf_start_p
== NULL
) {
1663 unsigned char *buf
= xmalloc(need
);
1665 *data_start_p
= buf
;
1667 *buf_end_p
= buf
+need
;
1669 else if ((size_t)(*buf_end_p
- *data_end_p
) < need
) {
1670 MEMMOVE(*buf_start_p
, *data_start_p
, unsigned char, *data_end_p
- *data_start_p
);
1671 *data_end_p
= *buf_start_p
+ (*data_end_p
- *data_start_p
);
1672 *data_start_p
= *buf_start_p
;
1673 if ((size_t)(*buf_end_p
- *data_end_p
) < need
) {
1675 size_t s
= (*data_end_p
- *buf_start_p
) + need
;
1678 buf
= xrealloc(*buf_start_p
, s
);
1679 *data_start_p
= buf
;
1680 *data_end_p
= buf
+ (*data_end_p
- *buf_start_p
);
1682 *buf_end_p
= buf
+ s
;
1686 memcpy(*data_end_p
, insert_str
, insert_len
);
1687 *data_end_p
+= insert_len
;
1688 if (tc
&& tc
->transcoder
->asciicompat_type
== asciicompat_encoder
) {
1689 memcpy(*data_end_p
, TRANSCODING_READBUF(tc
)+tc
->recognized_len
, tc
->readagain_len
);
1690 *data_end_p
+= tc
->readagain_len
;
1691 tc
->readagain_len
= 0;
1694 if (insert_str
!= str
&& insert_str
!= insert_buf
)
1695 xfree((void*)insert_str
);
1699 if (insert_str
!= str
&& insert_str
!= insert_buf
)
1700 xfree((void*)insert_str
);
1705 rb_econv_close(rb_econv_t
*ec
)
1709 if (ec
->replacement_allocated
) {
1710 xfree((void *)ec
->replacement_str
);
1712 for (i
= 0; i
< ec
->num_trans
; i
++) {
1713 rb_transcoding_close(ec
->elems
[i
].tc
);
1714 if (ec
->elems
[i
].out_buf_start
)
1715 xfree(ec
->elems
[i
].out_buf_start
);
1717 xfree(ec
->in_buf_start
);
1723 rb_econv_memsize(rb_econv_t
*ec
)
1725 size_t size
= sizeof(rb_econv_t
);
1728 if (ec
->replacement_allocated
) {
1729 size
+= ec
->replacement_len
;
1731 for (i
= 0; i
< ec
->num_trans
; i
++) {
1732 size
+= rb_transcoding_memsize(ec
->elems
[i
].tc
);
1734 if (ec
->elems
[i
].out_buf_start
) {
1735 size
+= ec
->elems
[i
].out_buf_end
- ec
->elems
[i
].out_buf_start
;
1738 size
+= ec
->in_buf_end
- ec
->in_buf_start
;
1739 size
+= sizeof(rb_econv_elem_t
) * ec
->num_allocated
;
1745 rb_econv_putbackable(rb_econv_t
*ec
)
1747 if (ec
->num_trans
== 0)
1749 #if SIZEOF_SIZE_T > SIZEOF_INT
1750 if (ec
->elems
[0].tc
->readagain_len
> INT_MAX
) return INT_MAX
;
1752 return (int)ec
->elems
[0].tc
->readagain_len
;
1756 rb_econv_putback(rb_econv_t
*ec
, unsigned char *p
, int n
)
1759 if (ec
->num_trans
== 0 || n
== 0)
1761 tc
= ec
->elems
[0].tc
;
1762 memcpy(p
, TRANSCODING_READBUF(tc
) + tc
->recognized_len
+ tc
->readagain_len
- n
, n
);
1763 tc
->readagain_len
-= n
;
1766 struct asciicompat_encoding_t
{
1767 const char *ascii_compat_name
;
1768 const char *ascii_incompat_name
;
1772 asciicompat_encoding_i(st_data_t key
, st_data_t val
, st_data_t arg
)
1774 struct asciicompat_encoding_t
*data
= (struct asciicompat_encoding_t
*)arg
;
1775 transcoder_entry_t
*entry
= (transcoder_entry_t
*)val
;
1776 const rb_transcoder
*tr
;
1778 if (DECORATOR_P(entry
->sname
, entry
->dname
))
1780 tr
= load_transcoder_entry(entry
);
1781 if (tr
&& tr
->asciicompat_type
== asciicompat_decoder
) {
1782 data
->ascii_compat_name
= tr
->dst_encoding
;
1789 rb_econv_asciicompat_encoding(const char *ascii_incompat_name
)
1793 struct asciicompat_encoding_t data
;
1795 if (!st_lookup(transcoder_table
, (st_data_t
)ascii_incompat_name
, &v
))
1797 table2
= (st_table
*)v
;
1801 * There is at most one transcoder for
1802 * converting from ASCII incompatible encoding.
1804 * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
1806 if (table2
->num_entries
!= 1)
1809 data
.ascii_incompat_name
= ascii_incompat_name
;
1810 data
.ascii_compat_name
= NULL
;
1811 st_foreach(table2
, asciicompat_encoding_i
, (st_data_t
)&data
);
1812 return data
.ascii_compat_name
;
1816 rb_econv_append(rb_econv_t
*ec
, const char *ss
, long len
, VALUE dst
, int flags
)
1818 unsigned const char *sp
, *se
;
1819 unsigned char *ds
, *dp
, *de
;
1820 rb_econv_result_t res
;
1824 dst
= rb_str_buf_new(len
);
1825 if (ec
->destination_encoding
)
1826 rb_enc_associate(dst
, ec
->destination_encoding
);
1830 max_output
= ec
->last_tc
->transcoder
->max_output
;
1835 long dlen
= RSTRING_LEN(dst
);
1836 if (rb_str_capacity(dst
) - dlen
< (size_t)len
+ max_output
) {
1837 unsigned long new_capa
= (unsigned long)dlen
+ len
+ max_output
;
1838 if (LONG_MAX
< new_capa
)
1839 rb_raise(rb_eArgError
, "too long string");
1840 rb_str_resize(dst
, new_capa
);
1841 rb_str_set_len(dst
, dlen
);
1843 sp
= (const unsigned char *)ss
;
1845 ds
= (unsigned char *)RSTRING_PTR(dst
);
1846 de
= ds
+ rb_str_capacity(dst
);
1848 res
= rb_econv_convert(ec
, &sp
, se
, &dp
, de
, flags
);
1849 len
-= (const char *)sp
- ss
;
1850 ss
= (const char *)sp
;
1851 rb_str_set_len(dst
, dlen
+ (dp
- ds
));
1852 rb_econv_check_error(ec
);
1853 } while (res
== econv_destination_buffer_full
);
1859 rb_econv_substr_append(rb_econv_t
*ec
, VALUE src
, long off
, long len
, VALUE dst
, int flags
)
1861 src
= rb_str_new_frozen(src
);
1862 dst
= rb_econv_append(ec
, RSTRING_PTR(src
) + off
, len
, dst
, flags
);
1868 rb_econv_str_append(rb_econv_t
*ec
, VALUE src
, VALUE dst
, int flags
)
1870 return rb_econv_substr_append(ec
, src
, 0, RSTRING_LEN(src
), dst
, flags
);
1874 rb_econv_substr_convert(rb_econv_t
*ec
, VALUE src
, long byteoff
, long bytesize
, int flags
)
1876 return rb_econv_substr_append(ec
, src
, byteoff
, bytesize
, Qnil
, flags
);
1880 rb_econv_str_convert(rb_econv_t
*ec
, VALUE src
, int flags
)
1882 return rb_econv_substr_append(ec
, src
, 0, RSTRING_LEN(src
), Qnil
, flags
);
1886 rb_econv_add_converter(rb_econv_t
*ec
, const char *sname
, const char *dname
, int n
)
1888 transcoder_entry_t
*entry
;
1889 const rb_transcoder
*tr
;
1891 if (ec
->started
!= 0)
1894 entry
= get_transcoder_entry(sname
, dname
);
1898 tr
= load_transcoder_entry(entry
);
1901 return rb_econv_add_transcoder_at(ec
, tr
, n
);
1905 rb_econv_decorate_at(rb_econv_t
*ec
, const char *decorator_name
, int n
)
1907 return rb_econv_add_converter(ec
, "", decorator_name
, n
);
1911 rb_econv_decorate_at_first(rb_econv_t
*ec
, const char *decorator_name
)
1913 const rb_transcoder
*tr
;
1915 if (ec
->num_trans
== 0)
1916 return rb_econv_decorate_at(ec
, decorator_name
, 0);
1918 tr
= ec
->elems
[0].tc
->transcoder
;
1920 if (!DECORATOR_P(tr
->src_encoding
, tr
->dst_encoding
) &&
1921 tr
->asciicompat_type
== asciicompat_decoder
)
1922 return rb_econv_decorate_at(ec
, decorator_name
, 1);
1924 return rb_econv_decorate_at(ec
, decorator_name
, 0);
1928 rb_econv_decorate_at_last(rb_econv_t
*ec
, const char *decorator_name
)
1930 const rb_transcoder
*tr
;
1932 if (ec
->num_trans
== 0)
1933 return rb_econv_decorate_at(ec
, decorator_name
, 0);
1935 tr
= ec
->elems
[ec
->num_trans
-1].tc
->transcoder
;
1937 if (!DECORATOR_P(tr
->src_encoding
, tr
->dst_encoding
) &&
1938 tr
->asciicompat_type
== asciicompat_encoder
)
1939 return rb_econv_decorate_at(ec
, decorator_name
, ec
->num_trans
-1);
1941 return rb_econv_decorate_at(ec
, decorator_name
, ec
->num_trans
);
1945 rb_econv_binmode(rb_econv_t
*ec
)
1947 const char *dname
= 0;
1949 switch (ec
->flags
& ECONV_NEWLINE_DECORATOR_MASK
) {
1950 case ECONV_UNIVERSAL_NEWLINE_DECORATOR
:
1951 dname
= "universal_newline";
1953 case ECONV_CRLF_NEWLINE_DECORATOR
:
1954 dname
= "crlf_newline";
1956 case ECONV_CR_NEWLINE_DECORATOR
:
1957 dname
= "cr_newline";
1962 const rb_transcoder
*transcoder
= get_transcoder_entry("", dname
)->transcoder
;
1963 int num_trans
= ec
->num_trans
;
1966 for (i
=0; i
< num_trans
; i
++) {
1967 if (transcoder
== ec
->elems
[i
].tc
->transcoder
) {
1968 rb_transcoding_close(ec
->elems
[i
].tc
);
1969 xfree(ec
->elems
[i
].out_buf_start
);
1973 ec
->elems
[j
++] = ec
->elems
[i
];
1977 ec
->flags
&= ~ECONV_NEWLINE_DECORATOR_MASK
;
1981 econv_description(const char *sname
, const char *dname
, int ecflags
, VALUE mesg
)
1983 int has_description
= 0;
1986 mesg
= rb_str_new(NULL
, 0);
1988 if (*sname
!= '\0' || *dname
!= '\0') {
1990 rb_str_cat2(mesg
, dname
);
1991 else if (*dname
== '\0')
1992 rb_str_cat2(mesg
, sname
);
1994 rb_str_catf(mesg
, "%s to %s", sname
, dname
);
1995 has_description
= 1;
1998 if (ecflags
& (ECONV_NEWLINE_DECORATOR_MASK
|
1999 ECONV_XML_TEXT_DECORATOR
|
2000 ECONV_XML_ATTR_CONTENT_DECORATOR
|
2001 ECONV_XML_ATTR_QUOTE_DECORATOR
)) {
2002 const char *pre
= "";
2003 if (has_description
)
2004 rb_str_cat2(mesg
, " with ");
2005 if (ecflags
& ECONV_UNIVERSAL_NEWLINE_DECORATOR
) {
2006 rb_str_cat2(mesg
, pre
); pre
= ",";
2007 rb_str_cat2(mesg
, "universal_newline");
2009 if (ecflags
& ECONV_CRLF_NEWLINE_DECORATOR
) {
2010 rb_str_cat2(mesg
, pre
); pre
= ",";
2011 rb_str_cat2(mesg
, "crlf_newline");
2013 if (ecflags
& ECONV_CR_NEWLINE_DECORATOR
) {
2014 rb_str_cat2(mesg
, pre
); pre
= ",";
2015 rb_str_cat2(mesg
, "cr_newline");
2017 if (ecflags
& ECONV_XML_TEXT_DECORATOR
) {
2018 rb_str_cat2(mesg
, pre
); pre
= ",";
2019 rb_str_cat2(mesg
, "xml_text");
2021 if (ecflags
& ECONV_XML_ATTR_CONTENT_DECORATOR
) {
2022 rb_str_cat2(mesg
, pre
); pre
= ",";
2023 rb_str_cat2(mesg
, "xml_attr_content");
2025 if (ecflags
& ECONV_XML_ATTR_QUOTE_DECORATOR
) {
2026 rb_str_cat2(mesg
, pre
); pre
= ",";
2027 rb_str_cat2(mesg
, "xml_attr_quote");
2029 has_description
= 1;
2031 if (!has_description
) {
2032 rb_str_cat2(mesg
, "no-conversion");
2039 rb_econv_open_exc(const char *sname
, const char *dname
, int ecflags
)
2042 mesg
= rb_str_new_cstr("code converter not found (");
2043 econv_description(sname
, dname
, ecflags
, mesg
);
2044 rb_str_cat2(mesg
, ")");
2045 exc
= rb_exc_new3(rb_eConverterNotFoundError
, mesg
);
2050 make_econv_exception(rb_econv_t
*ec
)
2053 if (ec
->last_error
.result
== econv_invalid_byte_sequence
||
2054 ec
->last_error
.result
== econv_incomplete_input
) {
2055 const char *err
= (const char *)ec
->last_error
.error_bytes_start
;
2056 size_t error_len
= ec
->last_error
.error_bytes_len
;
2057 VALUE bytes
= rb_str_new(err
, error_len
);
2058 VALUE dumped
= rb_str_dump(bytes
);
2059 size_t readagain_len
= ec
->last_error
.readagain_len
;
2060 VALUE bytes2
= Qnil
;
2062 if (ec
->last_error
.result
== econv_incomplete_input
) {
2063 mesg
= rb_sprintf("incomplete %s on %s",
2064 StringValueCStr(dumped
),
2065 ec
->last_error
.source_encoding
);
2067 else if (readagain_len
) {
2068 bytes2
= rb_str_new(err
+error_len
, readagain_len
);
2069 dumped2
= rb_str_dump(bytes2
);
2070 mesg
= rb_sprintf("%s followed by %s on %s",
2071 StringValueCStr(dumped
),
2072 StringValueCStr(dumped2
),
2073 ec
->last_error
.source_encoding
);
2076 mesg
= rb_sprintf("%s on %s",
2077 StringValueCStr(dumped
),
2078 ec
->last_error
.source_encoding
);
2081 exc
= rb_exc_new3(rb_eInvalidByteSequenceError
, mesg
);
2082 rb_ivar_set(exc
, id_error_bytes
, bytes
);
2083 rb_ivar_set(exc
, id_readagain_bytes
, bytes2
);
2084 rb_ivar_set(exc
, id_incomplete_input
, RBOOL(ec
->last_error
.result
== econv_incomplete_input
));
2087 if (ec
->last_error
.result
== econv_undefined_conversion
) {
2088 VALUE bytes
= rb_str_new((const char *)ec
->last_error
.error_bytes_start
,
2089 ec
->last_error
.error_bytes_len
);
2090 VALUE dumped
= Qnil
;
2092 if (strcmp(ec
->last_error
.source_encoding
, "UTF-8") == 0) {
2093 rb_encoding
*utf8
= rb_utf8_encoding();
2094 const char *start
, *end
;
2096 start
= (const char *)ec
->last_error
.error_bytes_start
;
2097 end
= start
+ ec
->last_error
.error_bytes_len
;
2098 n
= rb_enc_precise_mbclen(start
, end
, utf8
);
2099 if (MBCLEN_CHARFOUND_P(n
) &&
2100 (size_t)MBCLEN_CHARFOUND_LEN(n
) == ec
->last_error
.error_bytes_len
) {
2101 unsigned int cc
= rb_enc_mbc_to_codepoint(start
, end
, utf8
);
2102 dumped
= rb_sprintf("U+%04X", cc
);
2106 dumped
= rb_str_dump(bytes
);
2107 if (strcmp(ec
->last_error
.source_encoding
,
2108 ec
->source_encoding_name
) == 0 &&
2109 strcmp(ec
->last_error
.destination_encoding
,
2110 ec
->destination_encoding_name
) == 0) {
2111 mesg
= rb_sprintf("%s from %s to %s",
2112 StringValueCStr(dumped
),
2113 ec
->last_error
.source_encoding
,
2114 ec
->last_error
.destination_encoding
);
2118 mesg
= rb_sprintf("%s to %s in conversion from %s",
2119 StringValueCStr(dumped
),
2120 ec
->last_error
.destination_encoding
,
2121 ec
->source_encoding_name
);
2122 for (i
= 0; i
< ec
->num_trans
; i
++) {
2123 const rb_transcoder
*tr
= ec
->elems
[i
].tc
->transcoder
;
2124 if (!DECORATOR_P(tr
->src_encoding
, tr
->dst_encoding
))
2125 rb_str_catf(mesg
, " to %s",
2126 ec
->elems
[i
].tc
->transcoder
->dst_encoding
);
2129 exc
= rb_exc_new3(rb_eUndefinedConversionError
, mesg
);
2130 idx
= rb_enc_find_index(ec
->last_error
.source_encoding
);
2132 rb_enc_associate_index(bytes
, idx
);
2133 rb_ivar_set(exc
, id_error_char
, bytes
);
2139 rb_ivar_set(exc
, id_source_encoding_name
, rb_str_new2(ec
->last_error
.source_encoding
));
2140 rb_ivar_set(exc
, id_destination_encoding_name
, rb_str_new2(ec
->last_error
.destination_encoding
));
2141 int idx
= rb_enc_find_index(ec
->last_error
.source_encoding
);
2143 rb_ivar_set(exc
, id_source_encoding
, rb_enc_from_encoding(rb_enc_from_index(idx
)));
2144 idx
= rb_enc_find_index(ec
->last_error
.destination_encoding
);
2146 rb_ivar_set(exc
, id_destination_encoding
, rb_enc_from_encoding(rb_enc_from_index(idx
)));
2153 unsigned char *(*resize_destination
)(VALUE
, size_t, size_t),
2155 unsigned char **out_start_ptr
,
2156 unsigned char **out_pos
,
2157 unsigned char **out_stop_ptr
)
2159 size_t len
= (*out_pos
- *out_start_ptr
);
2160 size_t new_len
= (len
+ max_output
) * 2;
2161 *out_start_ptr
= resize_destination(destination
, len
, new_len
);
2162 *out_pos
= *out_start_ptr
+ len
;
2163 *out_stop_ptr
= *out_start_ptr
+ new_len
;
2167 make_replacement(rb_econv_t
*ec
)
2170 const rb_transcoder
*tr
;
2171 const unsigned char *replacement
;
2172 const char *repl_enc
;
2173 const char *ins_enc
;
2176 if (ec
->replacement_str
)
2179 ins_enc
= rb_econv_encoding_to_insert_output(ec
);
2183 tr
= tc
->transcoder
;
2184 rb_enc_find(tr
->dst_encoding
);
2185 replacement
= (const unsigned char *)get_replacement_character(ins_enc
, &len
, &repl_enc
);
2188 replacement
= (unsigned char *)"?";
2193 ec
->replacement_str
= replacement
;
2194 ec
->replacement_len
= len
;
2195 ec
->replacement_enc
= repl_enc
;
2196 ec
->replacement_allocated
= 0;
2201 rb_econv_set_replacement(rb_econv_t
*ec
,
2202 const unsigned char *str
, size_t len
, const char *encname
)
2204 unsigned char *str2
;
2206 const char *encname2
;
2208 encname2
= rb_econv_encoding_to_insert_output(ec
);
2210 if (!*encname2
|| encoding_equal(encname
, encname2
)) {
2211 str2
= xmalloc(len
);
2212 MEMCPY(str2
, str
, unsigned char, len
); /* xxx: str may be invalid */
2217 str2
= allocate_converted_string(encname
, encname2
, str
, len
, NULL
, 0, &len2
);
2222 if (ec
->replacement_allocated
) {
2223 xfree((void *)ec
->replacement_str
);
2225 ec
->replacement_allocated
= 1;
2226 ec
->replacement_str
= str2
;
2227 ec
->replacement_len
= len2
;
2228 ec
->replacement_enc
= encname2
;
2233 output_replacement_character(rb_econv_t
*ec
)
2237 if (make_replacement(ec
) == -1)
2240 ret
= rb_econv_insert_output(ec
, ec
->replacement_str
, ec
->replacement_len
, ec
->replacement_enc
);
2248 #define hash_fallback rb_hash_aref
2251 proc_fallback(VALUE fallback
, VALUE c
)
2253 return rb_proc_call(fallback
, rb_ary_new4(1, &c
));
2257 method_fallback(VALUE fallback
, VALUE c
)
2259 return rb_method_call(1, &c
, fallback
);
2263 aref_fallback(VALUE fallback
, VALUE c
)
2265 return rb_funcallv_public(fallback
, idAREF
, 1, &c
);
2269 transcode_loop(const unsigned char **in_pos
, unsigned char **out_pos
,
2270 const unsigned char *in_stop
, unsigned char *out_stop
,
2272 unsigned char *(*resize_destination
)(VALUE
, size_t, size_t),
2273 const char *src_encoding
,
2274 const char *dst_encoding
,
2279 rb_transcoding
*last_tc
;
2280 rb_econv_result_t ret
;
2281 unsigned char *out_start
= *out_pos
;
2284 VALUE fallback
= Qnil
;
2285 VALUE (*fallback_func
)(VALUE
, VALUE
) = 0;
2287 ec
= rb_econv_open_opts(src_encoding
, dst_encoding
, ecflags
, ecopts
);
2289 rb_exc_raise(rb_econv_open_exc(src_encoding
, dst_encoding
, ecflags
));
2291 if (!NIL_P(ecopts
) && RB_TYPE_P(ecopts
, T_HASH
)) {
2292 fallback
= rb_hash_aref(ecopts
, sym_fallback
);
2293 if (RB_TYPE_P(fallback
, T_HASH
)) {
2294 fallback_func
= hash_fallback
;
2296 else if (rb_obj_is_proc(fallback
)) {
2297 fallback_func
= proc_fallback
;
2299 else if (rb_obj_is_method(fallback
)) {
2300 fallback_func
= method_fallback
;
2303 fallback_func
= aref_fallback
;
2306 last_tc
= ec
->last_tc
;
2307 max_output
= last_tc
? last_tc
->transcoder
->max_output
: 1;
2310 ret
= rb_econv_convert(ec
, in_pos
, in_stop
, out_pos
, out_stop
, 0);
2312 if (!NIL_P(fallback
) && ret
== econv_undefined_conversion
) {
2313 VALUE rep
= rb_enc_str_new(
2314 (const char *)ec
->last_error
.error_bytes_start
,
2315 ec
->last_error
.error_bytes_len
,
2316 rb_enc_find(ec
->last_error
.source_encoding
));
2317 rep
= (*fallback_func
)(fallback
, rep
);
2318 if (rep
!= Qundef
&& !NIL_P(rep
)) {
2320 ret
= rb_econv_insert_output(ec
, (const unsigned char *)RSTRING_PTR(rep
),
2321 RSTRING_LEN(rep
), rb_enc_name(rb_enc_get(rep
)));
2322 if ((int)ret
== -1) {
2323 rb_raise(rb_eArgError
, "too big fallback string");
2329 if (ret
== econv_invalid_byte_sequence
||
2330 ret
== econv_incomplete_input
||
2331 ret
== econv_undefined_conversion
) {
2332 exc
= make_econv_exception(ec
);
2337 if (ret
== econv_destination_buffer_full
) {
2338 more_output_buffer(destination
, resize_destination
, max_output
, &out_start
, out_pos
, &out_stop
);
2346 /* sample transcode_loop implementation in byte-by-byte stream style */
2348 transcode_loop(const unsigned char **in_pos
, unsigned char **out_pos
,
2349 const unsigned char *in_stop
, unsigned char *out_stop
,
2351 unsigned char *(*resize_destination
)(VALUE
, size_t, size_t),
2352 const char *src_encoding
,
2353 const char *dst_encoding
,
2358 rb_transcoding
*last_tc
;
2359 rb_econv_result_t ret
;
2360 unsigned char *out_start
= *out_pos
;
2361 const unsigned char *ptr
;
2365 ec
= rb_econv_open_opts(src_encoding
, dst_encoding
, ecflags
, ecopts
);
2367 rb_exc_raise(rb_econv_open_exc(src_encoding
, dst_encoding
, ecflags
));
2369 last_tc
= ec
->last_tc
;
2370 max_output
= last_tc
? last_tc
->transcoder
->max_output
: 1;
2372 ret
= econv_source_buffer_empty
;
2374 while (ret
!= econv_finished
) {
2375 unsigned char input_byte
;
2376 const unsigned char *p
= &input_byte
;
2378 if (ret
== econv_source_buffer_empty
) {
2379 if (ptr
< in_stop
) {
2381 ret
= rb_econv_convert(ec
, &p
, p
+1, out_pos
, out_stop
, ECONV_PARTIAL_INPUT
);
2384 ret
= rb_econv_convert(ec
, NULL
, NULL
, out_pos
, out_stop
, 0);
2388 ret
= rb_econv_convert(ec
, NULL
, NULL
, out_pos
, out_stop
, ECONV_PARTIAL_INPUT
);
2390 if (&input_byte
!= p
)
2391 ptr
+= p
- &input_byte
;
2393 case econv_invalid_byte_sequence
:
2394 case econv_incomplete_input
:
2395 case econv_undefined_conversion
:
2396 exc
= make_econv_exception(ec
);
2401 case econv_destination_buffer_full
:
2402 more_output_buffer(destination
, resize_destination
, max_output
, &out_start
, out_pos
, &out_stop
);
2405 case econv_source_buffer_empty
:
2408 case econv_finished
:
2420 * String-specific code
2423 static unsigned char *
2424 str_transcoding_resize(VALUE destination
, size_t len
, size_t new_len
)
2426 rb_str_resize(destination
, new_len
);
2427 return (unsigned char *)RSTRING_PTR(destination
);
2431 econv_opts(VALUE opt
, int ecflags
)
2434 int newlineflag
= 0;
2436 v
= rb_hash_aref(opt
, sym_invalid
);
2439 else if (v
==sym_replace
) {
2440 ecflags
|= ECONV_INVALID_REPLACE
;
2443 rb_raise(rb_eArgError
, "unknown value for invalid character option");
2446 v
= rb_hash_aref(opt
, sym_undef
);
2449 else if (v
==sym_replace
) {
2450 ecflags
|= ECONV_UNDEF_REPLACE
;
2453 rb_raise(rb_eArgError
, "unknown value for undefined character option");
2456 v
= rb_hash_aref(opt
, sym_replace
);
2457 if (!NIL_P(v
) && !(ecflags
& ECONV_INVALID_REPLACE
)) {
2458 ecflags
|= ECONV_UNDEF_REPLACE
;
2461 v
= rb_hash_aref(opt
, sym_xml
);
2464 ecflags
|= ECONV_XML_TEXT_DECORATOR
|ECONV_UNDEF_HEX_CHARREF
;
2466 else if (v
==sym_attr
) {
2467 ecflags
|= ECONV_XML_ATTR_CONTENT_DECORATOR
|ECONV_XML_ATTR_QUOTE_DECORATOR
|ECONV_UNDEF_HEX_CHARREF
;
2469 else if (SYMBOL_P(v
)) {
2470 rb_raise(rb_eArgError
, "unexpected value for xml option: %"PRIsVALUE
, rb_sym2str(v
));
2473 rb_raise(rb_eArgError
, "unexpected value for xml option");
2477 #ifdef ENABLE_ECONV_NEWLINE_OPTION
2478 v
= rb_hash_aref(opt
, sym_newline
);
2481 ecflags
&= ~ECONV_NEWLINE_DECORATOR_MASK
;
2482 if (v
== sym_universal
) {
2483 ecflags
|= ECONV_UNIVERSAL_NEWLINE_DECORATOR
;
2485 else if (v
== sym_crlf
) {
2486 ecflags
|= ECONV_CRLF_NEWLINE_DECORATOR
;
2488 else if (v
== sym_cr
) {
2489 ecflags
|= ECONV_CR_NEWLINE_DECORATOR
;
2491 else if (v
== sym_lf
) {
2492 /* ecflags |= ECONV_LF_NEWLINE_DECORATOR; */
2494 else if (SYMBOL_P(v
)) {
2495 rb_raise(rb_eArgError
, "unexpected value for newline option: %"PRIsVALUE
,
2499 rb_raise(rb_eArgError
, "unexpected value for newline option");
2506 v
= rb_hash_aref(opt
, sym_universal_newline
);
2508 setflags
|= ECONV_UNIVERSAL_NEWLINE_DECORATOR
;
2509 newlineflag
|= !NIL_P(v
);
2511 v
= rb_hash_aref(opt
, sym_crlf_newline
);
2513 setflags
|= ECONV_CRLF_NEWLINE_DECORATOR
;
2514 newlineflag
|= !NIL_P(v
);
2516 v
= rb_hash_aref(opt
, sym_cr_newline
);
2518 setflags
|= ECONV_CR_NEWLINE_DECORATOR
;
2519 newlineflag
|= !NIL_P(v
);
2521 switch (newlineflag
) {
2523 ecflags
&= ~ECONV_NEWLINE_DECORATOR_MASK
;
2524 ecflags
|= setflags
;
2528 rb_warning(":newline option precedes other newline options");
2537 rb_econv_prepare_options(VALUE opthash
, VALUE
*opts
, int ecflags
)
2539 VALUE newhash
= Qnil
;
2542 if (NIL_P(opthash
)) {
2546 ecflags
= econv_opts(opthash
, ecflags
);
2548 v
= rb_hash_aref(opthash
, sym_replace
);
2551 if (rb_enc_str_coderange(v
) == ENC_CODERANGE_BROKEN
) {
2552 VALUE dumped
= rb_str_dump(v
);
2553 rb_raise(rb_eArgError
, "replacement string is broken: %s as %s",
2554 StringValueCStr(dumped
),
2555 rb_enc_name(rb_enc_get(v
)));
2557 v
= rb_str_new_frozen(v
);
2558 newhash
= rb_hash_new();
2559 rb_hash_aset(newhash
, sym_replace
, v
);
2562 v
= rb_hash_aref(opthash
, sym_fallback
);
2564 VALUE h
= rb_check_hash_type(v
);
2566 ? (rb_obj_is_proc(v
) || rb_obj_is_method(v
) || rb_respond_to(v
, idAREF
))
2569 newhash
= rb_hash_new();
2570 rb_hash_aset(newhash
, sym_fallback
, v
);
2574 if (!NIL_P(newhash
))
2575 rb_hash_freeze(newhash
);
2582 rb_econv_prepare_opts(VALUE opthash
, VALUE
*opts
)
2584 return rb_econv_prepare_options(opthash
, opts
, 0);
2588 rb_econv_open_opts(const char *source_encoding
, const char *destination_encoding
, int ecflags
, VALUE opthash
)
2593 if (NIL_P(opthash
)) {
2597 if (!RB_TYPE_P(opthash
, T_HASH
) || !OBJ_FROZEN(opthash
))
2598 rb_bug("rb_econv_open_opts called with invalid opthash");
2599 replacement
= rb_hash_aref(opthash
, sym_replace
);
2602 ec
= rb_econv_open(source_encoding
, destination_encoding
, ecflags
);
2606 if (!NIL_P(replacement
)) {
2608 rb_encoding
*enc
= rb_enc_get(replacement
);
2610 ret
= rb_econv_set_replacement(ec
,
2611 (const unsigned char *)RSTRING_PTR(replacement
),
2612 RSTRING_LEN(replacement
),
2623 enc_arg(VALUE
*arg
, const char **name_p
, rb_encoding
**enc_p
)
2630 if (((encidx
= rb_to_encoding_index(encval
= *arg
)) < 0) ||
2631 !(enc
= rb_enc_from_index(encidx
))) {
2634 n
= StringValueCStr(*arg
);
2637 n
= rb_enc_name(enc
);
2647 str_transcode_enc_args(VALUE str
, VALUE
*arg1
, VALUE
*arg2
,
2648 const char **sname_p
, rb_encoding
**senc_p
,
2649 const char **dname_p
, rb_encoding
**denc_p
)
2651 rb_encoding
*senc
, *denc
;
2652 const char *sname
, *dname
;
2653 int sencidx
, dencidx
;
2655 dencidx
= enc_arg(arg1
, &dname
, &denc
);
2658 sencidx
= rb_enc_get_index(str
);
2659 senc
= rb_enc_from_index(sencidx
);
2660 sname
= rb_enc_name(senc
);
2663 sencidx
= enc_arg(arg2
, &sname
, &senc
);
2674 str_transcode0(int argc
, VALUE
*argv
, VALUE
*self
, int ecflags
, VALUE ecopts
)
2680 unsigned char *buf
, *bp
, *sp
;
2681 const unsigned char *fromp
;
2682 rb_encoding
*senc
, *denc
;
2683 const char *sname
, *dname
;
2685 int explicitly_invalid_replace
= TRUE
;
2687 rb_check_arity(argc
, 0, 2);
2690 arg1
= rb_enc_default_internal();
2692 if (!ecflags
) return -1;
2693 arg1
= rb_obj_encoding(str
);
2695 if (!(ecflags
& ECONV_INVALID_MASK
)) {
2696 explicitly_invalid_replace
= FALSE
;
2698 ecflags
|= ECONV_INVALID_REPLACE
| ECONV_UNDEF_REPLACE
;
2703 arg2
= argc
<=1 ? Qnil
: argv
[1];
2704 dencidx
= str_transcode_enc_args(str
, &arg1
, &arg2
, &sname
, &senc
, &dname
, &denc
);
2706 if ((ecflags
& (ECONV_NEWLINE_DECORATOR_MASK
|
2707 ECONV_XML_TEXT_DECORATOR
|
2708 ECONV_XML_ATTR_CONTENT_DECORATOR
|
2709 ECONV_XML_ATTR_QUOTE_DECORATOR
)) == 0) {
2710 if (senc
&& senc
== denc
) {
2711 if ((ecflags
& ECONV_INVALID_MASK
) && explicitly_invalid_replace
) {
2713 if (!NIL_P(ecopts
)) {
2714 rep
= rb_hash_aref(ecopts
, sym_replace
);
2716 dest
= rb_enc_str_scrub(senc
, str
, rep
);
2717 if (NIL_P(dest
)) dest
= str
;
2721 return NIL_P(arg2
) ? -1 : dencidx
;
2723 if (senc
&& denc
&& rb_enc_asciicompat(senc
) && rb_enc_asciicompat(denc
)) {
2724 if (rb_enc_str_coderange(str
) == ENC_CODERANGE_7BIT
) {
2728 if (encoding_equal(sname
, dname
)) {
2729 return NIL_P(arg2
) ? -1 : dencidx
;
2733 if (senc
&& denc
&& !rb_enc_asciicompat(senc
) && !rb_enc_asciicompat(denc
)) {
2734 rb_encoding
*utf8
= rb_utf8_encoding();
2735 str
= rb_str_conv_enc(str
, senc
, utf8
);
2739 if (encoding_equal(sname
, dname
)) {
2745 fromp
= sp
= (unsigned char *)RSTRING_PTR(str
);
2746 slen
= RSTRING_LEN(str
);
2747 blen
= slen
+ 30; /* len + margin */
2748 dest
= rb_str_tmp_new(blen
);
2749 bp
= (unsigned char *)RSTRING_PTR(dest
);
2751 transcode_loop(&fromp
, &bp
, (sp
+slen
), (bp
+blen
), dest
, str_transcoding_resize
, sname
, dname
, ecflags
, ecopts
);
2752 if (fromp
!= sp
+slen
) {
2753 rb_raise(rb_eArgError
, "not fully converted, %"PRIdPTRDIFF
" bytes left", sp
+slen
-fromp
);
2755 buf
= (unsigned char *)RSTRING_PTR(dest
);
2757 rb_str_set_len(dest
, bp
- buf
);
2761 dencidx
= rb_define_dummy_encoding(dname
);
2771 str_transcode(int argc
, VALUE
*argv
, VALUE
*self
)
2775 VALUE ecopts
= Qnil
;
2777 argc
= rb_scan_args(argc
, argv
, "02:", NULL
, NULL
, &opt
);
2779 ecflags
= rb_econv_prepare_opts(opt
, &ecopts
);
2781 return str_transcode0(argc
, argv
, self
, ecflags
, ecopts
);
2785 str_encode_associate(VALUE str
, int encidx
)
2789 rb_enc_associate_index(str
, encidx
);
2791 /* transcoded string never be broken. */
2792 if (rb_enc_asciicompat(rb_enc_from_index(encidx
))) {
2793 rb_str_coderange_scan_restartable(RSTRING_PTR(str
), RSTRING_END(str
), 0, &cr
);
2796 cr
= ENC_CODERANGE_VALID
;
2798 ENC_CODERANGE_SET(str
, cr
);
2804 * str.encode!(encoding, **options) -> str
2805 * str.encode!(dst_encoding, src_encoding, **options) -> str
2807 * The first form transcodes the contents of <i>str</i> from
2808 * str.encoding to +encoding+.
2809 * The second form transcodes the contents of <i>str</i> from
2810 * src_encoding to dst_encoding.
2811 * The +options+ keyword arguments give details for conversion. See String#encode
2813 * Returns the string even if no changes were made.
2817 str_encode_bang(int argc
, VALUE
*argv
, VALUE str
)
2822 rb_check_frozen(str
);
2825 encidx
= str_transcode(argc
, argv
, &newstr
);
2827 if (encidx
< 0) return str
;
2828 if (newstr
== str
) {
2829 rb_enc_associate_index(str
, encidx
);
2832 rb_str_shared_replace(str
, newstr
);
2833 return str_encode_associate(str
, encidx
);
2836 static VALUE
encoded_dup(VALUE newstr
, VALUE str
, int encidx
);
2840 * str.encode(encoding, **options) -> str
2841 * str.encode(dst_encoding, src_encoding, **options) -> str
2842 * str.encode(**options) -> str
2844 * The first form returns a copy of +str+ transcoded
2845 * to encoding +encoding+.
2846 * The second form returns a copy of +str+ transcoded
2847 * from src_encoding to dst_encoding.
2848 * The last form returns a copy of +str+ transcoded to
2849 * <tt>Encoding.default_internal</tt>.
2851 * By default, the first and second form raise
2852 * Encoding::UndefinedConversionError for characters that are
2853 * undefined in the destination encoding, and
2854 * Encoding::InvalidByteSequenceError for invalid byte sequences
2855 * in the source encoding. The last form by default does not raise
2856 * exceptions but uses replacement strings.
2858 * The +options+ keyword arguments give details for conversion.
2859 * The arguments are:
2862 * If the value is +:replace+, #encode replaces invalid byte sequences in
2863 * +str+ with the replacement character. The default is to raise the
2864 * Encoding::InvalidByteSequenceError exception
2866 * If the value is +:replace+, #encode replaces characters which are
2867 * undefined in the destination encoding with the replacement character.
2868 * The default is to raise the Encoding::UndefinedConversionError.
2870 * Sets the replacement string to the given value. The default replacement
2871 * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
2873 * Sets the replacement string by the given object for undefined
2874 * character. The object should be a Hash, a Proc, a Method, or an
2875 * object which has [] method.
2876 * Its key is an undefined character encoded in the source encoding
2877 * of current transcoder. Its value can be any encoding until it
2878 * can be converted into the destination encoding of the transcoder.
2880 * The value must be +:text+ or +:attr+.
2881 * If the value is +:text+ #encode replaces undefined characters with their
2882 * (upper-case hexadecimal) numeric character references. '&', '<', and '>'
2883 * are converted to "&", "<", and ">", respectively.
2884 * If the value is +:attr+, #encode also quotes the replacement result
2885 * (using '"'), and replaces '"' with """.
2887 * Replaces LF ("\n") with CR ("\r") if value is true.
2889 * Replaces LF ("\n") with CRLF ("\r\n") if value is true.
2890 * :universal_newline ::
2891 * Replaces CRLF ("\r\n") and CR ("\r") with LF ("\n") if value is true.
2895 str_encode(int argc
, VALUE
*argv
, VALUE str
)
2898 int encidx
= str_transcode(argc
, argv
, &newstr
);
2899 return encoded_dup(newstr
, str
, encidx
);
2903 rb_str_encode(VALUE str
, VALUE to
, int ecflags
, VALUE ecopts
)
2908 int encidx
= str_transcode0(argc
, argv
, &newstr
, ecflags
, ecopts
);
2909 return encoded_dup(newstr
, str
, encidx
);
2913 encoded_dup(VALUE newstr
, VALUE str
, int encidx
)
2915 if (encidx
< 0) return rb_str_dup(str
);
2916 if (newstr
== str
) {
2917 newstr
= rb_str_dup(str
);
2918 rb_enc_associate_index(newstr
, encidx
);
2922 RBASIC_SET_CLASS(newstr
, rb_obj_class(str
));
2924 return str_encode_associate(newstr
, encidx
);
2928 * Document-class: Encoding::Converter
2930 * Encoding conversion class.
2933 econv_free(void *ptr
)
2935 rb_econv_t
*ec
= ptr
;
2940 econv_memsize(const void *ptr
)
2942 return sizeof(rb_econv_t
);
2945 static const rb_data_type_t econv_data_type
= {
2947 {0, econv_free
, econv_memsize
,},
2948 0, 0, RUBY_TYPED_FREE_IMMEDIATELY
2952 econv_s_allocate(VALUE klass
)
2954 return TypedData_Wrap_Struct(klass
, &econv_data_type
, NULL
);
2957 static rb_encoding
*
2958 make_dummy_encoding(const char *name
)
2962 idx
= rb_define_dummy_encoding(name
);
2963 enc
= rb_enc_from_index(idx
);
2967 static rb_encoding
*
2968 make_encoding(const char *name
)
2971 enc
= rb_enc_find(name
);
2973 enc
= make_dummy_encoding(name
);
2978 make_encobj(const char *name
)
2980 return rb_enc_from_encoding(make_encoding(name
));
2985 * Encoding::Converter.asciicompat_encoding(string) -> encoding or nil
2986 * Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil
2988 * Returns the corresponding ASCII compatible encoding.
2990 * Returns nil if the argument is an ASCII compatible encoding.
2992 * "corresponding ASCII compatible encoding" is an ASCII compatible encoding which
2993 * can represents exactly the same characters as the given ASCII incompatible encoding.
2994 * So, no conversion undefined error occurs when converting between the two encodings.
2996 * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
2997 * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
2998 * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
3002 econv_s_asciicompat_encoding(VALUE klass
, VALUE arg
)
3004 const char *arg_name
, *result_name
;
3005 rb_encoding
*arg_enc
, *result_enc
;
3007 enc_arg(&arg
, &arg_name
, &arg_enc
);
3009 result_name
= rb_econv_asciicompat_encoding(arg_name
);
3011 if (result_name
== NULL
)
3014 result_enc
= make_encoding(result_name
);
3016 return rb_enc_from_encoding(result_enc
);
3020 econv_args(int argc
, VALUE
*argv
,
3021 VALUE
*snamev_p
, VALUE
*dnamev_p
,
3022 const char **sname_p
, const char **dname_p
,
3023 rb_encoding
**senc_p
, rb_encoding
**denc_p
,
3027 VALUE opt
, flags_v
, ecopts
;
3029 const char *sname
, *dname
;
3030 rb_encoding
*senc
, *denc
;
3033 argc
= rb_scan_args(argc
, argv
, "21:", snamev_p
, dnamev_p
, &flags_v
, &opt
);
3035 if (!NIL_P(flags_v
)) {
3037 rb_error_arity(argc
+ 1, 2, 3);
3039 ecflags
= NUM2INT(rb_to_int(flags_v
));
3042 else if (!NIL_P(opt
)) {
3043 ecflags
= rb_econv_prepare_opts(opt
, &ecopts
);
3051 sidx
= rb_to_encoding_index(*snamev_p
);
3053 senc
= rb_enc_from_index(sidx
);
3056 StringValue(*snamev_p
);
3060 didx
= rb_to_encoding_index(*dnamev_p
);
3062 denc
= rb_enc_from_index(didx
);
3065 StringValue(*dnamev_p
);
3068 sname
= senc
? rb_enc_name(senc
) : StringValueCStr(*snamev_p
);
3069 dname
= denc
? rb_enc_name(denc
) : StringValueCStr(*dnamev_p
);
3075 *ecflags_p
= ecflags
;
3080 decorate_convpath(VALUE convpath
, int ecflags
)
3083 const char *decorators
[MAX_ECFLAGS_DECORATORS
];
3087 num_decorators
= decorator_names(ecflags
, decorators
);
3088 if (num_decorators
== -1)
3091 len
= n
= RARRAY_LENINT(convpath
);
3093 VALUE pair
= RARRAY_AREF(convpath
, n
-1);
3094 if (RB_TYPE_P(pair
, T_ARRAY
)) {
3095 const char *sname
= rb_enc_name(rb_to_encoding(RARRAY_AREF(pair
, 0)));
3096 const char *dname
= rb_enc_name(rb_to_encoding(RARRAY_AREF(pair
, 1)));
3097 transcoder_entry_t
*entry
= get_transcoder_entry(sname
, dname
);
3098 const rb_transcoder
*tr
= load_transcoder_entry(entry
);
3101 if (!DECORATOR_P(tr
->src_encoding
, tr
->dst_encoding
) &&
3102 tr
->asciicompat_type
== asciicompat_encoder
) {
3104 rb_ary_store(convpath
, len
+ num_decorators
- 1, pair
);
3108 rb_ary_store(convpath
, len
+ num_decorators
- 1, pair
);
3112 for (i
= 0; i
< num_decorators
; i
++)
3113 rb_ary_store(convpath
, n
+ i
, rb_str_new_cstr(decorators
[i
]));
3119 search_convpath_i(const char *sname
, const char *dname
, int depth
, void *arg
)
3124 if (NIL_P(*ary_p
)) {
3125 *ary_p
= rb_ary_new();
3128 if (DECORATOR_P(sname
, dname
)) {
3129 v
= rb_str_new_cstr(dname
);
3132 v
= rb_assoc_new(make_encobj(sname
), make_encobj(dname
));
3134 rb_ary_store(*ary_p
, depth
, v
);
3139 * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary
3140 * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary
3142 * Returns a conversion path.
3144 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
3145 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3146 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
3148 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
3150 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
3151 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3152 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3153 * # "universal_newline"]
3155 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
3157 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
3158 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3159 * # "universal_newline",
3160 * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
3163 econv_s_search_convpath(int argc
, VALUE
*argv
, VALUE klass
)
3165 VALUE snamev
, dnamev
;
3166 const char *sname
, *dname
;
3167 rb_encoding
*senc
, *denc
;
3172 econv_args(argc
, argv
, &snamev
, &dnamev
, &sname
, &dname
, &senc
, &denc
, &ecflags
, &ecopts
);
3175 transcode_search_path(sname
, dname
, search_convpath_i
, &convpath
);
3177 if (NIL_P(convpath
)) {
3178 VALUE exc
= rb_econv_open_exc(sname
, dname
, ecflags
);
3179 RB_GC_GUARD(snamev
);
3180 RB_GC_GUARD(dnamev
);
3184 if (decorate_convpath(convpath
, ecflags
) == -1) {
3185 VALUE exc
= rb_econv_open_exc(sname
, dname
, ecflags
);
3186 RB_GC_GUARD(snamev
);
3187 RB_GC_GUARD(dnamev
);
3195 * Check the existence of a conversion path.
3196 * Returns the number of converters in the conversion path.
3197 * result: >=0:success -1:failure
3200 rb_econv_has_convpath_p(const char* from_encoding
, const char* to_encoding
)
3202 VALUE convpath
= Qnil
;
3203 transcode_search_path(from_encoding
, to_encoding
, search_convpath_i
,
3205 return RTEST(convpath
);
3208 struct rb_econv_init_by_convpath_t
{
3215 rb_econv_init_by_convpath_i(const char *sname
, const char *dname
, int depth
, void *arg
)
3217 struct rb_econv_init_by_convpath_t
*a
= (struct rb_econv_init_by_convpath_t
*)arg
;
3223 ret
= rb_econv_add_converter(a
->ec
, sname
, dname
, a
->index
);
3230 rb_econv_init_by_convpath(VALUE self
, VALUE convpath
,
3231 const char **sname_p
, const char **dname_p
,
3232 rb_encoding
**senc_p
, rb_encoding
**denc_p
)
3238 rb_encoding
*senc
= 0, *denc
= 0;
3239 const char *sname
, *dname
;
3241 ec
= rb_econv_alloc(RARRAY_LENINT(convpath
));
3242 DATA_PTR(self
) = ec
;
3244 for (i
= 0; i
< RARRAY_LEN(convpath
); i
++) {
3245 VALUE snamev
, dnamev
;
3247 elt
= rb_ary_entry(convpath
, i
);
3248 if (!NIL_P(pair
= rb_check_array_type(elt
))) {
3249 if (RARRAY_LEN(pair
) != 2)
3250 rb_raise(rb_eArgError
, "not a 2-element array in convpath");
3251 snamev
= rb_ary_entry(pair
, 0);
3252 enc_arg(&snamev
, &sname
, &senc
);
3253 dnamev
= rb_ary_entry(pair
, 1);
3254 enc_arg(&dnamev
, &dname
, &denc
);
3258 dname
= StringValueCStr(elt
);
3260 if (DECORATOR_P(sname
, dname
)) {
3261 ret
= rb_econv_add_converter(ec
, sname
, dname
, ec
->num_trans
);
3263 VALUE msg
= rb_sprintf("decoration failed: %s", dname
);
3264 RB_GC_GUARD(snamev
);
3265 RB_GC_GUARD(dnamev
);
3266 rb_exc_raise(rb_exc_new_str(rb_eArgError
, msg
));
3270 int j
= ec
->num_trans
;
3271 struct rb_econv_init_by_convpath_t arg
;
3273 arg
.index
= ec
->num_trans
;
3275 ret
= transcode_search_path(sname
, dname
, rb_econv_init_by_convpath_i
, &arg
);
3276 if (ret
== -1 || arg
.ret
== -1) {
3277 VALUE msg
= rb_sprintf("adding conversion failed: %s to %s", sname
, dname
);
3278 RB_GC_GUARD(snamev
);
3279 RB_GC_GUARD(dnamev
);
3280 rb_exc_raise(rb_exc_new_str(rb_eArgError
, msg
));
3285 *sname_p
= ec
->elems
[j
].tc
->transcoder
->src_encoding
;
3288 *dname_p
= ec
->elems
[ec
->num_trans
-1].tc
->transcoder
->dst_encoding
;
3299 ec
->source_encoding_name
= *sname_p
;
3300 ec
->destination_encoding_name
= *dname_p
;
3307 * Encoding::Converter.new(source_encoding, destination_encoding)
3308 * Encoding::Converter.new(source_encoding, destination_encoding, opt)
3309 * Encoding::Converter.new(convpath)
3311 * possible options elements:
3313 * :invalid => nil # raise error on invalid byte sequence (default)
3314 * :invalid => :replace # replace invalid byte sequence
3315 * :undef => nil # raise error on undefined conversion (default)
3316 * :undef => :replace # replace undefined conversion
3317 * :replace => string # replacement string ("?" or "\uFFFD" if not specified)
3318 * :newline => :universal # decorator for converting CRLF and CR to LF
3319 * :newline => :crlf # decorator for converting LF to CRLF
3320 * :newline => :cr # decorator for converting LF to CR
3321 * :universal_newline => true # decorator for converting CRLF and CR to LF
3322 * :crlf_newline => true # decorator for converting LF to CRLF
3323 * :cr_newline => true # decorator for converting LF to CR
3324 * :xml => :text # escape as XML CharData.
3325 * :xml => :attr # escape as XML AttValue
3327 * Encoding::Converter::INVALID_REPLACE
3328 * Encoding::Converter::UNDEF_REPLACE
3329 * Encoding::Converter::UNDEF_HEX_CHARREF
3330 * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
3331 * Encoding::Converter::CRLF_NEWLINE_DECORATOR
3332 * Encoding::Converter::CR_NEWLINE_DECORATOR
3333 * Encoding::Converter::XML_TEXT_DECORATOR
3334 * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
3335 * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
3337 * Encoding::Converter.new creates an instance of Encoding::Converter.
3339 * Source_encoding and destination_encoding should be a string or
3342 * opt should be nil, a hash or an integer.
3344 * convpath should be an array.
3345 * convpath may contain
3346 * - two-element arrays which contain encodings or encoding names, or
3347 * - strings representing decorator names.
3349 * Encoding::Converter.new optionally takes an option.
3350 * The option should be a hash or an integer.
3351 * The option hash can contain :invalid => nil, etc.
3352 * The option integer should be logical-or of constants such as
3353 * Encoding::Converter::INVALID_REPLACE, etc.
3356 * Raise error on invalid byte sequence. This is a default behavior.
3357 * [:invalid => :replace]
3358 * Replace invalid byte sequence by replacement string.
3360 * Raise an error if a character in source_encoding is not defined in destination_encoding.
3361 * This is a default behavior.
3362 * [:undef => :replace]
3363 * Replace undefined character in destination_encoding with replacement string.
3364 * [:replace => string]
3365 * Specify the replacement string.
3366 * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
3367 * [:universal_newline => true]
3368 * Convert CRLF and CR to LF.
3369 * [:crlf_newline => true]
3370 * Convert LF to CRLF.
3371 * [:cr_newline => true]
3374 * Escape as XML CharData.
3375 * This form can be used as an HTML 4.0 #PCDATA.
3379 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3381 * Escape as XML AttValue.
3382 * The converted result is quoted as "...".
3383 * This form can be used as an HTML 4.0 attribute value.
3388 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3391 * # UTF-16BE to UTF-8
3392 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3394 * # Usually, decorators such as newline conversion are inserted last.
3395 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
3396 * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
3397 * # "universal_newline"]
3399 * # But, if the last encoding is ASCII incompatible,
3400 * # decorators are inserted before the last conversion.
3401 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
3402 * p ec.convpath #=> ["crlf_newline",
3403 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3405 * # Conversion path can be specified directly.
3406 * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
3407 * p ec.convpath #=> ["universal_newline",
3408 * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
3409 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3412 econv_init(int argc
, VALUE
*argv
, VALUE self
)
3415 VALUE snamev
, dnamev
;
3416 const char *sname
, *dname
;
3417 rb_encoding
*senc
, *denc
;
3422 if (rb_check_typeddata(self
, &econv_data_type
)) {
3423 rb_raise(rb_eTypeError
, "already initialized");
3426 if (argc
== 1 && !NIL_P(convpath
= rb_check_array_type(argv
[0]))) {
3427 ec
= rb_econv_init_by_convpath(self
, convpath
, &sname
, &dname
, &senc
, &denc
);
3432 econv_args(argc
, argv
, &snamev
, &dnamev
, &sname
, &dname
, &senc
, &denc
, &ecflags
, &ecopts
);
3433 ec
= rb_econv_open_opts(sname
, dname
, ecflags
, ecopts
);
3437 VALUE exc
= rb_econv_open_exc(sname
, dname
, ecflags
);
3438 RB_GC_GUARD(snamev
);
3439 RB_GC_GUARD(dnamev
);
3443 if (!DECORATOR_P(sname
, dname
)) {
3445 senc
= make_dummy_encoding(sname
);
3447 denc
= make_dummy_encoding(dname
);
3448 RB_GC_GUARD(snamev
);
3449 RB_GC_GUARD(dnamev
);
3452 ec
->source_encoding
= senc
;
3453 ec
->destination_encoding
= denc
;
3455 DATA_PTR(self
) = ec
;
3462 * ec.inspect -> string
3464 * Returns a printable version of <i>ec</i>
3466 * ec = Encoding::Converter.new("iso-8859-1", "utf-8")
3467 * puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
3471 econv_inspect(VALUE self
)
3473 const char *cname
= rb_obj_classname(self
);
3476 TypedData_Get_Struct(self
, rb_econv_t
, &econv_data_type
, ec
);
3478 return rb_sprintf("#<%s: uninitialized>", cname
);
3480 const char *sname
= ec
->source_encoding_name
;
3481 const char *dname
= ec
->destination_encoding_name
;
3483 str
= rb_sprintf("#<%s: ", cname
);
3484 econv_description(sname
, dname
, ec
->flags
, str
);
3485 rb_str_cat2(str
, ">");
3491 check_econv(VALUE self
)
3495 TypedData_Get_Struct(self
, rb_econv_t
, &econv_data_type
, ec
);
3497 rb_raise(rb_eTypeError
, "uninitialized encoding converter");
3504 * ec.source_encoding -> encoding
3506 * Returns the source encoding as an Encoding object.
3509 econv_source_encoding(VALUE self
)
3511 rb_econv_t
*ec
= check_econv(self
);
3512 if (!ec
->source_encoding
)
3514 return rb_enc_from_encoding(ec
->source_encoding
);
3519 * ec.destination_encoding -> encoding
3521 * Returns the destination encoding as an Encoding object.
3524 econv_destination_encoding(VALUE self
)
3526 rb_econv_t
*ec
= check_econv(self
);
3527 if (!ec
->destination_encoding
)
3529 return rb_enc_from_encoding(ec
->destination_encoding
);
3534 * ec.convpath -> ary
3536 * Returns the conversion path of ec.
3538 * The result is an array of conversions.
3540 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
3542 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3543 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3546 * Each element of the array is a pair of encodings or a string.
3547 * A pair means an encoding conversion.
3548 * A string means a decorator.
3550 * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
3551 * a converter from ISO-8859-1 to UTF-8.
3552 * "crlf_newline" means newline converter from LF to CRLF.
3555 econv_convpath(VALUE self
)
3557 rb_econv_t
*ec
= check_econv(self
);
3561 result
= rb_ary_new();
3562 for (i
= 0; i
< ec
->num_trans
; i
++) {
3563 const rb_transcoder
*tr
= ec
->elems
[i
].tc
->transcoder
;
3565 if (DECORATOR_P(tr
->src_encoding
, tr
->dst_encoding
))
3566 v
= rb_str_new_cstr(tr
->dst_encoding
);
3568 v
= rb_assoc_new(make_encobj(tr
->src_encoding
), make_encobj(tr
->dst_encoding
));
3569 rb_ary_push(result
, v
);
3576 * ec == other -> true or false
3579 econv_equal(VALUE self
, VALUE other
)
3581 rb_econv_t
*ec1
= check_econv(self
);
3585 if (!rb_typeddata_is_kind_of(other
, &econv_data_type
)) {
3588 ec2
= DATA_PTR(other
);
3589 if (!ec2
) return Qfalse
;
3590 if (ec1
->source_encoding_name
!= ec2
->source_encoding_name
&&
3591 strcmp(ec1
->source_encoding_name
, ec2
->source_encoding_name
))
3593 if (ec1
->destination_encoding_name
!= ec2
->destination_encoding_name
&&
3594 strcmp(ec1
->destination_encoding_name
, ec2
->destination_encoding_name
))
3596 if (ec1
->flags
!= ec2
->flags
) return Qfalse
;
3597 if (ec1
->replacement_enc
!= ec2
->replacement_enc
&&
3598 strcmp(ec1
->replacement_enc
, ec2
->replacement_enc
))
3600 if (ec1
->replacement_len
!= ec2
->replacement_len
) return Qfalse
;
3601 if (ec1
->replacement_str
!= ec2
->replacement_str
&&
3602 memcmp(ec1
->replacement_str
, ec2
->replacement_str
, ec2
->replacement_len
))
3605 if (ec1
->num_trans
!= ec2
->num_trans
) return Qfalse
;
3606 for (i
= 0; i
< ec1
->num_trans
; i
++) {
3607 if (ec1
->elems
[i
].tc
->transcoder
!= ec2
->elems
[i
].tc
->transcoder
)
3614 econv_result_to_symbol(rb_econv_result_t res
)
3617 case econv_invalid_byte_sequence
: return sym_invalid_byte_sequence
;
3618 case econv_incomplete_input
: return sym_incomplete_input
;
3619 case econv_undefined_conversion
: return sym_undefined_conversion
;
3620 case econv_destination_buffer_full
: return sym_destination_buffer_full
;
3621 case econv_source_buffer_empty
: return sym_source_buffer_empty
;
3622 case econv_finished
: return sym_finished
;
3623 case econv_after_output
: return sym_after_output
;
3624 default: return INT2NUM(res
); /* should not be reached */
3630 * ec.primitive_convert(source_buffer, destination_buffer) -> symbol
3631 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
3632 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
3633 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
3635 * possible opt elements:
3637 * :partial_input => true # source buffer may be part of larger source
3638 * :after_output => true # stop conversion after output before input
3640 * Encoding::Converter::PARTIAL_INPUT
3641 * Encoding::Converter::AFTER_OUTPUT
3644 * :invalid_byte_sequence
3646 * :undefined_conversion
3648 * :destination_buffer_full
3649 * :source_buffer_empty
3652 * primitive_convert converts source_buffer into destination_buffer.
3654 * source_buffer should be a string or nil.
3655 * nil means an empty string.
3657 * destination_buffer should be a string.
3659 * destination_byteoffset should be an integer or nil.
3660 * nil means the end of destination_buffer.
3661 * If it is omitted, nil is assumed.
3663 * destination_bytesize should be an integer or nil.
3664 * nil means unlimited.
3665 * If it is omitted, nil is assumed.
3667 * opt should be nil, a hash or an integer.
3668 * nil means no flags.
3669 * If it is omitted, nil is assumed.
3671 * primitive_convert converts the content of source_buffer from beginning
3672 * and store the result into destination_buffer.
3674 * destination_byteoffset and destination_bytesize specify the region which
3675 * the converted result is stored.
3676 * destination_byteoffset specifies the start position in destination_buffer in bytes.
3677 * If destination_byteoffset is nil,
3678 * destination_buffer.bytesize is used for appending the result.
3679 * destination_bytesize specifies maximum number of bytes.
3680 * If destination_bytesize is nil,
3681 * destination size is unlimited.
3682 * After conversion, destination_buffer is resized to
3683 * destination_byteoffset + actually produced number of bytes.
3684 * Also destination_buffer's encoding is set to destination_encoding.
3686 * primitive_convert drops the converted part of source_buffer.
3687 * the dropped part is converted in destination_buffer or
3688 * buffered in Encoding::Converter object.
3690 * primitive_convert stops conversion when one of following condition met.
3691 * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
3692 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3693 * - unexpected end of source buffer (:incomplete_input)
3694 * this occur only when :partial_input is not specified.
3695 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3696 * - character not representable in output encoding (:undefined_conversion)
3697 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3698 * - after some output is generated, before input is done (:after_output)
3699 * this occur only when :after_output is specified.
3700 * - destination buffer is full (:destination_buffer_full)
3701 * this occur only when destination_bytesize is non-nil.
3702 * - source buffer is empty (:source_buffer_empty)
3703 * this occur only when :partial_input is specified.
3704 * - conversion is finished (:finished)
3707 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3708 * ret = ec.primitive_convert(src="pi", dst="", nil, 100)
3709 * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
3711 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3712 * ret = ec.primitive_convert(src="pi", dst="", nil, 1)
3713 * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
3714 * ret = ec.primitive_convert(src, dst="", nil, 1)
3715 * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
3716 * ret = ec.primitive_convert(src, dst="", nil, 1)
3717 * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
3718 * ret = ec.primitive_convert(src, dst="", nil, 1)
3719 * p [ret, src, dst] #=> [:finished, "", "i"]
3723 econv_primitive_convert(int argc
, VALUE
*argv
, VALUE self
)
3725 VALUE input
, output
, output_byteoffset_v
, output_bytesize_v
, opt
, flags_v
;
3726 rb_econv_t
*ec
= check_econv(self
);
3727 rb_econv_result_t res
;
3728 const unsigned char *ip
, *is
;
3729 unsigned char *op
, *os
;
3730 long output_byteoffset
, output_bytesize
;
3731 unsigned long output_byteend
;
3734 argc
= rb_scan_args(argc
, argv
, "23:", &input
, &output
, &output_byteoffset_v
, &output_bytesize_v
, &flags_v
, &opt
);
3736 if (NIL_P(output_byteoffset_v
))
3737 output_byteoffset
= 0; /* dummy */
3739 output_byteoffset
= NUM2LONG(output_byteoffset_v
);
3741 if (NIL_P(output_bytesize_v
))
3742 output_bytesize
= 0; /* dummy */
3744 output_bytesize
= NUM2LONG(output_bytesize_v
);
3746 if (!NIL_P(flags_v
)) {
3748 rb_error_arity(argc
+ 1, 2, 5);
3750 flags
= NUM2INT(rb_to_int(flags_v
));
3752 else if (!NIL_P(opt
)) {
3755 v
= rb_hash_aref(opt
, sym_partial_input
);
3757 flags
|= ECONV_PARTIAL_INPUT
;
3758 v
= rb_hash_aref(opt
, sym_after_output
);
3760 flags
|= ECONV_AFTER_OUTPUT
;
3766 StringValue(output
);
3769 rb_str_modify(output
);
3771 if (NIL_P(output_bytesize_v
)) {
3773 output_bytesize
= rb_str_capacity(output
);
3775 output_bytesize
= RSTRING_EMBED_LEN_MAX
;
3777 if (!NIL_P(input
) && output_bytesize
< RSTRING_LEN(input
))
3778 output_bytesize
= RSTRING_LEN(input
);
3783 if (NIL_P(output_byteoffset_v
))
3784 output_byteoffset
= RSTRING_LEN(output
);
3786 if (output_byteoffset
< 0)
3787 rb_raise(rb_eArgError
, "negative output_byteoffset");
3789 if (RSTRING_LEN(output
) < output_byteoffset
)
3790 rb_raise(rb_eArgError
, "output_byteoffset too big");
3792 if (output_bytesize
< 0)
3793 rb_raise(rb_eArgError
, "negative output_bytesize");
3795 output_byteend
= (unsigned long)output_byteoffset
+
3796 (unsigned long)output_bytesize
;
3798 if (output_byteend
< (unsigned long)output_byteoffset
||
3799 LONG_MAX
< output_byteend
)
3800 rb_raise(rb_eArgError
, "output_byteoffset+output_bytesize too big");
3802 if (rb_str_capacity(output
) < output_byteend
)
3803 rb_str_resize(output
, output_byteend
);
3809 ip
= (const unsigned char *)RSTRING_PTR(input
);
3810 is
= ip
+ RSTRING_LEN(input
);
3813 op
= (unsigned char *)RSTRING_PTR(output
) + output_byteoffset
;
3814 os
= op
+ output_bytesize
;
3816 res
= rb_econv_convert(ec
, &ip
, is
, &op
, os
, flags
);
3817 rb_str_set_len(output
, op
-(unsigned char *)RSTRING_PTR(output
));
3818 if (!NIL_P(input
)) {
3819 rb_str_drop_bytes(input
, ip
- (unsigned char *)RSTRING_PTR(input
));
3822 if (NIL_P(output_bytesize_v
) && res
== econv_destination_buffer_full
) {
3823 if (LONG_MAX
/ 2 < output_bytesize
)
3824 rb_raise(rb_eArgError
, "too long conversion result");
3825 output_bytesize
*= 2;
3826 output_byteoffset_v
= Qnil
;
3830 if (ec
->destination_encoding
) {
3831 rb_enc_associate(output
, ec
->destination_encoding
);
3834 return econv_result_to_symbol(res
);
3839 * ec.convert(source_string) -> destination_string
3841 * Convert source_string and return destination_string.
3843 * source_string is assumed as a part of source.
3844 * i.e. :partial_input=>true is specified internally.
3845 * finish method should be used last.
3847 * ec = Encoding::Converter.new("utf-8", "euc-jp")
3848 * puts ec.convert("\u3042").dump #=> "\xA4\xA2"
3849 * puts ec.finish.dump #=> ""
3851 * ec = Encoding::Converter.new("euc-jp", "utf-8")
3852 * puts ec.convert("\xA4").dump #=> ""
3853 * puts ec.convert("\xA2").dump #=> "\xE3\x81\x82"
3854 * puts ec.finish.dump #=> ""
3856 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3857 * puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP")
3858 * puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP")
3859 * puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP")
3860 * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
3862 * If a conversion error occur,
3863 * Encoding::UndefinedConversionError or
3864 * Encoding::InvalidByteSequenceError is raised.
3865 * Encoding::Converter#convert doesn't supply methods to recover or restart
3866 * from these exceptions.
3867 * When you want to handle these conversion errors,
3868 * use Encoding::Converter#primitive_convert.
3872 econv_convert(VALUE self
, VALUE source_string
)
3877 rb_econv_t
*ec
= check_econv(self
);
3879 StringValue(source_string
);
3881 dst
= rb_str_new(NULL
, 0);
3883 av
[0] = rb_str_dup(source_string
);
3887 av
[4] = INT2NUM(ECONV_PARTIAL_INPUT
);
3890 ret
= econv_primitive_convert(ac
, av
, self
);
3892 if (ret
== sym_invalid_byte_sequence
||
3893 ret
== sym_undefined_conversion
||
3894 ret
== sym_incomplete_input
) {
3895 VALUE exc
= make_econv_exception(ec
);
3899 if (ret
== sym_finished
) {
3900 rb_raise(rb_eArgError
, "converter already finished");
3903 if (ret
!= sym_source_buffer_empty
) {
3904 rb_bug("unexpected result of econv_primitive_convert");
3912 * ec.finish -> string
3914 * Finishes the converter.
3915 * It returns the last part of the converted string.
3917 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3918 * p ec.convert("\u3042") #=> "\e$B$\""
3919 * p ec.finish #=> "\e(B"
3922 econv_finish(VALUE self
)
3927 rb_econv_t
*ec
= check_econv(self
);
3929 dst
= rb_str_new(NULL
, 0);
3938 ret
= econv_primitive_convert(ac
, av
, self
);
3940 if (ret
== sym_invalid_byte_sequence
||
3941 ret
== sym_undefined_conversion
||
3942 ret
== sym_incomplete_input
) {
3943 VALUE exc
= make_econv_exception(ec
);
3947 if (ret
!= sym_finished
) {
3948 rb_bug("unexpected result of econv_primitive_convert");
3956 * ec.primitive_errinfo -> array
3958 * primitive_errinfo returns important information regarding the last error
3959 * as a 5-element array:
3961 * [result, enc1, enc2, error_bytes, readagain_bytes]
3963 * result is the last result of primitive_convert.
3965 * Other elements are only meaningful when result is
3966 * :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
3968 * enc1 and enc2 indicate a conversion step as a pair of strings.
3969 * For example, a converter from EUC-JP to ISO-8859-1 converts
3970 * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
3971 * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
3973 * error_bytes and readagain_bytes indicate the byte sequences which caused the error.
3974 * error_bytes is discarded portion.
3975 * readagain_bytes is buffered portion which is read again on next conversion.
3979 * # \xff is invalid as EUC-JP.
3980 * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
3981 * ec.primitive_convert(src="\xff", dst="", nil, 10)
3982 * p ec.primitive_errinfo
3983 * #=> [:invalid_byte_sequence, "EUC-JP", "Shift_JIS", "\xFF", ""]
3985 * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
3986 * # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
3987 * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
3988 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3989 * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
3990 * p ec.primitive_errinfo
3991 * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
3993 * # partial character is invalid
3994 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3995 * ec.primitive_convert(src="\xa4", dst="", nil, 10)
3996 * p ec.primitive_errinfo
3997 * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
3999 * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
4000 * # partial characters.
4001 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4002 * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
4003 * p ec.primitive_errinfo
4004 * #=> [:source_buffer_empty, nil, nil, nil, nil]
4006 * # \xd8\x00\x00@ is invalid as UTF-16BE because
4007 * # no low surrogate after high surrogate (\xd8\x00).
4008 * # It is detected by 3rd byte (\00) which is part of next character.
4009 * # So the high surrogate (\xd8\x00) is discarded and
4010 * # the 3rd byte is read again later.
4011 * # Since the byte is buffered in ec, it is dropped from src.
4012 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
4013 * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
4014 * p ec.primitive_errinfo
4015 * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
4019 * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
4020 * # The problem is detected by 4th byte.
4021 * ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
4022 * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
4023 * p ec.primitive_errinfo
4024 * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
4030 econv_primitive_errinfo(VALUE self
)
4032 rb_econv_t
*ec
= check_econv(self
);
4036 ary
= rb_ary_new2(5);
4038 rb_ary_store(ary
, 0, econv_result_to_symbol(ec
->last_error
.result
));
4039 rb_ary_store(ary
, 4, Qnil
);
4041 if (ec
->last_error
.source_encoding
)
4042 rb_ary_store(ary
, 1, rb_str_new2(ec
->last_error
.source_encoding
));
4044 if (ec
->last_error
.destination_encoding
)
4045 rb_ary_store(ary
, 2, rb_str_new2(ec
->last_error
.destination_encoding
));
4047 if (ec
->last_error
.error_bytes_start
) {
4048 rb_ary_store(ary
, 3, rb_str_new((const char *)ec
->last_error
.error_bytes_start
, ec
->last_error
.error_bytes_len
));
4049 rb_ary_store(ary
, 4, rb_str_new((const char *)ec
->last_error
.error_bytes_start
+ ec
->last_error
.error_bytes_len
, ec
->last_error
.readagain_len
));
4057 * ec.insert_output(string) -> nil
4059 * Inserts string into the encoding converter.
4060 * The string will be converted to the destination encoding and
4061 * output on later conversions.
4063 * If the destination encoding is stateful,
4064 * string is converted according to the state and the state is updated.
4066 * This method should be used only when a conversion error occurs.
4068 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4069 * src = "HIRAGANA LETTER A is \u{3042}."
4071 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4072 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."]
4073 * ec.insert_output("<err>")
4074 * p ec.primitive_convert(src, dst) #=> :finished
4075 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""]
4077 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
4078 * src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
4080 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4081 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
4082 * ec.insert_output "?" # state change required to output "?".
4083 * p ec.primitive_convert(src, dst) #=> :finished
4084 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
4088 econv_insert_output(VALUE self
, VALUE string
)
4090 const char *insert_enc
;
4094 rb_econv_t
*ec
= check_econv(self
);
4096 StringValue(string
);
4097 insert_enc
= rb_econv_encoding_to_insert_output(ec
);
4098 string
= rb_str_encode(string
, rb_enc_from_encoding(rb_enc_find(insert_enc
)), 0, Qnil
);
4100 ret
= rb_econv_insert_output(ec
, (const unsigned char *)RSTRING_PTR(string
), RSTRING_LEN(string
), insert_enc
);
4102 rb_raise(rb_eArgError
, "too big string");
4110 * ec.putback -> string
4111 * ec.putback(max_numbytes) -> string
4113 * Put back the bytes which will be converted.
4115 * The bytes are caused by invalid_byte_sequence error.
4116 * When invalid_byte_sequence error, some bytes are discarded and
4117 * some bytes are buffered to be converted later.
4118 * The latter bytes can be put back.
4119 * It can be observed by
4120 * Encoding::InvalidByteSequenceError#readagain_bytes and
4121 * Encoding::Converter#primitive_errinfo.
4123 * ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
4124 * src = "\x00\xd8\x61\x00"
4126 * p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
4127 * p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
4128 * p ec.putback #=> "a\x00"
4129 * p ec.putback #=> "" # no more bytes to put back
4133 econv_putback(int argc
, VALUE
*argv
, VALUE self
)
4135 rb_econv_t
*ec
= check_econv(self
);
4140 if (!rb_check_arity(argc
, 0, 1) || NIL_P(max
= argv
[0])) {
4141 n
= rb_econv_putbackable(ec
);
4145 putbackable
= rb_econv_putbackable(ec
);
4146 if (putbackable
< n
)
4150 str
= rb_str_new(NULL
, n
);
4151 rb_econv_putback(ec
, (unsigned char *)RSTRING_PTR(str
), n
);
4153 if (ec
->source_encoding
) {
4154 rb_enc_associate(str
, ec
->source_encoding
);
4162 * ec.last_error -> exception or nil
4164 * Returns an exception object for the last conversion.
4165 * Returns nil if the last conversion did not produce an error.
4167 * "error" means that
4168 * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
4169 * Encoding::Converter#convert and
4170 * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
4171 * Encoding::Converter#primitive_convert.
4173 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4174 * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
4175 * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
4176 * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
4177 * p ec.last_error #=> nil
4181 econv_last_error(VALUE self
)
4183 rb_econv_t
*ec
= check_econv(self
);
4186 exc
= make_econv_exception(ec
);
4194 * ec.replacement -> string
4196 * Returns the replacement string.
4198 * ec = Encoding::Converter.new("euc-jp", "us-ascii")
4199 * p ec.replacement #=> "?"
4201 * ec = Encoding::Converter.new("euc-jp", "utf-8")
4202 * p ec.replacement #=> "\uFFFD"
4205 econv_get_replacement(VALUE self
)
4207 rb_econv_t
*ec
= check_econv(self
);
4211 ret
= make_replacement(ec
);
4213 rb_raise(rb_eUndefinedConversionError
, "replacement character setup failed");
4216 enc
= rb_enc_find(ec
->replacement_enc
);
4217 return rb_enc_str_new((const char *)ec
->replacement_str
, (long)ec
->replacement_len
, enc
);
4222 * ec.replacement = string
4224 * Sets the replacement string.
4226 * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
4227 * ec.replacement = "<undef>"
4228 * p ec.convert("a \u3042 b") #=> "a <undef> b"
4231 econv_set_replacement(VALUE self
, VALUE arg
)
4233 rb_econv_t
*ec
= check_econv(self
);
4238 StringValue(string
);
4239 enc
= rb_enc_get(string
);
4241 ret
= rb_econv_set_replacement(ec
,
4242 (const unsigned char *)RSTRING_PTR(string
),
4243 RSTRING_LEN(string
),
4247 /* xxx: rb_eInvalidByteSequenceError? */
4248 rb_raise(rb_eUndefinedConversionError
, "replacement character setup failed");
4255 rb_econv_make_exception(rb_econv_t
*ec
)
4257 return make_econv_exception(ec
);
4261 rb_econv_check_error(rb_econv_t
*ec
)
4265 exc
= make_econv_exception(ec
);
4273 * ecerr.source_encoding_name -> string
4275 * Returns the source encoding name as a string.
4278 ecerr_source_encoding_name(VALUE self
)
4280 return rb_attr_get(self
, id_source_encoding_name
);
4285 * ecerr.source_encoding -> encoding
4287 * Returns the source encoding as an encoding object.
4289 * Note that the result may not be equal to the source encoding of
4290 * the encoding converter if the conversion has multiple steps.
4292 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
4294 * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
4295 * rescue Encoding::UndefinedConversionError
4296 * p $!.source_encoding #=> #<Encoding:UTF-8>
4297 * p $!.destination_encoding #=> #<Encoding:EUC-JP>
4298 * p $!.source_encoding_name #=> "UTF-8"
4299 * p $!.destination_encoding_name #=> "EUC-JP"
4304 ecerr_source_encoding(VALUE self
)
4306 return rb_attr_get(self
, id_source_encoding
);
4311 * ecerr.destination_encoding_name -> string
4313 * Returns the destination encoding name as a string.
4316 ecerr_destination_encoding_name(VALUE self
)
4318 return rb_attr_get(self
, id_destination_encoding_name
);
4323 * ecerr.destination_encoding -> string
4325 * Returns the destination encoding as an encoding object.
4328 ecerr_destination_encoding(VALUE self
)
4330 return rb_attr_get(self
, id_destination_encoding
);
4335 * ecerr.error_char -> string
4337 * Returns the one-character string which cause Encoding::UndefinedConversionError.
4339 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
4341 * ec.convert("\xa0")
4342 * rescue Encoding::UndefinedConversionError
4343 * puts $!.error_char.dump #=> "\xC2\xA0"
4344 * p $!.error_char.encoding #=> #<Encoding:UTF-8>
4349 ecerr_error_char(VALUE self
)
4351 return rb_attr_get(self
, id_error_char
);
4356 * ecerr.error_bytes -> string
4358 * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
4360 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4362 * ec.convert("abc\xA1\xFFdef")
4363 * rescue Encoding::InvalidByteSequenceError
4364 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
4365 * puts $!.error_bytes.dump #=> "\xA1"
4366 * puts $!.readagain_bytes.dump #=> "\xFF"
4370 ecerr_error_bytes(VALUE self
)
4372 return rb_attr_get(self
, id_error_bytes
);
4377 * ecerr.readagain_bytes -> string
4379 * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
4382 ecerr_readagain_bytes(VALUE self
)
4384 return rb_attr_get(self
, id_readagain_bytes
);
4389 * ecerr.incomplete_input? -> true or false
4391 * Returns true if the invalid byte sequence error is caused by
4392 * premature end of string.
4394 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4397 * ec.convert("abc\xA1z")
4398 * rescue Encoding::InvalidByteSequenceError
4399 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
4400 * p $!.incomplete_input? #=> false
4404 * ec.convert("abc\xA1")
4406 * rescue Encoding::InvalidByteSequenceError
4407 * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
4408 * p $!.incomplete_input? #=> true
4412 ecerr_incomplete_input(VALUE self
)
4414 return rb_attr_get(self
, id_incomplete_input
);
4418 * Document-class: Encoding::UndefinedConversionError
4420 * Raised by Encoding and String methods when a transcoding operation
4425 * Document-class: Encoding::InvalidByteSequenceError
4427 * Raised by Encoding and String methods when the string being
4428 * transcoded contains a byte invalid for the either the source or
4433 * Document-class: Encoding::ConverterNotFoundError
4435 * Raised by transcoding methods when a named encoding does not
4436 * correspond with a known converter.
4440 Init_transcode(void)
4442 transcoder_table
= st_init_strcasetable();
4444 id_destination_encoding
= rb_intern_const("destination_encoding");
4445 id_destination_encoding_name
= rb_intern_const("destination_encoding_name");
4446 id_error_bytes
= rb_intern_const("error_bytes");
4447 id_error_char
= rb_intern_const("error_char");
4448 id_incomplete_input
= rb_intern_const("incomplete_input");
4449 id_readagain_bytes
= rb_intern_const("readagain_bytes");
4450 id_source_encoding
= rb_intern_const("source_encoding");
4451 id_source_encoding_name
= rb_intern_const("source_encoding_name");
4453 sym_invalid
= ID2SYM(rb_intern_const("invalid"));
4454 sym_undef
= ID2SYM(rb_intern_const("undef"));
4455 sym_replace
= ID2SYM(rb_intern_const("replace"));
4456 sym_fallback
= ID2SYM(rb_intern_const("fallback"));
4457 sym_xml
= ID2SYM(rb_intern_const("xml"));
4458 sym_text
= ID2SYM(rb_intern_const("text"));
4459 sym_attr
= ID2SYM(rb_intern_const("attr"));
4461 sym_invalid_byte_sequence
= ID2SYM(rb_intern_const("invalid_byte_sequence"));
4462 sym_undefined_conversion
= ID2SYM(rb_intern_const("undefined_conversion"));
4463 sym_destination_buffer_full
= ID2SYM(rb_intern_const("destination_buffer_full"));
4464 sym_source_buffer_empty
= ID2SYM(rb_intern_const("source_buffer_empty"));
4465 sym_finished
= ID2SYM(rb_intern_const("finished"));
4466 sym_after_output
= ID2SYM(rb_intern_const("after_output"));
4467 sym_incomplete_input
= ID2SYM(rb_intern_const("incomplete_input"));
4468 sym_universal_newline
= ID2SYM(rb_intern_const("universal_newline"));
4469 sym_crlf_newline
= ID2SYM(rb_intern_const("crlf_newline"));
4470 sym_cr_newline
= ID2SYM(rb_intern_const("cr_newline"));
4471 sym_partial_input
= ID2SYM(rb_intern_const("partial_input"));
4473 #ifdef ENABLE_ECONV_NEWLINE_OPTION
4474 sym_newline
= ID2SYM(rb_intern_const("newline"));
4475 sym_universal
= ID2SYM(rb_intern_const("universal"));
4476 sym_crlf
= ID2SYM(rb_intern_const("crlf"));
4477 sym_cr
= ID2SYM(rb_intern_const("cr"));
4478 sym_lf
= ID2SYM(rb_intern_const("lf"));
4485 InitVM_transcode(void)
4487 rb_eUndefinedConversionError
= rb_define_class_under(rb_cEncoding
, "UndefinedConversionError", rb_eEncodingError
);
4488 rb_eInvalidByteSequenceError
= rb_define_class_under(rb_cEncoding
, "InvalidByteSequenceError", rb_eEncodingError
);
4489 rb_eConverterNotFoundError
= rb_define_class_under(rb_cEncoding
, "ConverterNotFoundError", rb_eEncodingError
);
4491 rb_define_method(rb_cString
, "encode", str_encode
, -1);
4492 rb_define_method(rb_cString
, "encode!", str_encode_bang
, -1);
4494 rb_cEncodingConverter
= rb_define_class_under(rb_cEncoding
, "Converter", rb_cObject
);
4495 rb_define_alloc_func(rb_cEncodingConverter
, econv_s_allocate
);
4496 rb_define_singleton_method(rb_cEncodingConverter
, "asciicompat_encoding", econv_s_asciicompat_encoding
, 1);
4497 rb_define_singleton_method(rb_cEncodingConverter
, "search_convpath", econv_s_search_convpath
, -1);
4498 rb_define_method(rb_cEncodingConverter
, "initialize", econv_init
, -1);
4499 rb_define_method(rb_cEncodingConverter
, "inspect", econv_inspect
, 0);
4500 rb_define_method(rb_cEncodingConverter
, "convpath", econv_convpath
, 0);
4501 rb_define_method(rb_cEncodingConverter
, "source_encoding", econv_source_encoding
, 0);
4502 rb_define_method(rb_cEncodingConverter
, "destination_encoding", econv_destination_encoding
, 0);
4503 rb_define_method(rb_cEncodingConverter
, "primitive_convert", econv_primitive_convert
, -1);
4504 rb_define_method(rb_cEncodingConverter
, "convert", econv_convert
, 1);
4505 rb_define_method(rb_cEncodingConverter
, "finish", econv_finish
, 0);
4506 rb_define_method(rb_cEncodingConverter
, "primitive_errinfo", econv_primitive_errinfo
, 0);
4507 rb_define_method(rb_cEncodingConverter
, "insert_output", econv_insert_output
, 1);
4508 rb_define_method(rb_cEncodingConverter
, "putback", econv_putback
, -1);
4509 rb_define_method(rb_cEncodingConverter
, "last_error", econv_last_error
, 0);
4510 rb_define_method(rb_cEncodingConverter
, "replacement", econv_get_replacement
, 0);
4511 rb_define_method(rb_cEncodingConverter
, "replacement=", econv_set_replacement
, 1);
4512 rb_define_method(rb_cEncodingConverter
, "==", econv_equal
, 1);
4514 /* Document-const: INVALID_MASK
4516 * Mask for invalid byte sequences
4518 rb_define_const(rb_cEncodingConverter
, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK
));
4520 /* Document-const: INVALID_REPLACE
4522 * Replace invalid byte sequences
4524 rb_define_const(rb_cEncodingConverter
, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE
));
4526 /* Document-const: UNDEF_MASK
4528 * Mask for a valid character in the source encoding but no related
4529 * character(s) in destination encoding.
4531 rb_define_const(rb_cEncodingConverter
, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK
));
4533 /* Document-const: UNDEF_REPLACE
4535 * Replace byte sequences that are undefined in the destination encoding.
4537 rb_define_const(rb_cEncodingConverter
, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE
));
4539 /* Document-const: UNDEF_HEX_CHARREF
4541 * Replace byte sequences that are undefined in the destination encoding
4542 * with an XML hexadecimal character reference. This is valid for XML
4545 rb_define_const(rb_cEncodingConverter
, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF
));
4547 /* Document-const: PARTIAL_INPUT
4549 * Indicates the source may be part of a larger string. See
4550 * primitive_convert for an example.
4552 rb_define_const(rb_cEncodingConverter
, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT
));
4554 /* Document-const: AFTER_OUTPUT
4556 * Stop converting after some output is complete but before all of the
4557 * input was consumed. See primitive_convert for an example.
4559 rb_define_const(rb_cEncodingConverter
, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT
));
4561 /* Document-const: UNIVERSAL_NEWLINE_DECORATOR
4563 * Decorator for converting CRLF and CR to LF
4565 rb_define_const(rb_cEncodingConverter
, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR
));
4567 /* Document-const: CRLF_NEWLINE_DECORATOR
4569 * Decorator for converting LF to CRLF
4571 rb_define_const(rb_cEncodingConverter
, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR
));
4573 /* Document-const: CR_NEWLINE_DECORATOR
4575 * Decorator for converting LF to CR
4577 rb_define_const(rb_cEncodingConverter
, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR
));
4579 /* Document-const: XML_TEXT_DECORATOR
4581 * Escape as XML CharData
4583 rb_define_const(rb_cEncodingConverter
, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR
));
4585 /* Document-const: XML_ATTR_CONTENT_DECORATOR
4587 * Escape as XML AttValue
4589 rb_define_const(rb_cEncodingConverter
, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR
));
4591 /* Document-const: XML_ATTR_QUOTE_DECORATOR
4593 * Escape as XML AttValue
4595 rb_define_const(rb_cEncodingConverter
, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR
));
4597 rb_define_method(rb_eUndefinedConversionError
, "source_encoding_name", ecerr_source_encoding_name
, 0);
4598 rb_define_method(rb_eUndefinedConversionError
, "destination_encoding_name", ecerr_destination_encoding_name
, 0);
4599 rb_define_method(rb_eUndefinedConversionError
, "source_encoding", ecerr_source_encoding
, 0);
4600 rb_define_method(rb_eUndefinedConversionError
, "destination_encoding", ecerr_destination_encoding
, 0);
4601 rb_define_method(rb_eUndefinedConversionError
, "error_char", ecerr_error_char
, 0);
4603 rb_define_method(rb_eInvalidByteSequenceError
, "source_encoding_name", ecerr_source_encoding_name
, 0);
4604 rb_define_method(rb_eInvalidByteSequenceError
, "destination_encoding_name", ecerr_destination_encoding_name
, 0);
4605 rb_define_method(rb_eInvalidByteSequenceError
, "source_encoding", ecerr_source_encoding
, 0);
4606 rb_define_method(rb_eInvalidByteSequenceError
, "destination_encoding", ecerr_destination_encoding
, 0);
4607 rb_define_method(rb_eInvalidByteSequenceError
, "error_bytes", ecerr_error_bytes
, 0);
4608 rb_define_method(rb_eInvalidByteSequenceError
, "readagain_bytes", ecerr_readagain_bytes
, 0);
4609 rb_define_method(rb_eInvalidByteSequenceError
, "incomplete_input?", ecerr_incomplete_input
, 0);