[ruby/win32ole] Undefine allocator of WIN32OLE_VARIABLE to get rid of warning
[ruby-80x24.org.git] / encoding.c
blob2737108379037c5b92654cf493bf8425755d0d80
1 /**********************************************************************
3 encoding.c -
5 $Author$
6 created at: Thu May 24 17:23:27 JST 2007
8 Copyright (C) 2007 Yukihiro Matsumoto
10 **********************************************************************/
12 #include "ruby/internal/config.h"
14 #include <ctype.h>
16 #include "encindex.h"
17 #include "internal.h"
18 #include "internal/enc.h"
19 #include "internal/encoding.h"
20 #include "internal/inits.h"
21 #include "internal/load.h"
22 #include "internal/object.h"
23 #include "internal/string.h"
24 #include "internal/vm.h"
25 #include "regenc.h"
26 #include "ruby/encoding.h"
27 #include "ruby/util.h"
28 #include "ruby_assert.h"
29 #include "vm_sync.h"
31 #ifndef ENC_DEBUG
32 #define ENC_DEBUG 0
33 #endif
34 #define ENC_ASSERT(expr) RUBY_ASSERT_WHEN(ENC_DEBUG, expr)
35 #define MUST_STRING(str) (ENC_ASSERT(RB_TYPE_P(str, T_STRING)), str)
37 #undef rb_ascii8bit_encindex
38 #undef rb_utf8_encindex
39 #undef rb_usascii_encindex
41 typedef OnigEncodingType rb_raw_encoding;
43 #if defined __GNUC__ && __GNUC__ >= 4
44 #pragma GCC visibility push(default)
45 int rb_enc_register(const char *name, rb_encoding *encoding);
46 void rb_enc_set_base(const char *name, const char *orig);
47 int rb_enc_set_dummy(int index);
48 void rb_encdb_declare(const char *name);
49 int rb_encdb_replicate(const char *name, const char *orig);
50 int rb_encdb_dummy(const char *name);
51 int rb_encdb_alias(const char *alias, const char *orig);
52 void rb_encdb_set_unicode(int index);
53 #pragma GCC visibility pop
54 #endif
56 static ID id_encoding;
57 VALUE rb_cEncoding;
59 #define DEFAULT_ENCODING_LIST_CAPA 128
60 static VALUE rb_default_encoding_list;
61 static VALUE rb_additional_encoding_list;
63 struct rb_encoding_entry {
64 const char *name;
65 rb_encoding *enc;
66 rb_encoding *base;
69 static struct enc_table {
70 struct rb_encoding_entry *list;
71 int count;
72 int size;
73 st_table *names;
74 } global_enc_table;
76 static rb_encoding *global_enc_ascii,
77 *global_enc_utf_8,
78 *global_enc_us_ascii;
80 #define GLOBAL_ENC_TABLE_ENTER(enc_table) struct enc_table *enc_table = &global_enc_table; RB_VM_LOCK_ENTER()
81 #define GLOBAL_ENC_TABLE_LEAVE() RB_VM_LOCK_LEAVE()
82 #define GLOBAL_ENC_TABLE_EVAL(enc_table, expr) do { \
83 GLOBAL_ENC_TABLE_ENTER(enc_table); \
84 { \
85 expr; \
86 } \
87 GLOBAL_ENC_TABLE_LEAVE(); \
88 } while (0)
91 #define ENC_DUMMY_FLAG (1<<24)
92 #define ENC_INDEX_MASK (~(~0U<<24))
94 #define ENC_TO_ENCINDEX(enc) (int)((enc)->ruby_encoding_index & ENC_INDEX_MASK)
95 #define ENC_DUMMY_P(enc) ((enc)->ruby_encoding_index & ENC_DUMMY_FLAG)
96 #define ENC_SET_DUMMY(enc) ((enc)->ruby_encoding_index |= ENC_DUMMY_FLAG)
98 #define ENCODING_COUNT ENCINDEX_BUILTIN_MAX
99 #define UNSPECIFIED_ENCODING INT_MAX
101 #define ENCODING_NAMELEN_MAX 63
102 #define valid_encoding_name_p(name) ((name) && strlen(name) <= ENCODING_NAMELEN_MAX)
104 static const rb_data_type_t encoding_data_type = {
105 "encoding",
106 {0, 0, 0,},
107 0, 0, RUBY_TYPED_FREE_IMMEDIATELY
110 #define is_data_encoding(obj) (RTYPEDDATA_P(obj) && RTYPEDDATA_TYPE(obj) == &encoding_data_type)
111 #define is_obj_encoding(obj) (RB_TYPE_P((obj), T_DATA) && is_data_encoding(obj))
114 rb_data_is_encoding(VALUE obj)
116 return is_data_encoding(obj);
119 static VALUE
120 enc_new(rb_encoding *encoding)
122 VALUE enc = TypedData_Wrap_Struct(rb_cEncoding, &encoding_data_type, (void *)encoding);
123 rb_obj_freeze(enc);
124 FL_SET_RAW(enc, RUBY_FL_SHAREABLE);
125 return enc;
128 static void
129 enc_list_update(int index, rb_raw_encoding *encoding)
131 if (index < DEFAULT_ENCODING_LIST_CAPA) {
132 VALUE list = rb_default_encoding_list;
133 if (list && NIL_P(rb_ary_entry(list, index))) {
134 /* initialize encoding data */
135 rb_ary_store(list, index, enc_new(encoding));
138 else {
139 RB_VM_LOCK_ENTER();
141 VALUE list = rb_additional_encoding_list;
142 if (list && NIL_P(rb_ary_entry(list, index))) {
143 /* initialize encoding data */
144 rb_ary_store(list, index - DEFAULT_ENCODING_LIST_CAPA, enc_new(encoding));
147 RB_VM_LOCK_LEAVE();
151 static VALUE
152 enc_list_lookup(int idx)
154 VALUE list, enc;
156 if (idx < DEFAULT_ENCODING_LIST_CAPA) {
157 if (!(list = rb_default_encoding_list)) {
158 rb_bug("rb_enc_from_encoding_index(%d): no rb_default_encoding_list", idx);
160 enc = rb_ary_entry(list, idx);
162 else {
163 RB_VM_LOCK_ENTER();
165 if (!(list = rb_additional_encoding_list)) {
166 rb_bug("rb_enc_from_encoding_index(%d): no rb_additional_encoding_list", idx);
168 enc = rb_ary_entry(list, idx - DEFAULT_ENCODING_LIST_CAPA);
170 RB_VM_LOCK_LEAVE();
173 if (NIL_P(enc)) {
174 rb_bug("rb_enc_from_encoding_index(%d): not created yet", idx);
176 else {
177 return enc;
181 static VALUE
182 rb_enc_from_encoding_index(int idx)
184 return enc_list_lookup(idx);
187 VALUE
188 rb_enc_from_encoding(rb_encoding *encoding)
190 int idx;
191 if (!encoding) return Qnil;
192 idx = ENC_TO_ENCINDEX(encoding);
193 return rb_enc_from_encoding_index(idx);
197 rb_enc_to_index(rb_encoding *enc)
199 return enc ? ENC_TO_ENCINDEX(enc) : 0;
203 rb_enc_dummy_p(rb_encoding *enc)
205 return ENC_DUMMY_P(enc) != 0;
208 static int
209 check_encoding(rb_encoding *enc)
211 int index = rb_enc_to_index(enc);
212 if (rb_enc_from_index(index) != enc)
213 return -1;
214 if (rb_enc_autoload_p(enc)) {
215 index = rb_enc_autoload(enc);
217 return index;
220 static int
221 enc_check_encoding(VALUE obj)
223 if (!is_obj_encoding(obj)) {
224 return -1;
226 return check_encoding(RDATA(obj)->data);
229 NORETURN(static void not_encoding(VALUE enc));
230 static void
231 not_encoding(VALUE enc)
233 rb_raise(rb_eTypeError, "wrong argument type %"PRIsVALUE" (expected Encoding)",
234 rb_obj_class(enc));
237 static rb_encoding *
238 must_encoding(VALUE enc)
240 int index = enc_check_encoding(enc);
241 if (index < 0) {
242 not_encoding(enc);
244 return DATA_PTR(enc);
247 static rb_encoding *
248 must_encindex(int index)
250 rb_encoding *enc = rb_enc_from_index(index);
251 if (!enc) {
252 rb_raise(rb_eEncodingError, "encoding index out of bound: %d",
253 index);
255 if (ENC_TO_ENCINDEX(enc) != (int)(index & ENC_INDEX_MASK)) {
256 rb_raise(rb_eEncodingError, "wrong encoding index %d for %s (expected %d)",
257 index, rb_enc_name(enc), ENC_TO_ENCINDEX(enc));
259 if (rb_enc_autoload_p(enc) && rb_enc_autoload(enc) == -1) {
260 rb_loaderror("failed to load encoding (%s)",
261 rb_enc_name(enc));
263 return enc;
267 rb_to_encoding_index(VALUE enc)
269 int idx;
270 const char *name;
272 idx = enc_check_encoding(enc);
273 if (idx >= 0) {
274 return idx;
276 else if (NIL_P(enc = rb_check_string_type(enc))) {
277 return -1;
279 if (!rb_enc_asciicompat(rb_enc_get(enc))) {
280 return -1;
282 if (!(name = rb_str_to_cstr(enc))) {
283 return -1;
285 return rb_enc_find_index(name);
288 static const char *
289 name_for_encoding(volatile VALUE *enc)
291 VALUE name = StringValue(*enc);
292 const char *n;
294 if (!rb_enc_asciicompat(rb_enc_get(name))) {
295 rb_raise(rb_eArgError, "invalid encoding name (non ASCII)");
297 if (!(n = rb_str_to_cstr(name))) {
298 rb_raise(rb_eArgError, "invalid encoding name (NUL byte)");
300 return n;
303 /* Returns encoding index or UNSPECIFIED_ENCODING */
304 static int
305 str_find_encindex(VALUE enc)
307 int idx = rb_enc_find_index(name_for_encoding(&enc));
308 RB_GC_GUARD(enc);
309 return idx;
312 static int
313 str_to_encindex(VALUE enc)
315 int idx = str_find_encindex(enc);
316 if (idx < 0) {
317 rb_raise(rb_eArgError, "unknown encoding name - %"PRIsVALUE, enc);
319 return idx;
322 static rb_encoding *
323 str_to_encoding(VALUE enc)
325 return rb_enc_from_index(str_to_encindex(enc));
328 rb_encoding *
329 rb_to_encoding(VALUE enc)
331 if (enc_check_encoding(enc) >= 0) return RDATA(enc)->data;
332 return str_to_encoding(enc);
335 rb_encoding *
336 rb_find_encoding(VALUE enc)
338 int idx;
339 if (enc_check_encoding(enc) >= 0) return RDATA(enc)->data;
340 idx = str_find_encindex(enc);
341 if (idx < 0) return NULL;
342 return rb_enc_from_index(idx);
345 static int
346 enc_table_expand(struct enc_table *enc_table, int newsize)
348 struct rb_encoding_entry *ent;
349 int count = newsize;
351 if (enc_table->size >= newsize) return newsize;
352 newsize = (newsize + 7) / 8 * 8;
353 ent = REALLOC_N(enc_table->list, struct rb_encoding_entry, newsize);
354 memset(ent + enc_table->size, 0, sizeof(*ent)*(newsize - enc_table->size));
355 enc_table->list = ent;
356 enc_table->size = newsize;
357 return count;
360 static int
361 enc_register_at(struct enc_table *enc_table, int index, const char *name, rb_encoding *base_encoding)
363 struct rb_encoding_entry *ent = &enc_table->list[index];
364 rb_raw_encoding *encoding;
366 if (!valid_encoding_name_p(name)) return -1;
367 if (!ent->name) {
368 ent->name = name = strdup(name);
370 else if (STRCASECMP(name, ent->name)) {
371 return -1;
373 encoding = (rb_raw_encoding *)ent->enc;
374 if (!encoding) {
375 encoding = xmalloc(sizeof(rb_encoding));
377 if (base_encoding) {
378 *encoding = *base_encoding;
380 else {
381 memset(encoding, 0, sizeof(*ent->enc));
383 encoding->name = name;
384 encoding->ruby_encoding_index = index;
385 ent->enc = encoding;
386 st_insert(enc_table->names, (st_data_t)name, (st_data_t)index);
388 enc_list_update(index, encoding);
389 return index;
392 static int
393 enc_register(struct enc_table *enc_table, const char *name, rb_encoding *encoding)
395 int index = enc_table->count;
397 enc_table->count = enc_table_expand(enc_table, index + 1);
398 return enc_register_at(enc_table, index, name, encoding);
401 static void set_encoding_const(const char *, rb_encoding *);
402 static int enc_registered(struct enc_table *enc_table, const char *name);
404 static rb_encoding *
405 enc_from_index(struct enc_table *enc_table, int index)
407 if (UNLIKELY(index < 0 || enc_table->count <= (index &= ENC_INDEX_MASK))) {
408 return 0;
410 return enc_table->list[index].enc;
413 rb_encoding *
414 rb_enc_from_index(int index)
416 rb_encoding *enc;
418 switch (index) {
419 case ENCINDEX_ASCII: return global_enc_ascii;
420 case ENCINDEX_UTF_8: return global_enc_utf_8;
421 case ENCINDEX_US_ASCII: return global_enc_us_ascii;
422 default:
423 GLOBAL_ENC_TABLE_EVAL(enc_table,
424 enc = enc_from_index(enc_table, index));
425 return enc;
430 rb_enc_register(const char *name, rb_encoding *encoding)
432 int index;
434 GLOBAL_ENC_TABLE_ENTER(enc_table);
436 index = enc_registered(enc_table, name);
438 if (index >= 0) {
439 rb_encoding *oldenc = enc_from_index(enc_table, index);
440 if (STRCASECMP(name, rb_enc_name(oldenc))) {
441 index = enc_register(enc_table, name, encoding);
443 else if (rb_enc_autoload_p(oldenc) || !ENC_DUMMY_P(oldenc)) {
444 enc_register_at(enc_table, index, name, encoding);
446 else {
447 rb_raise(rb_eArgError, "encoding %s is already registered", name);
450 else {
451 index = enc_register(enc_table, name, encoding);
452 set_encoding_const(name, rb_enc_from_index(index));
455 GLOBAL_ENC_TABLE_LEAVE();
456 return index;
460 enc_registered(struct enc_table *enc_table, const char *name)
462 st_data_t idx = 0;
464 if (!name) return -1;
465 if (!enc_table->list) return -1;
466 if (st_lookup(enc_table->names, (st_data_t)name, &idx)) {
467 return (int)idx;
469 return -1;
472 void
473 rb_encdb_declare(const char *name)
475 GLOBAL_ENC_TABLE_ENTER(enc_table);
477 int idx = enc_registered(enc_table, name);
478 if (idx < 0) {
479 idx = enc_register(enc_table, name, 0);
481 set_encoding_const(name, rb_enc_from_index(idx));
483 GLOBAL_ENC_TABLE_LEAVE();
486 static void
487 enc_check_duplication(struct enc_table *enc_table, const char *name)
489 if (enc_registered(enc_table, name) >= 0) {
490 rb_raise(rb_eArgError, "encoding %s is already registered", name);
494 static rb_encoding*
495 set_base_encoding(struct enc_table *enc_table, int index, rb_encoding *base)
497 rb_encoding *enc = enc_table->list[index].enc;
499 ASSUME(enc);
500 enc_table->list[index].base = base;
501 if (ENC_DUMMY_P(base)) ENC_SET_DUMMY((rb_raw_encoding *)enc);
502 return enc;
505 /* for encdb.h
506 * Set base encoding for encodings which are not replicas
507 * but not in their own files.
509 void
510 rb_enc_set_base(const char *name, const char *orig)
512 GLOBAL_ENC_TABLE_ENTER(enc_table);
514 int idx = enc_registered(enc_table, name);
515 int origidx = enc_registered(enc_table, orig);
516 set_base_encoding(enc_table, idx, rb_enc_from_index(origidx));
518 GLOBAL_ENC_TABLE_LEAVE();
521 /* for encdb.h
522 * Set encoding dummy.
525 rb_enc_set_dummy(int index)
527 rb_encoding *enc;
529 GLOBAL_ENC_TABLE_EVAL(enc_table,
530 enc = enc_table->list[index].enc);
532 ENC_SET_DUMMY((rb_raw_encoding *)enc);
533 return index;
536 static int
537 enc_replicate(struct enc_table *enc_table, const char *name, rb_encoding *encoding)
539 int idx;
541 enc_check_duplication(enc_table, name);
542 idx = enc_register(enc_table, name, encoding);
543 if (idx < 0) rb_raise(rb_eArgError, "invalid encoding name: %s", name);
544 set_base_encoding(enc_table, idx, encoding);
545 set_encoding_const(name, rb_enc_from_index(idx));
546 return idx;
550 rb_enc_replicate(const char *name, rb_encoding *encoding)
552 int r;
554 GLOBAL_ENC_TABLE_EVAL(enc_table,
555 r = enc_replicate(enc_table, name, encoding));
557 return r;
561 * call-seq:
562 * enc.replicate(name) -> encoding
564 * Returns a replicated encoding of _enc_ whose name is _name_.
565 * The new encoding should have the same byte structure of _enc_.
566 * If _name_ is used by another encoding, raise ArgumentError.
569 static VALUE
570 enc_replicate_m(VALUE encoding, VALUE name)
572 int idx = rb_enc_replicate(name_for_encoding(&name), rb_to_encoding(encoding));
573 RB_GC_GUARD(name);
574 return rb_enc_from_encoding_index(idx);
577 static int
578 enc_replicate_with_index(struct enc_table *enc_table, const char *name, rb_encoding *origenc, int idx)
580 if (idx < 0) {
581 idx = enc_register(enc_table, name, origenc);
583 else {
584 idx = enc_register_at(enc_table, idx, name, origenc);
586 if (idx >= 0) {
587 set_base_encoding(enc_table, idx, origenc);
588 set_encoding_const(name, rb_enc_from_index(idx));
590 else {
591 rb_raise(rb_eArgError, "failed to replicate encoding");
593 return idx;
597 rb_encdb_replicate(const char *name, const char *orig)
599 int r;
601 GLOBAL_ENC_TABLE_ENTER(enc_table);
603 int origidx = enc_registered(enc_table, orig);
604 int idx = enc_registered(enc_table, name);
606 if (origidx < 0) {
607 origidx = enc_register(enc_table, orig, 0);
609 r = enc_replicate_with_index(enc_table, name, rb_enc_from_index(origidx), idx);
611 GLOBAL_ENC_TABLE_LEAVE();
613 return r;
617 rb_define_dummy_encoding(const char *name)
619 int index;
621 GLOBAL_ENC_TABLE_ENTER(enc_table);
623 index = enc_replicate(enc_table, name, rb_ascii8bit_encoding());
624 rb_encoding *enc = enc_table->list[index].enc;
625 ENC_SET_DUMMY((rb_raw_encoding *)enc);
627 GLOBAL_ENC_TABLE_LEAVE();
629 return index;
633 rb_encdb_dummy(const char *name)
635 int index;
637 GLOBAL_ENC_TABLE_ENTER(enc_table);
639 index = enc_replicate_with_index(enc_table, name,
640 rb_ascii8bit_encoding(),
641 enc_registered(enc_table, name));
642 rb_encoding *enc = enc_table->list[index].enc;
643 ENC_SET_DUMMY((rb_raw_encoding *)enc);
645 GLOBAL_ENC_TABLE_LEAVE();
647 return index;
651 * call-seq:
652 * enc.dummy? -> true or false
654 * Returns true for dummy encodings.
655 * A dummy encoding is an encoding for which character handling is not properly
656 * implemented.
657 * It is used for stateful encodings.
659 * Encoding::ISO_2022_JP.dummy? #=> true
660 * Encoding::UTF_8.dummy? #=> false
663 static VALUE
664 enc_dummy_p(VALUE enc)
666 return RBOOL(ENC_DUMMY_P(must_encoding(enc)));
670 * call-seq:
671 * enc.ascii_compatible? -> true or false
673 * Returns whether ASCII-compatible or not.
675 * Encoding::UTF_8.ascii_compatible? #=> true
676 * Encoding::UTF_16BE.ascii_compatible? #=> false
679 static VALUE
680 enc_ascii_compatible_p(VALUE enc)
682 return RBOOL(rb_enc_asciicompat(must_encoding(enc)));
686 * Returns non-zero when the encoding is Unicode series other than UTF-7 else 0.
689 rb_enc_unicode_p(rb_encoding *enc)
691 return ONIGENC_IS_UNICODE(enc);
694 static st_data_t
695 enc_dup_name(st_data_t name)
697 return (st_data_t)strdup((const char *)name);
701 * Returns copied alias name when the key is added for st_table,
702 * else returns NULL.
704 static int
705 enc_alias_internal(struct enc_table *enc_table, const char *alias, int idx)
707 return st_insert2(enc_table->names, (st_data_t)alias, (st_data_t)idx,
708 enc_dup_name);
711 static int
712 enc_alias(struct enc_table *enc_table, const char *alias, int idx)
714 if (!valid_encoding_name_p(alias)) return -1;
715 if (!enc_alias_internal(enc_table, alias, idx))
716 set_encoding_const(alias, enc_from_index(enc_table, idx));
717 return idx;
721 rb_enc_alias(const char *alias, const char *orig)
723 int idx, r;
725 GLOBAL_ENC_TABLE_ENTER(enc_table);
727 enc_check_duplication(enc_table, alias);
728 if ((idx = rb_enc_find_index(orig)) < 0) {
729 r = -1;
731 else {
732 r = enc_alias(enc_table, alias, idx);
735 GLOBAL_ENC_TABLE_LEAVE();
737 return r;
741 rb_encdb_alias(const char *alias, const char *orig)
743 int r;
745 GLOBAL_ENC_TABLE_ENTER(enc_table);
747 int idx = enc_registered(enc_table, orig);
749 if (idx < 0) {
750 idx = enc_register(enc_table, orig, 0);
752 r = enc_alias(enc_table, alias, idx);
754 GLOBAL_ENC_TABLE_LEAVE();
756 return r;
759 void
760 rb_encdb_set_unicode(int index)
762 rb_raw_encoding *enc = (rb_raw_encoding *)rb_enc_from_index(index);
763 ASSUME(enc);
764 enc->flags |= ONIGENC_FLAG_UNICODE;
767 static void
768 rb_enc_init(struct enc_table *enc_table)
770 enc_table_expand(enc_table, ENCODING_COUNT + 1);
771 if (!enc_table->names) {
772 enc_table->names = st_init_strcasetable();
774 #define ENC_REGISTER(enc) enc_register_at(enc_table, ENCINDEX_##enc, rb_enc_name(&OnigEncoding##enc), &OnigEncoding##enc)
775 ENC_REGISTER(ASCII);
776 ENC_REGISTER(UTF_8);
777 ENC_REGISTER(US_ASCII);
778 global_enc_ascii = enc_table->list[ENCINDEX_ASCII].enc;
779 global_enc_utf_8 = enc_table->list[ENCINDEX_UTF_8].enc;
780 global_enc_us_ascii = enc_table->list[ENCINDEX_US_ASCII].enc;
781 #undef ENC_REGISTER
782 #define ENCDB_REGISTER(name, enc) enc_register_at(enc_table, ENCINDEX_##enc, name, NULL)
783 ENCDB_REGISTER("UTF-16BE", UTF_16BE);
784 ENCDB_REGISTER("UTF-16LE", UTF_16LE);
785 ENCDB_REGISTER("UTF-32BE", UTF_32BE);
786 ENCDB_REGISTER("UTF-32LE", UTF_32LE);
787 ENCDB_REGISTER("UTF-16", UTF_16);
788 ENCDB_REGISTER("UTF-32", UTF_32);
789 ENCDB_REGISTER("UTF8-MAC", UTF8_MAC);
791 ENCDB_REGISTER("EUC-JP", EUC_JP);
792 ENCDB_REGISTER("Windows-31J", Windows_31J);
793 #undef ENCDB_REGISTER
794 enc_table->count = ENCINDEX_BUILTIN_MAX;
797 rb_encoding *
798 rb_enc_get_from_index(int index)
800 return must_encindex(index);
803 int rb_require_internal_silent(VALUE fname);
805 static int
806 load_encoding(const char *name)
808 VALUE enclib = rb_sprintf("enc/%s.so", name);
809 VALUE debug = ruby_debug;
810 VALUE errinfo;
811 char *s = RSTRING_PTR(enclib) + 4, *e = RSTRING_END(enclib) - 3;
812 int loaded;
813 int idx;
815 while (s < e) {
816 if (!ISALNUM(*s)) *s = '_';
817 else if (ISUPPER(*s)) *s = (char)TOLOWER(*s);
818 ++s;
820 enclib = rb_fstring(enclib);
821 ruby_debug = Qfalse;
822 errinfo = rb_errinfo();
823 loaded = rb_require_internal_silent(enclib);
824 ruby_debug = debug;
825 rb_set_errinfo(errinfo);
827 GLOBAL_ENC_TABLE_ENTER(enc_table);
829 if (loaded < 0 || 1 < loaded) {
830 idx = -1;
832 else if ((idx = enc_registered(enc_table, name)) < 0) {
833 idx = -1;
835 else if (rb_enc_autoload_p(enc_table->list[idx].enc)) {
836 idx = -1;
839 GLOBAL_ENC_TABLE_LEAVE();
841 return idx;
844 static int
845 enc_autoload_body(struct enc_table *enc_table, rb_encoding *enc)
847 rb_encoding *base = enc_table->list[ENC_TO_ENCINDEX(enc)].base;
849 if (base) {
850 int i = 0;
851 do {
852 if (i >= enc_table->count) return -1;
853 } while (enc_table->list[i].enc != base && (++i, 1));
854 if (rb_enc_autoload_p(base)) {
855 if (rb_enc_autoload(base) < 0) return -1;
857 i = enc->ruby_encoding_index;
858 enc_register_at(enc_table, i & ENC_INDEX_MASK, rb_enc_name(enc), base);
859 ((rb_raw_encoding *)enc)->ruby_encoding_index = i;
860 i &= ENC_INDEX_MASK;
861 return i;
863 else {
864 return -2;
869 rb_enc_autoload(rb_encoding *enc)
871 int i;
872 GLOBAL_ENC_TABLE_EVAL(enc_table, i = enc_autoload_body(enc_table, enc));
873 if (i == -2) {
874 i = load_encoding(rb_enc_name(enc));
876 return i;
879 /* Return encoding index or UNSPECIFIED_ENCODING from encoding name */
881 rb_enc_find_index(const char *name)
883 int i;
884 rb_encoding *enc;
886 GLOBAL_ENC_TABLE_EVAL(enc_table, i = enc_registered(enc_table, name));
888 if (i < 0) {
889 i = load_encoding(name);
891 else if (!(enc = rb_enc_from_index(i))) {
892 if (i != UNSPECIFIED_ENCODING) {
893 rb_raise(rb_eArgError, "encoding %s is not registered", name);
896 else if (rb_enc_autoload_p(enc)) {
897 if (rb_enc_autoload(enc) < 0) {
898 rb_warn("failed to load encoding (%s); use ASCII-8BIT instead",
899 name);
900 return 0;
903 return i;
907 rb_enc_find_index2(const char *name, long len)
909 char buf[ENCODING_NAMELEN_MAX+1];
911 if (len > ENCODING_NAMELEN_MAX) return -1;
912 memcpy(buf, name, len);
913 buf[len] = '\0';
914 return rb_enc_find_index(buf);
917 rb_encoding *
918 rb_enc_find(const char *name)
920 int idx = rb_enc_find_index(name);
921 if (idx < 0) idx = 0;
922 return rb_enc_from_index(idx);
925 static inline int
926 enc_capable(VALUE obj)
928 if (SPECIAL_CONST_P(obj)) return SYMBOL_P(obj);
929 switch (BUILTIN_TYPE(obj)) {
930 case T_STRING:
931 case T_REGEXP:
932 case T_FILE:
933 case T_SYMBOL:
934 return TRUE;
935 case T_DATA:
936 if (is_data_encoding(obj)) return TRUE;
937 default:
938 return FALSE;
943 rb_enc_capable(VALUE obj)
945 return enc_capable(obj);
949 rb_id_encoding(void)
951 CONST_ID(id_encoding, "encoding");
952 return id_encoding;
955 static int
956 enc_get_index_str(VALUE str)
958 int i = ENCODING_GET_INLINED(str);
959 if (i == ENCODING_INLINE_MAX) {
960 VALUE iv;
962 #if 0
963 iv = rb_ivar_get(str, rb_id_encoding());
964 i = NUM2INT(iv);
965 #else
967 * Tentatively, assume ASCII-8BIT, if encoding index instance
968 * variable is not found. This can happen when freeing after
969 * all instance variables are removed in `obj_free`.
971 iv = rb_attr_get(str, rb_id_encoding());
972 i = NIL_P(iv) ? ENCINDEX_ASCII : NUM2INT(iv);
973 #endif
975 return i;
979 rb_enc_get_index(VALUE obj)
981 int i = -1;
982 VALUE tmp;
984 if (SPECIAL_CONST_P(obj)) {
985 if (!SYMBOL_P(obj)) return -1;
986 obj = rb_sym2str(obj);
988 switch (BUILTIN_TYPE(obj)) {
989 case T_STRING:
990 case T_SYMBOL:
991 case T_REGEXP:
992 i = enc_get_index_str(obj);
993 break;
994 case T_FILE:
995 tmp = rb_funcallv(obj, rb_intern("internal_encoding"), 0, 0);
996 if (NIL_P(tmp)) {
997 tmp = rb_funcallv(obj, rb_intern("external_encoding"), 0, 0);
999 if (is_obj_encoding(tmp)) {
1000 i = enc_check_encoding(tmp);
1002 break;
1003 case T_DATA:
1004 if (is_data_encoding(obj)) {
1005 i = enc_check_encoding(obj);
1007 break;
1008 default:
1009 break;
1011 return i;
1014 static void
1015 enc_set_index(VALUE obj, int idx)
1017 if (!enc_capable(obj)) {
1018 rb_raise(rb_eArgError, "cannot set encoding on non-encoding capable object");
1021 if (idx < ENCODING_INLINE_MAX) {
1022 ENCODING_SET_INLINED(obj, idx);
1023 return;
1025 ENCODING_SET_INLINED(obj, ENCODING_INLINE_MAX);
1026 rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx));
1029 void
1030 rb_enc_set_index(VALUE obj, int idx)
1032 rb_check_frozen(obj);
1033 must_encindex(idx);
1034 enc_set_index(obj, idx);
1037 VALUE
1038 rb_enc_associate_index(VALUE obj, int idx)
1040 rb_encoding *enc;
1041 int oldidx, oldtermlen, termlen;
1043 /* enc_check_capable(obj);*/
1044 rb_check_frozen(obj);
1045 oldidx = rb_enc_get_index(obj);
1046 if (oldidx == idx)
1047 return obj;
1048 if (SPECIAL_CONST_P(obj)) {
1049 rb_raise(rb_eArgError, "cannot set encoding");
1051 enc = must_encindex(idx);
1052 if (!ENC_CODERANGE_ASCIIONLY(obj) ||
1053 !rb_enc_asciicompat(enc)) {
1054 ENC_CODERANGE_CLEAR(obj);
1056 termlen = rb_enc_mbminlen(enc);
1057 oldtermlen = rb_enc_mbminlen(rb_enc_from_index(oldidx));
1058 if (oldtermlen != termlen && RB_TYPE_P(obj, T_STRING)) {
1059 rb_str_change_terminator_length(obj, oldtermlen, termlen);
1061 enc_set_index(obj, idx);
1062 return obj;
1065 VALUE
1066 rb_enc_associate(VALUE obj, rb_encoding *enc)
1068 return rb_enc_associate_index(obj, rb_enc_to_index(enc));
1071 rb_encoding*
1072 rb_enc_get(VALUE obj)
1074 return rb_enc_from_index(rb_enc_get_index(obj));
1077 static rb_encoding*
1078 rb_encoding_check(rb_encoding* enc, VALUE str1, VALUE str2)
1080 if (!enc)
1081 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
1082 rb_enc_name(rb_enc_get(str1)),
1083 rb_enc_name(rb_enc_get(str2)));
1084 return enc;
1087 static rb_encoding* enc_compatible_str(VALUE str1, VALUE str2);
1089 rb_encoding*
1090 rb_enc_check_str(VALUE str1, VALUE str2)
1092 rb_encoding *enc = enc_compatible_str(MUST_STRING(str1), MUST_STRING(str2));
1093 return rb_encoding_check(enc, str1, str2);
1096 rb_encoding*
1097 rb_enc_check(VALUE str1, VALUE str2)
1099 rb_encoding *enc = rb_enc_compatible(str1, str2);
1100 return rb_encoding_check(enc, str1, str2);
1103 static rb_encoding*
1104 enc_compatible_latter(VALUE str1, VALUE str2, int idx1, int idx2)
1106 int isstr1, isstr2;
1107 rb_encoding *enc1 = rb_enc_from_index(idx1);
1108 rb_encoding *enc2 = rb_enc_from_index(idx2);
1110 isstr2 = RB_TYPE_P(str2, T_STRING);
1111 if (isstr2 && RSTRING_LEN(str2) == 0)
1112 return enc1;
1113 isstr1 = RB_TYPE_P(str1, T_STRING);
1114 if (isstr1 && isstr2 && RSTRING_LEN(str1) == 0)
1115 return (rb_enc_asciicompat(enc1) && rb_enc_str_asciionly_p(str2)) ? enc1 : enc2;
1116 if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) {
1117 return 0;
1120 /* objects whose encoding is the same of contents */
1121 if (!isstr2 && idx2 == ENCINDEX_US_ASCII)
1122 return enc1;
1123 if (!isstr1 && idx1 == ENCINDEX_US_ASCII)
1124 return enc2;
1126 if (!isstr1) {
1127 VALUE tmp = str1;
1128 int idx0 = idx1;
1129 str1 = str2;
1130 str2 = tmp;
1131 idx1 = idx2;
1132 idx2 = idx0;
1133 idx0 = isstr1;
1134 isstr1 = isstr2;
1135 isstr2 = idx0;
1137 if (isstr1) {
1138 int cr1, cr2;
1140 cr1 = rb_enc_str_coderange(str1);
1141 if (isstr2) {
1142 cr2 = rb_enc_str_coderange(str2);
1143 if (cr1 != cr2) {
1144 /* may need to handle ENC_CODERANGE_BROKEN */
1145 if (cr1 == ENC_CODERANGE_7BIT) return enc2;
1146 if (cr2 == ENC_CODERANGE_7BIT) return enc1;
1148 if (cr2 == ENC_CODERANGE_7BIT) {
1149 return enc1;
1152 if (cr1 == ENC_CODERANGE_7BIT)
1153 return enc2;
1155 return 0;
1158 static rb_encoding*
1159 enc_compatible_str(VALUE str1, VALUE str2)
1161 int idx1 = enc_get_index_str(str1);
1162 int idx2 = enc_get_index_str(str2);
1164 if (idx1 < 0 || idx2 < 0)
1165 return 0;
1167 if (idx1 == idx2) {
1168 return rb_enc_from_index(idx1);
1170 else {
1171 return enc_compatible_latter(str1, str2, idx1, idx2);
1175 rb_encoding*
1176 rb_enc_compatible(VALUE str1, VALUE str2)
1178 int idx1 = rb_enc_get_index(str1);
1179 int idx2 = rb_enc_get_index(str2);
1181 if (idx1 < 0 || idx2 < 0)
1182 return 0;
1184 if (idx1 == idx2) {
1185 return rb_enc_from_index(idx1);
1188 return enc_compatible_latter(str1, str2, idx1, idx2);
1191 void
1192 rb_enc_copy(VALUE obj1, VALUE obj2)
1194 rb_enc_associate_index(obj1, rb_enc_get_index(obj2));
1199 * call-seq:
1200 * obj.encoding -> encoding
1202 * Returns the Encoding object that represents the encoding of obj.
1205 VALUE
1206 rb_obj_encoding(VALUE obj)
1208 int idx = rb_enc_get_index(obj);
1209 if (idx < 0) {
1210 rb_raise(rb_eTypeError, "unknown encoding");
1212 return rb_enc_from_encoding_index(idx & ENC_INDEX_MASK);
1216 rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
1218 return ONIGENC_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
1222 rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
1224 int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
1225 if (MBCLEN_CHARFOUND_P(n) && MBCLEN_CHARFOUND_LEN(n) <= e-p)
1226 return MBCLEN_CHARFOUND_LEN(n);
1227 else {
1228 int min = rb_enc_mbminlen(enc);
1229 return min <= e-p ? min : (int)(e-p);
1234 rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
1236 int n;
1237 if (e <= p)
1238 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
1239 n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
1240 if (e-p < n)
1241 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n-(int)(e-p));
1242 return n;
1246 rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
1248 unsigned int c;
1249 int l;
1250 if (e <= p)
1251 return -1;
1252 if (rb_enc_asciicompat(enc)) {
1253 c = (unsigned char)*p;
1254 if (!ISASCII(c))
1255 return -1;
1256 if (len) *len = 1;
1257 return c;
1259 l = rb_enc_precise_mbclen(p, e, enc);
1260 if (!MBCLEN_CHARFOUND_P(l))
1261 return -1;
1262 c = rb_enc_mbc_to_codepoint(p, e, enc);
1263 if (!rb_enc_isascii(c, enc))
1264 return -1;
1265 if (len) *len = l;
1266 return c;
1269 unsigned int
1270 rb_enc_codepoint_len(const char *p, const char *e, int *len_p, rb_encoding *enc)
1272 int r;
1273 if (e <= p)
1274 rb_raise(rb_eArgError, "empty string");
1275 r = rb_enc_precise_mbclen(p, e, enc);
1276 if (!MBCLEN_CHARFOUND_P(r)) {
1277 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(enc));
1279 if (len_p) *len_p = MBCLEN_CHARFOUND_LEN(r);
1280 return rb_enc_mbc_to_codepoint(p, e, enc);
1284 rb_enc_codelen(int c, rb_encoding *enc)
1286 int n = ONIGENC_CODE_TO_MBCLEN(enc,c);
1287 if (n == 0) {
1288 rb_raise(rb_eArgError, "invalid codepoint 0x%x in %s", c, rb_enc_name(enc));
1290 return n;
1294 rb_enc_toupper(int c, rb_encoding *enc)
1296 return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_UPPER_CASE(c):(c));
1300 rb_enc_tolower(int c, rb_encoding *enc)
1302 return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_LOWER_CASE(c):(c));
1306 * call-seq:
1307 * enc.inspect -> string
1309 * Returns a string which represents the encoding for programmers.
1311 * Encoding::UTF_8.inspect #=> "#<Encoding:UTF-8>"
1312 * Encoding::ISO_2022_JP.inspect #=> "#<Encoding:ISO-2022-JP (dummy)>"
1314 static VALUE
1315 enc_inspect(VALUE self)
1317 rb_encoding *enc;
1319 if (!is_data_encoding(self)) {
1320 not_encoding(self);
1322 if (!(enc = DATA_PTR(self)) || rb_enc_from_index(rb_enc_to_index(enc)) != enc) {
1323 rb_raise(rb_eTypeError, "broken Encoding");
1325 return rb_enc_sprintf(rb_usascii_encoding(),
1326 "#<%"PRIsVALUE":%s%s%s>", rb_obj_class(self),
1327 rb_enc_name(enc),
1328 (ENC_DUMMY_P(enc) ? " (dummy)" : ""),
1329 rb_enc_autoload_p(enc) ? " (autoload)" : "");
1333 * call-seq:
1334 * enc.name -> string
1335 * enc.to_s -> string
1337 * Returns the name of the encoding.
1339 * Encoding::UTF_8.name #=> "UTF-8"
1341 static VALUE
1342 enc_name(VALUE self)
1344 return rb_fstring_cstr(rb_enc_name((rb_encoding*)DATA_PTR(self)));
1347 static int
1348 enc_names_i(st_data_t name, st_data_t idx, st_data_t args)
1350 VALUE *arg = (VALUE *)args;
1352 if ((int)idx == (int)arg[0]) {
1353 VALUE str = rb_fstring_cstr((char *)name);
1354 rb_ary_push(arg[1], str);
1356 return ST_CONTINUE;
1360 * call-seq:
1361 * enc.names -> array
1363 * Returns the list of name and aliases of the encoding.
1365 * Encoding::WINDOWS_31J.names #=> ["Windows-31J", "CP932", "csWindows31J", "SJIS", "PCK"]
1367 static VALUE
1368 enc_names(VALUE self)
1370 VALUE args[2];
1372 args[0] = (VALUE)rb_to_encoding_index(self);
1373 args[1] = rb_ary_new2(0);
1375 GLOBAL_ENC_TABLE_EVAL(enc_table,
1376 st_foreach(enc_table->names, enc_names_i, (st_data_t)args));
1378 return args[1];
1382 * call-seq:
1383 * Encoding.list -> [enc1, enc2, ...]
1385 * Returns the list of loaded encodings.
1387 * Encoding.list
1388 * #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
1389 * #<Encoding:ISO-2022-JP (dummy)>]
1391 * Encoding.find("US-ASCII")
1392 * #=> #<Encoding:US-ASCII>
1394 * Encoding.list
1395 * #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
1396 * #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>]
1399 static VALUE
1400 enc_list(VALUE klass)
1402 VALUE ary = rb_ary_new2(0);
1404 RB_VM_LOCK_ENTER();
1406 rb_ary_replace(ary, rb_default_encoding_list);
1407 rb_ary_concat(ary, rb_additional_encoding_list);
1409 RB_VM_LOCK_LEAVE();
1411 return ary;
1415 * call-seq:
1416 * Encoding.find(string) -> enc
1418 * Search the encoding with specified <i>name</i>.
1419 * <i>name</i> should be a string.
1421 * Encoding.find("US-ASCII") #=> #<Encoding:US-ASCII>
1423 * Names which this method accept are encoding names and aliases
1424 * including following special aliases
1426 * "external":: default external encoding
1427 * "internal":: default internal encoding
1428 * "locale":: locale encoding
1429 * "filesystem":: filesystem encoding
1431 * An ArgumentError is raised when no encoding with <i>name</i>.
1432 * Only <code>Encoding.find("internal")</code> however returns nil
1433 * when no encoding named "internal", in other words, when Ruby has no
1434 * default internal encoding.
1436 static VALUE
1437 enc_find(VALUE klass, VALUE enc)
1439 int idx;
1440 if (is_obj_encoding(enc))
1441 return enc;
1442 idx = str_to_encindex(enc);
1443 if (idx == UNSPECIFIED_ENCODING) return Qnil;
1444 return rb_enc_from_encoding_index(idx);
1448 * call-seq:
1449 * Encoding.compatible?(obj1, obj2) -> enc or nil
1451 * Checks the compatibility of two objects.
1453 * If the objects are both strings they are compatible when they are
1454 * concatenatable. The encoding of the concatenated string will be returned
1455 * if they are compatible, nil if they are not.
1457 * Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b")
1458 * #=> #<Encoding:ISO-8859-1>
1460 * Encoding.compatible?(
1461 * "\xa1".force_encoding("iso-8859-1"),
1462 * "\xa1\xa1".force_encoding("euc-jp"))
1463 * #=> nil
1465 * If the objects are non-strings their encodings are compatible when they
1466 * have an encoding and:
1467 * * Either encoding is US-ASCII compatible
1468 * * One of the encodings is a 7-bit encoding
1471 static VALUE
1472 enc_compatible_p(VALUE klass, VALUE str1, VALUE str2)
1474 rb_encoding *enc;
1476 if (!enc_capable(str1)) return Qnil;
1477 if (!enc_capable(str2)) return Qnil;
1478 enc = rb_enc_compatible(str1, str2);
1479 if (!enc) return Qnil;
1480 return rb_enc_from_encoding(enc);
1483 NORETURN(static VALUE enc_s_alloc(VALUE klass));
1484 /* :nodoc: */
1485 static VALUE
1486 enc_s_alloc(VALUE klass)
1488 rb_undefined_alloc(klass);
1489 UNREACHABLE_RETURN(Qnil);
1492 /* :nodoc: */
1493 static VALUE
1494 enc_dump(int argc, VALUE *argv, VALUE self)
1496 rb_check_arity(argc, 0, 1);
1497 return enc_name(self);
1500 /* :nodoc: */
1501 static VALUE
1502 enc_load(VALUE klass, VALUE str)
1504 return str;
1507 /* :nodoc: */
1508 static VALUE
1509 enc_m_loader(VALUE klass, VALUE str)
1511 return enc_find(klass, str);
1514 rb_encoding *
1515 rb_ascii8bit_encoding(void)
1517 return global_enc_ascii;
1521 rb_ascii8bit_encindex(void)
1523 return ENCINDEX_ASCII;
1526 rb_encoding *
1527 rb_utf8_encoding(void)
1529 return global_enc_utf_8;
1533 rb_utf8_encindex(void)
1535 return ENCINDEX_UTF_8;
1538 rb_encoding *
1539 rb_usascii_encoding(void)
1541 return global_enc_us_ascii;
1545 rb_usascii_encindex(void)
1547 return ENCINDEX_US_ASCII;
1550 int rb_locale_charmap_index(void);
1553 rb_locale_encindex(void)
1555 int idx = rb_locale_charmap_index();
1557 if (idx < 0) idx = ENCINDEX_UTF_8;
1559 GLOBAL_ENC_TABLE_ENTER(enc_table);
1560 if (enc_registered(enc_table, "locale") < 0) {
1561 # if defined _WIN32
1562 void Init_w32_codepage(void);
1563 Init_w32_codepage();
1564 # endif
1565 enc_alias_internal(enc_table, "locale", idx);
1567 GLOBAL_ENC_TABLE_LEAVE();
1569 return idx;
1572 rb_encoding *
1573 rb_locale_encoding(void)
1575 return rb_enc_from_index(rb_locale_encindex());
1579 rb_filesystem_encindex(void)
1581 int idx;
1583 GLOBAL_ENC_TABLE_EVAL(enc_table,
1584 idx = enc_registered(enc_table, "filesystem"));
1586 if (idx < 0)
1587 idx = ENCINDEX_ASCII;
1588 return idx;
1591 rb_encoding *
1592 rb_filesystem_encoding(void)
1594 return rb_enc_from_index(rb_filesystem_encindex());
1597 struct default_encoding {
1598 int index; /* -2 => not yet set, -1 => nil */
1599 rb_encoding *enc;
1602 static struct default_encoding default_external = {0};
1604 static int
1605 enc_set_default_encoding(struct default_encoding *def, VALUE encoding, const char *name)
1607 int overridden = FALSE;
1609 if (def->index != -2)
1610 /* Already set */
1611 overridden = TRUE;
1613 GLOBAL_ENC_TABLE_ENTER(enc_table);
1615 if (NIL_P(encoding)) {
1616 def->index = -1;
1617 def->enc = 0;
1618 st_insert(enc_table->names, (st_data_t)strdup(name),
1619 (st_data_t)UNSPECIFIED_ENCODING);
1621 else {
1622 def->index = rb_enc_to_index(rb_to_encoding(encoding));
1623 def->enc = 0;
1624 enc_alias_internal(enc_table, name, def->index);
1627 if (def == &default_external) {
1628 enc_alias_internal(enc_table, "filesystem", Init_enc_set_filesystem_encoding());
1631 GLOBAL_ENC_TABLE_LEAVE();
1633 return overridden;
1636 rb_encoding *
1637 rb_default_external_encoding(void)
1639 if (default_external.enc) return default_external.enc;
1641 if (default_external.index >= 0) {
1642 default_external.enc = rb_enc_from_index(default_external.index);
1643 return default_external.enc;
1645 else {
1646 return rb_locale_encoding();
1650 VALUE
1651 rb_enc_default_external(void)
1653 return rb_enc_from_encoding(rb_default_external_encoding());
1657 * call-seq:
1658 * Encoding.default_external -> enc
1660 * Returns default external encoding.
1662 * The default external encoding is used by default for strings created from
1663 * the following locations:
1665 * * CSV
1666 * * File data read from disk
1667 * * SDBM
1668 * * StringIO
1669 * * Zlib::GzipReader
1670 * * Zlib::GzipWriter
1671 * * String#inspect
1672 * * Regexp#inspect
1674 * While strings created from these locations will have this encoding, the
1675 * encoding may not be valid. Be sure to check String#valid_encoding?.
1677 * File data written to disk will be transcoded to the default external
1678 * encoding when written, if default_internal is not nil.
1680 * The default external encoding is initialized by the -E option.
1681 * If -E isn't set, it is initialized to UTF-8 on Windows and the locale on
1682 * other operating systems.
1684 static VALUE
1685 get_default_external(VALUE klass)
1687 return rb_enc_default_external();
1690 void
1691 rb_enc_set_default_external(VALUE encoding)
1693 if (NIL_P(encoding)) {
1694 rb_raise(rb_eArgError, "default external can not be nil");
1696 enc_set_default_encoding(&default_external, encoding,
1697 "external");
1701 * call-seq:
1702 * Encoding.default_external = enc
1704 * Sets default external encoding. You should not set
1705 * Encoding::default_external in ruby code as strings created before changing
1706 * the value may have a different encoding from strings created after the value
1707 * was changed., instead you should use <tt>ruby -E</tt> to invoke ruby with
1708 * the correct default_external.
1710 * See Encoding::default_external for information on how the default external
1711 * encoding is used.
1713 static VALUE
1714 set_default_external(VALUE klass, VALUE encoding)
1716 rb_warning("setting Encoding.default_external");
1717 rb_enc_set_default_external(encoding);
1718 return encoding;
1721 static struct default_encoding default_internal = {-2};
1723 rb_encoding *
1724 rb_default_internal_encoding(void)
1726 if (!default_internal.enc && default_internal.index >= 0) {
1727 default_internal.enc = rb_enc_from_index(default_internal.index);
1729 return default_internal.enc; /* can be NULL */
1732 VALUE
1733 rb_enc_default_internal(void)
1735 /* Note: These functions cope with default_internal not being set */
1736 return rb_enc_from_encoding(rb_default_internal_encoding());
1740 * call-seq:
1741 * Encoding.default_internal -> enc
1743 * Returns default internal encoding. Strings will be transcoded to the
1744 * default internal encoding in the following places if the default internal
1745 * encoding is not nil:
1747 * * CSV
1748 * * Etc.sysconfdir and Etc.systmpdir
1749 * * File data read from disk
1750 * * File names from Dir
1751 * * Integer#chr
1752 * * String#inspect and Regexp#inspect
1753 * * Strings returned from Readline
1754 * * Strings returned from SDBM
1755 * * Time#zone
1756 * * Values from ENV
1757 * * Values in ARGV including $PROGRAM_NAME
1759 * Additionally String#encode and String#encode! use the default internal
1760 * encoding if no encoding is given.
1762 * The script encoding (__ENCODING__), not default_internal, is used as the
1763 * encoding of created strings.
1765 * Encoding::default_internal is initialized with -E option or nil otherwise.
1767 static VALUE
1768 get_default_internal(VALUE klass)
1770 return rb_enc_default_internal();
1773 void
1774 rb_enc_set_default_internal(VALUE encoding)
1776 enc_set_default_encoding(&default_internal, encoding,
1777 "internal");
1781 * call-seq:
1782 * Encoding.default_internal = enc or nil
1784 * Sets default internal encoding or removes default internal encoding when
1785 * passed nil. You should not set Encoding::default_internal in ruby code as
1786 * strings created before changing the value may have a different encoding
1787 * from strings created after the change. Instead you should use
1788 * <tt>ruby -E</tt> to invoke ruby with the correct default_internal.
1790 * See Encoding::default_internal for information on how the default internal
1791 * encoding is used.
1793 static VALUE
1794 set_default_internal(VALUE klass, VALUE encoding)
1796 rb_warning("setting Encoding.default_internal");
1797 rb_enc_set_default_internal(encoding);
1798 return encoding;
1801 static void
1802 set_encoding_const(const char *name, rb_encoding *enc)
1804 VALUE encoding = rb_enc_from_encoding(enc);
1805 char *s = (char *)name;
1806 int haslower = 0, hasupper = 0, valid = 0;
1808 if (ISDIGIT(*s)) return;
1809 if (ISUPPER(*s)) {
1810 hasupper = 1;
1811 while (*++s && (ISALNUM(*s) || *s == '_')) {
1812 if (ISLOWER(*s)) haslower = 1;
1815 if (!*s) {
1816 if (s - name > ENCODING_NAMELEN_MAX) return;
1817 valid = 1;
1818 rb_define_const(rb_cEncoding, name, encoding);
1820 if (!valid || haslower) {
1821 size_t len = s - name;
1822 if (len > ENCODING_NAMELEN_MAX) return;
1823 if (!haslower || !hasupper) {
1824 do {
1825 if (ISLOWER(*s)) haslower = 1;
1826 if (ISUPPER(*s)) hasupper = 1;
1827 } while (*++s && (!haslower || !hasupper));
1828 len = s - name;
1830 len += strlen(s);
1831 if (len++ > ENCODING_NAMELEN_MAX) return;
1832 MEMCPY(s = ALLOCA_N(char, len), name, char, len);
1833 name = s;
1834 if (!valid) {
1835 if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s);
1836 for (; *s; ++s) {
1837 if (!ISALNUM(*s)) *s = '_';
1839 if (hasupper) {
1840 rb_define_const(rb_cEncoding, name, encoding);
1843 if (haslower) {
1844 for (s = (char *)name; *s; ++s) {
1845 if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s);
1847 rb_define_const(rb_cEncoding, name, encoding);
1852 static int
1853 rb_enc_name_list_i(st_data_t name, st_data_t idx, st_data_t arg)
1855 VALUE ary = (VALUE)arg;
1856 VALUE str = rb_fstring_cstr((char *)name);
1857 rb_ary_push(ary, str);
1858 return ST_CONTINUE;
1862 * call-seq:
1863 * Encoding.name_list -> ["enc1", "enc2", ...]
1865 * Returns the list of available encoding names.
1867 * Encoding.name_list
1868 * #=> ["US-ASCII", "ASCII-8BIT", "UTF-8",
1869 * "ISO-8859-1", "Shift_JIS", "EUC-JP",
1870 * "Windows-31J",
1871 * "BINARY", "CP932", "eucJP"]
1875 static VALUE
1876 rb_enc_name_list(VALUE klass)
1878 VALUE ary;
1880 GLOBAL_ENC_TABLE_ENTER(enc_table);
1882 ary = rb_ary_new2(enc_table->names->num_entries);
1883 st_foreach(enc_table->names, rb_enc_name_list_i, (st_data_t)ary);
1885 GLOBAL_ENC_TABLE_LEAVE();
1887 return ary;
1890 static int
1891 rb_enc_aliases_enc_i(st_data_t name, st_data_t orig, st_data_t arg)
1893 VALUE *p = (VALUE *)arg;
1894 VALUE aliases = p[0], ary = p[1];
1895 int idx = (int)orig;
1896 VALUE key, str = rb_ary_entry(ary, idx);
1898 if (NIL_P(str)) {
1899 rb_encoding *enc = rb_enc_from_index(idx);
1901 if (!enc) return ST_CONTINUE;
1902 if (STRCASECMP((char*)name, rb_enc_name(enc)) == 0) {
1903 return ST_CONTINUE;
1905 str = rb_fstring_cstr(rb_enc_name(enc));
1906 rb_ary_store(ary, idx, str);
1908 key = rb_fstring_cstr((char *)name);
1909 rb_hash_aset(aliases, key, str);
1910 return ST_CONTINUE;
1914 * call-seq:
1915 * Encoding.aliases -> {"alias1" => "orig1", "alias2" => "orig2", ...}
1917 * Returns the hash of available encoding alias and original encoding name.
1919 * Encoding.aliases
1920 * #=> {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1968"=>"US-ASCII",
1921 * "SJIS"=>"Windows-31J", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"}
1925 static VALUE
1926 rb_enc_aliases(VALUE klass)
1928 VALUE aliases[2];
1929 aliases[0] = rb_hash_new();
1930 aliases[1] = rb_ary_new();
1932 GLOBAL_ENC_TABLE_EVAL(enc_table,
1933 st_foreach(enc_table->names, rb_enc_aliases_enc_i, (st_data_t)aliases));
1935 return aliases[0];
1939 * An Encoding instance represents a character encoding usable in Ruby. It is
1940 * defined as a constant under the Encoding namespace. It has a name and
1941 * optionally, aliases:
1943 * Encoding::ISO_8859_1.name
1944 * #=> "ISO-8859-1"
1946 * Encoding::ISO_8859_1.names
1947 * #=> ["ISO-8859-1", "ISO8859-1"]
1949 * Ruby methods dealing with encodings return or accept Encoding instances as
1950 * arguments (when a method accepts an Encoding instance as an argument, it
1951 * can be passed an Encoding name or alias instead).
1953 * "some string".encoding
1954 * #=> #<Encoding:UTF-8>
1956 * string = "some string".encode(Encoding::ISO_8859_1)
1957 * #=> "some string"
1958 * string.encoding
1959 * #=> #<Encoding:ISO-8859-1>
1961 * "some string".encode "ISO-8859-1"
1962 * #=> "some string"
1964 * Encoding::ASCII_8BIT is a special encoding that is usually used for
1965 * a byte string, not a character string. But as the name insists, its
1966 * characters in the range of ASCII are considered as ASCII
1967 * characters. This is useful when you use ASCII-8BIT characters with
1968 * other ASCII compatible characters.
1970 * == Changing an encoding
1972 * The associated Encoding of a String can be changed in two different ways.
1974 * First, it is possible to set the Encoding of a string to a new Encoding
1975 * without changing the internal byte representation of the string, with
1976 * String#force_encoding. This is how you can tell Ruby the correct encoding
1977 * of a string.
1979 * string
1980 * #=> "R\xC3\xA9sum\xC3\xA9"
1981 * string.encoding
1982 * #=> #<Encoding:ISO-8859-1>
1983 * string.force_encoding(Encoding::UTF_8)
1984 * #=> "R\u00E9sum\u00E9"
1986 * Second, it is possible to transcode a string, i.e. translate its internal
1987 * byte representation to another encoding. Its associated encoding is also
1988 * set to the other encoding. See String#encode for the various forms of
1989 * transcoding, and the Encoding::Converter class for additional control over
1990 * the transcoding process.
1992 * string
1993 * #=> "R\u00E9sum\u00E9"
1994 * string.encoding
1995 * #=> #<Encoding:UTF-8>
1996 * string = string.encode!(Encoding::ISO_8859_1)
1997 * #=> "R\xE9sum\xE9"
1998 * string.encoding
1999 * #=> #<Encoding::ISO-8859-1>
2001 * == Script encoding
2003 * All Ruby script code has an associated Encoding which any String literal
2004 * created in the source code will be associated to.
2006 * The default script encoding is Encoding::UTF_8 after v2.0, but it
2007 * can be changed by a magic comment on the first line of the source
2008 * code file (or second line, if there is a shebang line on the
2009 * first). The comment must contain the word <code>coding</code> or
2010 * <code>encoding</code>, followed by a colon, space and the Encoding
2011 * name or alias:
2013 * # encoding: UTF-8
2015 * "some string".encoding
2016 * #=> #<Encoding:UTF-8>
2018 * The <code>__ENCODING__</code> keyword returns the script encoding of the file
2019 * which the keyword is written:
2021 * # encoding: ISO-8859-1
2023 * __ENCODING__
2024 * #=> #<Encoding:ISO-8859-1>
2026 * <code>ruby -K</code> will change the default locale encoding, but this is
2027 * not recommended. Ruby source files should declare its script encoding by a
2028 * magic comment even when they only depend on US-ASCII strings or regular
2029 * expressions.
2031 * == Locale encoding
2033 * The default encoding of the environment. Usually derived from locale.
2035 * see Encoding.locale_charmap, Encoding.find('locale')
2037 * == Filesystem encoding
2039 * The default encoding of strings from the filesystem of the environment.
2040 * This is used for strings of file names or paths.
2042 * see Encoding.find('filesystem')
2044 * == External encoding
2046 * Each IO object has an external encoding which indicates the encoding that
2047 * Ruby will use to read its data. By default Ruby sets the external encoding
2048 * of an IO object to the default external encoding. The default external
2049 * encoding is set by locale encoding or the interpreter <code>-E</code> option.
2050 * Encoding.default_external returns the current value of the external
2051 * encoding.
2053 * ENV["LANG"]
2054 * #=> "UTF-8"
2055 * Encoding.default_external
2056 * #=> #<Encoding:UTF-8>
2058 * $ ruby -E ISO-8859-1 -e "p Encoding.default_external"
2059 * #<Encoding:ISO-8859-1>
2061 * $ LANG=C ruby -e 'p Encoding.default_external'
2062 * #<Encoding:US-ASCII>
2064 * The default external encoding may also be set through
2065 * Encoding.default_external=, but you should not do this as strings created
2066 * before and after the change will have inconsistent encodings. Instead use
2067 * <code>ruby -E</code> to invoke ruby with the correct external encoding.
2069 * When you know that the actual encoding of the data of an IO object is not
2070 * the default external encoding, you can reset its external encoding with
2071 * IO#set_encoding or set it at IO object creation (see IO.new options).
2073 * == Internal encoding
2075 * To process the data of an IO object which has an encoding different
2076 * from its external encoding, you can set its internal encoding. Ruby will use
2077 * this internal encoding to transcode the data when it is read from the IO
2078 * object.
2080 * Conversely, when data is written to the IO object it is transcoded from the
2081 * internal encoding to the external encoding of the IO object.
2083 * The internal encoding of an IO object can be set with
2084 * IO#set_encoding or at IO object creation (see IO.new options).
2086 * The internal encoding is optional and when not set, the Ruby default
2087 * internal encoding is used. If not explicitly set this default internal
2088 * encoding is +nil+ meaning that by default, no transcoding occurs.
2090 * The default internal encoding can be set with the interpreter option
2091 * <code>-E</code>. Encoding.default_internal returns the current internal
2092 * encoding.
2094 * $ ruby -e 'p Encoding.default_internal'
2095 * nil
2097 * $ ruby -E ISO-8859-1:UTF-8 -e "p [Encoding.default_external, \
2098 * Encoding.default_internal]"
2099 * [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>]
2101 * The default internal encoding may also be set through
2102 * Encoding.default_internal=, but you should not do this as strings created
2103 * before and after the change will have inconsistent encodings. Instead use
2104 * <code>ruby -E</code> to invoke ruby with the correct internal encoding.
2106 * == IO encoding example
2108 * In the following example a UTF-8 encoded string "R\u00E9sum\u00E9" is transcoded for
2109 * output to ISO-8859-1 encoding, then read back in and transcoded to UTF-8:
2111 * string = "R\u00E9sum\u00E9"
2113 * open("transcoded.txt", "w:ISO-8859-1") do |io|
2114 * io.write(string)
2115 * end
2117 * puts "raw text:"
2118 * p File.binread("transcoded.txt")
2119 * puts
2121 * open("transcoded.txt", "r:ISO-8859-1:UTF-8") do |io|
2122 * puts "transcoded text:"
2123 * p io.read
2124 * end
2126 * While writing the file, the internal encoding is not specified as it is
2127 * only necessary for reading. While reading the file both the internal and
2128 * external encoding must be specified to obtain the correct result.
2130 * $ ruby t.rb
2131 * raw text:
2132 * "R\xE9sum\xE9"
2134 * transcoded text:
2135 * "R\u00E9sum\u00E9"
2139 void
2140 Init_Encoding(void)
2142 VALUE list;
2143 int i;
2145 rb_cEncoding = rb_define_class("Encoding", rb_cObject);
2146 rb_define_alloc_func(rb_cEncoding, enc_s_alloc);
2147 rb_undef_method(CLASS_OF(rb_cEncoding), "new");
2148 rb_define_method(rb_cEncoding, "to_s", enc_name, 0);
2149 rb_define_method(rb_cEncoding, "inspect", enc_inspect, 0);
2150 rb_define_method(rb_cEncoding, "name", enc_name, 0);
2151 rb_define_method(rb_cEncoding, "names", enc_names, 0);
2152 rb_define_method(rb_cEncoding, "dummy?", enc_dummy_p, 0);
2153 rb_define_method(rb_cEncoding, "ascii_compatible?", enc_ascii_compatible_p, 0);
2154 rb_define_method(rb_cEncoding, "replicate", enc_replicate_m, 1);
2155 rb_define_singleton_method(rb_cEncoding, "list", enc_list, 0);
2156 rb_define_singleton_method(rb_cEncoding, "name_list", rb_enc_name_list, 0);
2157 rb_define_singleton_method(rb_cEncoding, "aliases", rb_enc_aliases, 0);
2158 rb_define_singleton_method(rb_cEncoding, "find", enc_find, 1);
2159 rb_define_singleton_method(rb_cEncoding, "compatible?", enc_compatible_p, 2);
2161 rb_define_method(rb_cEncoding, "_dump", enc_dump, -1);
2162 rb_define_singleton_method(rb_cEncoding, "_load", enc_load, 1);
2164 rb_define_singleton_method(rb_cEncoding, "default_external", get_default_external, 0);
2165 rb_define_singleton_method(rb_cEncoding, "default_external=", set_default_external, 1);
2166 rb_define_singleton_method(rb_cEncoding, "default_internal", get_default_internal, 0);
2167 rb_define_singleton_method(rb_cEncoding, "default_internal=", set_default_internal, 1);
2168 rb_define_singleton_method(rb_cEncoding, "locale_charmap", rb_locale_charmap, 0); /* in localeinit.c */
2170 struct enc_table *enc_table = &global_enc_table;
2172 if (DEFAULT_ENCODING_LIST_CAPA < enc_table->count) rb_bug("DEFAULT_ENCODING_LIST_CAPA is too small");
2174 list = rb_additional_encoding_list = rb_ary_new();
2175 RBASIC_CLEAR_CLASS(list);
2176 rb_gc_register_mark_object(list);
2178 list = rb_default_encoding_list = rb_ary_new2(DEFAULT_ENCODING_LIST_CAPA);
2179 RBASIC_CLEAR_CLASS(list);
2180 rb_gc_register_mark_object(list);
2182 for (i = 0; i < enc_table->count; ++i) {
2183 rb_ary_push(list, enc_new(enc_table->list[i].enc));
2186 rb_marshal_define_compat(rb_cEncoding, Qnil, 0, enc_m_loader);
2189 void
2190 Init_encodings(void)
2192 rb_enc_init(&global_enc_table);
2195 /* locale insensitive ctype functions */
2197 void
2198 rb_enc_foreach_name(int (*func)(st_data_t name, st_data_t idx, st_data_t arg), st_data_t arg)
2200 GLOBAL_ENC_TABLE_EVAL(enc_table, st_foreach(enc_table->names, func, arg));