1 /**********************************************************************
6 created at: Thu May 24 17:23:27 JST 2007
8 Copyright (C) 2007 Yukihiro Matsumoto
10 **********************************************************************/
12 #include "ruby/internal/config.h"
18 #include "internal/enc.h"
19 #include "internal/encoding.h"
20 #include "internal/inits.h"
21 #include "internal/load.h"
22 #include "internal/object.h"
23 #include "internal/string.h"
24 #include "internal/vm.h"
26 #include "ruby/encoding.h"
27 #include "ruby/util.h"
28 #include "ruby_assert.h"
34 #define ENC_ASSERT(expr) RUBY_ASSERT_WHEN(ENC_DEBUG, expr)
35 #define MUST_STRING(str) (ENC_ASSERT(RB_TYPE_P(str, T_STRING)), str)
37 #undef rb_ascii8bit_encindex
38 #undef rb_utf8_encindex
39 #undef rb_usascii_encindex
41 typedef OnigEncodingType rb_raw_encoding
;
43 #if defined __GNUC__ && __GNUC__ >= 4
44 #pragma GCC visibility push(default)
45 int rb_enc_register(const char *name
, rb_encoding
*encoding
);
46 void rb_enc_set_base(const char *name
, const char *orig
);
47 int rb_enc_set_dummy(int index
);
48 void rb_encdb_declare(const char *name
);
49 int rb_encdb_replicate(const char *name
, const char *orig
);
50 int rb_encdb_dummy(const char *name
);
51 int rb_encdb_alias(const char *alias
, const char *orig
);
52 void rb_encdb_set_unicode(int index
);
53 #pragma GCC visibility pop
56 static ID id_encoding
;
59 #define DEFAULT_ENCODING_LIST_CAPA 128
60 static VALUE rb_default_encoding_list
;
61 static VALUE rb_additional_encoding_list
;
63 struct rb_encoding_entry
{
69 static struct enc_table
{
70 struct rb_encoding_entry
*list
;
76 static rb_encoding
*global_enc_ascii
,
80 #define GLOBAL_ENC_TABLE_ENTER(enc_table) struct enc_table *enc_table = &global_enc_table; RB_VM_LOCK_ENTER()
81 #define GLOBAL_ENC_TABLE_LEAVE() RB_VM_LOCK_LEAVE()
82 #define GLOBAL_ENC_TABLE_EVAL(enc_table, expr) do { \
83 GLOBAL_ENC_TABLE_ENTER(enc_table); \
87 GLOBAL_ENC_TABLE_LEAVE(); \
91 #define ENC_DUMMY_FLAG (1<<24)
92 #define ENC_INDEX_MASK (~(~0U<<24))
94 #define ENC_TO_ENCINDEX(enc) (int)((enc)->ruby_encoding_index & ENC_INDEX_MASK)
95 #define ENC_DUMMY_P(enc) ((enc)->ruby_encoding_index & ENC_DUMMY_FLAG)
96 #define ENC_SET_DUMMY(enc) ((enc)->ruby_encoding_index |= ENC_DUMMY_FLAG)
98 #define ENCODING_COUNT ENCINDEX_BUILTIN_MAX
99 #define UNSPECIFIED_ENCODING INT_MAX
101 #define ENCODING_NAMELEN_MAX 63
102 #define valid_encoding_name_p(name) ((name) && strlen(name) <= ENCODING_NAMELEN_MAX)
104 static const rb_data_type_t encoding_data_type
= {
107 0, 0, RUBY_TYPED_FREE_IMMEDIATELY
110 #define is_data_encoding(obj) (RTYPEDDATA_P(obj) && RTYPEDDATA_TYPE(obj) == &encoding_data_type)
111 #define is_obj_encoding(obj) (RB_TYPE_P((obj), T_DATA) && is_data_encoding(obj))
114 rb_data_is_encoding(VALUE obj
)
116 return is_data_encoding(obj
);
120 enc_new(rb_encoding
*encoding
)
122 VALUE enc
= TypedData_Wrap_Struct(rb_cEncoding
, &encoding_data_type
, (void *)encoding
);
124 FL_SET_RAW(enc
, RUBY_FL_SHAREABLE
);
129 enc_list_update(int index
, rb_raw_encoding
*encoding
)
131 if (index
< DEFAULT_ENCODING_LIST_CAPA
) {
132 VALUE list
= rb_default_encoding_list
;
133 if (list
&& NIL_P(rb_ary_entry(list
, index
))) {
134 /* initialize encoding data */
135 rb_ary_store(list
, index
, enc_new(encoding
));
141 VALUE list
= rb_additional_encoding_list
;
142 if (list
&& NIL_P(rb_ary_entry(list
, index
))) {
143 /* initialize encoding data */
144 rb_ary_store(list
, index
- DEFAULT_ENCODING_LIST_CAPA
, enc_new(encoding
));
152 enc_list_lookup(int idx
)
156 if (idx
< DEFAULT_ENCODING_LIST_CAPA
) {
157 if (!(list
= rb_default_encoding_list
)) {
158 rb_bug("rb_enc_from_encoding_index(%d): no rb_default_encoding_list", idx
);
160 enc
= rb_ary_entry(list
, idx
);
165 if (!(list
= rb_additional_encoding_list
)) {
166 rb_bug("rb_enc_from_encoding_index(%d): no rb_additional_encoding_list", idx
);
168 enc
= rb_ary_entry(list
, idx
- DEFAULT_ENCODING_LIST_CAPA
);
174 rb_bug("rb_enc_from_encoding_index(%d): not created yet", idx
);
182 rb_enc_from_encoding_index(int idx
)
184 return enc_list_lookup(idx
);
188 rb_enc_from_encoding(rb_encoding
*encoding
)
191 if (!encoding
) return Qnil
;
192 idx
= ENC_TO_ENCINDEX(encoding
);
193 return rb_enc_from_encoding_index(idx
);
197 rb_enc_to_index(rb_encoding
*enc
)
199 return enc
? ENC_TO_ENCINDEX(enc
) : 0;
203 rb_enc_dummy_p(rb_encoding
*enc
)
205 return ENC_DUMMY_P(enc
) != 0;
209 check_encoding(rb_encoding
*enc
)
211 int index
= rb_enc_to_index(enc
);
212 if (rb_enc_from_index(index
) != enc
)
214 if (rb_enc_autoload_p(enc
)) {
215 index
= rb_enc_autoload(enc
);
221 enc_check_encoding(VALUE obj
)
223 if (!is_obj_encoding(obj
)) {
226 return check_encoding(RDATA(obj
)->data
);
229 NORETURN(static void not_encoding(VALUE enc
));
231 not_encoding(VALUE enc
)
233 rb_raise(rb_eTypeError
, "wrong argument type %"PRIsVALUE
" (expected Encoding)",
238 must_encoding(VALUE enc
)
240 int index
= enc_check_encoding(enc
);
244 return DATA_PTR(enc
);
248 must_encindex(int index
)
250 rb_encoding
*enc
= rb_enc_from_index(index
);
252 rb_raise(rb_eEncodingError
, "encoding index out of bound: %d",
255 if (ENC_TO_ENCINDEX(enc
) != (int)(index
& ENC_INDEX_MASK
)) {
256 rb_raise(rb_eEncodingError
, "wrong encoding index %d for %s (expected %d)",
257 index
, rb_enc_name(enc
), ENC_TO_ENCINDEX(enc
));
259 if (rb_enc_autoload_p(enc
) && rb_enc_autoload(enc
) == -1) {
260 rb_loaderror("failed to load encoding (%s)",
267 rb_to_encoding_index(VALUE enc
)
272 idx
= enc_check_encoding(enc
);
276 else if (NIL_P(enc
= rb_check_string_type(enc
))) {
279 if (!rb_enc_asciicompat(rb_enc_get(enc
))) {
282 if (!(name
= rb_str_to_cstr(enc
))) {
285 return rb_enc_find_index(name
);
289 name_for_encoding(volatile VALUE
*enc
)
291 VALUE name
= StringValue(*enc
);
294 if (!rb_enc_asciicompat(rb_enc_get(name
))) {
295 rb_raise(rb_eArgError
, "invalid encoding name (non ASCII)");
297 if (!(n
= rb_str_to_cstr(name
))) {
298 rb_raise(rb_eArgError
, "invalid encoding name (NUL byte)");
303 /* Returns encoding index or UNSPECIFIED_ENCODING */
305 str_find_encindex(VALUE enc
)
307 int idx
= rb_enc_find_index(name_for_encoding(&enc
));
313 str_to_encindex(VALUE enc
)
315 int idx
= str_find_encindex(enc
);
317 rb_raise(rb_eArgError
, "unknown encoding name - %"PRIsVALUE
, enc
);
323 str_to_encoding(VALUE enc
)
325 return rb_enc_from_index(str_to_encindex(enc
));
329 rb_to_encoding(VALUE enc
)
331 if (enc_check_encoding(enc
) >= 0) return RDATA(enc
)->data
;
332 return str_to_encoding(enc
);
336 rb_find_encoding(VALUE enc
)
339 if (enc_check_encoding(enc
) >= 0) return RDATA(enc
)->data
;
340 idx
= str_find_encindex(enc
);
341 if (idx
< 0) return NULL
;
342 return rb_enc_from_index(idx
);
346 enc_table_expand(struct enc_table
*enc_table
, int newsize
)
348 struct rb_encoding_entry
*ent
;
351 if (enc_table
->size
>= newsize
) return newsize
;
352 newsize
= (newsize
+ 7) / 8 * 8;
353 ent
= REALLOC_N(enc_table
->list
, struct rb_encoding_entry
, newsize
);
354 memset(ent
+ enc_table
->size
, 0, sizeof(*ent
)*(newsize
- enc_table
->size
));
355 enc_table
->list
= ent
;
356 enc_table
->size
= newsize
;
361 enc_register_at(struct enc_table
*enc_table
, int index
, const char *name
, rb_encoding
*base_encoding
)
363 struct rb_encoding_entry
*ent
= &enc_table
->list
[index
];
364 rb_raw_encoding
*encoding
;
366 if (!valid_encoding_name_p(name
)) return -1;
368 ent
->name
= name
= strdup(name
);
370 else if (STRCASECMP(name
, ent
->name
)) {
373 encoding
= (rb_raw_encoding
*)ent
->enc
;
375 encoding
= xmalloc(sizeof(rb_encoding
));
378 *encoding
= *base_encoding
;
381 memset(encoding
, 0, sizeof(*ent
->enc
));
383 encoding
->name
= name
;
384 encoding
->ruby_encoding_index
= index
;
386 st_insert(enc_table
->names
, (st_data_t
)name
, (st_data_t
)index
);
388 enc_list_update(index
, encoding
);
393 enc_register(struct enc_table
*enc_table
, const char *name
, rb_encoding
*encoding
)
395 int index
= enc_table
->count
;
397 enc_table
->count
= enc_table_expand(enc_table
, index
+ 1);
398 return enc_register_at(enc_table
, index
, name
, encoding
);
401 static void set_encoding_const(const char *, rb_encoding
*);
402 static int enc_registered(struct enc_table
*enc_table
, const char *name
);
405 enc_from_index(struct enc_table
*enc_table
, int index
)
407 if (UNLIKELY(index
< 0 || enc_table
->count
<= (index
&= ENC_INDEX_MASK
))) {
410 return enc_table
->list
[index
].enc
;
414 rb_enc_from_index(int index
)
419 case ENCINDEX_ASCII
: return global_enc_ascii
;
420 case ENCINDEX_UTF_8
: return global_enc_utf_8
;
421 case ENCINDEX_US_ASCII
: return global_enc_us_ascii
;
423 GLOBAL_ENC_TABLE_EVAL(enc_table
,
424 enc
= enc_from_index(enc_table
, index
));
430 rb_enc_register(const char *name
, rb_encoding
*encoding
)
434 GLOBAL_ENC_TABLE_ENTER(enc_table
);
436 index
= enc_registered(enc_table
, name
);
439 rb_encoding
*oldenc
= enc_from_index(enc_table
, index
);
440 if (STRCASECMP(name
, rb_enc_name(oldenc
))) {
441 index
= enc_register(enc_table
, name
, encoding
);
443 else if (rb_enc_autoload_p(oldenc
) || !ENC_DUMMY_P(oldenc
)) {
444 enc_register_at(enc_table
, index
, name
, encoding
);
447 rb_raise(rb_eArgError
, "encoding %s is already registered", name
);
451 index
= enc_register(enc_table
, name
, encoding
);
452 set_encoding_const(name
, rb_enc_from_index(index
));
455 GLOBAL_ENC_TABLE_LEAVE();
460 enc_registered(struct enc_table
*enc_table
, const char *name
)
464 if (!name
) return -1;
465 if (!enc_table
->list
) return -1;
466 if (st_lookup(enc_table
->names
, (st_data_t
)name
, &idx
)) {
473 rb_encdb_declare(const char *name
)
475 GLOBAL_ENC_TABLE_ENTER(enc_table
);
477 int idx
= enc_registered(enc_table
, name
);
479 idx
= enc_register(enc_table
, name
, 0);
481 set_encoding_const(name
, rb_enc_from_index(idx
));
483 GLOBAL_ENC_TABLE_LEAVE();
487 enc_check_duplication(struct enc_table
*enc_table
, const char *name
)
489 if (enc_registered(enc_table
, name
) >= 0) {
490 rb_raise(rb_eArgError
, "encoding %s is already registered", name
);
495 set_base_encoding(struct enc_table
*enc_table
, int index
, rb_encoding
*base
)
497 rb_encoding
*enc
= enc_table
->list
[index
].enc
;
500 enc_table
->list
[index
].base
= base
;
501 if (ENC_DUMMY_P(base
)) ENC_SET_DUMMY((rb_raw_encoding
*)enc
);
506 * Set base encoding for encodings which are not replicas
507 * but not in their own files.
510 rb_enc_set_base(const char *name
, const char *orig
)
512 GLOBAL_ENC_TABLE_ENTER(enc_table
);
514 int idx
= enc_registered(enc_table
, name
);
515 int origidx
= enc_registered(enc_table
, orig
);
516 set_base_encoding(enc_table
, idx
, rb_enc_from_index(origidx
));
518 GLOBAL_ENC_TABLE_LEAVE();
522 * Set encoding dummy.
525 rb_enc_set_dummy(int index
)
529 GLOBAL_ENC_TABLE_EVAL(enc_table
,
530 enc
= enc_table
->list
[index
].enc
);
532 ENC_SET_DUMMY((rb_raw_encoding
*)enc
);
537 enc_replicate(struct enc_table
*enc_table
, const char *name
, rb_encoding
*encoding
)
541 enc_check_duplication(enc_table
, name
);
542 idx
= enc_register(enc_table
, name
, encoding
);
543 if (idx
< 0) rb_raise(rb_eArgError
, "invalid encoding name: %s", name
);
544 set_base_encoding(enc_table
, idx
, encoding
);
545 set_encoding_const(name
, rb_enc_from_index(idx
));
550 rb_enc_replicate(const char *name
, rb_encoding
*encoding
)
554 GLOBAL_ENC_TABLE_EVAL(enc_table
,
555 r
= enc_replicate(enc_table
, name
, encoding
));
562 * enc.replicate(name) -> encoding
564 * Returns a replicated encoding of _enc_ whose name is _name_.
565 * The new encoding should have the same byte structure of _enc_.
566 * If _name_ is used by another encoding, raise ArgumentError.
570 enc_replicate_m(VALUE encoding
, VALUE name
)
572 int idx
= rb_enc_replicate(name_for_encoding(&name
), rb_to_encoding(encoding
));
574 return rb_enc_from_encoding_index(idx
);
578 enc_replicate_with_index(struct enc_table
*enc_table
, const char *name
, rb_encoding
*origenc
, int idx
)
581 idx
= enc_register(enc_table
, name
, origenc
);
584 idx
= enc_register_at(enc_table
, idx
, name
, origenc
);
587 set_base_encoding(enc_table
, idx
, origenc
);
588 set_encoding_const(name
, rb_enc_from_index(idx
));
591 rb_raise(rb_eArgError
, "failed to replicate encoding");
597 rb_encdb_replicate(const char *name
, const char *orig
)
601 GLOBAL_ENC_TABLE_ENTER(enc_table
);
603 int origidx
= enc_registered(enc_table
, orig
);
604 int idx
= enc_registered(enc_table
, name
);
607 origidx
= enc_register(enc_table
, orig
, 0);
609 r
= enc_replicate_with_index(enc_table
, name
, rb_enc_from_index(origidx
), idx
);
611 GLOBAL_ENC_TABLE_LEAVE();
617 rb_define_dummy_encoding(const char *name
)
621 GLOBAL_ENC_TABLE_ENTER(enc_table
);
623 index
= enc_replicate(enc_table
, name
, rb_ascii8bit_encoding());
624 rb_encoding
*enc
= enc_table
->list
[index
].enc
;
625 ENC_SET_DUMMY((rb_raw_encoding
*)enc
);
627 GLOBAL_ENC_TABLE_LEAVE();
633 rb_encdb_dummy(const char *name
)
637 GLOBAL_ENC_TABLE_ENTER(enc_table
);
639 index
= enc_replicate_with_index(enc_table
, name
,
640 rb_ascii8bit_encoding(),
641 enc_registered(enc_table
, name
));
642 rb_encoding
*enc
= enc_table
->list
[index
].enc
;
643 ENC_SET_DUMMY((rb_raw_encoding
*)enc
);
645 GLOBAL_ENC_TABLE_LEAVE();
652 * enc.dummy? -> true or false
654 * Returns true for dummy encodings.
655 * A dummy encoding is an encoding for which character handling is not properly
657 * It is used for stateful encodings.
659 * Encoding::ISO_2022_JP.dummy? #=> true
660 * Encoding::UTF_8.dummy? #=> false
664 enc_dummy_p(VALUE enc
)
666 return RBOOL(ENC_DUMMY_P(must_encoding(enc
)));
671 * enc.ascii_compatible? -> true or false
673 * Returns whether ASCII-compatible or not.
675 * Encoding::UTF_8.ascii_compatible? #=> true
676 * Encoding::UTF_16BE.ascii_compatible? #=> false
680 enc_ascii_compatible_p(VALUE enc
)
682 return RBOOL(rb_enc_asciicompat(must_encoding(enc
)));
686 * Returns non-zero when the encoding is Unicode series other than UTF-7 else 0.
689 rb_enc_unicode_p(rb_encoding
*enc
)
691 return ONIGENC_IS_UNICODE(enc
);
695 enc_dup_name(st_data_t name
)
697 return (st_data_t
)strdup((const char *)name
);
701 * Returns copied alias name when the key is added for st_table,
705 enc_alias_internal(struct enc_table
*enc_table
, const char *alias
, int idx
)
707 return st_insert2(enc_table
->names
, (st_data_t
)alias
, (st_data_t
)idx
,
712 enc_alias(struct enc_table
*enc_table
, const char *alias
, int idx
)
714 if (!valid_encoding_name_p(alias
)) return -1;
715 if (!enc_alias_internal(enc_table
, alias
, idx
))
716 set_encoding_const(alias
, enc_from_index(enc_table
, idx
));
721 rb_enc_alias(const char *alias
, const char *orig
)
725 GLOBAL_ENC_TABLE_ENTER(enc_table
);
727 enc_check_duplication(enc_table
, alias
);
728 if ((idx
= rb_enc_find_index(orig
)) < 0) {
732 r
= enc_alias(enc_table
, alias
, idx
);
735 GLOBAL_ENC_TABLE_LEAVE();
741 rb_encdb_alias(const char *alias
, const char *orig
)
745 GLOBAL_ENC_TABLE_ENTER(enc_table
);
747 int idx
= enc_registered(enc_table
, orig
);
750 idx
= enc_register(enc_table
, orig
, 0);
752 r
= enc_alias(enc_table
, alias
, idx
);
754 GLOBAL_ENC_TABLE_LEAVE();
760 rb_encdb_set_unicode(int index
)
762 rb_raw_encoding
*enc
= (rb_raw_encoding
*)rb_enc_from_index(index
);
764 enc
->flags
|= ONIGENC_FLAG_UNICODE
;
768 rb_enc_init(struct enc_table
*enc_table
)
770 enc_table_expand(enc_table
, ENCODING_COUNT
+ 1);
771 if (!enc_table
->names
) {
772 enc_table
->names
= st_init_strcasetable();
774 #define ENC_REGISTER(enc) enc_register_at(enc_table, ENCINDEX_##enc, rb_enc_name(&OnigEncoding##enc), &OnigEncoding##enc)
777 ENC_REGISTER(US_ASCII
);
778 global_enc_ascii
= enc_table
->list
[ENCINDEX_ASCII
].enc
;
779 global_enc_utf_8
= enc_table
->list
[ENCINDEX_UTF_8
].enc
;
780 global_enc_us_ascii
= enc_table
->list
[ENCINDEX_US_ASCII
].enc
;
782 #define ENCDB_REGISTER(name, enc) enc_register_at(enc_table, ENCINDEX_##enc, name, NULL)
783 ENCDB_REGISTER("UTF-16BE", UTF_16BE
);
784 ENCDB_REGISTER("UTF-16LE", UTF_16LE
);
785 ENCDB_REGISTER("UTF-32BE", UTF_32BE
);
786 ENCDB_REGISTER("UTF-32LE", UTF_32LE
);
787 ENCDB_REGISTER("UTF-16", UTF_16
);
788 ENCDB_REGISTER("UTF-32", UTF_32
);
789 ENCDB_REGISTER("UTF8-MAC", UTF8_MAC
);
791 ENCDB_REGISTER("EUC-JP", EUC_JP
);
792 ENCDB_REGISTER("Windows-31J", Windows_31J
);
793 #undef ENCDB_REGISTER
794 enc_table
->count
= ENCINDEX_BUILTIN_MAX
;
798 rb_enc_get_from_index(int index
)
800 return must_encindex(index
);
803 int rb_require_internal_silent(VALUE fname
);
806 load_encoding(const char *name
)
808 VALUE enclib
= rb_sprintf("enc/%s.so", name
);
809 VALUE debug
= ruby_debug
;
811 char *s
= RSTRING_PTR(enclib
) + 4, *e
= RSTRING_END(enclib
) - 3;
816 if (!ISALNUM(*s
)) *s
= '_';
817 else if (ISUPPER(*s
)) *s
= (char)TOLOWER(*s
);
820 enclib
= rb_fstring(enclib
);
822 errinfo
= rb_errinfo();
823 loaded
= rb_require_internal_silent(enclib
);
825 rb_set_errinfo(errinfo
);
827 GLOBAL_ENC_TABLE_ENTER(enc_table
);
829 if (loaded
< 0 || 1 < loaded
) {
832 else if ((idx
= enc_registered(enc_table
, name
)) < 0) {
835 else if (rb_enc_autoload_p(enc_table
->list
[idx
].enc
)) {
839 GLOBAL_ENC_TABLE_LEAVE();
845 enc_autoload_body(struct enc_table
*enc_table
, rb_encoding
*enc
)
847 rb_encoding
*base
= enc_table
->list
[ENC_TO_ENCINDEX(enc
)].base
;
852 if (i
>= enc_table
->count
) return -1;
853 } while (enc_table
->list
[i
].enc
!= base
&& (++i
, 1));
854 if (rb_enc_autoload_p(base
)) {
855 if (rb_enc_autoload(base
) < 0) return -1;
857 i
= enc
->ruby_encoding_index
;
858 enc_register_at(enc_table
, i
& ENC_INDEX_MASK
, rb_enc_name(enc
), base
);
859 ((rb_raw_encoding
*)enc
)->ruby_encoding_index
= i
;
869 rb_enc_autoload(rb_encoding
*enc
)
872 GLOBAL_ENC_TABLE_EVAL(enc_table
, i
= enc_autoload_body(enc_table
, enc
));
874 i
= load_encoding(rb_enc_name(enc
));
879 /* Return encoding index or UNSPECIFIED_ENCODING from encoding name */
881 rb_enc_find_index(const char *name
)
886 GLOBAL_ENC_TABLE_EVAL(enc_table
, i
= enc_registered(enc_table
, name
));
889 i
= load_encoding(name
);
891 else if (!(enc
= rb_enc_from_index(i
))) {
892 if (i
!= UNSPECIFIED_ENCODING
) {
893 rb_raise(rb_eArgError
, "encoding %s is not registered", name
);
896 else if (rb_enc_autoload_p(enc
)) {
897 if (rb_enc_autoload(enc
) < 0) {
898 rb_warn("failed to load encoding (%s); use ASCII-8BIT instead",
907 rb_enc_find_index2(const char *name
, long len
)
909 char buf
[ENCODING_NAMELEN_MAX
+1];
911 if (len
> ENCODING_NAMELEN_MAX
) return -1;
912 memcpy(buf
, name
, len
);
914 return rb_enc_find_index(buf
);
918 rb_enc_find(const char *name
)
920 int idx
= rb_enc_find_index(name
);
921 if (idx
< 0) idx
= 0;
922 return rb_enc_from_index(idx
);
926 enc_capable(VALUE obj
)
928 if (SPECIAL_CONST_P(obj
)) return SYMBOL_P(obj
);
929 switch (BUILTIN_TYPE(obj
)) {
936 if (is_data_encoding(obj
)) return TRUE
;
943 rb_enc_capable(VALUE obj
)
945 return enc_capable(obj
);
951 CONST_ID(id_encoding
, "encoding");
956 enc_get_index_str(VALUE str
)
958 int i
= ENCODING_GET_INLINED(str
);
959 if (i
== ENCODING_INLINE_MAX
) {
963 iv
= rb_ivar_get(str
, rb_id_encoding());
967 * Tentatively, assume ASCII-8BIT, if encoding index instance
968 * variable is not found. This can happen when freeing after
969 * all instance variables are removed in `obj_free`.
971 iv
= rb_attr_get(str
, rb_id_encoding());
972 i
= NIL_P(iv
) ? ENCINDEX_ASCII
: NUM2INT(iv
);
979 rb_enc_get_index(VALUE obj
)
984 if (SPECIAL_CONST_P(obj
)) {
985 if (!SYMBOL_P(obj
)) return -1;
986 obj
= rb_sym2str(obj
);
988 switch (BUILTIN_TYPE(obj
)) {
992 i
= enc_get_index_str(obj
);
995 tmp
= rb_funcallv(obj
, rb_intern("internal_encoding"), 0, 0);
997 tmp
= rb_funcallv(obj
, rb_intern("external_encoding"), 0, 0);
999 if (is_obj_encoding(tmp
)) {
1000 i
= enc_check_encoding(tmp
);
1004 if (is_data_encoding(obj
)) {
1005 i
= enc_check_encoding(obj
);
1015 enc_set_index(VALUE obj
, int idx
)
1017 if (!enc_capable(obj
)) {
1018 rb_raise(rb_eArgError
, "cannot set encoding on non-encoding capable object");
1021 if (idx
< ENCODING_INLINE_MAX
) {
1022 ENCODING_SET_INLINED(obj
, idx
);
1025 ENCODING_SET_INLINED(obj
, ENCODING_INLINE_MAX
);
1026 rb_ivar_set(obj
, rb_id_encoding(), INT2NUM(idx
));
1030 rb_enc_set_index(VALUE obj
, int idx
)
1032 rb_check_frozen(obj
);
1034 enc_set_index(obj
, idx
);
1038 rb_enc_associate_index(VALUE obj
, int idx
)
1041 int oldidx
, oldtermlen
, termlen
;
1043 /* enc_check_capable(obj);*/
1044 rb_check_frozen(obj
);
1045 oldidx
= rb_enc_get_index(obj
);
1048 if (SPECIAL_CONST_P(obj
)) {
1049 rb_raise(rb_eArgError
, "cannot set encoding");
1051 enc
= must_encindex(idx
);
1052 if (!ENC_CODERANGE_ASCIIONLY(obj
) ||
1053 !rb_enc_asciicompat(enc
)) {
1054 ENC_CODERANGE_CLEAR(obj
);
1056 termlen
= rb_enc_mbminlen(enc
);
1057 oldtermlen
= rb_enc_mbminlen(rb_enc_from_index(oldidx
));
1058 if (oldtermlen
!= termlen
&& RB_TYPE_P(obj
, T_STRING
)) {
1059 rb_str_change_terminator_length(obj
, oldtermlen
, termlen
);
1061 enc_set_index(obj
, idx
);
1066 rb_enc_associate(VALUE obj
, rb_encoding
*enc
)
1068 return rb_enc_associate_index(obj
, rb_enc_to_index(enc
));
1072 rb_enc_get(VALUE obj
)
1074 return rb_enc_from_index(rb_enc_get_index(obj
));
1078 rb_encoding_check(rb_encoding
* enc
, VALUE str1
, VALUE str2
)
1081 rb_raise(rb_eEncCompatError
, "incompatible character encodings: %s and %s",
1082 rb_enc_name(rb_enc_get(str1
)),
1083 rb_enc_name(rb_enc_get(str2
)));
1087 static rb_encoding
* enc_compatible_str(VALUE str1
, VALUE str2
);
1090 rb_enc_check_str(VALUE str1
, VALUE str2
)
1092 rb_encoding
*enc
= enc_compatible_str(MUST_STRING(str1
), MUST_STRING(str2
));
1093 return rb_encoding_check(enc
, str1
, str2
);
1097 rb_enc_check(VALUE str1
, VALUE str2
)
1099 rb_encoding
*enc
= rb_enc_compatible(str1
, str2
);
1100 return rb_encoding_check(enc
, str1
, str2
);
1104 enc_compatible_latter(VALUE str1
, VALUE str2
, int idx1
, int idx2
)
1107 rb_encoding
*enc1
= rb_enc_from_index(idx1
);
1108 rb_encoding
*enc2
= rb_enc_from_index(idx2
);
1110 isstr2
= RB_TYPE_P(str2
, T_STRING
);
1111 if (isstr2
&& RSTRING_LEN(str2
) == 0)
1113 isstr1
= RB_TYPE_P(str1
, T_STRING
);
1114 if (isstr1
&& isstr2
&& RSTRING_LEN(str1
) == 0)
1115 return (rb_enc_asciicompat(enc1
) && rb_enc_str_asciionly_p(str2
)) ? enc1
: enc2
;
1116 if (!rb_enc_asciicompat(enc1
) || !rb_enc_asciicompat(enc2
)) {
1120 /* objects whose encoding is the same of contents */
1121 if (!isstr2
&& idx2
== ENCINDEX_US_ASCII
)
1123 if (!isstr1
&& idx1
== ENCINDEX_US_ASCII
)
1140 cr1
= rb_enc_str_coderange(str1
);
1142 cr2
= rb_enc_str_coderange(str2
);
1144 /* may need to handle ENC_CODERANGE_BROKEN */
1145 if (cr1
== ENC_CODERANGE_7BIT
) return enc2
;
1146 if (cr2
== ENC_CODERANGE_7BIT
) return enc1
;
1148 if (cr2
== ENC_CODERANGE_7BIT
) {
1152 if (cr1
== ENC_CODERANGE_7BIT
)
1159 enc_compatible_str(VALUE str1
, VALUE str2
)
1161 int idx1
= enc_get_index_str(str1
);
1162 int idx2
= enc_get_index_str(str2
);
1164 if (idx1
< 0 || idx2
< 0)
1168 return rb_enc_from_index(idx1
);
1171 return enc_compatible_latter(str1
, str2
, idx1
, idx2
);
1176 rb_enc_compatible(VALUE str1
, VALUE str2
)
1178 int idx1
= rb_enc_get_index(str1
);
1179 int idx2
= rb_enc_get_index(str2
);
1181 if (idx1
< 0 || idx2
< 0)
1185 return rb_enc_from_index(idx1
);
1188 return enc_compatible_latter(str1
, str2
, idx1
, idx2
);
1192 rb_enc_copy(VALUE obj1
, VALUE obj2
)
1194 rb_enc_associate_index(obj1
, rb_enc_get_index(obj2
));
1200 * obj.encoding -> encoding
1202 * Returns the Encoding object that represents the encoding of obj.
1206 rb_obj_encoding(VALUE obj
)
1208 int idx
= rb_enc_get_index(obj
);
1210 rb_raise(rb_eTypeError
, "unknown encoding");
1212 return rb_enc_from_encoding_index(idx
& ENC_INDEX_MASK
);
1216 rb_enc_fast_mbclen(const char *p
, const char *e
, rb_encoding
*enc
)
1218 return ONIGENC_MBC_ENC_LEN(enc
, (UChar
*)p
, (UChar
*)e
);
1222 rb_enc_mbclen(const char *p
, const char *e
, rb_encoding
*enc
)
1224 int n
= ONIGENC_PRECISE_MBC_ENC_LEN(enc
, (UChar
*)p
, (UChar
*)e
);
1225 if (MBCLEN_CHARFOUND_P(n
) && MBCLEN_CHARFOUND_LEN(n
) <= e
-p
)
1226 return MBCLEN_CHARFOUND_LEN(n
);
1228 int min
= rb_enc_mbminlen(enc
);
1229 return min
<= e
-p
? min
: (int)(e
-p
);
1234 rb_enc_precise_mbclen(const char *p
, const char *e
, rb_encoding
*enc
)
1238 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
1239 n
= ONIGENC_PRECISE_MBC_ENC_LEN(enc
, (UChar
*)p
, (UChar
*)e
);
1241 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n
-(int)(e
-p
));
1246 rb_enc_ascget(const char *p
, const char *e
, int *len
, rb_encoding
*enc
)
1252 if (rb_enc_asciicompat(enc
)) {
1253 c
= (unsigned char)*p
;
1259 l
= rb_enc_precise_mbclen(p
, e
, enc
);
1260 if (!MBCLEN_CHARFOUND_P(l
))
1262 c
= rb_enc_mbc_to_codepoint(p
, e
, enc
);
1263 if (!rb_enc_isascii(c
, enc
))
1270 rb_enc_codepoint_len(const char *p
, const char *e
, int *len_p
, rb_encoding
*enc
)
1274 rb_raise(rb_eArgError
, "empty string");
1275 r
= rb_enc_precise_mbclen(p
, e
, enc
);
1276 if (!MBCLEN_CHARFOUND_P(r
)) {
1277 rb_raise(rb_eArgError
, "invalid byte sequence in %s", rb_enc_name(enc
));
1279 if (len_p
) *len_p
= MBCLEN_CHARFOUND_LEN(r
);
1280 return rb_enc_mbc_to_codepoint(p
, e
, enc
);
1284 rb_enc_codelen(int c
, rb_encoding
*enc
)
1286 int n
= ONIGENC_CODE_TO_MBCLEN(enc
,c
);
1288 rb_raise(rb_eArgError
, "invalid codepoint 0x%x in %s", c
, rb_enc_name(enc
));
1294 rb_enc_toupper(int c
, rb_encoding
*enc
)
1296 return (ONIGENC_IS_ASCII_CODE(c
)?ONIGENC_ASCII_CODE_TO_UPPER_CASE(c
):(c
));
1300 rb_enc_tolower(int c
, rb_encoding
*enc
)
1302 return (ONIGENC_IS_ASCII_CODE(c
)?ONIGENC_ASCII_CODE_TO_LOWER_CASE(c
):(c
));
1307 * enc.inspect -> string
1309 * Returns a string which represents the encoding for programmers.
1311 * Encoding::UTF_8.inspect #=> "#<Encoding:UTF-8>"
1312 * Encoding::ISO_2022_JP.inspect #=> "#<Encoding:ISO-2022-JP (dummy)>"
1315 enc_inspect(VALUE self
)
1319 if (!is_data_encoding(self
)) {
1322 if (!(enc
= DATA_PTR(self
)) || rb_enc_from_index(rb_enc_to_index(enc
)) != enc
) {
1323 rb_raise(rb_eTypeError
, "broken Encoding");
1325 return rb_enc_sprintf(rb_usascii_encoding(),
1326 "#<%"PRIsVALUE
":%s%s%s>", rb_obj_class(self
),
1328 (ENC_DUMMY_P(enc
) ? " (dummy)" : ""),
1329 rb_enc_autoload_p(enc
) ? " (autoload)" : "");
1334 * enc.name -> string
1335 * enc.to_s -> string
1337 * Returns the name of the encoding.
1339 * Encoding::UTF_8.name #=> "UTF-8"
1342 enc_name(VALUE self
)
1344 return rb_fstring_cstr(rb_enc_name((rb_encoding
*)DATA_PTR(self
)));
1348 enc_names_i(st_data_t name
, st_data_t idx
, st_data_t args
)
1350 VALUE
*arg
= (VALUE
*)args
;
1352 if ((int)idx
== (int)arg
[0]) {
1353 VALUE str
= rb_fstring_cstr((char *)name
);
1354 rb_ary_push(arg
[1], str
);
1361 * enc.names -> array
1363 * Returns the list of name and aliases of the encoding.
1365 * Encoding::WINDOWS_31J.names #=> ["Windows-31J", "CP932", "csWindows31J", "SJIS", "PCK"]
1368 enc_names(VALUE self
)
1372 args
[0] = (VALUE
)rb_to_encoding_index(self
);
1373 args
[1] = rb_ary_new2(0);
1375 GLOBAL_ENC_TABLE_EVAL(enc_table
,
1376 st_foreach(enc_table
->names
, enc_names_i
, (st_data_t
)args
));
1383 * Encoding.list -> [enc1, enc2, ...]
1385 * Returns the list of loaded encodings.
1388 * #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
1389 * #<Encoding:ISO-2022-JP (dummy)>]
1391 * Encoding.find("US-ASCII")
1392 * #=> #<Encoding:US-ASCII>
1395 * #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
1396 * #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>]
1400 enc_list(VALUE klass
)
1402 VALUE ary
= rb_ary_new2(0);
1406 rb_ary_replace(ary
, rb_default_encoding_list
);
1407 rb_ary_concat(ary
, rb_additional_encoding_list
);
1416 * Encoding.find(string) -> enc
1418 * Search the encoding with specified <i>name</i>.
1419 * <i>name</i> should be a string.
1421 * Encoding.find("US-ASCII") #=> #<Encoding:US-ASCII>
1423 * Names which this method accept are encoding names and aliases
1424 * including following special aliases
1426 * "external":: default external encoding
1427 * "internal":: default internal encoding
1428 * "locale":: locale encoding
1429 * "filesystem":: filesystem encoding
1431 * An ArgumentError is raised when no encoding with <i>name</i>.
1432 * Only <code>Encoding.find("internal")</code> however returns nil
1433 * when no encoding named "internal", in other words, when Ruby has no
1434 * default internal encoding.
1437 enc_find(VALUE klass
, VALUE enc
)
1440 if (is_obj_encoding(enc
))
1442 idx
= str_to_encindex(enc
);
1443 if (idx
== UNSPECIFIED_ENCODING
) return Qnil
;
1444 return rb_enc_from_encoding_index(idx
);
1449 * Encoding.compatible?(obj1, obj2) -> enc or nil
1451 * Checks the compatibility of two objects.
1453 * If the objects are both strings they are compatible when they are
1454 * concatenatable. The encoding of the concatenated string will be returned
1455 * if they are compatible, nil if they are not.
1457 * Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b")
1458 * #=> #<Encoding:ISO-8859-1>
1460 * Encoding.compatible?(
1461 * "\xa1".force_encoding("iso-8859-1"),
1462 * "\xa1\xa1".force_encoding("euc-jp"))
1465 * If the objects are non-strings their encodings are compatible when they
1466 * have an encoding and:
1467 * * Either encoding is US-ASCII compatible
1468 * * One of the encodings is a 7-bit encoding
1472 enc_compatible_p(VALUE klass
, VALUE str1
, VALUE str2
)
1476 if (!enc_capable(str1
)) return Qnil
;
1477 if (!enc_capable(str2
)) return Qnil
;
1478 enc
= rb_enc_compatible(str1
, str2
);
1479 if (!enc
) return Qnil
;
1480 return rb_enc_from_encoding(enc
);
1483 NORETURN(static VALUE
enc_s_alloc(VALUE klass
));
1486 enc_s_alloc(VALUE klass
)
1488 rb_undefined_alloc(klass
);
1489 UNREACHABLE_RETURN(Qnil
);
1494 enc_dump(int argc
, VALUE
*argv
, VALUE self
)
1496 rb_check_arity(argc
, 0, 1);
1497 return enc_name(self
);
1502 enc_load(VALUE klass
, VALUE str
)
1509 enc_m_loader(VALUE klass
, VALUE str
)
1511 return enc_find(klass
, str
);
1515 rb_ascii8bit_encoding(void)
1517 return global_enc_ascii
;
1521 rb_ascii8bit_encindex(void)
1523 return ENCINDEX_ASCII
;
1527 rb_utf8_encoding(void)
1529 return global_enc_utf_8
;
1533 rb_utf8_encindex(void)
1535 return ENCINDEX_UTF_8
;
1539 rb_usascii_encoding(void)
1541 return global_enc_us_ascii
;
1545 rb_usascii_encindex(void)
1547 return ENCINDEX_US_ASCII
;
1550 int rb_locale_charmap_index(void);
1553 rb_locale_encindex(void)
1555 int idx
= rb_locale_charmap_index();
1557 if (idx
< 0) idx
= ENCINDEX_UTF_8
;
1559 GLOBAL_ENC_TABLE_ENTER(enc_table
);
1560 if (enc_registered(enc_table
, "locale") < 0) {
1562 void Init_w32_codepage(void);
1563 Init_w32_codepage();
1565 enc_alias_internal(enc_table
, "locale", idx
);
1567 GLOBAL_ENC_TABLE_LEAVE();
1573 rb_locale_encoding(void)
1575 return rb_enc_from_index(rb_locale_encindex());
1579 rb_filesystem_encindex(void)
1583 GLOBAL_ENC_TABLE_EVAL(enc_table
,
1584 idx
= enc_registered(enc_table
, "filesystem"));
1587 idx
= ENCINDEX_ASCII
;
1592 rb_filesystem_encoding(void)
1594 return rb_enc_from_index(rb_filesystem_encindex());
1597 struct default_encoding
{
1598 int index
; /* -2 => not yet set, -1 => nil */
1602 static struct default_encoding default_external
= {0};
1605 enc_set_default_encoding(struct default_encoding
*def
, VALUE encoding
, const char *name
)
1607 int overridden
= FALSE
;
1609 if (def
->index
!= -2)
1613 GLOBAL_ENC_TABLE_ENTER(enc_table
);
1615 if (NIL_P(encoding
)) {
1618 st_insert(enc_table
->names
, (st_data_t
)strdup(name
),
1619 (st_data_t
)UNSPECIFIED_ENCODING
);
1622 def
->index
= rb_enc_to_index(rb_to_encoding(encoding
));
1624 enc_alias_internal(enc_table
, name
, def
->index
);
1627 if (def
== &default_external
) {
1628 enc_alias_internal(enc_table
, "filesystem", Init_enc_set_filesystem_encoding());
1631 GLOBAL_ENC_TABLE_LEAVE();
1637 rb_default_external_encoding(void)
1639 if (default_external
.enc
) return default_external
.enc
;
1641 if (default_external
.index
>= 0) {
1642 default_external
.enc
= rb_enc_from_index(default_external
.index
);
1643 return default_external
.enc
;
1646 return rb_locale_encoding();
1651 rb_enc_default_external(void)
1653 return rb_enc_from_encoding(rb_default_external_encoding());
1658 * Encoding.default_external -> enc
1660 * Returns default external encoding.
1662 * The default external encoding is used by default for strings created from
1663 * the following locations:
1666 * * File data read from disk
1669 * * Zlib::GzipReader
1670 * * Zlib::GzipWriter
1674 * While strings created from these locations will have this encoding, the
1675 * encoding may not be valid. Be sure to check String#valid_encoding?.
1677 * File data written to disk will be transcoded to the default external
1678 * encoding when written, if default_internal is not nil.
1680 * The default external encoding is initialized by the -E option.
1681 * If -E isn't set, it is initialized to UTF-8 on Windows and the locale on
1682 * other operating systems.
1685 get_default_external(VALUE klass
)
1687 return rb_enc_default_external();
1691 rb_enc_set_default_external(VALUE encoding
)
1693 if (NIL_P(encoding
)) {
1694 rb_raise(rb_eArgError
, "default external can not be nil");
1696 enc_set_default_encoding(&default_external
, encoding
,
1702 * Encoding.default_external = enc
1704 * Sets default external encoding. You should not set
1705 * Encoding::default_external in ruby code as strings created before changing
1706 * the value may have a different encoding from strings created after the value
1707 * was changed., instead you should use <tt>ruby -E</tt> to invoke ruby with
1708 * the correct default_external.
1710 * See Encoding::default_external for information on how the default external
1714 set_default_external(VALUE klass
, VALUE encoding
)
1716 rb_warning("setting Encoding.default_external");
1717 rb_enc_set_default_external(encoding
);
1721 static struct default_encoding default_internal
= {-2};
1724 rb_default_internal_encoding(void)
1726 if (!default_internal
.enc
&& default_internal
.index
>= 0) {
1727 default_internal
.enc
= rb_enc_from_index(default_internal
.index
);
1729 return default_internal
.enc
; /* can be NULL */
1733 rb_enc_default_internal(void)
1735 /* Note: These functions cope with default_internal not being set */
1736 return rb_enc_from_encoding(rb_default_internal_encoding());
1741 * Encoding.default_internal -> enc
1743 * Returns default internal encoding. Strings will be transcoded to the
1744 * default internal encoding in the following places if the default internal
1745 * encoding is not nil:
1748 * * Etc.sysconfdir and Etc.systmpdir
1749 * * File data read from disk
1750 * * File names from Dir
1752 * * String#inspect and Regexp#inspect
1753 * * Strings returned from Readline
1754 * * Strings returned from SDBM
1757 * * Values in ARGV including $PROGRAM_NAME
1759 * Additionally String#encode and String#encode! use the default internal
1760 * encoding if no encoding is given.
1762 * The script encoding (__ENCODING__), not default_internal, is used as the
1763 * encoding of created strings.
1765 * Encoding::default_internal is initialized with -E option or nil otherwise.
1768 get_default_internal(VALUE klass
)
1770 return rb_enc_default_internal();
1774 rb_enc_set_default_internal(VALUE encoding
)
1776 enc_set_default_encoding(&default_internal
, encoding
,
1782 * Encoding.default_internal = enc or nil
1784 * Sets default internal encoding or removes default internal encoding when
1785 * passed nil. You should not set Encoding::default_internal in ruby code as
1786 * strings created before changing the value may have a different encoding
1787 * from strings created after the change. Instead you should use
1788 * <tt>ruby -E</tt> to invoke ruby with the correct default_internal.
1790 * See Encoding::default_internal for information on how the default internal
1794 set_default_internal(VALUE klass
, VALUE encoding
)
1796 rb_warning("setting Encoding.default_internal");
1797 rb_enc_set_default_internal(encoding
);
1802 set_encoding_const(const char *name
, rb_encoding
*enc
)
1804 VALUE encoding
= rb_enc_from_encoding(enc
);
1805 char *s
= (char *)name
;
1806 int haslower
= 0, hasupper
= 0, valid
= 0;
1808 if (ISDIGIT(*s
)) return;
1811 while (*++s
&& (ISALNUM(*s
) || *s
== '_')) {
1812 if (ISLOWER(*s
)) haslower
= 1;
1816 if (s
- name
> ENCODING_NAMELEN_MAX
) return;
1818 rb_define_const(rb_cEncoding
, name
, encoding
);
1820 if (!valid
|| haslower
) {
1821 size_t len
= s
- name
;
1822 if (len
> ENCODING_NAMELEN_MAX
) return;
1823 if (!haslower
|| !hasupper
) {
1825 if (ISLOWER(*s
)) haslower
= 1;
1826 if (ISUPPER(*s
)) hasupper
= 1;
1827 } while (*++s
&& (!haslower
|| !hasupper
));
1831 if (len
++ > ENCODING_NAMELEN_MAX
) return;
1832 MEMCPY(s
= ALLOCA_N(char, len
), name
, char, len
);
1835 if (ISLOWER(*s
)) *s
= ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s
);
1837 if (!ISALNUM(*s
)) *s
= '_';
1840 rb_define_const(rb_cEncoding
, name
, encoding
);
1844 for (s
= (char *)name
; *s
; ++s
) {
1845 if (ISLOWER(*s
)) *s
= ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s
);
1847 rb_define_const(rb_cEncoding
, name
, encoding
);
1853 rb_enc_name_list_i(st_data_t name
, st_data_t idx
, st_data_t arg
)
1855 VALUE ary
= (VALUE
)arg
;
1856 VALUE str
= rb_fstring_cstr((char *)name
);
1857 rb_ary_push(ary
, str
);
1863 * Encoding.name_list -> ["enc1", "enc2", ...]
1865 * Returns the list of available encoding names.
1867 * Encoding.name_list
1868 * #=> ["US-ASCII", "ASCII-8BIT", "UTF-8",
1869 * "ISO-8859-1", "Shift_JIS", "EUC-JP",
1871 * "BINARY", "CP932", "eucJP"]
1876 rb_enc_name_list(VALUE klass
)
1880 GLOBAL_ENC_TABLE_ENTER(enc_table
);
1882 ary
= rb_ary_new2(enc_table
->names
->num_entries
);
1883 st_foreach(enc_table
->names
, rb_enc_name_list_i
, (st_data_t
)ary
);
1885 GLOBAL_ENC_TABLE_LEAVE();
1891 rb_enc_aliases_enc_i(st_data_t name
, st_data_t orig
, st_data_t arg
)
1893 VALUE
*p
= (VALUE
*)arg
;
1894 VALUE aliases
= p
[0], ary
= p
[1];
1895 int idx
= (int)orig
;
1896 VALUE key
, str
= rb_ary_entry(ary
, idx
);
1899 rb_encoding
*enc
= rb_enc_from_index(idx
);
1901 if (!enc
) return ST_CONTINUE
;
1902 if (STRCASECMP((char*)name
, rb_enc_name(enc
)) == 0) {
1905 str
= rb_fstring_cstr(rb_enc_name(enc
));
1906 rb_ary_store(ary
, idx
, str
);
1908 key
= rb_fstring_cstr((char *)name
);
1909 rb_hash_aset(aliases
, key
, str
);
1915 * Encoding.aliases -> {"alias1" => "orig1", "alias2" => "orig2", ...}
1917 * Returns the hash of available encoding alias and original encoding name.
1920 * #=> {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1968"=>"US-ASCII",
1921 * "SJIS"=>"Windows-31J", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"}
1926 rb_enc_aliases(VALUE klass
)
1929 aliases
[0] = rb_hash_new();
1930 aliases
[1] = rb_ary_new();
1932 GLOBAL_ENC_TABLE_EVAL(enc_table
,
1933 st_foreach(enc_table
->names
, rb_enc_aliases_enc_i
, (st_data_t
)aliases
));
1939 * An Encoding instance represents a character encoding usable in Ruby. It is
1940 * defined as a constant under the Encoding namespace. It has a name and
1941 * optionally, aliases:
1943 * Encoding::ISO_8859_1.name
1946 * Encoding::ISO_8859_1.names
1947 * #=> ["ISO-8859-1", "ISO8859-1"]
1949 * Ruby methods dealing with encodings return or accept Encoding instances as
1950 * arguments (when a method accepts an Encoding instance as an argument, it
1951 * can be passed an Encoding name or alias instead).
1953 * "some string".encoding
1954 * #=> #<Encoding:UTF-8>
1956 * string = "some string".encode(Encoding::ISO_8859_1)
1959 * #=> #<Encoding:ISO-8859-1>
1961 * "some string".encode "ISO-8859-1"
1964 * Encoding::ASCII_8BIT is a special encoding that is usually used for
1965 * a byte string, not a character string. But as the name insists, its
1966 * characters in the range of ASCII are considered as ASCII
1967 * characters. This is useful when you use ASCII-8BIT characters with
1968 * other ASCII compatible characters.
1970 * == Changing an encoding
1972 * The associated Encoding of a String can be changed in two different ways.
1974 * First, it is possible to set the Encoding of a string to a new Encoding
1975 * without changing the internal byte representation of the string, with
1976 * String#force_encoding. This is how you can tell Ruby the correct encoding
1980 * #=> "R\xC3\xA9sum\xC3\xA9"
1982 * #=> #<Encoding:ISO-8859-1>
1983 * string.force_encoding(Encoding::UTF_8)
1984 * #=> "R\u00E9sum\u00E9"
1986 * Second, it is possible to transcode a string, i.e. translate its internal
1987 * byte representation to another encoding. Its associated encoding is also
1988 * set to the other encoding. See String#encode for the various forms of
1989 * transcoding, and the Encoding::Converter class for additional control over
1990 * the transcoding process.
1993 * #=> "R\u00E9sum\u00E9"
1995 * #=> #<Encoding:UTF-8>
1996 * string = string.encode!(Encoding::ISO_8859_1)
1997 * #=> "R\xE9sum\xE9"
1999 * #=> #<Encoding::ISO-8859-1>
2001 * == Script encoding
2003 * All Ruby script code has an associated Encoding which any String literal
2004 * created in the source code will be associated to.
2006 * The default script encoding is Encoding::UTF_8 after v2.0, but it
2007 * can be changed by a magic comment on the first line of the source
2008 * code file (or second line, if there is a shebang line on the
2009 * first). The comment must contain the word <code>coding</code> or
2010 * <code>encoding</code>, followed by a colon, space and the Encoding
2015 * "some string".encoding
2016 * #=> #<Encoding:UTF-8>
2018 * The <code>__ENCODING__</code> keyword returns the script encoding of the file
2019 * which the keyword is written:
2021 * # encoding: ISO-8859-1
2024 * #=> #<Encoding:ISO-8859-1>
2026 * <code>ruby -K</code> will change the default locale encoding, but this is
2027 * not recommended. Ruby source files should declare its script encoding by a
2028 * magic comment even when they only depend on US-ASCII strings or regular
2031 * == Locale encoding
2033 * The default encoding of the environment. Usually derived from locale.
2035 * see Encoding.locale_charmap, Encoding.find('locale')
2037 * == Filesystem encoding
2039 * The default encoding of strings from the filesystem of the environment.
2040 * This is used for strings of file names or paths.
2042 * see Encoding.find('filesystem')
2044 * == External encoding
2046 * Each IO object has an external encoding which indicates the encoding that
2047 * Ruby will use to read its data. By default Ruby sets the external encoding
2048 * of an IO object to the default external encoding. The default external
2049 * encoding is set by locale encoding or the interpreter <code>-E</code> option.
2050 * Encoding.default_external returns the current value of the external
2055 * Encoding.default_external
2056 * #=> #<Encoding:UTF-8>
2058 * $ ruby -E ISO-8859-1 -e "p Encoding.default_external"
2059 * #<Encoding:ISO-8859-1>
2061 * $ LANG=C ruby -e 'p Encoding.default_external'
2062 * #<Encoding:US-ASCII>
2064 * The default external encoding may also be set through
2065 * Encoding.default_external=, but you should not do this as strings created
2066 * before and after the change will have inconsistent encodings. Instead use
2067 * <code>ruby -E</code> to invoke ruby with the correct external encoding.
2069 * When you know that the actual encoding of the data of an IO object is not
2070 * the default external encoding, you can reset its external encoding with
2071 * IO#set_encoding or set it at IO object creation (see IO.new options).
2073 * == Internal encoding
2075 * To process the data of an IO object which has an encoding different
2076 * from its external encoding, you can set its internal encoding. Ruby will use
2077 * this internal encoding to transcode the data when it is read from the IO
2080 * Conversely, when data is written to the IO object it is transcoded from the
2081 * internal encoding to the external encoding of the IO object.
2083 * The internal encoding of an IO object can be set with
2084 * IO#set_encoding or at IO object creation (see IO.new options).
2086 * The internal encoding is optional and when not set, the Ruby default
2087 * internal encoding is used. If not explicitly set this default internal
2088 * encoding is +nil+ meaning that by default, no transcoding occurs.
2090 * The default internal encoding can be set with the interpreter option
2091 * <code>-E</code>. Encoding.default_internal returns the current internal
2094 * $ ruby -e 'p Encoding.default_internal'
2097 * $ ruby -E ISO-8859-1:UTF-8 -e "p [Encoding.default_external, \
2098 * Encoding.default_internal]"
2099 * [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>]
2101 * The default internal encoding may also be set through
2102 * Encoding.default_internal=, but you should not do this as strings created
2103 * before and after the change will have inconsistent encodings. Instead use
2104 * <code>ruby -E</code> to invoke ruby with the correct internal encoding.
2106 * == IO encoding example
2108 * In the following example a UTF-8 encoded string "R\u00E9sum\u00E9" is transcoded for
2109 * output to ISO-8859-1 encoding, then read back in and transcoded to UTF-8:
2111 * string = "R\u00E9sum\u00E9"
2113 * open("transcoded.txt", "w:ISO-8859-1") do |io|
2118 * p File.binread("transcoded.txt")
2121 * open("transcoded.txt", "r:ISO-8859-1:UTF-8") do |io|
2122 * puts "transcoded text:"
2126 * While writing the file, the internal encoding is not specified as it is
2127 * only necessary for reading. While reading the file both the internal and
2128 * external encoding must be specified to obtain the correct result.
2135 * "R\u00E9sum\u00E9"
2145 rb_cEncoding
= rb_define_class("Encoding", rb_cObject
);
2146 rb_define_alloc_func(rb_cEncoding
, enc_s_alloc
);
2147 rb_undef_method(CLASS_OF(rb_cEncoding
), "new");
2148 rb_define_method(rb_cEncoding
, "to_s", enc_name
, 0);
2149 rb_define_method(rb_cEncoding
, "inspect", enc_inspect
, 0);
2150 rb_define_method(rb_cEncoding
, "name", enc_name
, 0);
2151 rb_define_method(rb_cEncoding
, "names", enc_names
, 0);
2152 rb_define_method(rb_cEncoding
, "dummy?", enc_dummy_p
, 0);
2153 rb_define_method(rb_cEncoding
, "ascii_compatible?", enc_ascii_compatible_p
, 0);
2154 rb_define_method(rb_cEncoding
, "replicate", enc_replicate_m
, 1);
2155 rb_define_singleton_method(rb_cEncoding
, "list", enc_list
, 0);
2156 rb_define_singleton_method(rb_cEncoding
, "name_list", rb_enc_name_list
, 0);
2157 rb_define_singleton_method(rb_cEncoding
, "aliases", rb_enc_aliases
, 0);
2158 rb_define_singleton_method(rb_cEncoding
, "find", enc_find
, 1);
2159 rb_define_singleton_method(rb_cEncoding
, "compatible?", enc_compatible_p
, 2);
2161 rb_define_method(rb_cEncoding
, "_dump", enc_dump
, -1);
2162 rb_define_singleton_method(rb_cEncoding
, "_load", enc_load
, 1);
2164 rb_define_singleton_method(rb_cEncoding
, "default_external", get_default_external
, 0);
2165 rb_define_singleton_method(rb_cEncoding
, "default_external=", set_default_external
, 1);
2166 rb_define_singleton_method(rb_cEncoding
, "default_internal", get_default_internal
, 0);
2167 rb_define_singleton_method(rb_cEncoding
, "default_internal=", set_default_internal
, 1);
2168 rb_define_singleton_method(rb_cEncoding
, "locale_charmap", rb_locale_charmap
, 0); /* in localeinit.c */
2170 struct enc_table
*enc_table
= &global_enc_table
;
2172 if (DEFAULT_ENCODING_LIST_CAPA
< enc_table
->count
) rb_bug("DEFAULT_ENCODING_LIST_CAPA is too small");
2174 list
= rb_additional_encoding_list
= rb_ary_new();
2175 RBASIC_CLEAR_CLASS(list
);
2176 rb_gc_register_mark_object(list
);
2178 list
= rb_default_encoding_list
= rb_ary_new2(DEFAULT_ENCODING_LIST_CAPA
);
2179 RBASIC_CLEAR_CLASS(list
);
2180 rb_gc_register_mark_object(list
);
2182 for (i
= 0; i
< enc_table
->count
; ++i
) {
2183 rb_ary_push(list
, enc_new(enc_table
->list
[i
].enc
));
2186 rb_marshal_define_compat(rb_cEncoding
, Qnil
, 0, enc_m_loader
);
2190 Init_encodings(void)
2192 rb_enc_init(&global_enc_table
);
2195 /* locale insensitive ctype functions */
2198 rb_enc_foreach_name(int (*func
)(st_data_t name
, st_data_t idx
, st_data_t arg
), st_data_t arg
)
2200 GLOBAL_ENC_TABLE_EVAL(enc_table
, st_foreach(enc_table
->names
, func
, arg
));