* lisp/subr.el (filter): Remove. Use `cl-remove-if-not' or `seq-filter'.
[emacs.git] / src / charset.c
blob171a00f23d1116164885dc688a5198cd1a7cdfd1
1 /* Basic character set support.
3 Copyright (C) 2001-2014 Free Software Foundation, Inc.
5 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
6 2005, 2006, 2007, 2008, 2009, 2010, 2011
7 National Institute of Advanced Industrial Science and Technology (AIST)
8 Registration Number H14PRO021
10 Copyright (C) 2003, 2004
11 National Institute of Advanced Industrial Science and Technology (AIST)
12 Registration Number H13PRO009
14 This file is part of GNU Emacs.
16 GNU Emacs is free software: you can redistribute it and/or modify
17 it under the terms of the GNU General Public License as published by
18 the Free Software Foundation, either version 3 of the License, or
19 (at your option) any later version.
21 GNU Emacs is distributed in the hope that it will be useful,
22 but WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 GNU General Public License for more details.
26 You should have received a copy of the GNU General Public License
27 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
29 #include <config.h>
31 #include <errno.h>
32 #include <stdio.h>
33 #include <unistd.h>
34 #include <limits.h>
35 #include <sys/types.h>
36 #include <c-ctype.h>
37 #include "lisp.h"
38 #include "character.h"
39 #include "charset.h"
40 #include "coding.h"
41 #include "disptab.h"
42 #include "buffer.h"
44 /*** GENERAL NOTES on CODED CHARACTER SETS (CHARSETS) ***
46 A coded character set ("charset" hereafter) is a meaningful
47 collection (i.e. language, culture, functionality, etc.) of
48 characters. Emacs handles multiple charsets at once. In Emacs Lisp
49 code, a charset is represented by a symbol. In C code, a charset is
50 represented by its ID number or by a pointer to a struct charset.
52 The actual information about each charset is stored in two places.
53 Lispy information is stored in the hash table Vcharset_hash_table as
54 a vector (charset attributes). The other information is stored in
55 charset_table as a struct charset.
59 /* Hash table that contains attributes of each charset. Keys are
60 charset symbols, and values are vectors of charset attributes. */
61 Lisp_Object Vcharset_hash_table;
63 /* Table of struct charset. */
64 struct charset *charset_table;
66 static ptrdiff_t charset_table_size;
67 static int charset_table_used;
69 Lisp_Object Qcharsetp;
71 /* Special charset symbols. */
72 Lisp_Object Qascii;
73 static Lisp_Object Qeight_bit;
74 static Lisp_Object Qiso_8859_1;
75 static Lisp_Object Qunicode;
76 static Lisp_Object Qemacs;
78 /* The corresponding charsets. */
79 int charset_ascii;
80 int charset_eight_bit;
81 static int charset_iso_8859_1;
82 int charset_unicode;
83 static int charset_emacs;
85 /* The other special charsets. */
86 int charset_jisx0201_roman;
87 int charset_jisx0208_1978;
88 int charset_jisx0208;
89 int charset_ksc5601;
91 /* Value of charset attribute `charset-iso-plane'. */
92 static Lisp_Object Qgl, Qgr;
94 /* Charset of unibyte characters. */
95 int charset_unibyte;
97 /* List of charsets ordered by the priority. */
98 Lisp_Object Vcharset_ordered_list;
100 /* Sub-list of Vcharset_ordered_list that contains all non-preferred
101 charsets. */
102 Lisp_Object Vcharset_non_preferred_head;
104 /* Incremented everytime we change Vcharset_ordered_list. This is
105 unsigned short so that it fits in Lisp_Int and never matches
106 -1. */
107 unsigned short charset_ordered_list_tick;
109 /* List of iso-2022 charsets. */
110 Lisp_Object Viso_2022_charset_list;
112 /* List of emacs-mule charsets. */
113 Lisp_Object Vemacs_mule_charset_list;
115 int emacs_mule_charset[256];
117 /* Mapping table from ISO2022's charset (specified by DIMENSION,
118 CHARS, and FINAL-CHAR) to Emacs' charset. */
119 int iso_charset_table[ISO_MAX_DIMENSION][ISO_MAX_CHARS][ISO_MAX_FINAL];
121 #define CODE_POINT_TO_INDEX(charset, code) \
122 ((charset)->code_linear_p \
123 ? (int) ((code) - (charset)->min_code) \
124 : (((charset)->code_space_mask[(code) >> 24] & 0x8) \
125 && ((charset)->code_space_mask[((code) >> 16) & 0xFF] & 0x4) \
126 && ((charset)->code_space_mask[((code) >> 8) & 0xFF] & 0x2) \
127 && ((charset)->code_space_mask[(code) & 0xFF] & 0x1)) \
128 ? (int) (((((code) >> 24) - (charset)->code_space[12]) \
129 * (charset)->code_space[11]) \
130 + (((((code) >> 16) & 0xFF) - (charset)->code_space[8]) \
131 * (charset)->code_space[7]) \
132 + (((((code) >> 8) & 0xFF) - (charset)->code_space[4]) \
133 * (charset)->code_space[3]) \
134 + (((code) & 0xFF) - (charset)->code_space[0]) \
135 - ((charset)->char_index_offset)) \
136 : -1)
139 /* Return the code-point for the character index IDX in CHARSET.
140 IDX should be an unsigned int variable in a valid range (which is
141 always in nonnegative int range too). IDX contains garbage afterwards. */
143 #define INDEX_TO_CODE_POINT(charset, idx) \
144 ((charset)->code_linear_p \
145 ? (idx) + (charset)->min_code \
146 : (idx += (charset)->char_index_offset, \
147 (((charset)->code_space[0] + (idx) % (charset)->code_space[2]) \
148 | (((charset)->code_space[4] \
149 + ((idx) / (charset)->code_space[3] % (charset)->code_space[6])) \
150 << 8) \
151 | (((charset)->code_space[8] \
152 + ((idx) / (charset)->code_space[7] % (charset)->code_space[10])) \
153 << 16) \
154 | (((charset)->code_space[12] + ((idx) / (charset)->code_space[11])) \
155 << 24))))
157 /* Structure to hold mapping tables for a charset. Used by temacs
158 invoked for dumping. */
160 static struct
162 /* The current charset for which the following tables are setup. */
163 struct charset *current;
165 /* 1 iff the following table is used for encoder. */
166 short for_encoder;
168 /* When the following table is used for encoding, minimum and
169 maximum character of the current charset. */
170 int min_char, max_char;
172 /* A Unicode character corresponding to the code index 0 (i.e. the
173 minimum code-point) of the current charset, or -1 if the code
174 index 0 is not a Unicode character. This is checked when
175 table.encoder[CHAR] is zero. */
176 int zero_index_char;
178 union {
179 /* Table mapping code-indices (not code-points) of the current
180 charset to Unicode characters. If decoder[CHAR] is -1, CHAR
181 doesn't belong to the current charset. */
182 int decoder[0x10000];
183 /* Table mapping Unicode characters to code-indices of the current
184 charset. The first 0x10000 elements are for BMP (0..0xFFFF),
185 and the last 0x10000 are for SMP (0x10000..0x1FFFF) or SIP
186 (0x20000..0x2FFFF). Note that there is no charset map that
187 uses both SMP and SIP. */
188 unsigned short encoder[0x20000];
189 } table;
190 } *temp_charset_work;
192 #define SET_TEMP_CHARSET_WORK_ENCODER(C, CODE) \
193 do { \
194 if ((CODE) == 0) \
195 temp_charset_work->zero_index_char = (C); \
196 else if ((C) < 0x20000) \
197 temp_charset_work->table.encoder[(C)] = (CODE); \
198 else \
199 temp_charset_work->table.encoder[(C) - 0x10000] = (CODE); \
200 } while (0)
202 #define GET_TEMP_CHARSET_WORK_ENCODER(C) \
203 ((C) == temp_charset_work->zero_index_char ? 0 \
204 : (C) < 0x20000 ? (temp_charset_work->table.encoder[(C)] \
205 ? (int) temp_charset_work->table.encoder[(C)] : -1) \
206 : temp_charset_work->table.encoder[(C) - 0x10000] \
207 ? temp_charset_work->table.encoder[(C) - 0x10000] : -1)
209 #define SET_TEMP_CHARSET_WORK_DECODER(C, CODE) \
210 (temp_charset_work->table.decoder[(CODE)] = (C))
212 #define GET_TEMP_CHARSET_WORK_DECODER(CODE) \
213 (temp_charset_work->table.decoder[(CODE)])
216 /* Set to 1 to warn that a charset map is loaded and thus a buffer
217 text and a string data may be relocated. */
218 bool charset_map_loaded;
220 struct charset_map_entries
222 struct {
223 unsigned from, to;
224 int c;
225 } entry[0x10000];
226 struct charset_map_entries *next;
229 /* Load the mapping information of CHARSET from ENTRIES for
230 initializing (CONTROL_FLAG == 0), decoding (CONTROL_FLAG == 1), and
231 encoding (CONTROL_FLAG == 2).
233 If CONTROL_FLAG is 0, setup CHARSET->min_char, CHARSET->max_char,
234 and CHARSET->fast_map.
236 If CONTROL_FLAG is 1, setup the following tables according to
237 CHARSET->method and inhibit_load_charset_map.
239 CHARSET->method | inhibit_lcm == 0 | inhibit_lcm == 1
240 ----------------------+--------------------+---------------------------
241 CHARSET_METHOD_MAP | CHARSET->decoder | temp_charset_work->decoder
242 ----------------------+--------------------+---------------------------
243 CHARSET_METHOD_OFFSET | Vchar_unify_table | temp_charset_work->decoder
245 If CONTROL_FLAG is 2, setup the following tables.
247 CHARSET->method | inhibit_lcm == 0 | inhibit_lcm == 1
248 ----------------------+--------------------+---------------------------
249 CHARSET_METHOD_MAP | CHARSET->encoder | temp_charset_work->encoder
250 ----------------------+--------------------+--------------------------
251 CHARSET_METHOD_OFFSET | CHARSET->deunifier | temp_charset_work->encoder
254 static void
255 load_charset_map (struct charset *charset, struct charset_map_entries *entries, int n_entries, int control_flag)
257 Lisp_Object vec IF_LINT (= Qnil), table IF_LINT (= Qnil);
258 unsigned max_code = CHARSET_MAX_CODE (charset);
259 bool ascii_compatible_p = charset->ascii_compatible_p;
260 int min_char, max_char, nonascii_min_char;
261 int i;
262 unsigned char *fast_map = charset->fast_map;
264 if (n_entries <= 0)
265 return;
267 if (control_flag)
269 if (! inhibit_load_charset_map)
271 if (control_flag == 1)
273 if (charset->method == CHARSET_METHOD_MAP)
275 int n = CODE_POINT_TO_INDEX (charset, max_code) + 1;
277 vec = Fmake_vector (make_number (n), make_number (-1));
278 set_charset_attr (charset, charset_decoder, vec);
280 else
282 char_table_set_range (Vchar_unify_table,
283 charset->min_char, charset->max_char,
284 Qnil);
287 else
289 table = Fmake_char_table (Qnil, Qnil);
290 set_charset_attr (charset,
291 (charset->method == CHARSET_METHOD_MAP
292 ? charset_encoder : charset_deunifier),
293 table);
296 else
298 if (! temp_charset_work)
299 temp_charset_work = xmalloc (sizeof *temp_charset_work);
300 if (control_flag == 1)
302 memset (temp_charset_work->table.decoder, -1,
303 sizeof (int) * 0x10000);
305 else
307 memset (temp_charset_work->table.encoder, 0,
308 sizeof (unsigned short) * 0x20000);
309 temp_charset_work->zero_index_char = -1;
311 temp_charset_work->current = charset;
312 temp_charset_work->for_encoder = (control_flag == 2);
313 control_flag += 2;
315 charset_map_loaded = 1;
318 min_char = max_char = entries->entry[0].c;
319 nonascii_min_char = MAX_CHAR;
320 for (i = 0; i < n_entries; i++)
322 unsigned from, to;
323 int from_index, to_index, lim_index;
324 int from_c, to_c;
325 int idx = i % 0x10000;
327 if (i > 0 && idx == 0)
328 entries = entries->next;
329 from = entries->entry[idx].from;
330 to = entries->entry[idx].to;
331 from_c = entries->entry[idx].c;
332 from_index = CODE_POINT_TO_INDEX (charset, from);
333 if (from == to)
335 to_index = from_index;
336 to_c = from_c;
338 else
340 to_index = CODE_POINT_TO_INDEX (charset, to);
341 to_c = from_c + (to_index - from_index);
343 if (from_index < 0 || to_index < 0)
344 continue;
345 lim_index = to_index + 1;
347 if (to_c > max_char)
348 max_char = to_c;
349 else if (from_c < min_char)
350 min_char = from_c;
352 if (control_flag == 1)
354 if (charset->method == CHARSET_METHOD_MAP)
355 for (; from_index < lim_index; from_index++, from_c++)
356 ASET (vec, from_index, make_number (from_c));
357 else
358 for (; from_index < lim_index; from_index++, from_c++)
359 CHAR_TABLE_SET (Vchar_unify_table,
360 CHARSET_CODE_OFFSET (charset) + from_index,
361 make_number (from_c));
363 else if (control_flag == 2)
365 if (charset->method == CHARSET_METHOD_MAP
366 && CHARSET_COMPACT_CODES_P (charset))
367 for (; from_index < lim_index; from_index++, from_c++)
369 unsigned code = from_index;
370 code = INDEX_TO_CODE_POINT (charset, code);
372 if (NILP (CHAR_TABLE_REF (table, from_c)))
373 CHAR_TABLE_SET (table, from_c, make_number (code));
375 else
376 for (; from_index < lim_index; from_index++, from_c++)
378 if (NILP (CHAR_TABLE_REF (table, from_c)))
379 CHAR_TABLE_SET (table, from_c, make_number (from_index));
382 else if (control_flag == 3)
383 for (; from_index < lim_index; from_index++, from_c++)
384 SET_TEMP_CHARSET_WORK_DECODER (from_c, from_index);
385 else if (control_flag == 4)
386 for (; from_index < lim_index; from_index++, from_c++)
387 SET_TEMP_CHARSET_WORK_ENCODER (from_c, from_index);
388 else /* control_flag == 0 */
390 if (ascii_compatible_p)
392 if (! ASCII_CHAR_P (from_c))
394 if (from_c < nonascii_min_char)
395 nonascii_min_char = from_c;
397 else if (! ASCII_CHAR_P (to_c))
399 nonascii_min_char = 0x80;
403 for (; from_c <= to_c; from_c++)
404 CHARSET_FAST_MAP_SET (from_c, fast_map);
408 if (control_flag == 0)
410 CHARSET_MIN_CHAR (charset) = (ascii_compatible_p
411 ? nonascii_min_char : min_char);
412 CHARSET_MAX_CHAR (charset) = max_char;
414 else if (control_flag == 4)
416 temp_charset_work->min_char = min_char;
417 temp_charset_work->max_char = max_char;
422 /* Read a hexadecimal number (preceded by "0x") from the file FP while
423 paying attention to comment character '#'. */
425 static unsigned
426 read_hex (FILE *fp, bool *eof, bool *overflow)
428 int c;
429 unsigned n;
431 while ((c = getc (fp)) != EOF)
433 if (c == '#')
435 while ((c = getc (fp)) != EOF && c != '\n');
437 else if (c == '0')
439 if ((c = getc (fp)) == EOF || c == 'x')
440 break;
443 if (c == EOF)
445 *eof = 1;
446 return 0;
448 n = 0;
449 while (c_isxdigit (c = getc (fp)))
451 if (UINT_MAX >> 4 < n)
452 *overflow = 1;
453 n = ((n << 4)
454 | (c - ('0' <= c && c <= '9' ? '0'
455 : 'A' <= c && c <= 'F' ? 'A' - 10
456 : 'a' - 10)));
458 if (c != EOF)
459 ungetc (c, fp);
460 return n;
463 /* Return a mapping vector for CHARSET loaded from MAPFILE.
464 Each line of MAPFILE has this form
465 0xAAAA 0xCCCC
466 where 0xAAAA is a code-point and 0xCCCC is the corresponding
467 character code, or this form
468 0xAAAA-0xBBBB 0xCCCC
469 where 0xAAAA and 0xBBBB are code-points specifying a range, and
470 0xCCCC is the first character code of the range.
472 The returned vector has this form:
473 [ CODE1 CHAR1 CODE2 CHAR2 .... ]
474 where CODE1 is a code-point or a cons of code-points specifying a
475 range.
477 Note that this function uses `openp' to open MAPFILE but ignores
478 `file-name-handler-alist' to avoid running any Lisp code. */
480 static void
481 load_charset_map_from_file (struct charset *charset, Lisp_Object mapfile,
482 int control_flag)
484 unsigned min_code = CHARSET_MIN_CODE (charset);
485 unsigned max_code = CHARSET_MAX_CODE (charset);
486 int fd;
487 FILE *fp;
488 struct charset_map_entries *head, *entries;
489 int n_entries;
490 AUTO_STRING (map, ".map");
491 AUTO_STRING (txt, ".txt");
492 AUTO_LIST2 (suffixes, map, txt);
493 ptrdiff_t count = SPECPDL_INDEX ();
494 record_unwind_protect_nothing ();
495 specbind (Qfile_name_handler_alist, Qnil);
496 fd = openp (Vcharset_map_path, mapfile, suffixes, NULL, Qnil, false);
497 fp = fd < 0 ? 0 : fdopen (fd, "r");
498 if (!fp)
500 int open_errno = errno;
501 emacs_close (fd);
502 report_file_errno ("Loading charset map", mapfile, open_errno);
504 set_unwind_protect_ptr (count, fclose_unwind, fp);
505 unbind_to (count + 1, Qnil);
507 /* Use record_xmalloc, as `charset_map_entries' is
508 large (larger than MAX_ALLOCA). */
509 head = record_xmalloc (sizeof *head);
510 entries = head;
511 memset (entries, 0, sizeof (struct charset_map_entries));
513 n_entries = 0;
514 while (1)
516 unsigned from, to, c;
517 int idx;
518 bool eof = 0, overflow = 0;
520 from = read_hex (fp, &eof, &overflow);
521 if (eof)
522 break;
523 if (getc (fp) == '-')
524 to = read_hex (fp, &eof, &overflow);
525 else
526 to = from;
527 if (eof)
528 break;
529 c = read_hex (fp, &eof, &overflow);
530 if (eof)
531 break;
533 if (overflow)
534 continue;
535 if (from < min_code || to > max_code || from > to || c > MAX_CHAR)
536 continue;
538 if (n_entries == 0x10000)
540 entries->next = record_xmalloc (sizeof *entries->next);
541 entries = entries->next;
542 memset (entries, 0, sizeof (struct charset_map_entries));
543 n_entries = 0;
545 idx = n_entries;
546 entries->entry[idx].from = from;
547 entries->entry[idx].to = to;
548 entries->entry[idx].c = c;
549 n_entries++;
551 fclose (fp);
552 clear_unwind_protect (count);
554 load_charset_map (charset, head, n_entries, control_flag);
555 unbind_to (count, Qnil);
558 static void
559 load_charset_map_from_vector (struct charset *charset, Lisp_Object vec, int control_flag)
561 unsigned min_code = CHARSET_MIN_CODE (charset);
562 unsigned max_code = CHARSET_MAX_CODE (charset);
563 struct charset_map_entries *head, *entries;
564 int n_entries;
565 int len = ASIZE (vec);
566 int i;
567 USE_SAFE_ALLOCA;
569 if (len % 2 == 1)
571 add_to_log ("Failure in loading charset map: %V", vec, Qnil);
572 return;
575 /* Use SAFE_ALLOCA instead of alloca, as `charset_map_entries' is
576 large (larger than MAX_ALLOCA). */
577 head = SAFE_ALLOCA (sizeof *head);
578 entries = head;
579 memset (entries, 0, sizeof (struct charset_map_entries));
581 n_entries = 0;
582 for (i = 0; i < len; i += 2)
584 Lisp_Object val, val2;
585 unsigned from, to;
586 EMACS_INT c;
587 int idx;
589 val = AREF (vec, i);
590 if (CONSP (val))
592 val2 = XCDR (val);
593 val = XCAR (val);
594 from = XFASTINT (val);
595 to = XFASTINT (val2);
597 else
598 from = to = XFASTINT (val);
599 val = AREF (vec, i + 1);
600 CHECK_NATNUM (val);
601 c = XFASTINT (val);
603 if (from < min_code || to > max_code || from > to || c > MAX_CHAR)
604 continue;
606 if (n_entries > 0 && (n_entries % 0x10000) == 0)
608 entries->next = SAFE_ALLOCA (sizeof *entries->next);
609 entries = entries->next;
610 memset (entries, 0, sizeof (struct charset_map_entries));
612 idx = n_entries % 0x10000;
613 entries->entry[idx].from = from;
614 entries->entry[idx].to = to;
615 entries->entry[idx].c = c;
616 n_entries++;
619 load_charset_map (charset, head, n_entries, control_flag);
620 SAFE_FREE ();
624 /* Load a mapping table for CHARSET. CONTROL-FLAG tells what kind of
625 map it is (see the comment of load_charset_map for the detail). */
627 static void
628 load_charset (struct charset *charset, int control_flag)
630 Lisp_Object map;
632 if (inhibit_load_charset_map
633 && temp_charset_work
634 && charset == temp_charset_work->current
635 && ((control_flag == 2) == temp_charset_work->for_encoder))
636 return;
638 if (CHARSET_METHOD (charset) == CHARSET_METHOD_MAP)
639 map = CHARSET_MAP (charset);
640 else
642 if (! CHARSET_UNIFIED_P (charset))
643 emacs_abort ();
644 map = CHARSET_UNIFY_MAP (charset);
646 if (STRINGP (map))
647 load_charset_map_from_file (charset, map, control_flag);
648 else
649 load_charset_map_from_vector (charset, map, control_flag);
653 DEFUN ("charsetp", Fcharsetp, Scharsetp, 1, 1, 0,
654 doc: /* Return non-nil if and only if OBJECT is a charset.*/)
655 (Lisp_Object object)
657 return (CHARSETP (object) ? Qt : Qnil);
661 static void
662 map_charset_for_dump (void (*c_function) (Lisp_Object, Lisp_Object),
663 Lisp_Object function, Lisp_Object arg,
664 unsigned int from, unsigned int to)
666 int from_idx = CODE_POINT_TO_INDEX (temp_charset_work->current, from);
667 int to_idx = CODE_POINT_TO_INDEX (temp_charset_work->current, to);
668 Lisp_Object range = Fcons (Qnil, Qnil);
669 int c, stop;
671 c = temp_charset_work->min_char;
672 stop = (temp_charset_work->max_char < 0x20000
673 ? temp_charset_work->max_char : 0xFFFF);
675 while (1)
677 int idx = GET_TEMP_CHARSET_WORK_ENCODER (c);
679 if (idx >= from_idx && idx <= to_idx)
681 if (NILP (XCAR (range)))
682 XSETCAR (range, make_number (c));
684 else if (! NILP (XCAR (range)))
686 XSETCDR (range, make_number (c - 1));
687 if (c_function)
688 (*c_function) (arg, range);
689 else
690 call2 (function, range, arg);
691 XSETCAR (range, Qnil);
693 if (c == stop)
695 if (c == temp_charset_work->max_char)
697 if (! NILP (XCAR (range)))
699 XSETCDR (range, make_number (c));
700 if (c_function)
701 (*c_function) (arg, range);
702 else
703 call2 (function, range, arg);
705 break;
707 c = 0x1FFFF;
708 stop = temp_charset_work->max_char;
710 c++;
714 void
715 map_charset_chars (void (*c_function)(Lisp_Object, Lisp_Object), Lisp_Object function,
716 Lisp_Object arg, struct charset *charset, unsigned from, unsigned to)
718 Lisp_Object range;
719 bool partial = (from > CHARSET_MIN_CODE (charset)
720 || to < CHARSET_MAX_CODE (charset));
722 if (CHARSET_METHOD (charset) == CHARSET_METHOD_OFFSET)
724 int from_idx = CODE_POINT_TO_INDEX (charset, from);
725 int to_idx = CODE_POINT_TO_INDEX (charset, to);
726 int from_c = from_idx + CHARSET_CODE_OFFSET (charset);
727 int to_c = to_idx + CHARSET_CODE_OFFSET (charset);
729 if (CHARSET_UNIFIED_P (charset))
731 if (! CHAR_TABLE_P (CHARSET_DEUNIFIER (charset)))
732 load_charset (charset, 2);
733 if (CHAR_TABLE_P (CHARSET_DEUNIFIER (charset)))
734 map_char_table_for_charset (c_function, function,
735 CHARSET_DEUNIFIER (charset), arg,
736 partial ? charset : NULL, from, to);
737 else
738 map_charset_for_dump (c_function, function, arg, from, to);
741 range = Fcons (make_number (from_c), make_number (to_c));
742 if (NILP (function))
743 (*c_function) (arg, range);
744 else
745 call2 (function, range, arg);
747 else if (CHARSET_METHOD (charset) == CHARSET_METHOD_MAP)
749 if (! CHAR_TABLE_P (CHARSET_ENCODER (charset)))
750 load_charset (charset, 2);
751 if (CHAR_TABLE_P (CHARSET_ENCODER (charset)))
752 map_char_table_for_charset (c_function, function,
753 CHARSET_ENCODER (charset), arg,
754 partial ? charset : NULL, from, to);
755 else
756 map_charset_for_dump (c_function, function, arg, from, to);
758 else if (CHARSET_METHOD (charset) == CHARSET_METHOD_SUBSET)
760 Lisp_Object subset_info;
761 int offset;
763 subset_info = CHARSET_SUBSET (charset);
764 charset = CHARSET_FROM_ID (XFASTINT (AREF (subset_info, 0)));
765 offset = XINT (AREF (subset_info, 3));
766 from -= offset;
767 if (from < XFASTINT (AREF (subset_info, 1)))
768 from = XFASTINT (AREF (subset_info, 1));
769 to -= offset;
770 if (to > XFASTINT (AREF (subset_info, 2)))
771 to = XFASTINT (AREF (subset_info, 2));
772 map_charset_chars (c_function, function, arg, charset, from, to);
774 else /* i.e. CHARSET_METHOD_SUPERSET */
776 Lisp_Object parents;
778 for (parents = CHARSET_SUPERSET (charset); CONSP (parents);
779 parents = XCDR (parents))
781 int offset;
782 unsigned this_from, this_to;
784 charset = CHARSET_FROM_ID (XFASTINT (XCAR (XCAR (parents))));
785 offset = XINT (XCDR (XCAR (parents)));
786 this_from = from > offset ? from - offset : 0;
787 this_to = to > offset ? to - offset : 0;
788 if (this_from < CHARSET_MIN_CODE (charset))
789 this_from = CHARSET_MIN_CODE (charset);
790 if (this_to > CHARSET_MAX_CODE (charset))
791 this_to = CHARSET_MAX_CODE (charset);
792 map_charset_chars (c_function, function, arg, charset,
793 this_from, this_to);
798 DEFUN ("map-charset-chars", Fmap_charset_chars, Smap_charset_chars, 2, 5, 0,
799 doc: /* Call FUNCTION for all characters in CHARSET.
800 FUNCTION is called with an argument RANGE and the optional 3rd
801 argument ARG.
803 RANGE is a cons (FROM . TO), where FROM and TO indicate a range of
804 characters contained in CHARSET.
806 The optional 4th and 5th arguments FROM-CODE and TO-CODE specify the
807 range of code points (in CHARSET) of target characters. */)
808 (Lisp_Object function, Lisp_Object charset, Lisp_Object arg, Lisp_Object from_code, Lisp_Object to_code)
810 struct charset *cs;
811 unsigned from, to;
813 CHECK_CHARSET_GET_CHARSET (charset, cs);
814 if (NILP (from_code))
815 from = CHARSET_MIN_CODE (cs);
816 else
818 from = XINT (from_code);
819 if (from < CHARSET_MIN_CODE (cs))
820 from = CHARSET_MIN_CODE (cs);
822 if (NILP (to_code))
823 to = CHARSET_MAX_CODE (cs);
824 else
826 to = XINT (to_code);
827 if (to > CHARSET_MAX_CODE (cs))
828 to = CHARSET_MAX_CODE (cs);
830 map_charset_chars (NULL, function, arg, cs, from, to);
831 return Qnil;
835 /* Define a charset according to the arguments. The Nth argument is
836 the Nth attribute of the charset (the last attribute `charset-id'
837 is not included). See the docstring of `define-charset' for the
838 detail. */
840 DEFUN ("define-charset-internal", Fdefine_charset_internal,
841 Sdefine_charset_internal, charset_arg_max, MANY, 0,
842 doc: /* For internal use only.
843 usage: (define-charset-internal ...) */)
844 (ptrdiff_t nargs, Lisp_Object *args)
846 /* Charset attr vector. */
847 Lisp_Object attrs;
848 Lisp_Object val;
849 EMACS_UINT hash_code;
850 struct Lisp_Hash_Table *hash_table = XHASH_TABLE (Vcharset_hash_table);
851 int i, j;
852 struct charset charset;
853 int id;
854 int dimension;
855 bool new_definition_p;
856 int nchars;
858 if (nargs != charset_arg_max)
859 return Fsignal (Qwrong_number_of_arguments,
860 Fcons (intern ("define-charset-internal"),
861 make_number (nargs)));
863 attrs = Fmake_vector (make_number (charset_attr_max), Qnil);
865 CHECK_SYMBOL (args[charset_arg_name]);
866 ASET (attrs, charset_name, args[charset_arg_name]);
868 val = args[charset_arg_code_space];
869 for (i = 0, dimension = 0, nchars = 1; ; i++)
871 Lisp_Object min_byte_obj, max_byte_obj;
872 int min_byte, max_byte;
874 min_byte_obj = Faref (val, make_number (i * 2));
875 max_byte_obj = Faref (val, make_number (i * 2 + 1));
876 CHECK_RANGED_INTEGER (min_byte_obj, 0, 255);
877 min_byte = XINT (min_byte_obj);
878 CHECK_RANGED_INTEGER (max_byte_obj, min_byte, 255);
879 max_byte = XINT (max_byte_obj);
880 charset.code_space[i * 4] = min_byte;
881 charset.code_space[i * 4 + 1] = max_byte;
882 charset.code_space[i * 4 + 2] = max_byte - min_byte + 1;
883 if (max_byte > 0)
884 dimension = i + 1;
885 if (i == 3)
886 break;
887 nchars *= charset.code_space[i * 4 + 2];
888 charset.code_space[i * 4 + 3] = nchars;
891 val = args[charset_arg_dimension];
892 if (NILP (val))
893 charset.dimension = dimension;
894 else
896 CHECK_RANGED_INTEGER (val, 1, 4);
897 charset.dimension = XINT (val);
900 charset.code_linear_p
901 = (charset.dimension == 1
902 || (charset.code_space[2] == 256
903 && (charset.dimension == 2
904 || (charset.code_space[6] == 256
905 && (charset.dimension == 3
906 || charset.code_space[10] == 256)))));
908 if (! charset.code_linear_p)
910 charset.code_space_mask = xzalloc (256);
911 for (i = 0; i < 4; i++)
912 for (j = charset.code_space[i * 4]; j <= charset.code_space[i * 4 + 1];
913 j++)
914 charset.code_space_mask[j] |= (1 << i);
917 charset.iso_chars_96 = charset.code_space[2] == 96;
919 charset.min_code = (charset.code_space[0]
920 | (charset.code_space[4] << 8)
921 | (charset.code_space[8] << 16)
922 | ((unsigned) charset.code_space[12] << 24));
923 charset.max_code = (charset.code_space[1]
924 | (charset.code_space[5] << 8)
925 | (charset.code_space[9] << 16)
926 | ((unsigned) charset.code_space[13] << 24));
927 charset.char_index_offset = 0;
929 val = args[charset_arg_min_code];
930 if (! NILP (val))
932 unsigned code = cons_to_unsigned (val, UINT_MAX);
934 if (code < charset.min_code
935 || code > charset.max_code)
936 args_out_of_range_3 (make_fixnum_or_float (charset.min_code),
937 make_fixnum_or_float (charset.max_code), val);
938 charset.char_index_offset = CODE_POINT_TO_INDEX (&charset, code);
939 charset.min_code = code;
942 val = args[charset_arg_max_code];
943 if (! NILP (val))
945 unsigned code = cons_to_unsigned (val, UINT_MAX);
947 if (code < charset.min_code
948 || code > charset.max_code)
949 args_out_of_range_3 (make_fixnum_or_float (charset.min_code),
950 make_fixnum_or_float (charset.max_code), val);
951 charset.max_code = code;
954 charset.compact_codes_p = charset.max_code < 0x10000;
956 val = args[charset_arg_invalid_code];
957 if (NILP (val))
959 if (charset.min_code > 0)
960 charset.invalid_code = 0;
961 else
963 if (charset.max_code < UINT_MAX)
964 charset.invalid_code = charset.max_code + 1;
965 else
966 error ("Attribute :invalid-code must be specified");
969 else
970 charset.invalid_code = cons_to_unsigned (val, UINT_MAX);
972 val = args[charset_arg_iso_final];
973 if (NILP (val))
974 charset.iso_final = -1;
975 else
977 CHECK_NUMBER (val);
978 if (XINT (val) < '0' || XINT (val) > 127)
979 error ("Invalid iso-final-char: %"pI"d", XINT (val));
980 charset.iso_final = XINT (val);
983 val = args[charset_arg_iso_revision];
984 if (NILP (val))
985 charset.iso_revision = -1;
986 else
988 CHECK_RANGED_INTEGER (val, -1, 63);
989 charset.iso_revision = XINT (val);
992 val = args[charset_arg_emacs_mule_id];
993 if (NILP (val))
994 charset.emacs_mule_id = -1;
995 else
997 CHECK_NATNUM (val);
998 if ((XINT (val) > 0 && XINT (val) <= 128) || XINT (val) >= 256)
999 error ("Invalid emacs-mule-id: %"pI"d", XINT (val));
1000 charset.emacs_mule_id = XINT (val);
1003 charset.ascii_compatible_p = ! NILP (args[charset_arg_ascii_compatible_p]);
1005 charset.supplementary_p = ! NILP (args[charset_arg_supplementary_p]);
1007 charset.unified_p = 0;
1009 memset (charset.fast_map, 0, sizeof (charset.fast_map));
1011 if (! NILP (args[charset_arg_code_offset]))
1013 val = args[charset_arg_code_offset];
1014 CHECK_CHARACTER (val);
1016 charset.method = CHARSET_METHOD_OFFSET;
1017 charset.code_offset = XINT (val);
1019 i = CODE_POINT_TO_INDEX (&charset, charset.max_code);
1020 if (MAX_CHAR - charset.code_offset < i)
1021 error ("Unsupported max char: %d", charset.max_char);
1022 charset.max_char = i + charset.code_offset;
1023 i = CODE_POINT_TO_INDEX (&charset, charset.min_code);
1024 charset.min_char = i + charset.code_offset;
1026 i = (charset.min_char >> 7) << 7;
1027 for (; i < 0x10000 && i <= charset.max_char; i += 128)
1028 CHARSET_FAST_MAP_SET (i, charset.fast_map);
1029 i = (i >> 12) << 12;
1030 for (; i <= charset.max_char; i += 0x1000)
1031 CHARSET_FAST_MAP_SET (i, charset.fast_map);
1032 if (charset.code_offset == 0 && charset.max_char >= 0x80)
1033 charset.ascii_compatible_p = 1;
1035 else if (! NILP (args[charset_arg_map]))
1037 val = args[charset_arg_map];
1038 ASET (attrs, charset_map, val);
1039 charset.method = CHARSET_METHOD_MAP;
1041 else if (! NILP (args[charset_arg_subset]))
1043 Lisp_Object parent;
1044 Lisp_Object parent_min_code, parent_max_code, parent_code_offset;
1045 struct charset *parent_charset;
1047 val = args[charset_arg_subset];
1048 parent = Fcar (val);
1049 CHECK_CHARSET_GET_CHARSET (parent, parent_charset);
1050 parent_min_code = Fnth (make_number (1), val);
1051 CHECK_NATNUM (parent_min_code);
1052 parent_max_code = Fnth (make_number (2), val);
1053 CHECK_NATNUM (parent_max_code);
1054 parent_code_offset = Fnth (make_number (3), val);
1055 CHECK_NUMBER (parent_code_offset);
1056 val = make_uninit_vector (4);
1057 ASET (val, 0, make_number (parent_charset->id));
1058 ASET (val, 1, parent_min_code);
1059 ASET (val, 2, parent_max_code);
1060 ASET (val, 3, parent_code_offset);
1061 ASET (attrs, charset_subset, val);
1063 charset.method = CHARSET_METHOD_SUBSET;
1064 /* Here, we just copy the parent's fast_map. It's not accurate,
1065 but at least it works for quickly detecting which character
1066 DOESN'T belong to this charset. */
1067 for (i = 0; i < 190; i++)
1068 charset.fast_map[i] = parent_charset->fast_map[i];
1070 /* We also copy these for parents. */
1071 charset.min_char = parent_charset->min_char;
1072 charset.max_char = parent_charset->max_char;
1074 else if (! NILP (args[charset_arg_superset]))
1076 val = args[charset_arg_superset];
1077 charset.method = CHARSET_METHOD_SUPERSET;
1078 val = Fcopy_sequence (val);
1079 ASET (attrs, charset_superset, val);
1081 charset.min_char = MAX_CHAR;
1082 charset.max_char = 0;
1083 for (; ! NILP (val); val = Fcdr (val))
1085 Lisp_Object elt, car_part, cdr_part;
1086 int this_id, offset;
1087 struct charset *this_charset;
1089 elt = Fcar (val);
1090 if (CONSP (elt))
1092 car_part = XCAR (elt);
1093 cdr_part = XCDR (elt);
1094 CHECK_CHARSET_GET_ID (car_part, this_id);
1095 CHECK_TYPE_RANGED_INTEGER (int, cdr_part);
1096 offset = XINT (cdr_part);
1098 else
1100 CHECK_CHARSET_GET_ID (elt, this_id);
1101 offset = 0;
1103 XSETCAR (val, Fcons (make_number (this_id), make_number (offset)));
1105 this_charset = CHARSET_FROM_ID (this_id);
1106 if (charset.min_char > this_charset->min_char)
1107 charset.min_char = this_charset->min_char;
1108 if (charset.max_char < this_charset->max_char)
1109 charset.max_char = this_charset->max_char;
1110 for (i = 0; i < 190; i++)
1111 charset.fast_map[i] |= this_charset->fast_map[i];
1114 else
1115 error ("None of :code-offset, :map, :parents are specified");
1117 val = args[charset_arg_unify_map];
1118 if (! NILP (val) && !STRINGP (val))
1119 CHECK_VECTOR (val);
1120 ASET (attrs, charset_unify_map, val);
1122 CHECK_LIST (args[charset_arg_plist]);
1123 ASET (attrs, charset_plist, args[charset_arg_plist]);
1125 charset.hash_index = hash_lookup (hash_table, args[charset_arg_name],
1126 &hash_code);
1127 if (charset.hash_index >= 0)
1129 new_definition_p = 0;
1130 id = XFASTINT (CHARSET_SYMBOL_ID (args[charset_arg_name]));
1131 set_hash_value_slot (hash_table, charset.hash_index, attrs);
1133 else
1135 charset.hash_index = hash_put (hash_table, args[charset_arg_name], attrs,
1136 hash_code);
1137 if (charset_table_used == charset_table_size)
1139 /* Ensure that charset IDs fit into 'int' as well as into the
1140 restriction imposed by fixnums. Although the 'int' restriction
1141 could be removed, too much other code would need altering; for
1142 example, the IDs are stuffed into struct
1143 coding_system.charbuf[i] entries, which are 'int'. */
1144 int old_size = charset_table_size;
1145 ptrdiff_t new_size = old_size;
1146 struct charset *new_table =
1147 xpalloc (0, &new_size, 1,
1148 min (INT_MAX, MOST_POSITIVE_FIXNUM),
1149 sizeof *charset_table);
1150 memcpy (new_table, charset_table, old_size * sizeof *new_table);
1151 charset_table = new_table;
1152 charset_table_size = new_size;
1153 /* FIXME: This leaks memory, as the old charset_table becomes
1154 unreachable. If the old charset table is charset_table_init
1155 then this leak is intentional; otherwise, it's unclear.
1156 If the latter memory leak is intentional, a
1157 comment should be added to explain this. If not, the old
1158 charset_table should be freed, by passing it as the 1st argument
1159 to xpalloc and removing the memcpy. */
1161 id = charset_table_used++;
1162 new_definition_p = 1;
1165 ASET (attrs, charset_id, make_number (id));
1166 charset.id = id;
1167 charset_table[id] = charset;
1169 if (charset.method == CHARSET_METHOD_MAP)
1171 load_charset (&charset, 0);
1172 charset_table[id] = charset;
1175 if (charset.iso_final >= 0)
1177 ISO_CHARSET_TABLE (charset.dimension, charset.iso_chars_96,
1178 charset.iso_final) = id;
1179 if (new_definition_p)
1180 Viso_2022_charset_list = nconc2 (Viso_2022_charset_list,
1181 list1 (make_number (id)));
1182 if (ISO_CHARSET_TABLE (1, 0, 'J') == id)
1183 charset_jisx0201_roman = id;
1184 else if (ISO_CHARSET_TABLE (2, 0, '@') == id)
1185 charset_jisx0208_1978 = id;
1186 else if (ISO_CHARSET_TABLE (2, 0, 'B') == id)
1187 charset_jisx0208 = id;
1188 else if (ISO_CHARSET_TABLE (2, 0, 'C') == id)
1189 charset_ksc5601 = id;
1192 if (charset.emacs_mule_id >= 0)
1194 emacs_mule_charset[charset.emacs_mule_id] = id;
1195 if (charset.emacs_mule_id < 0xA0)
1196 emacs_mule_bytes[charset.emacs_mule_id] = charset.dimension + 1;
1197 else
1198 emacs_mule_bytes[charset.emacs_mule_id] = charset.dimension + 2;
1199 if (new_definition_p)
1200 Vemacs_mule_charset_list = nconc2 (Vemacs_mule_charset_list,
1201 list1 (make_number (id)));
1204 if (new_definition_p)
1206 Vcharset_list = Fcons (args[charset_arg_name], Vcharset_list);
1207 if (charset.supplementary_p)
1208 Vcharset_ordered_list = nconc2 (Vcharset_ordered_list,
1209 list1 (make_number (id)));
1210 else
1212 Lisp_Object tail;
1214 for (tail = Vcharset_ordered_list; CONSP (tail); tail = XCDR (tail))
1216 struct charset *cs = CHARSET_FROM_ID (XINT (XCAR (tail)));
1218 if (cs->supplementary_p)
1219 break;
1221 if (EQ (tail, Vcharset_ordered_list))
1222 Vcharset_ordered_list = Fcons (make_number (id),
1223 Vcharset_ordered_list);
1224 else if (NILP (tail))
1225 Vcharset_ordered_list = nconc2 (Vcharset_ordered_list,
1226 list1 (make_number (id)));
1227 else
1229 val = Fcons (XCAR (tail), XCDR (tail));
1230 XSETCDR (tail, val);
1231 XSETCAR (tail, make_number (id));
1234 charset_ordered_list_tick++;
1237 return Qnil;
1241 /* Same as Fdefine_charset_internal but arguments are more convenient
1242 to call from C (typically in syms_of_charset). This can define a
1243 charset of `offset' method only. Return the ID of the new
1244 charset. */
1246 static int
1247 define_charset_internal (Lisp_Object name,
1248 int dimension,
1249 const char *code_space_chars,
1250 unsigned min_code, unsigned max_code,
1251 int iso_final, int iso_revision, int emacs_mule_id,
1252 bool ascii_compatible, bool supplementary,
1253 int code_offset)
1255 const unsigned char *code_space = (const unsigned char *) code_space_chars;
1256 Lisp_Object args[charset_arg_max];
1257 Lisp_Object val;
1258 int i;
1260 args[charset_arg_name] = name;
1261 args[charset_arg_dimension] = make_number (dimension);
1262 val = make_uninit_vector (8);
1263 for (i = 0; i < 8; i++)
1264 ASET (val, i, make_number (code_space[i]));
1265 args[charset_arg_code_space] = val;
1266 args[charset_arg_min_code] = make_number (min_code);
1267 args[charset_arg_max_code] = make_number (max_code);
1268 args[charset_arg_iso_final]
1269 = (iso_final < 0 ? Qnil : make_number (iso_final));
1270 args[charset_arg_iso_revision] = make_number (iso_revision);
1271 args[charset_arg_emacs_mule_id]
1272 = (emacs_mule_id < 0 ? Qnil : make_number (emacs_mule_id));
1273 args[charset_arg_ascii_compatible_p] = ascii_compatible ? Qt : Qnil;
1274 args[charset_arg_supplementary_p] = supplementary ? Qt : Qnil;
1275 args[charset_arg_invalid_code] = Qnil;
1276 args[charset_arg_code_offset] = make_number (code_offset);
1277 args[charset_arg_map] = Qnil;
1278 args[charset_arg_subset] = Qnil;
1279 args[charset_arg_superset] = Qnil;
1280 args[charset_arg_unify_map] = Qnil;
1282 args[charset_arg_plist] =
1283 listn (CONSTYPE_HEAP, 14,
1284 intern_c_string (":name"),
1285 args[charset_arg_name],
1286 intern_c_string (":dimension"),
1287 args[charset_arg_dimension],
1288 intern_c_string (":code-space"),
1289 args[charset_arg_code_space],
1290 intern_c_string (":iso-final-char"),
1291 args[charset_arg_iso_final],
1292 intern_c_string (":emacs-mule-id"),
1293 args[charset_arg_emacs_mule_id],
1294 intern_c_string (":ascii-compatible-p"),
1295 args[charset_arg_ascii_compatible_p],
1296 intern_c_string (":code-offset"),
1297 args[charset_arg_code_offset]);
1298 Fdefine_charset_internal (charset_arg_max, args);
1300 return XINT (CHARSET_SYMBOL_ID (name));
1304 DEFUN ("define-charset-alias", Fdefine_charset_alias,
1305 Sdefine_charset_alias, 2, 2, 0,
1306 doc: /* Define ALIAS as an alias for charset CHARSET. */)
1307 (Lisp_Object alias, Lisp_Object charset)
1309 Lisp_Object attr;
1311 CHECK_CHARSET_GET_ATTR (charset, attr);
1312 Fputhash (alias, attr, Vcharset_hash_table);
1313 Vcharset_list = Fcons (alias, Vcharset_list);
1314 return Qnil;
1318 DEFUN ("charset-plist", Fcharset_plist, Scharset_plist, 1, 1, 0,
1319 doc: /* Return the property list of CHARSET. */)
1320 (Lisp_Object charset)
1322 Lisp_Object attrs;
1324 CHECK_CHARSET_GET_ATTR (charset, attrs);
1325 return CHARSET_ATTR_PLIST (attrs);
1329 DEFUN ("set-charset-plist", Fset_charset_plist, Sset_charset_plist, 2, 2, 0,
1330 doc: /* Set CHARSET's property list to PLIST. */)
1331 (Lisp_Object charset, Lisp_Object plist)
1333 Lisp_Object attrs;
1335 CHECK_CHARSET_GET_ATTR (charset, attrs);
1336 ASET (attrs, charset_plist, plist);
1337 return plist;
1341 DEFUN ("unify-charset", Funify_charset, Sunify_charset, 1, 3, 0,
1342 doc: /* Unify characters of CHARSET with Unicode.
1343 This means reading the relevant file and installing the table defined
1344 by CHARSET's `:unify-map' property.
1346 Optional second arg UNIFY-MAP is a file name string or a vector. It has
1347 the same meaning as the `:unify-map' attribute in the function
1348 `define-charset' (which see).
1350 Optional third argument DEUNIFY, if non-nil, means to de-unify CHARSET. */)
1351 (Lisp_Object charset, Lisp_Object unify_map, Lisp_Object deunify)
1353 int id;
1354 struct charset *cs;
1356 CHECK_CHARSET_GET_ID (charset, id);
1357 cs = CHARSET_FROM_ID (id);
1358 if (NILP (deunify)
1359 ? CHARSET_UNIFIED_P (cs) && ! NILP (CHARSET_DEUNIFIER (cs))
1360 : ! CHARSET_UNIFIED_P (cs))
1361 return Qnil;
1363 CHARSET_UNIFIED_P (cs) = 0;
1364 if (NILP (deunify))
1366 if (CHARSET_METHOD (cs) != CHARSET_METHOD_OFFSET
1367 || CHARSET_CODE_OFFSET (cs) < 0x110000)
1368 error ("Can't unify charset: %s", SDATA (SYMBOL_NAME (charset)));
1369 if (NILP (unify_map))
1370 unify_map = CHARSET_UNIFY_MAP (cs);
1371 else
1373 if (! STRINGP (unify_map) && ! VECTORP (unify_map))
1374 signal_error ("Bad unify-map", unify_map);
1375 set_charset_attr (cs, charset_unify_map, unify_map);
1377 if (NILP (Vchar_unify_table))
1378 Vchar_unify_table = Fmake_char_table (Qnil, Qnil);
1379 char_table_set_range (Vchar_unify_table,
1380 cs->min_char, cs->max_char, charset);
1381 CHARSET_UNIFIED_P (cs) = 1;
1383 else if (CHAR_TABLE_P (Vchar_unify_table))
1385 unsigned min_code = CHARSET_MIN_CODE (cs);
1386 unsigned max_code = CHARSET_MAX_CODE (cs);
1387 int min_char = DECODE_CHAR (cs, min_code);
1388 int max_char = DECODE_CHAR (cs, max_code);
1390 char_table_set_range (Vchar_unify_table, min_char, max_char, Qnil);
1393 return Qnil;
1396 /* Check that DIMENSION, CHARS, and FINAL_CHAR specify a valid ISO charset.
1397 Return true if it's a 96-character set, false if 94. */
1399 static bool
1400 check_iso_charset_parameter (Lisp_Object dimension, Lisp_Object chars,
1401 Lisp_Object final_char)
1403 CHECK_NUMBER (dimension);
1404 CHECK_NUMBER (chars);
1405 CHECK_CHARACTER (final_char);
1407 if (! (1 <= XINT (dimension) && XINT (dimension) <= 3))
1408 error ("Invalid DIMENSION %"pI"d, it should be 1, 2, or 3",
1409 XINT (dimension));
1411 bool chars_flag = XINT (chars) == 96;
1412 if (! (chars_flag || XINT (chars) == 94))
1413 error ("Invalid CHARS %"pI"d, it should be 94 or 96", XINT (chars));
1415 int final_ch = XFASTINT (final_char);
1416 if (! ('0' <= final_ch && final_ch <= '~'))
1417 error ("Invalid FINAL-CHAR '%c', it should be '0'..'~'", final_ch);
1419 return chars_flag;
1422 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
1423 Sget_unused_iso_final_char, 2, 2, 0,
1424 doc: /*
1425 Return an unused ISO final char for a charset of DIMENSION and CHARS.
1426 DIMENSION is the number of bytes to represent a character: 1 or 2.
1427 CHARS is the number of characters in a dimension: 94 or 96.
1429 This final char is for private use, thus the range is `0' (48) .. `?' (63).
1430 If there's no unused final char for the specified kind of charset,
1431 return nil. */)
1432 (Lisp_Object dimension, Lisp_Object chars)
1434 bool chars_flag = check_iso_charset_parameter (dimension, chars,
1435 make_number ('0'));
1436 for (int final_char = '0'; final_char <= '?'; final_char++)
1437 if (ISO_CHARSET_TABLE (XINT (dimension), chars_flag, final_char) < 0)
1438 return make_number (final_char);
1439 return Qnil;
1443 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
1444 4, 4, 0,
1445 doc: /* Declare an equivalent charset for ISO-2022 decoding.
1447 On decoding by an ISO-2022 base coding system, when a charset
1448 specified by DIMENSION, CHARS, and FINAL-CHAR is designated, behave as
1449 if CHARSET is designated instead. */)
1450 (Lisp_Object dimension, Lisp_Object chars, Lisp_Object final_char, Lisp_Object charset)
1452 int id;
1454 CHECK_CHARSET_GET_ID (charset, id);
1455 bool chars_flag = check_iso_charset_parameter (dimension, chars, final_char);
1456 ISO_CHARSET_TABLE (XINT (dimension), chars_flag, XFASTINT (final_char)) = id;
1457 return Qnil;
1461 /* Return information about charsets in the text at PTR of NBYTES
1462 bytes, which are NCHARS characters. The value is:
1464 0: Each character is represented by one byte. This is always
1465 true for a unibyte string. For a multibyte string, true if
1466 it contains only ASCII characters.
1468 1: No charsets other than ascii, control-1, and latin-1 are
1469 found.
1471 2: Otherwise.
1475 string_xstring_p (Lisp_Object string)
1477 const unsigned char *p = SDATA (string);
1478 const unsigned char *endp = p + SBYTES (string);
1480 if (SCHARS (string) == SBYTES (string))
1481 return 0;
1483 while (p < endp)
1485 int c = STRING_CHAR_ADVANCE (p);
1487 if (c >= 0x100)
1488 return 2;
1490 return 1;
1494 /* Find charsets in the string at PTR of NCHARS and NBYTES.
1496 CHARSETS is a vector. If Nth element is non-nil, it means the
1497 charset whose id is N is already found.
1499 It may lookup a translation table TABLE if supplied. */
1501 static void
1502 find_charsets_in_text (const unsigned char *ptr, ptrdiff_t nchars,
1503 ptrdiff_t nbytes, Lisp_Object charsets,
1504 Lisp_Object table, bool multibyte)
1506 const unsigned char *pend = ptr + nbytes;
1508 if (nchars == nbytes)
1510 if (multibyte)
1511 ASET (charsets, charset_ascii, Qt);
1512 else
1513 while (ptr < pend)
1515 int c = *ptr++;
1517 if (!NILP (table))
1518 c = translate_char (table, c);
1519 if (ASCII_CHAR_P (c))
1520 ASET (charsets, charset_ascii, Qt);
1521 else
1522 ASET (charsets, charset_eight_bit, Qt);
1525 else
1527 while (ptr < pend)
1529 int c = STRING_CHAR_ADVANCE (ptr);
1530 struct charset *charset;
1532 if (!NILP (table))
1533 c = translate_char (table, c);
1534 charset = CHAR_CHARSET (c);
1535 ASET (charsets, CHARSET_ID (charset), Qt);
1540 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
1541 2, 3, 0,
1542 doc: /* Return a list of charsets in the region between BEG and END.
1543 BEG and END are buffer positions.
1544 Optional arg TABLE if non-nil is a translation table to look up.
1546 If the current buffer is unibyte, the returned list may contain
1547 only `ascii', `eight-bit-control', and `eight-bit-graphic'. */)
1548 (Lisp_Object beg, Lisp_Object end, Lisp_Object table)
1550 Lisp_Object charsets;
1551 ptrdiff_t from, from_byte, to, stop, stop_byte;
1552 int i;
1553 Lisp_Object val;
1554 bool multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
1556 validate_region (&beg, &end);
1557 from = XFASTINT (beg);
1558 stop = to = XFASTINT (end);
1560 if (from < GPT && GPT < to)
1562 stop = GPT;
1563 stop_byte = GPT_BYTE;
1565 else
1566 stop_byte = CHAR_TO_BYTE (stop);
1568 from_byte = CHAR_TO_BYTE (from);
1570 charsets = Fmake_vector (make_number (charset_table_used), Qnil);
1571 while (1)
1573 find_charsets_in_text (BYTE_POS_ADDR (from_byte), stop - from,
1574 stop_byte - from_byte, charsets, table,
1575 multibyte);
1576 if (stop < to)
1578 from = stop, from_byte = stop_byte;
1579 stop = to, stop_byte = CHAR_TO_BYTE (stop);
1581 else
1582 break;
1585 val = Qnil;
1586 for (i = charset_table_used - 1; i >= 0; i--)
1587 if (!NILP (AREF (charsets, i)))
1588 val = Fcons (CHARSET_NAME (charset_table + i), val);
1589 return val;
1592 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
1593 1, 2, 0,
1594 doc: /* Return a list of charsets in STR.
1595 Optional arg TABLE if non-nil is a translation table to look up.
1597 If STR is unibyte, the returned list may contain
1598 only `ascii', `eight-bit-control', and `eight-bit-graphic'. */)
1599 (Lisp_Object str, Lisp_Object table)
1601 Lisp_Object charsets;
1602 int i;
1603 Lisp_Object val;
1605 CHECK_STRING (str);
1607 charsets = Fmake_vector (make_number (charset_table_used), Qnil);
1608 find_charsets_in_text (SDATA (str), SCHARS (str), SBYTES (str),
1609 charsets, table,
1610 STRING_MULTIBYTE (str));
1611 val = Qnil;
1612 for (i = charset_table_used - 1; i >= 0; i--)
1613 if (!NILP (AREF (charsets, i)))
1614 val = Fcons (CHARSET_NAME (charset_table + i), val);
1615 return val;
1620 /* Return a unified character code for C (>= 0x110000). VAL is a
1621 value of Vchar_unify_table for C; i.e. it is nil, an integer, or a
1622 charset symbol. */
1623 static int
1624 maybe_unify_char (int c, Lisp_Object val)
1626 struct charset *charset;
1628 if (INTEGERP (val))
1629 return XFASTINT (val);
1630 if (NILP (val))
1631 return c;
1633 CHECK_CHARSET_GET_CHARSET (val, charset);
1634 #ifdef REL_ALLOC
1635 /* The call to load_charset below can allocate memory, which screws
1636 callers of this function through STRING_CHAR_* macros that hold C
1637 pointers to buffer text, if REL_ALLOC is used. */
1638 r_alloc_inhibit_buffer_relocation (1);
1639 #endif
1640 load_charset (charset, 1);
1641 if (! inhibit_load_charset_map)
1643 val = CHAR_TABLE_REF (Vchar_unify_table, c);
1644 if (! NILP (val))
1645 c = XFASTINT (val);
1647 else
1649 int code_index = c - CHARSET_CODE_OFFSET (charset);
1650 int unified = GET_TEMP_CHARSET_WORK_DECODER (code_index);
1652 if (unified > 0)
1653 c = unified;
1655 #ifdef REL_ALLOC
1656 r_alloc_inhibit_buffer_relocation (0);
1657 #endif
1658 return c;
1662 /* Return a character corresponding to the code-point CODE of
1663 CHARSET. */
1666 decode_char (struct charset *charset, unsigned int code)
1668 int c, char_index;
1669 enum charset_method method = CHARSET_METHOD (charset);
1671 if (code < CHARSET_MIN_CODE (charset) || code > CHARSET_MAX_CODE (charset))
1672 return -1;
1674 if (method == CHARSET_METHOD_SUBSET)
1676 Lisp_Object subset_info;
1678 subset_info = CHARSET_SUBSET (charset);
1679 charset = CHARSET_FROM_ID (XFASTINT (AREF (subset_info, 0)));
1680 code -= XINT (AREF (subset_info, 3));
1681 if (code < XFASTINT (AREF (subset_info, 1))
1682 || code > XFASTINT (AREF (subset_info, 2)))
1683 c = -1;
1684 else
1685 c = DECODE_CHAR (charset, code);
1687 else if (method == CHARSET_METHOD_SUPERSET)
1689 Lisp_Object parents;
1691 parents = CHARSET_SUPERSET (charset);
1692 c = -1;
1693 for (; CONSP (parents); parents = XCDR (parents))
1695 int id = XINT (XCAR (XCAR (parents)));
1696 int code_offset = XINT (XCDR (XCAR (parents)));
1697 unsigned this_code = code - code_offset;
1699 charset = CHARSET_FROM_ID (id);
1700 if ((c = DECODE_CHAR (charset, this_code)) >= 0)
1701 break;
1704 else
1706 char_index = CODE_POINT_TO_INDEX (charset, code);
1707 if (char_index < 0)
1708 return -1;
1710 if (method == CHARSET_METHOD_MAP)
1712 Lisp_Object decoder;
1714 decoder = CHARSET_DECODER (charset);
1715 if (! VECTORP (decoder))
1717 load_charset (charset, 1);
1718 decoder = CHARSET_DECODER (charset);
1720 if (VECTORP (decoder))
1721 c = XINT (AREF (decoder, char_index));
1722 else
1723 c = GET_TEMP_CHARSET_WORK_DECODER (char_index);
1725 else /* method == CHARSET_METHOD_OFFSET */
1727 c = char_index + CHARSET_CODE_OFFSET (charset);
1728 if (CHARSET_UNIFIED_P (charset)
1729 && MAX_UNICODE_CHAR < c && c <= MAX_5_BYTE_CHAR)
1731 /* Unify C with a Unicode character if possible. */
1732 Lisp_Object val = CHAR_TABLE_REF (Vchar_unify_table, c);
1733 c = maybe_unify_char (c, val);
1738 return c;
1741 /* Variable used temporarily by the macro ENCODE_CHAR. */
1742 Lisp_Object charset_work;
1744 /* Return a code-point of C in CHARSET. If C doesn't belong to
1745 CHARSET, return CHARSET_INVALID_CODE (CHARSET). If STRICT is true,
1746 use CHARSET's strict_max_char instead of max_char. */
1748 unsigned
1749 encode_char (struct charset *charset, int c)
1751 unsigned code;
1752 enum charset_method method = CHARSET_METHOD (charset);
1754 if (CHARSET_UNIFIED_P (charset))
1756 Lisp_Object deunifier;
1757 int code_index = -1;
1759 deunifier = CHARSET_DEUNIFIER (charset);
1760 if (! CHAR_TABLE_P (deunifier))
1762 load_charset (charset, 2);
1763 deunifier = CHARSET_DEUNIFIER (charset);
1765 if (CHAR_TABLE_P (deunifier))
1767 Lisp_Object deunified = CHAR_TABLE_REF (deunifier, c);
1769 if (INTEGERP (deunified))
1770 code_index = XINT (deunified);
1772 else
1774 code_index = GET_TEMP_CHARSET_WORK_ENCODER (c);
1776 if (code_index >= 0)
1777 c = CHARSET_CODE_OFFSET (charset) + code_index;
1780 if (method == CHARSET_METHOD_SUBSET)
1782 Lisp_Object subset_info;
1783 struct charset *this_charset;
1785 subset_info = CHARSET_SUBSET (charset);
1786 this_charset = CHARSET_FROM_ID (XFASTINT (AREF (subset_info, 0)));
1787 code = ENCODE_CHAR (this_charset, c);
1788 if (code == CHARSET_INVALID_CODE (this_charset)
1789 || code < XFASTINT (AREF (subset_info, 1))
1790 || code > XFASTINT (AREF (subset_info, 2)))
1791 return CHARSET_INVALID_CODE (charset);
1792 code += XINT (AREF (subset_info, 3));
1793 return code;
1796 if (method == CHARSET_METHOD_SUPERSET)
1798 Lisp_Object parents;
1800 parents = CHARSET_SUPERSET (charset);
1801 for (; CONSP (parents); parents = XCDR (parents))
1803 int id = XINT (XCAR (XCAR (parents)));
1804 int code_offset = XINT (XCDR (XCAR (parents)));
1805 struct charset *this_charset = CHARSET_FROM_ID (id);
1807 code = ENCODE_CHAR (this_charset, c);
1808 if (code != CHARSET_INVALID_CODE (this_charset))
1809 return code + code_offset;
1811 return CHARSET_INVALID_CODE (charset);
1814 if (! CHARSET_FAST_MAP_REF ((c), charset->fast_map)
1815 || c < CHARSET_MIN_CHAR (charset) || c > CHARSET_MAX_CHAR (charset))
1816 return CHARSET_INVALID_CODE (charset);
1818 if (method == CHARSET_METHOD_MAP)
1820 Lisp_Object encoder;
1821 Lisp_Object val;
1823 encoder = CHARSET_ENCODER (charset);
1824 if (! CHAR_TABLE_P (CHARSET_ENCODER (charset)))
1826 load_charset (charset, 2);
1827 encoder = CHARSET_ENCODER (charset);
1829 if (CHAR_TABLE_P (encoder))
1831 val = CHAR_TABLE_REF (encoder, c);
1832 if (NILP (val))
1833 return CHARSET_INVALID_CODE (charset);
1834 code = XINT (val);
1835 if (! CHARSET_COMPACT_CODES_P (charset))
1836 code = INDEX_TO_CODE_POINT (charset, code);
1838 else
1840 code = GET_TEMP_CHARSET_WORK_ENCODER (c);
1841 code = INDEX_TO_CODE_POINT (charset, code);
1844 else /* method == CHARSET_METHOD_OFFSET */
1846 unsigned code_index = c - CHARSET_CODE_OFFSET (charset);
1848 code = INDEX_TO_CODE_POINT (charset, code_index);
1851 return code;
1855 DEFUN ("decode-char", Fdecode_char, Sdecode_char, 2, 3, 0,
1856 doc: /* Decode the pair of CHARSET and CODE-POINT into a character.
1857 Return nil if CODE-POINT is not valid in CHARSET.
1859 CODE-POINT may be a cons (HIGHER-16-BIT-VALUE . LOWER-16-BIT-VALUE). */)
1860 (Lisp_Object charset, Lisp_Object code_point, Lisp_Object restriction)
1862 int c, id;
1863 unsigned code;
1864 struct charset *charsetp;
1866 CHECK_CHARSET_GET_ID (charset, id);
1867 code = cons_to_unsigned (code_point, UINT_MAX);
1868 charsetp = CHARSET_FROM_ID (id);
1869 c = DECODE_CHAR (charsetp, code);
1870 return (c >= 0 ? make_number (c) : Qnil);
1874 DEFUN ("encode-char", Fencode_char, Sencode_char, 2, 3, 0,
1875 doc: /* Encode the character CH into a code-point of CHARSET.
1876 Return nil if CHARSET doesn't include CH. */)
1877 (Lisp_Object ch, Lisp_Object charset, Lisp_Object restriction)
1879 int c, id;
1880 unsigned code;
1881 struct charset *charsetp;
1883 CHECK_CHARSET_GET_ID (charset, id);
1884 CHECK_CHARACTER (ch);
1885 c = XFASTINT (ch);
1886 charsetp = CHARSET_FROM_ID (id);
1887 code = ENCODE_CHAR (charsetp, c);
1888 if (code == CHARSET_INVALID_CODE (charsetp))
1889 return Qnil;
1890 return INTEGER_TO_CONS (code);
1894 DEFUN ("make-char", Fmake_char, Smake_char, 1, 5, 0,
1895 doc:
1896 /* Return a character of CHARSET whose position codes are CODEn.
1898 CODE1 through CODE4 are optional, but if you don't supply sufficient
1899 position codes, it is assumed that the minimum code in each dimension
1900 is specified. */)
1901 (Lisp_Object charset, Lisp_Object code1, Lisp_Object code2, Lisp_Object code3, Lisp_Object code4)
1903 int id, dimension;
1904 struct charset *charsetp;
1905 unsigned code;
1906 int c;
1908 CHECK_CHARSET_GET_ID (charset, id);
1909 charsetp = CHARSET_FROM_ID (id);
1911 dimension = CHARSET_DIMENSION (charsetp);
1912 if (NILP (code1))
1913 code = (CHARSET_ASCII_COMPATIBLE_P (charsetp)
1914 ? 0 : CHARSET_MIN_CODE (charsetp));
1915 else
1917 CHECK_NATNUM (code1);
1918 if (XFASTINT (code1) >= 0x100)
1919 args_out_of_range (make_number (0xFF), code1);
1920 code = XFASTINT (code1);
1922 if (dimension > 1)
1924 code <<= 8;
1925 if (NILP (code2))
1926 code |= charsetp->code_space[(dimension - 2) * 4];
1927 else
1929 CHECK_NATNUM (code2);
1930 if (XFASTINT (code2) >= 0x100)
1931 args_out_of_range (make_number (0xFF), code2);
1932 code |= XFASTINT (code2);
1935 if (dimension > 2)
1937 code <<= 8;
1938 if (NILP (code3))
1939 code |= charsetp->code_space[(dimension - 3) * 4];
1940 else
1942 CHECK_NATNUM (code3);
1943 if (XFASTINT (code3) >= 0x100)
1944 args_out_of_range (make_number (0xFF), code3);
1945 code |= XFASTINT (code3);
1948 if (dimension > 3)
1950 code <<= 8;
1951 if (NILP (code4))
1952 code |= charsetp->code_space[0];
1953 else
1955 CHECK_NATNUM (code4);
1956 if (XFASTINT (code4) >= 0x100)
1957 args_out_of_range (make_number (0xFF), code4);
1958 code |= XFASTINT (code4);
1965 if (CHARSET_ISO_FINAL (charsetp) >= 0)
1966 code &= 0x7F7F7F7F;
1967 c = DECODE_CHAR (charsetp, code);
1968 if (c < 0)
1969 error ("Invalid code(s)");
1970 return make_number (c);
1974 /* Return the first charset in CHARSET_LIST that contains C.
1975 CHARSET_LIST is a list of charset IDs. If it is nil, use
1976 Vcharset_ordered_list. */
1978 struct charset *
1979 char_charset (int c, Lisp_Object charset_list, unsigned int *code_return)
1981 bool maybe_null = 0;
1983 if (NILP (charset_list))
1984 charset_list = Vcharset_ordered_list;
1985 else
1986 maybe_null = 1;
1988 while (CONSP (charset_list))
1990 struct charset *charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
1991 unsigned code = ENCODE_CHAR (charset, c);
1993 if (code != CHARSET_INVALID_CODE (charset))
1995 if (code_return)
1996 *code_return = code;
1997 return charset;
1999 charset_list = XCDR (charset_list);
2000 if (! maybe_null
2001 && c <= MAX_UNICODE_CHAR
2002 && EQ (charset_list, Vcharset_non_preferred_head))
2003 return CHARSET_FROM_ID (charset_unicode);
2005 return (maybe_null ? NULL
2006 : c <= MAX_5_BYTE_CHAR ? CHARSET_FROM_ID (charset_emacs)
2007 : CHARSET_FROM_ID (charset_eight_bit));
2011 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
2012 doc:
2013 /*Return list of charset and one to four position-codes of CH.
2014 The charset is decided by the current priority order of charsets.
2015 A position-code is a byte value of each dimension of the code-point of
2016 CH in the charset. */)
2017 (Lisp_Object ch)
2019 struct charset *charset;
2020 int c, dimension;
2021 unsigned code;
2022 Lisp_Object val;
2024 CHECK_CHARACTER (ch);
2025 c = XFASTINT (ch);
2026 charset = CHAR_CHARSET (c);
2027 if (! charset)
2028 emacs_abort ();
2029 code = ENCODE_CHAR (charset, c);
2030 if (code == CHARSET_INVALID_CODE (charset))
2031 emacs_abort ();
2032 dimension = CHARSET_DIMENSION (charset);
2033 for (val = Qnil; dimension > 0; dimension--)
2035 val = Fcons (make_number (code & 0xFF), val);
2036 code >>= 8;
2038 return Fcons (CHARSET_NAME (charset), val);
2042 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 2, 0,
2043 doc: /* Return the charset of highest priority that contains CH.
2044 ASCII characters are an exception: for them, this function always
2045 returns `ascii'.
2046 If optional 2nd arg RESTRICTION is non-nil, it is a list of charsets
2047 from which to find the charset. It may also be a coding system. In
2048 that case, find the charset from what supported by that coding system. */)
2049 (Lisp_Object ch, Lisp_Object restriction)
2051 struct charset *charset;
2053 CHECK_CHARACTER (ch);
2054 if (NILP (restriction))
2055 charset = CHAR_CHARSET (XINT (ch));
2056 else
2058 if (CONSP (restriction))
2060 int c = XFASTINT (ch);
2062 for (; CONSP (restriction); restriction = XCDR (restriction))
2064 struct charset *rcharset;
2066 CHECK_CHARSET_GET_CHARSET (XCAR (restriction), rcharset);
2067 if (ENCODE_CHAR (rcharset, c) != CHARSET_INVALID_CODE (rcharset))
2068 return XCAR (restriction);
2070 return Qnil;
2072 restriction = coding_system_charset_list (restriction);
2073 charset = char_charset (XINT (ch), restriction, NULL);
2074 if (! charset)
2075 return Qnil;
2077 return (CHARSET_NAME (charset));
2081 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
2082 doc: /*
2083 Return charset of a character in the current buffer at position POS.
2084 If POS is nil, it defaults to the current point.
2085 If POS is out of range, the value is nil. */)
2086 (Lisp_Object pos)
2088 Lisp_Object ch;
2089 struct charset *charset;
2091 ch = Fchar_after (pos);
2092 if (! INTEGERP (ch))
2093 return ch;
2094 charset = CHAR_CHARSET (XINT (ch));
2095 return (CHARSET_NAME (charset));
2099 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
2100 doc: /*
2101 Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.
2103 ISO 2022's designation sequence (escape sequence) distinguishes charsets
2104 by their DIMENSION, CHARS, and FINAL-CHAR,
2105 whereas Emacs distinguishes them by charset symbol.
2106 See the documentation of the function `charset-info' for the meanings of
2107 DIMENSION, CHARS, and FINAL-CHAR. */)
2108 (Lisp_Object dimension, Lisp_Object chars, Lisp_Object final_char)
2110 bool chars_flag = check_iso_charset_parameter (dimension, chars, final_char);
2111 int id = ISO_CHARSET_TABLE (XINT (dimension), chars_flag,
2112 XFASTINT (final_char));
2113 return (id >= 0 ? CHARSET_NAME (CHARSET_FROM_ID (id)) : Qnil);
2117 DEFUN ("clear-charset-maps", Fclear_charset_maps, Sclear_charset_maps,
2118 0, 0, 0,
2119 doc: /*
2120 Internal use only.
2121 Clear temporary charset mapping tables.
2122 It should be called only from temacs invoked for dumping. */)
2123 (void)
2125 if (temp_charset_work)
2127 xfree (temp_charset_work);
2128 temp_charset_work = NULL;
2131 if (CHAR_TABLE_P (Vchar_unify_table))
2132 Foptimize_char_table (Vchar_unify_table, Qnil);
2134 return Qnil;
2137 DEFUN ("charset-priority-list", Fcharset_priority_list,
2138 Scharset_priority_list, 0, 1, 0,
2139 doc: /* Return the list of charsets ordered by priority.
2140 HIGHESTP non-nil means just return the highest priority one. */)
2141 (Lisp_Object highestp)
2143 Lisp_Object val = Qnil, list = Vcharset_ordered_list;
2145 if (!NILP (highestp))
2146 return CHARSET_NAME (CHARSET_FROM_ID (XINT (Fcar (list))));
2148 while (!NILP (list))
2150 val = Fcons (CHARSET_NAME (CHARSET_FROM_ID (XINT (XCAR (list)))), val);
2151 list = XCDR (list);
2153 return Fnreverse (val);
2156 DEFUN ("set-charset-priority", Fset_charset_priority, Sset_charset_priority,
2157 1, MANY, 0,
2158 doc: /* Assign higher priority to the charsets given as arguments.
2159 usage: (set-charset-priority &rest charsets) */)
2160 (ptrdiff_t nargs, Lisp_Object *args)
2162 Lisp_Object new_head, old_list, arglist[2];
2163 Lisp_Object list_2022, list_emacs_mule;
2164 ptrdiff_t i;
2165 int id;
2167 old_list = Fcopy_sequence (Vcharset_ordered_list);
2168 new_head = Qnil;
2169 for (i = 0; i < nargs; i++)
2171 CHECK_CHARSET_GET_ID (args[i], id);
2172 if (! NILP (Fmemq (make_number (id), old_list)))
2174 old_list = Fdelq (make_number (id), old_list);
2175 new_head = Fcons (make_number (id), new_head);
2178 arglist[0] = Fnreverse (new_head);
2179 arglist[1] = Vcharset_non_preferred_head = old_list;
2180 Vcharset_ordered_list = Fnconc (2, arglist);
2181 charset_ordered_list_tick++;
2183 charset_unibyte = -1;
2184 for (old_list = Vcharset_ordered_list, list_2022 = list_emacs_mule = Qnil;
2185 CONSP (old_list); old_list = XCDR (old_list))
2187 if (! NILP (Fmemq (XCAR (old_list), Viso_2022_charset_list)))
2188 list_2022 = Fcons (XCAR (old_list), list_2022);
2189 if (! NILP (Fmemq (XCAR (old_list), Vemacs_mule_charset_list)))
2190 list_emacs_mule = Fcons (XCAR (old_list), list_emacs_mule);
2191 if (charset_unibyte < 0)
2193 struct charset *charset = CHARSET_FROM_ID (XINT (XCAR (old_list)));
2195 if (CHARSET_DIMENSION (charset) == 1
2196 && CHARSET_ASCII_COMPATIBLE_P (charset)
2197 && CHARSET_MAX_CHAR (charset) >= 0x80)
2198 charset_unibyte = CHARSET_ID (charset);
2201 Viso_2022_charset_list = Fnreverse (list_2022);
2202 Vemacs_mule_charset_list = Fnreverse (list_emacs_mule);
2203 if (charset_unibyte < 0)
2204 charset_unibyte = charset_iso_8859_1;
2206 return Qnil;
2209 DEFUN ("charset-id-internal", Fcharset_id_internal, Scharset_id_internal,
2210 0, 1, 0,
2211 doc: /* Internal use only.
2212 Return charset identification number of CHARSET. */)
2213 (Lisp_Object charset)
2215 int id;
2217 CHECK_CHARSET_GET_ID (charset, id);
2218 return make_number (id);
2221 struct charset_sort_data
2223 Lisp_Object charset;
2224 int id;
2225 ptrdiff_t priority;
2228 static int
2229 charset_compare (const void *d1, const void *d2)
2231 const struct charset_sort_data *data1 = d1, *data2 = d2;
2232 if (data1->priority != data2->priority)
2233 return data1->priority < data2->priority ? -1 : 1;
2234 return 0;
2237 DEFUN ("sort-charsets", Fsort_charsets, Ssort_charsets, 1, 1, 0,
2238 doc: /* Sort charset list CHARSETS by a priority of each charset.
2239 Return the sorted list. CHARSETS is modified by side effects.
2240 See also `charset-priority-list' and `set-charset-priority'. */)
2241 (Lisp_Object charsets)
2243 Lisp_Object len = Flength (charsets);
2244 ptrdiff_t n = XFASTINT (len), i, j;
2245 int done;
2246 Lisp_Object tail, elt, attrs;
2247 struct charset_sort_data *sort_data;
2248 int id, min_id = INT_MAX, max_id = INT_MIN;
2249 USE_SAFE_ALLOCA;
2251 if (n == 0)
2252 return Qnil;
2253 SAFE_NALLOCA (sort_data, 1, n);
2254 for (tail = charsets, i = 0; CONSP (tail); tail = XCDR (tail), i++)
2256 elt = XCAR (tail);
2257 CHECK_CHARSET_GET_ATTR (elt, attrs);
2258 sort_data[i].charset = elt;
2259 sort_data[i].id = id = XINT (CHARSET_ATTR_ID (attrs));
2260 if (id < min_id)
2261 min_id = id;
2262 if (id > max_id)
2263 max_id = id;
2265 for (done = 0, tail = Vcharset_ordered_list, i = 0;
2266 done < n && CONSP (tail); tail = XCDR (tail), i++)
2268 elt = XCAR (tail);
2269 id = XFASTINT (elt);
2270 if (id >= min_id && id <= max_id)
2271 for (j = 0; j < n; j++)
2272 if (sort_data[j].id == id)
2274 sort_data[j].priority = i;
2275 done++;
2278 qsort (sort_data, n, sizeof *sort_data, charset_compare);
2279 for (i = 0, tail = charsets; CONSP (tail); tail = XCDR (tail), i++)
2280 XSETCAR (tail, sort_data[i].charset);
2281 SAFE_FREE ();
2282 return charsets;
2286 void
2287 init_charset (void)
2289 Lisp_Object tempdir;
2290 tempdir = Fexpand_file_name (build_string ("charsets"), Vdata_directory);
2291 if (! file_accessible_directory_p (tempdir))
2293 /* This used to be non-fatal (dir_warning), but it should not
2294 happen, and if it does sooner or later it will cause some
2295 obscure problem (eg bug#6401), so better abort. */
2296 fprintf (stderr, "Error: charsets directory not found:\n\
2297 %s\n\
2298 Emacs will not function correctly without the character map files.\n%s\
2299 Please check your installation!\n",
2300 SDATA (tempdir),
2301 egetenv("EMACSDATA") ? "The EMACSDATA environment \
2302 variable is set, maybe it has the wrong value?\n" : "");
2303 exit (1);
2306 Vcharset_map_path = list1 (tempdir);
2310 void
2311 init_charset_once (void)
2313 int i, j, k;
2315 for (i = 0; i < ISO_MAX_DIMENSION; i++)
2316 for (j = 0; j < ISO_MAX_CHARS; j++)
2317 for (k = 0; k < ISO_MAX_FINAL; k++)
2318 iso_charset_table[i][j][k] = -1;
2320 for (i = 0; i < 256; i++)
2321 emacs_mule_charset[i] = -1;
2323 charset_jisx0201_roman = -1;
2324 charset_jisx0208_1978 = -1;
2325 charset_jisx0208 = -1;
2326 charset_ksc5601 = -1;
2329 #ifdef emacs
2331 /* Allocate an initial charset table that is large enough to handle
2332 Emacs while it is bootstrapping. As of September 2011, the size
2333 needs to be at least 166; make it a bit bigger to allow for future
2334 expansion.
2336 Don't make the value so small that the table is reallocated during
2337 bootstrapping, as glibc malloc calls larger than just under 64 KiB
2338 during an initial bootstrap wreak havoc after dumping; see the
2339 M_MMAP_THRESHOLD value in alloc.c, plus there is a extra overhead
2340 internal to glibc malloc and perhaps to Emacs malloc debugging. */
2341 static struct charset charset_table_init[180];
2343 void
2344 syms_of_charset (void)
2346 DEFSYM (Qcharsetp, "charsetp");
2348 DEFSYM (Qascii, "ascii");
2349 DEFSYM (Qunicode, "unicode");
2350 DEFSYM (Qemacs, "emacs");
2351 DEFSYM (Qeight_bit, "eight-bit");
2352 DEFSYM (Qiso_8859_1, "iso-8859-1");
2354 DEFSYM (Qgl, "gl");
2355 DEFSYM (Qgr, "gr");
2357 staticpro (&Vcharset_ordered_list);
2358 Vcharset_ordered_list = Qnil;
2360 staticpro (&Viso_2022_charset_list);
2361 Viso_2022_charset_list = Qnil;
2363 staticpro (&Vemacs_mule_charset_list);
2364 Vemacs_mule_charset_list = Qnil;
2366 /* Don't staticpro them here. It's done in syms_of_fns. */
2367 QCtest = intern_c_string (":test");
2368 Qeq = intern_c_string ("eq");
2370 staticpro (&Vcharset_hash_table);
2372 Lisp_Object args[2];
2373 args[0] = QCtest;
2374 args[1] = Qeq;
2375 Vcharset_hash_table = Fmake_hash_table (2, args);
2378 charset_table = charset_table_init;
2379 charset_table_size = ARRAYELTS (charset_table_init);
2380 charset_table_used = 0;
2382 defsubr (&Scharsetp);
2383 defsubr (&Smap_charset_chars);
2384 defsubr (&Sdefine_charset_internal);
2385 defsubr (&Sdefine_charset_alias);
2386 defsubr (&Scharset_plist);
2387 defsubr (&Sset_charset_plist);
2388 defsubr (&Sunify_charset);
2389 defsubr (&Sget_unused_iso_final_char);
2390 defsubr (&Sdeclare_equiv_charset);
2391 defsubr (&Sfind_charset_region);
2392 defsubr (&Sfind_charset_string);
2393 defsubr (&Sdecode_char);
2394 defsubr (&Sencode_char);
2395 defsubr (&Ssplit_char);
2396 defsubr (&Smake_char);
2397 defsubr (&Schar_charset);
2398 defsubr (&Scharset_after);
2399 defsubr (&Siso_charset);
2400 defsubr (&Sclear_charset_maps);
2401 defsubr (&Scharset_priority_list);
2402 defsubr (&Sset_charset_priority);
2403 defsubr (&Scharset_id_internal);
2404 defsubr (&Ssort_charsets);
2406 DEFVAR_LISP ("charset-map-path", Vcharset_map_path,
2407 doc: /* List of directories to search for charset map files. */);
2408 Vcharset_map_path = Qnil;
2410 DEFVAR_BOOL ("inhibit-load-charset-map", inhibit_load_charset_map,
2411 doc: /* Inhibit loading of charset maps. Used when dumping Emacs. */);
2412 inhibit_load_charset_map = 0;
2414 DEFVAR_LISP ("charset-list", Vcharset_list,
2415 doc: /* List of all charsets ever defined. */);
2416 Vcharset_list = Qnil;
2418 DEFVAR_LISP ("current-iso639-language", Vcurrent_iso639_language,
2419 doc: /* ISO639 language mnemonic symbol for the current language environment.
2420 If the current language environment is for multiple languages (e.g. "Latin-1"),
2421 the value may be a list of mnemonics. */);
2422 Vcurrent_iso639_language = Qnil;
2424 charset_ascii
2425 = define_charset_internal (Qascii, 1, "\x00\x7F\0\0\0\0\0",
2426 0, 127, 'B', -1, 0, 1, 0, 0);
2427 charset_iso_8859_1
2428 = define_charset_internal (Qiso_8859_1, 1, "\x00\xFF\0\0\0\0\0",
2429 0, 255, -1, -1, -1, 1, 0, 0);
2430 charset_unicode
2431 = define_charset_internal (Qunicode, 3, "\x00\xFF\x00\xFF\x00\x10\0",
2432 0, MAX_UNICODE_CHAR, -1, 0, -1, 1, 0, 0);
2433 charset_emacs
2434 = define_charset_internal (Qemacs, 3, "\x00\xFF\x00\xFF\x00\x3F\0",
2435 0, MAX_5_BYTE_CHAR, -1, 0, -1, 1, 1, 0);
2436 charset_eight_bit
2437 = define_charset_internal (Qeight_bit, 1, "\x80\xFF\0\0\0\0\0",
2438 128, 255, -1, 0, -1, 0, 1,
2439 MAX_5_BYTE_CHAR + 1);
2440 charset_unibyte = charset_iso_8859_1;
2443 #endif /* emacs */