2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
13 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
17 * CHARMAP file handling for iconv.
31 #include "parser.tab.h"
34 enum cmap_pass cmap_pass
;
35 static avl_tree_t cmap_sym
;
36 static avl_tree_t cmap_mbs
;
38 typedef struct charmap
{
40 struct charmap
*cm_alias_of
;
41 avl_node_t cm_avl_sym
;
42 avl_node_t cm_avl_mbs
;
46 char cm_frmbs
[MB_LEN_MAX
+ 1]; /* input */
47 char cm_tombs
[MB_LEN_MAX
+ 1]; /* output */
50 static void add_charmap_impl_fr(char *sym
, char *mbs
, int mbs_len
, int nodups
);
51 static void add_charmap_impl_to(char *sym
, char *mbs
, int mbs_len
, int nodups
);
54 * Array of POSIX specific portable characters.
59 } portable_chars
[] = {
62 { "backspace", '\b' },
64 { "carriage-return", '\r' },
66 { "vertical-tab", '\v' },
67 { "form-feed", '\f' },
69 { "exclamation-mark", '!' },
70 { "quotation-mark", '"' },
71 { "number-sign", '#' },
72 { "dollar-sign", '$' },
73 { "percent-sign", '%' },
75 { "apostrophe", '\'' },
76 { "left-parenthesis", '(' },
77 { "right-parenthesis", '(' },
81 { "hyphen-minus", '-' },
99 { "less-than-sign", '<' },
100 { "equals-sign", '=' },
101 { "greater-than-sign", '>' },
102 { "question-mark", '?' },
103 { "commercial-at", '@' },
104 { "left-square-bracket", '[' },
105 { "backslash", '\\' },
106 { "reverse-solidus", '\\' },
107 { "right-square-bracket", ']' },
108 { "circumflex", '^' },
109 { "circumflex-accent", '^' },
111 { "underscore", '_' },
112 { "grave-accent", '`' },
113 { "left-brace", '{' },
114 { "left-curly-bracket", '{' },
115 { "vertical-line", '|' },
116 { "right-brace", '}' },
117 { "right-curly-bracket", '}' },
175 cmap_compare_sym(const void *n1
, const void *n2
)
177 const charmap_t
*c1
= n1
;
178 const charmap_t
*c2
= n2
;
181 rv
= strcmp(c1
->cm_name
, c2
->cm_name
);
182 return ((rv
< 0) ? -1 : (rv
> 0) ? 1 : 0);
186 * In order for partial match searches to work,
187 * we need these sorted by mbs contents.
190 cmap_compare_mbs(const void *n1
, const void *n2
)
192 const charmap_t
*c1
= n1
;
193 const charmap_t
*c2
= n2
;
196 len
= c1
->cm_frmbs_len
;
197 if (len
< c2
->cm_frmbs_len
)
198 len
= c2
->cm_frmbs_len
;
199 rv
= memcmp(c1
->cm_frmbs
, c2
->cm_frmbs
, len
);
204 /* they match through length */
205 if (c1
->cm_frmbs_len
< c2
->cm_frmbs_len
)
207 if (c2
->cm_frmbs_len
< c1
->cm_frmbs_len
)
213 charmap_init(char *to_map
, char *from_map
)
215 avl_create(&cmap_sym
, cmap_compare_sym
, sizeof (charmap_t
),
216 offsetof(charmap_t
, cm_avl_sym
));
218 avl_create(&cmap_mbs
, cmap_compare_mbs
, sizeof (charmap_t
),
219 offsetof(charmap_t
, cm_avl_mbs
));
221 cmap_pass
= CMAP_PASS_FROM
;
222 reset_scanner(from_map
);
226 cmap_pass
= CMAP_PASS_TO
;
227 reset_scanner(to_map
);
237 cm
= avl_first(&cmap_mbs
);
239 (void) printf("name=\"%s\"\n", cm
->cm_name
);
241 (void) printf("\timbs=\"");
242 for (i
= 0; i
< cm
->cm_frmbs_len
; i
++)
243 (void) printf("\\x%02x", cm
->cm_frmbs
[i
] & 0xFF);
244 (void) printf("\"\n");
246 (void) printf("\tombs=\"");
247 for (i
= 0; i
< cm
->cm_tombs_len
; i
++)
248 (void) printf("\\x%02x", cm
->cm_tombs
[i
] & 0xFF);
249 (void) printf("\"\n");
251 cm
= AVL_NEXT(&cmap_mbs
, cm
);
256 * We parse two charmap files: First the "from" map, where we build
257 * cmap_mbs and cmap_sym which we'll later use to translate the input
258 * stream (mbs encodings) to symbols. Second, we parse the "to" map,
259 * where we fill in the tombs members of entries in cmap_sym, (which
260 * must alread exist) used later to write the output encoding.
263 add_charmap_impl(char *sym
, char *mbs
, int mbs_len
, int nodups
)
267 * While parsing both the "from" and "to" cmaps,
268 * require both the symbol and encoding.
270 if (sym
== NULL
|| mbs
== NULL
) {
271 errf(_("invalid charmap entry"));
277 add_charmap_impl_fr(sym
, mbs
, mbs_len
, nodups
);
280 add_charmap_impl_to(sym
, mbs
, mbs_len
, nodups
);
289 add_charmap_impl_fr(char *sym
, char *mbs
, int mbs_len
, int nodups
)
291 charmap_t
*m
, *n
, *s
;
292 avl_index_t where_sym
, where_mbs
;
294 if ((n
= calloc(1, sizeof (*n
))) == NULL
) {
295 errf(_("out of memory"));
300 assert(0 < mbs_len
&& mbs_len
<= MB_LEN_MAX
);
301 (void) memcpy(n
->cm_frmbs
, mbs
, mbs_len
);
302 n
->cm_frmbs_len
= mbs_len
;
304 m
= avl_find(&cmap_mbs
, n
, &where_mbs
);
305 s
= avl_find(&cmap_sym
, n
, &where_sym
);
308 * If we found the symbol, this is a dup.
312 warn(_("%s: duplicate character symbol"), sym
);
319 * If we found the mbs, the new one is an alias,
320 * which we'll add _only_ to the symbol AVL.
323 /* The new one is an alias of the original. */
325 avl_insert(&cmap_sym
, n
, where_sym
);
329 avl_insert(&cmap_sym
, n
, where_sym
);
330 avl_insert(&cmap_mbs
, n
, where_mbs
);
334 add_charmap_impl_to(char *sym
, char *mbs
, int mbs_len
, int nodups
)
336 charmap_t srch
= {0};
339 assert(0 < mbs_len
&& mbs_len
<= MB_LEN_MAX
);
343 m
= avl_find(&cmap_sym
, &srch
, NULL
);
346 warn(_("%s: symbol not found"), sym
);
349 if (m
->cm_alias_of
!= NULL
) {
352 /* don't warn for dups with aliases */
353 if (m
->cm_tombs_len
!= 0)
357 if (m
->cm_tombs_len
!= 0) {
359 warn(_("%s: duplicate encoding for"), sym
);
364 (void) memcpy(m
->cm_tombs
, mbs
, mbs_len
);
365 m
->cm_tombs_len
= mbs_len
;
369 add_charmap(char *sym
, char *mbs
)
371 /* mbs[0] is the length */
372 int mbs_len
= *mbs
++;
373 assert(0 < mbs_len
&& mbs_len
<= MB_LEN_MAX
);
374 add_charmap_impl(sym
, mbs
, mbs_len
, 1);
379 * This is called by the parser with start/end symbol strings (ssym, esym),
380 * which are allocated in the scanner (T_SYMBOL) and free'd here.
383 add_charmap_range(char *ssym
, char *esym
, char *mbs
)
390 char tmbs
[MB_LEN_MAX
+1];
393 static const char *digits
= "0123456789";
395 /* mbs[0] is the length */
397 assert(0 < mbs_len
&& mbs_len
<= MB_LEN_MAX
);
398 (void) memcpy(tmbs
, mbs
, mbs_len
);
399 mb_last
= tmbs
+ mbs_len
- 1;
404 if (((si
= strcspn(ssym
, digits
)) == 0) || (si
== ls
) ||
405 (strncmp(ssym
, esym
, si
) != 0) ||
406 (strspn(ssym
+ si
, digits
) != (ls
- si
)) ||
407 (strspn(esym
+ si
, digits
) != (le
- si
)) ||
408 ((sn
= atoi(ssym
+ si
)) > ((en
= atoi(esym
+ si
))))) {
409 errf(_("malformed charmap range"));
414 for (i
= sn
; i
<= en
; i
++) {
416 (void) asprintf(&nn
, "%s%0*u", ssym
, ls
- si
, i
);
418 errf(_("out of memory"));
422 add_charmap_impl(nn
, tmbs
, mbs_len
, 1);
430 add_charmap_char(char *name
, int c
)
432 char mbs
[MB_LEN_MAX
+1];
436 add_charmap_impl(name
, mbs
, 1, 0);
440 * POSIX insists that certain entries be present, even when not in the
441 * orginal charmap file.
444 add_charmap_posix(void)
448 for (i
= 0; portable_chars
[i
].name
; i
++) {
449 add_charmap_char(portable_chars
[i
].name
, portable_chars
[i
].ch
);
454 * This is called with a buffer of (typically) MB_LEN_MAX bytes,
455 * which is potentially a multi-byte symbol, but often contains
456 * extra bytes. Find and return the longest match in the charmap.
459 find_mbs(const char *mbs
, int len
)
461 charmap_t srch
= {0};
462 charmap_t
*cm
= NULL
;
465 (void) memcpy(srch
.cm_frmbs
, mbs
, len
);
466 srch
.cm_frmbs_len
= len
;
467 cm
= avl_find(&cmap_mbs
, &srch
, NULL
);
477 * Return true if this sequence matches the initial part
478 * of any sequence known in this charmap.
481 find_mbs_partial(const char *mbs
, int len
)
483 charmap_t srch
= {0};
487 (void) memcpy(srch
.cm_frmbs
, mbs
, len
);
488 srch
.cm_frmbs_len
= len
;
489 cm
= avl_find(&cmap_mbs
, &srch
, &where
);
491 /* full match - not expected, but OK */
494 cm
= avl_nearest(&cmap_mbs
, where
, AVL_AFTER
);
495 if (cm
!= NULL
&& 0 == memcmp(cm
->cm_frmbs
, mbs
, len
))
502 * Do like iconv(3), but with charmaps.
505 cm_iconv(const char **iptr
, size_t *ileft
, char **optr
, size_t *oleft
)
510 /* Ignore state reset requests. */
511 if (iptr
== NULL
|| *iptr
== NULL
)
514 if (*oleft
< MB_LEN_MAX
) {
519 while (*ileft
> 0 && *oleft
>= MB_LEN_MAX
) {
520 mbs_len
= MB_LEN_MAX
;
521 if (mbs_len
> *ileft
)
523 cm
= find_mbs(*iptr
, mbs_len
);
525 if (mbs_len
< MB_LEN_MAX
&&
526 find_mbs_partial(*iptr
, mbs_len
)) {
527 /* incomplete sequence */
534 assert(cm
->cm_frmbs_len
> 0);
535 if (cm
->cm_tombs_len
== 0) {
536 if (sflag
== 0 && cm
->cm_warned
== 0) {
538 warn(_("To-map does not encode <%s>\n"),
545 /* just skip this input seq. */
546 *iptr
+= cm
->cm_frmbs_len
;
547 *ileft
-= cm
->cm_frmbs_len
;
551 *iptr
+= cm
->cm_frmbs_len
;
552 *ileft
-= cm
->cm_frmbs_len
;
553 (void) memcpy(*optr
, cm
->cm_tombs
, cm
->cm_tombs_len
);
554 *optr
+= cm
->cm_tombs_len
;
555 *oleft
-= cm
->cm_tombs_len
;