beta-0.89.2
[luatex.git] / source / texk / web2c / luatexdir / font / tounicode.w
blobc213eee72c5d24ced03a6f8606e18e276fea72a3
1 % tounicode.w
3 % Copyright 2006 Han The Thanh, <thanh@@pdftex.org>
4 % Copyright 2006-2010 Taco Hoekwater <taco@@luatex.org>
6 % This file is part of LuaTeX.
8 % LuaTeX is free software; you can redistribute it and/or modify it under
9 % the terms of the GNU General Public License as published by the Free
10 % Software Foundation; either version 2 of the License, or (at your
11 % option) any later version.
13 % LuaTeX is distributed in the hope that it will be useful, but WITHOUT
14 % ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 % FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 % License for more details.
18 % You should have received a copy of the GNU General Public License along
19 % with LuaTeX; if not, see <http://www.gnu.org/licenses/>.
21 @ @c
24 #include "ptexlib.h"
26 @ @c
27 #define isXdigit(c) (isdigit(c) || ('A' <= (c) && (c) <= 'F'))
28 #define UNI_UNDEF -1
29 #define UNI_STRING -2 /* string allocated by |def_tounicode()| */
30 #define UNI_EXTRA_STRING -3 /* string allocated by |set_glyph_unicode()| */
32 static struct avl_table *glyph_unicode_tree = NULL;
34 static int comp_glyph_unicode_entry(const void *pa, const void *pb, void *p)
36 (void) p;
37 return strcmp(((const glyph_unicode_entry *) pa)->name,
38 ((const glyph_unicode_entry *) pb)->name);
41 static glyph_unicode_entry *new_glyph_unicode_entry(void)
43 glyph_unicode_entry *e;
44 e = xtalloc(1, glyph_unicode_entry);
45 e->name = NULL;
46 e->code = UNI_UNDEF;
47 e->unicode_seq = NULL;
48 return e;
51 static void destroy_glyph_unicode_entry(void *pa, void *pb)
53 glyph_unicode_entry *e = (glyph_unicode_entry *) pa;
54 (void) pb;
55 xfree(e->name);
56 if (e->code == UNI_STRING) {
57 assert(e->unicode_seq != NULL);
58 xfree(e->unicode_seq);
62 void glyph_unicode_free(void)
64 if (glyph_unicode_tree != NULL)
65 avl_destroy(glyph_unicode_tree, destroy_glyph_unicode_entry);
68 @ @c
69 void def_tounicode(str_number glyph, str_number unistr)
71 char buf[SMALL_BUF_SIZE], *p, *ph;
72 char buf2[SMALL_BUF_SIZE], *q;
73 int valid_unistr; /* 0: invalid; 1: unicode value; 2: string */
74 int i, l;
75 glyph_unicode_entry *gu, t;
76 void **aa;
78 p = makecstring(glyph);
79 assert(strlen(p) < SMALL_BUF_SIZE);
80 strcpy(buf, p);
81 free(p);
82 p = makecstring(unistr);
83 ph = p;
84 while (*p == ' ')
85 p++; /* ignore leading spaces */
86 l = (int) strlen(p);
87 while (l > 0 && p[l - 1] == ' ')
88 l--; /* ignore traling spaces */
89 valid_unistr = 1; /* a unicode value is the most common case */
90 for (i = 0; i < l; i++) {
91 if (p[i] == ' ')
92 valid_unistr = 2; /* if a space occurs we treat this entry as a string */
93 else if (!isXdigit((unsigned char)p[i])) {
94 valid_unistr = 0;
95 break;
98 if (l == 0 || valid_unistr == 0 || strlen(buf) == 0 || strcmp(buf, notdef) == 0) {
99 formatted_warning("tounicode", "invalid parameter(s): %s -> %s", buf, p);
100 return;
102 if (glyph_unicode_tree == NULL) {
103 glyph_unicode_tree =
104 avl_create(comp_glyph_unicode_entry, NULL, &avl_xallocator);
105 assert(glyph_unicode_tree != NULL);
107 t.name = buf;
108 /* allow overriding existing entries */
109 if ((gu = (glyph_unicode_entry *) avl_find(glyph_unicode_tree, &t)) != NULL) {
110 if (gu->code == UNI_STRING) {
111 assert(gu->unicode_seq != NULL);
112 xfree(gu->unicode_seq);
114 } else { /* make new entry */
115 gu = new_glyph_unicode_entry();
116 gu->name = xstrdup(buf);
118 if (valid_unistr == 2) { /* a string with space(s) */
119 /* copy p to buf2, ignoring spaces */
120 for (q = buf2; *p != 0; p++)
121 if (*p != ' ')
122 *q++ = *p;
123 *q = 0;
124 gu->code = UNI_STRING;
125 gu->unicode_seq = xstrdup(buf2);
126 } else {
127 i = sscanf(p, "%lX", &(gu->code));
128 assert(i == 1);
130 aa = avl_probe(glyph_unicode_tree, gu);
131 assert(aa != NULL);
132 free(ph);
136 @ @c
137 static long check_unicode_value(char *s, boolean multiple_value)
139 int l = (int) strlen(s);
140 int i;
141 long code = 0; /* anything that is not |UNI_UNDEF| will do */
143 if (l == 0)
144 return UNI_UNDEF;
145 if (multiple_value && l % 4 != 0)
146 return UNI_UNDEF;
147 if (!multiple_value && !(4 <= l && l <= 6))
148 return UNI_UNDEF;
150 for (i = 0; i < l; i++) {
151 if (!isXdigit((unsigned char)s[i]))
152 return UNI_UNDEF;
153 if (multiple_value) {
154 if (i % 4 == 3) {
155 if (sscanf(s + i - 3, "%4lX", &code) != 1)
156 return UNI_UNDEF;
157 if (!((0x0000 <= code && code <= 0xD7FF) ||
158 (0xE000 <= code && code <= 0xFFFF)))
159 return UNI_UNDEF;
161 } else { /* single value */
162 if (i == l - 1) {
163 if (sscanf(s, "%lX", &code) != 1)
164 return UNI_UNDEF;
165 if (!((0x0000 <= code && code <= 0xD7FF) ||
166 (0xE000 <= code && code <= 0x10FFFF)))
167 return UNI_UNDEF;
171 return code;
174 @ This function set proper values to |*gp| based on |s|; in case it returns
175 |gp->code == UNI_EXTRA_STRING| then the caller is responsible for freeing
176 |gp->unicode_seq| too.
178 static void set_glyph_unicode(char *s, glyph_unicode_entry * gp)
180 char buf[SMALL_BUF_SIZE], buf2[SMALL_BUF_SIZE], *p;
181 long code;
182 boolean last_component;
183 glyph_unicode_entry tmp, *ptmp;
185 /* skip dummy entries */
186 if (s == NULL || s == notdef)
187 return;
189 /* strip everything after the first dot */
190 p = strchr(s, '.');
191 if (p != NULL) {
192 *buf = 0;
193 strncat(buf, s, (size_t) (p - s));
194 s = buf;
197 if (strlen(s) == 0)
198 return;
200 /* check for case of multiple components separated by |'_'| */
201 p = strchr(s, '_');
202 if (p != NULL) {
203 assert(strlen(s) < sizeof(buf));
204 if (s != buf) {
205 strcpy(buf, s);
206 p = strchr(buf, '_');
207 s = buf;
209 *buf2 = 0;
210 last_component = false;
211 for (;;) {
212 *p = 0;
213 tmp.code = UNI_UNDEF;
214 set_glyph_unicode(s, &tmp);
215 switch (tmp.code) {
216 case UNI_UNDEF: /* not found, do nothing */
217 break;
218 case UNI_STRING: /* s matched an entry with string value in the database */
219 assert(tmp.unicode_seq != NULL);
220 assert(strlen(buf2) + strlen(tmp.unicode_seq) < sizeof(buf2));
221 strcat(buf2, tmp.unicode_seq);
222 break;
223 case UNI_EXTRA_STRING: /* s is a multiple value of form "uniXXXX" */
224 assert(strlen(buf2) + strlen(tmp.unicode_seq) < sizeof(buf2));
225 strcat(buf2, tmp.unicode_seq);
226 xfree(tmp.unicode_seq);
227 break;
228 default: /* s matched an entry with numeric value in the
229 database, or a value derived from "uXXXX" */
230 assert(tmp.code >= 0);
231 strcat(buf2, utf16be_str(tmp.code));
233 if (last_component)
234 break;
235 s = p + 1;
236 p = strchr(s, '_');
237 if (p == NULL) {
238 p = strend(s);
239 last_component = true;
242 gp->code = UNI_EXTRA_STRING;
243 gp->unicode_seq = xstrdup(buf2);
244 return;
247 /* lookup for glyph name in the database */
248 tmp.name = s;
249 tmp.code = UNI_UNDEF;
250 ptmp = (glyph_unicode_entry *) avl_find(glyph_unicode_tree, &tmp);
251 if (ptmp != NULL) {
252 gp->code = ptmp->code;
253 gp->unicode_seq = ptmp->unicode_seq;
254 return;
257 /* check for case of "uniXXXX" (multiple 4-hex-digit values allowed) */
258 if (str_prefix(s, "uni")) {
259 p = s + strlen("uni");
260 code = check_unicode_value(p, true);
261 if (code != UNI_UNDEF) {
262 if (strlen(p) == 4) /* single value */
263 gp->code = code;
264 else { /* multiple value */
265 gp->code = UNI_EXTRA_STRING;
266 gp->unicode_seq = xstrdup(p);
269 return; /* since the last case cannot happen */
272 /* check for case of "uXXXX" (single value up to 6 hex digits) */
273 if (str_prefix(s, "u")) {
274 p = s + strlen("u");
275 code = check_unicode_value(p, false);
276 if (code != UNI_UNDEF) {
277 assert(code >= 0);
278 gp->code = code;
283 @ @c
284 static void set_cid_glyph_unicode(long index, glyph_unicode_entry * gp,
285 internal_font_number f)
287 char *s;
288 if (font_tounicode(f) &&
289 (s = get_charinfo_tounicode(char_info(f, (int) index))) != NULL) {
290 gp->code = UNI_EXTRA_STRING;
291 gp->unicode_seq = xstrdup(s);
292 } else {
293 gp->code = index; /* fallback */
298 @ @c
299 int write_tounicode(PDF pdf, char **glyph_names, char *name)
301 char buf[SMALL_BUF_SIZE], *p;
302 static char builtin_suffix[] = "-builtin";
303 short range_size[257];
304 glyph_unicode_entry gtab[257];
305 int objnum;
306 int i, j;
307 int bfchar_count, bfrange_count, subrange_count;
308 assert(strlen(name) + strlen(builtin_suffix) < SMALL_BUF_SIZE);
309 if (glyph_unicode_tree == NULL) {
310 pdf->gen_tounicode = 0;
311 return 0;
313 strcpy(buf, name);
314 if ((p = strrchr(buf, '.')) != NULL && strcmp(p, ".enc") == 0)
315 *p = 0; /* strip ".enc" from encoding name */
316 else
317 strcat(buf, builtin_suffix); /* ".enc" not present, this is a builtin
318 encoding so the name is eg "cmr10-builtin" */
319 objnum = pdf_create_obj(pdf, obj_type_others, 0);
320 pdf_begin_obj(pdf, objnum, OBJSTM_NEVER);
321 pdf_begin_dict(pdf);
322 pdf_dict_add_streaminfo(pdf);
323 pdf_end_dict(pdf);
324 pdf_begin_stream(pdf);
325 pdf_printf(pdf, "%%!PS-Adobe-3.0 Resource-CMap\n"@/
326 "%%%%DocumentNeededResources: ProcSet (CIDInit)\n"@/
327 "%%%%IncludeResource: ProcSet (CIDInit)\n"@/
328 "%%%%BeginResource: CMap (TeX-%s-0)\n"@/
329 "%%%%Title: (TeX-%s-0 TeX %s 0)\n"@/
330 "%%%%Version: 1.000\n"@/
331 "%%%%EndComments\n"@/
332 "/CIDInit /ProcSet findresource begin\n"@/
333 "12 dict begin\n"@/
334 "begincmap\n"@/
335 "/CIDSystemInfo\n"@/
336 "<< /Registry (TeX)\n"@/
337 "/Ordering (%s)\n"@/
338 "/Supplement 0\n"@/
339 ">> def\n"@/
340 "/CMapName /TeX-%s-0 def\n"@/
341 "/CMapType 2 def\n"@/
342 "1 begincodespacerange\n"@/
343 "<00> <FF>\n" "endcodespacerange\n", buf, buf, buf, buf, buf);
345 /* set gtab */
346 for (i = 0; i < 256; ++i) {
347 gtab[i].code = UNI_UNDEF;
348 set_glyph_unicode(glyph_names[i], &gtab[i]);
350 gtab[256].code = UNI_UNDEF;
352 /* set |range_size| */
353 for (i = 0; i < 256;) {
354 if (gtab[i].code == UNI_STRING || gtab[i].code == UNI_EXTRA_STRING) {
355 range_size[i] = 1; /* single entry */
356 i++;
357 } else if (gtab[i].code == UNI_UNDEF) {
358 range_size[i] = 0; /* no entry */
359 i++;
360 } else { /* gtab[i].code >= 0 */
361 j = i;
362 while (i < 256 && gtab[i + 1].code >= 0 &&
363 gtab[i].code + 1 == gtab[i + 1].code)
364 i++;
365 /* at this point i is the last entry of the subrange */
366 i++; /* move i to the next entry */
367 range_size[j] = (short) (i - j);
371 /* calculate |bfrange_count| and |bfchar_count| */
372 bfrange_count = 0;
373 bfchar_count = 0;
374 for (i = 0; i < 256;) {
375 if (range_size[i] == 1) {
376 bfchar_count++;
377 i++;
378 } else if (range_size[i] > 1) {
379 bfrange_count++;
380 i += range_size[i];
381 } else
382 i++;
385 /* write out bfrange */
386 i = 0;
387 write_bfrange:
388 if (bfrange_count > 100)
389 subrange_count = 100;
390 else
391 subrange_count = bfrange_count;
392 bfrange_count -= subrange_count;
393 pdf_printf(pdf, "%i beginbfrange\n", subrange_count);
394 for (j = 0; j < subrange_count; j++) {
395 while (range_size[i] <= 1 && i < 256)
396 i++;
397 assert(i < 256);
398 pdf_printf(pdf, "<%02X> <%02X> <%s>\n", i, i + range_size[i] - 1,
399 utf16be_str(gtab[i].code));
400 i += range_size[i];
402 pdf_printf(pdf, "endbfrange\n");
403 if (bfrange_count > 0)
404 goto write_bfrange;
406 /* write out bfchar */
407 i = 0;
408 write_bfchar:
409 if (bfchar_count > 100)
410 subrange_count = 100;
411 else
412 subrange_count = bfchar_count;
413 bfchar_count -= subrange_count;
414 pdf_printf(pdf, "%i beginbfchar\n", subrange_count);
415 for (j = 0; j < subrange_count; j++) {
416 while (i < 256) {
417 if (range_size[i] > 1)
418 i += range_size[i];
419 else if (range_size[i] == 0)
420 i++;
421 else /* |range_size[i] == 1| */
422 break;
424 assert(i < 256 && gtab[i].code != UNI_UNDEF);
425 if (gtab[i].code == UNI_STRING || gtab[i].code == UNI_EXTRA_STRING) {
426 assert(gtab[i].unicode_seq != NULL);
427 pdf_printf(pdf, "<%02X> <%s>\n", i, gtab[i].unicode_seq);
428 } else
429 pdf_printf(pdf, "<%02X> <%s>\n", i, utf16be_str(gtab[i].code));
430 i++;
432 pdf_printf(pdf, "endbfchar\n");
433 if (bfchar_count > 0)
434 goto write_bfchar;
436 /* free strings allocated by |set_glyph_unicode()| */
437 for (i = 0; i < 256; ++i) {
438 if (gtab[i].code == UNI_EXTRA_STRING)
439 xfree(gtab[i].unicode_seq);
442 pdf_printf(pdf, "endcmap\n"
443 "CMapName currentdict /CMap defineresource pop\n"
444 "end\n" "end\n" "%%%%EndResource\n" "%%%%EOF\n");
445 pdf_end_stream(pdf);
446 pdf_end_obj(pdf);
447 return objnum;
450 @ @c
451 int write_cid_tounicode(PDF pdf, fo_entry * fo, internal_font_number f)
454 static int range_size[65537];
455 static glyph_unicode_entry gtab[65537];
456 int objnum;
457 int i, j, k;
458 int bfchar_count, bfrange_count, subrange_count;
459 char *buf;
461 assert(fo->fd->fontname);
462 buf = xmalloc((unsigned) (strlen(fo->fd->fontname) + 8));
463 sprintf(buf, "%s-%s",
464 (fo->fd->subset_tag != NULL ? fo->fd->subset_tag : "UCS"),
465 fo->fd->fontname);
467 objnum = pdf_create_obj(pdf, obj_type_others, 0);
468 pdf_begin_obj(pdf, objnum, OBJSTM_NEVER);
469 pdf_begin_dict(pdf);
470 pdf_dict_add_streaminfo(pdf);
471 pdf_end_dict(pdf);
472 pdf_begin_stream(pdf);
473 pdf_printf(pdf, "%%!PS-Adobe-3.0 Resource-CMap\n"@/
474 "%%%%DocumentNeededResources: ProcSet (CIDInit)\n"@/
475 "%%%%IncludeResource: ProcSet (CIDInit)\n"@/
476 "%%%%BeginResource: CMap (TeX-%s-0)\n"@/
477 "%%%%Title: (TeX-%s-0 TeX %s 0)\n"@/
478 "%%%%Version: 1.000\n"@/
479 "%%%%EndComments\n"@/
480 "/CIDInit /ProcSet findresource begin\n"@/
481 "12 dict begin\n"@/
482 "begincmap\n"@/
483 "/CIDSystemInfo\n"@/
484 "<< /Registry (TeX)\n"@/
485 "/Ordering (%s)\n"@/
486 "/Supplement 0\n"@/
487 ">> def\n"@/
488 "/CMapName /TeX-Identity-%s def\n"@/
489 "/CMapType 2 def\n"@/
490 "1 begincodespacerange\n"@/
491 "<0000> <FFFF>\n"@/
492 "endcodespacerange\n", buf, buf, buf, buf, buf);
493 xfree(buf);
494 /* set up gtab */
495 for (i = 0; i < 65537; ++i) {
496 gtab[i].code = UNI_UNDEF;
498 for (k = 1; k <= max_font_id(); k++) {
499 if (k == f || -f == pdf_font_num(k)) {
500 for (i = font_bc(k); i <= font_ec(k); i++) {
501 if (quick_char_exists(k, i) && char_used(k, i)) {
502 j = char_index(k, i);
503 if (gtab[j].code == UNI_UNDEF) {
504 set_cid_glyph_unicode(i, &gtab[j], f);
511 /* set |range_size| */
512 for (i = 0; i < 65536;) {
513 if (gtab[i].code == UNI_STRING || gtab[i].code == UNI_EXTRA_STRING) {
514 range_size[i] = 1; /* single entry */
515 i++;
516 } else if (gtab[i].code == UNI_UNDEF) {
517 range_size[i] = 0; /* no entry */
518 i++;
519 } else { /* |gtab[i].code >= 0| */
520 j = i;
521 k = i % 256;
522 while (i < 65536 && k<255 && gtab[i + 1].code >= 0 &&
523 gtab[i].code + 1 == gtab[i + 1].code) {
524 i++; k++;
526 /* at this point i is the last entry of the subrange */
527 i++; /* move i to the next entry */
528 range_size[j] = i - j;
532 /* calculate |bfrange_count| and |bfchar_count| */
533 bfrange_count = 0;
534 bfchar_count = 0;
535 for (i = 0; i < 65536;) {
536 if (range_size[i] == 1) {
537 bfchar_count++;
538 i++;
539 } else if (range_size[i] > 1) {
540 bfrange_count++;
541 i += range_size[i];
542 } else
543 i++;
546 /* write out bfrange */
547 i = 0;
548 write_bfrange:
549 if (bfrange_count > 100)
550 subrange_count = 100;
551 else
552 subrange_count = bfrange_count;
553 bfrange_count -= subrange_count;
554 pdf_printf(pdf, "%i beginbfrange\n", subrange_count);
555 for (j = 0; j < subrange_count; j++) {
556 while (range_size[i] <= 1 && i < 65536)
557 i++;
558 assert(i < 65536);
559 pdf_printf(pdf, "<%04X> <%04X> <%s>\n", i, i + range_size[i] - 1,
560 utf16be_str(gtab[i].code));
561 i += range_size[i];
563 pdf_printf(pdf, "endbfrange\n");
564 if (bfrange_count > 0)
565 goto write_bfrange;
567 /* write out bfchar */
568 i = 0;
569 write_bfchar:
570 if (bfchar_count > 100)
571 subrange_count = 100;
572 else
573 subrange_count = bfchar_count;
574 bfchar_count -= subrange_count;
575 pdf_printf(pdf, "%i beginbfchar\n", subrange_count);
576 for (j = 0; j < subrange_count; j++) {
577 while (i < 65536) {
578 if (range_size[i] > 1)
579 i += range_size[i];
580 else if (range_size[i] == 0)
581 i++;
582 else /* |range_size[i] == 1| */
583 break;
585 assert(i < 65536 && gtab[i].code != UNI_UNDEF);
586 if (gtab[i].code == UNI_STRING || gtab[i].code == UNI_EXTRA_STRING) {
587 assert(gtab[i].unicode_seq != NULL);
588 pdf_printf(pdf, "<%04X> <%s>\n", i, gtab[i].unicode_seq);
589 } else
590 pdf_printf(pdf, "<%04X> <%s>\n", i, utf16be_str(gtab[i].code));
591 i++;
593 pdf_printf(pdf, "endbfchar\n");
594 if (bfchar_count > 0)
595 goto write_bfchar;
597 /* free strings allocated by |set_glyph_unicode()| */
598 for (i = 0; i < 65536; ++i) {
599 if (gtab[i].code == UNI_EXTRA_STRING)
600 xfree(gtab[i].unicode_seq);
603 pdf_printf(pdf, "endcmap\n"
604 "CMapName currentdict /CMap defineresource pop\n"
605 "end\n" "end\n" "%%%%EndResource\n" "%%%%EOF\n");
606 pdf_end_stream(pdf);
607 pdf_end_obj(pdf);
608 return objnum;