1 //========================================================================
3 // CharCodeToUnicode.cc
5 // Copyright 2001-2003 Glyph & Cog, LLC
7 //========================================================================
9 //========================================================================
11 // Modified under the Poppler project - http://poppler.freedesktop.org
13 // All changes made under the Poppler project to this file are licensed
14 // under GPL version 2 or later
16 // Copyright (C) 2006, 2008-2010, 2012 Albert Astals Cid <aacid@kde.org>
17 // Copyright (C) 2007 Julien Rebetez <julienr@svn.gnome.org>
18 // Copyright (C) 2007 Koji Otani <sho@bbr.jp>
19 // Copyright (C) 2008 Michael Vrable <mvrable@cs.ucsd.edu>
20 // Copyright (C) 2008 Vasile Gaburici <gaburici@cs.umd.edu>
21 // Copyright (C) 2010 William Bader <williambader@hotmail.com>
22 // Copyright (C) 2010 Jakub Wilk <jwilk@jwilk.net>
23 // Copyright (C) 2012 Thomas Freitag <Thomas.Freitag@alfa.de>
24 // Copyright (C) 2012 Adrian Johnson <ajohnson@redneon.com>
25 // Copyright (C) 2014 Jiri Slaby <jirislaby@gmail.com>
26 // Copyright (C) 2015 Marek Kasik <mkasik@redhat.com>
28 // To see a description of the changes please see the Changelog file that
29 // came with your tarball or type make ChangeLog if you are building from git
31 //========================================================================
35 #ifdef USE_GCC_PRAGMAS
36 #pragma implementation
42 #include "goo/gfile.h"
43 #include "goo/GooLikely.h"
44 #include "goo/GooString.h"
46 #include "GlobalParams.h"
47 #include "PSTokenizer.h"
48 #include "CharCodeToUnicode.h"
51 //------------------------------------------------------------------------
53 struct CharCodeToUnicodeString
{
59 //------------------------------------------------------------------------
61 static int getCharFromString(void *data
) {
75 static int getCharFromFile(void *data
) {
76 return fgetc((FILE *)data
);
79 //------------------------------------------------------------------------
81 static int hexCharVals
[256] = {
82 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0x
83 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 1x
84 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 2x
85 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, // 3x
86 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 4x
87 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 5x
88 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 6x
89 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 7x
90 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 8x
91 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 9x
92 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Ax
93 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Bx
94 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Cx
95 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Dx
96 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Ex
97 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // Fx
100 // Parse a <len>-byte hex string <s> into *<val>. Returns false on
102 static GBool
parseHex(char *s
, int len
, Guint
*val
) {
106 for (i
= 0; i
< len
; ++i
) {
107 x
= hexCharVals
[s
[i
] & 0xff];
111 *val
= (*val
<< 4) + x
;
116 //------------------------------------------------------------------------
118 CharCodeToUnicode
*CharCodeToUnicode::makeIdentityMapping() {
119 CharCodeToUnicode
*ctu
= new CharCodeToUnicode();
120 ctu
->isIdentity
= gTrue
;
122 ctu
->map
= (Unicode
*)gmallocn(ctu
->mapLen
, sizeof(Unicode
));
126 CharCodeToUnicode
*CharCodeToUnicode::parseCIDToUnicode(GooString
*fileName
,
127 GooString
*collection
) {
130 CharCode size
, mapLenA
;
133 CharCodeToUnicode
*ctu
;
135 if (!(f
= openFile(fileName
->getCString(), "r"))) {
136 error(errIO
, -1, "Couldn't open cidToUnicode file '{0:t}'",
142 mapA
= (Unicode
*)gmallocn(size
, sizeof(Unicode
));
145 while (getLine(buf
, sizeof(buf
), f
)) {
146 if (mapLenA
== size
) {
148 mapA
= (Unicode
*)greallocn(mapA
, size
, sizeof(Unicode
));
150 if (sscanf(buf
, "%x", &u
) == 1) {
153 error(errSyntaxWarning
, -1, "Bad line ({0:d}) in cidToUnicode file '{1:t}'",
154 (int)(mapLenA
+ 1), fileName
);
161 ctu
= new CharCodeToUnicode(collection
->copy(), mapA
, mapLenA
, gTrue
,
167 CharCodeToUnicode
*CharCodeToUnicode::parseUnicodeToUnicode(
168 GooString
*fileName
) {
171 CharCodeToUnicodeString
*sMapA
;
172 CharCode size
, oldSize
, len
, sMapSizeA
, sMapLenA
;
177 Unicode
*uBuf
= (Unicode
*)gmallocn(uBufSize
, sizeof(Unicode
));
178 CharCodeToUnicode
*ctu
;
182 if (!(f
= openFile(fileName
->getCString(), "r"))) {
184 error(errIO
, -1, "Couldn't open unicodeToUnicode file '{0:t}'",
190 mapA
= (Unicode
*)gmallocn(size
, sizeof(Unicode
));
191 memset(mapA
, 0, size
* sizeof(Unicode
));
194 sMapSizeA
= sMapLenA
= 0;
197 while (getLine(buf
, sizeof(buf
), f
)) {
199 if (!(tok
= strtok_r(buf
, " \t\r\n", &tokptr
)) ||
200 !parseHex(tok
, strlen(tok
), &u0
)) {
201 error(errSyntaxWarning
, -1, "Bad line ({0:d}) in unicodeToUnicode file '{1:t}'",
206 while ((tok
= strtok_r(NULL
, " \t\r\n", &tokptr
))) {
210 uBuf
= (Unicode
*)greallocn(uBuf
, uBufSize
, sizeof(Unicode
));
212 if (!parseHex(tok
, strlen(tok
), &uBuf
[n
])) {
213 error(errSyntaxWarning
, -1, "Bad line ({0:d}) in unicodeToUnicode file '{1:t}'",
220 error(errSyntaxWarning
, -1, "Bad line ({0:d}) in unicodeToUnicode file '{1:t}'",
229 mapA
= (Unicode
*)greallocn(mapA
, size
, sizeof(Unicode
));
230 memset(mapA
+ oldSize
, 0, (size
- oldSize
) * sizeof(Unicode
));
236 if (sMapLenA
== sMapSizeA
) {
238 sMapA
= (CharCodeToUnicodeString
*)
239 greallocn(sMapA
, sMapSizeA
, sizeof(CharCodeToUnicodeString
));
241 sMapA
[sMapLenA
].c
= u0
;
242 sMapA
[sMapLenA
].u
= (Unicode
*)gmallocn(n
, sizeof(Unicode
));
243 for (i
= 0; i
< n
; ++i
) {
244 sMapA
[sMapLenA
].u
[i
] = uBuf
[i
];
246 sMapA
[sMapLenA
].len
= n
;
255 ctu
= new CharCodeToUnicode(fileName
->copy(), mapA
, len
, gTrue
,
256 sMapA
, sMapLenA
, sMapSizeA
);
262 CharCodeToUnicode
*CharCodeToUnicode::make8BitToUnicode(Unicode
*toUnicode
) {
263 return new CharCodeToUnicode(NULL
, toUnicode
, 256, gTrue
, NULL
, 0, 0);
266 CharCodeToUnicode
*CharCodeToUnicode::parseCMap(GooString
*buf
, int nBits
) {
267 CharCodeToUnicode
*ctu
;
270 ctu
= new CharCodeToUnicode(NULL
);
271 p
= buf
->getCString();
272 ctu
->parseCMap1(&getCharFromString
, &p
, nBits
);
276 CharCodeToUnicode
*CharCodeToUnicode::parseCMapFromFile(GooString
*fileName
,
278 CharCodeToUnicode
*ctu
;
281 ctu
= new CharCodeToUnicode(NULL
);
282 if ((f
= globalParams
->findToUnicodeFile(fileName
))) {
283 ctu
->parseCMap1(&getCharFromFile
, f
, nBits
);
286 error(errSyntaxError
, -1, "Couldn't find ToUnicode CMap file for '{0:t}'",
292 void CharCodeToUnicode::mergeCMap(GooString
*buf
, int nBits
) {
295 p
= buf
->getCString();
296 parseCMap1(&getCharFromString
, &p
, nBits
);
299 void CharCodeToUnicode::parseCMap1(int (*getCharFunc
)(void *), void *data
,
302 char tok1
[256], tok2
[256], tok3
[256];
305 CharCode maxCode
, code1
, code2
;
309 maxCode
= (nBits
== 8) ? 0xff : (nBits
== 16) ? 0xffff : 0xffffffff;
310 pst
= new PSTokenizer(getCharFunc
, data
);
311 pst
->getToken(tok1
, sizeof(tok1
), &n1
);
312 while (pst
->getToken(tok2
, sizeof(tok2
), &n2
)) {
313 if (!strcmp(tok2
, "usecmap")) {
314 if (tok1
[0] == '/') {
315 name
= new GooString(tok1
+ 1);
316 if ((f
= globalParams
->findToUnicodeFile(name
))) {
317 parseCMap1(&getCharFromFile
, f
, nBits
);
320 error(errSyntaxError
, -1, "Couldn't find ToUnicode CMap file for '{0:t}'",
325 pst
->getToken(tok1
, sizeof(tok1
), &n1
);
326 } else if (!strcmp(tok2
, "beginbfchar")) {
327 while (pst
->getToken(tok1
, sizeof(tok1
), &n1
)) {
328 if (!strcmp(tok1
, "endbfchar")) {
331 if (!pst
->getToken(tok2
, sizeof(tok2
), &n2
) ||
332 !strcmp(tok2
, "endbfchar")) {
333 error(errSyntaxWarning
, -1, "Illegal entry in bfchar block in ToUnicode CMap");
336 if (!(tok1
[0] == '<' && tok1
[n1
- 1] == '>' &&
337 tok2
[0] == '<' && tok2
[n2
- 1] == '>')) {
338 error(errSyntaxWarning
, -1, "Illegal entry in bfchar block in ToUnicode CMap");
341 tok1
[n1
- 1] = tok2
[n2
- 1] = '\0';
342 if (!parseHex(tok1
+ 1, n1
- 2, &code1
)) {
343 error(errSyntaxWarning
, -1, "Illegal entry in bfchar block in ToUnicode CMap");
346 if (code1
> maxCode
) {
347 error(errSyntaxWarning
, -1,
348 "Invalid entry in bfchar block in ToUnicode CMap");
350 addMapping(code1
, tok2
+ 1, n2
- 2, 0);
352 pst
->getToken(tok1
, sizeof(tok1
), &n1
);
353 } else if (!strcmp(tok2
, "beginbfrange")) {
354 while (pst
->getToken(tok1
, sizeof(tok1
), &n1
)) {
355 if (!strcmp(tok1
, "endbfrange")) {
358 if (!pst
->getToken(tok2
, sizeof(tok2
), &n2
) ||
359 !strcmp(tok2
, "endbfrange") ||
360 !pst
->getToken(tok3
, sizeof(tok3
), &n3
) ||
361 !strcmp(tok3
, "endbfrange")) {
362 error(errSyntaxWarning
, -1, "Illegal entry in bfrange block in ToUnicode CMap");
365 if (!(tok1
[0] == '<' && tok1
[n1
- 1] == '>' &&
366 tok2
[0] == '<' && tok2
[n2
- 1] == '>')) {
367 error(errSyntaxWarning
, -1, "Illegal entry in bfrange block in ToUnicode CMap");
370 tok1
[n1
- 1] = tok2
[n2
- 1] = '\0';
371 if (!parseHex(tok1
+ 1, n1
- 2, &code1
) ||
372 !parseHex(tok2
+ 1, n2
- 2, &code2
)) {
373 error(errSyntaxWarning
, -1, "Illegal entry in bfrange block in ToUnicode CMap");
376 if (code1
> maxCode
|| code2
> maxCode
) {
377 error(errSyntaxWarning
, -1,
378 "Invalid entry in bfrange block in ToUnicode CMap");
379 if (code1
> maxCode
) {
382 if (code2
> maxCode
) {
386 if (!strcmp(tok3
, "[")) {
388 while (pst
->getToken(tok1
, sizeof(tok1
), &n1
) &&
389 code1
+ i
<= code2
) {
390 if (!strcmp(tok1
, "]")) {
393 if (tok1
[0] == '<' && tok1
[n1
- 1] == '>') {
395 addMapping(code1
+ i
, tok1
+ 1, n1
- 2, 0);
397 error(errSyntaxWarning
, -1, "Illegal entry in bfrange block in ToUnicode CMap");
401 } else if (tok3
[0] == '<' && tok3
[n3
- 1] == '>') {
403 for (i
= 0; code1
<= code2
; ++code1
, ++i
) {
404 addMapping(code1
, tok3
+ 1, n3
- 2, i
);
408 error(errSyntaxWarning
, -1, "Illegal entry in bfrange block in ToUnicode CMap");
411 pst
->getToken(tok1
, sizeof(tok1
), &n1
);
419 void CharCodeToUnicode::addMapping(CharCode code
, char *uStr
, int n
,
425 if (code
> 0xffffff) {
426 // This is an arbitrary limit to avoid integer overflow issues.
427 // (I've seen CMaps with mappings for <ffffffff>.)
430 if (code
>= mapLen
) {
432 mapLen
= mapLen
? 2 * mapLen
: 256;
433 if (code
>= mapLen
) {
434 mapLen
= (code
+ 256) & ~255;
436 if (unlikely(code
>= mapLen
)) {
437 error(errSyntaxWarning
, -1, "Illegal code value in CharCodeToUnicode::addMapping");
440 map
= (Unicode
*)greallocn(map
, mapLen
, sizeof(Unicode
));
441 for (i
= oldLen
; i
< mapLen
; ++i
) {
447 if (!parseHex(uStr
, n
, &u
)) {
448 error(errSyntaxWarning
, -1, "Illegal entry in ToUnicode CMap");
451 map
[code
] = u
+ offset
;
452 if (!UnicodeIsValid(map
[code
])) {
456 if (sMapLen
>= sMapSize
) {
457 sMapSize
= sMapSize
+ 16;
458 sMap
= (CharCodeToUnicodeString
*)
459 greallocn(sMap
, sMapSize
, sizeof(CharCodeToUnicodeString
));
462 sMap
[sMapLen
].c
= code
;
463 int utf16Len
= n
/ 4;
464 Unicode
*utf16
= (Unicode
*)gmallocn(utf16Len
, sizeof(Unicode
));
465 for (j
= 0; j
< utf16Len
; ++j
) {
466 if (!parseHex(uStr
+ j
*4, 4, &utf16
[j
])) {
468 error(errSyntaxWarning
, -1, "Illegal entry in ToUnicode CMap");
472 utf16
[utf16Len
- 1] += offset
;
473 sMap
[sMapLen
].len
= UTF16toUCS4(utf16
, utf16Len
, &sMap
[sMapLen
].u
);
479 CharCodeToUnicode::CharCodeToUnicode() {
484 sMapLen
= sMapSize
= 0;
492 CharCodeToUnicode::CharCodeToUnicode(GooString
*tagA
) {
497 map
= (Unicode
*)gmallocn(mapLen
, sizeof(Unicode
));
498 for (i
= 0; i
< mapLen
; ++i
) {
502 sMapLen
= sMapSize
= 0;
510 CharCodeToUnicode::CharCodeToUnicode(GooString
*tagA
, Unicode
*mapA
,
511 CharCode mapLenA
, GBool copyMap
,
512 CharCodeToUnicodeString
*sMapA
,
513 int sMapLenA
, int sMapSizeA
) {
517 map
= (Unicode
*)gmallocn(mapLen
, sizeof(Unicode
));
518 memcpy(map
, mapA
, mapLen
* sizeof(Unicode
));
524 sMapSize
= sMapSizeA
;
532 CharCodeToUnicode::~CharCodeToUnicode() {
538 for (int i
= 0; i
< sMapLen
; ++i
) gfree(sMap
[i
].u
);
542 gDestroyMutex(&mutex
);
546 void CharCodeToUnicode::incRefCnt() {
552 gUnlockMutex(&mutex
);
556 void CharCodeToUnicode::decRefCnt() {
562 done
= --refCnt
== 0;
564 gUnlockMutex(&mutex
);
571 GBool
CharCodeToUnicode::match(GooString
*tagA
) {
572 return tag
&& !tag
->cmp(tagA
);
575 void CharCodeToUnicode::setMapping(CharCode c
, Unicode
*u
, int len
) {
578 if (!map
|| isIdentity
) {
584 for (i
= 0; i
< sMapLen
; ++i
) {
585 if (sMap
[i
].c
== c
) {
591 if (sMapLen
== sMapSize
) {
593 sMap
= (CharCodeToUnicodeString
*)
594 greallocn(sMap
, sMapSize
, sizeof(CharCodeToUnicodeString
));
601 sMap
[i
].u
= (Unicode
*)gmallocn(len
, sizeof(Unicode
));
602 for (j
= 0; j
< len
; ++j
) {
603 if (UnicodeIsValid(u
[j
])) {
606 sMap
[i
].u
[j
] = 0xfffd;
612 int CharCodeToUnicode::mapToUnicode(CharCode c
, Unicode
**u
) {
627 for (i
= sMapLen
- 1; i
>= 0; --i
) { // in reverse so CMap takes precedence
628 if (sMap
[i
].c
== c
) {
636 int CharCodeToUnicode::mapToCharCode(Unicode
* u
, CharCode
*c
, int usize
) {
637 //look for charcode in map
638 if (usize
== 1 || (usize
> 1 && !(*u
& ~0xff))) {
643 for (CharCode i
=0; i
<mapLen
; i
++) {
652 //for each entry in the sMap
653 for (i
=0; i
<sMapLen
; i
++) {
654 //if the entry's unicode length isn't the same are usize, the strings
655 // are obviously differents
656 if (sMap
[i
].len
!= usize
) continue;
657 //compare the string char by char
658 for (j
=0; j
<sMap
[i
].len
; j
++) {
659 if (sMap
[i
].u
[j
] != u
[j
]) {
664 //we have the same strings
665 if (j
==sMap
[i
].len
) {
674 //------------------------------------------------------------------------
676 CharCodeToUnicodeCache::CharCodeToUnicodeCache(int sizeA
) {
680 cache
= (CharCodeToUnicode
**)gmallocn(size
, sizeof(CharCodeToUnicode
*));
681 for (i
= 0; i
< size
; ++i
) {
686 CharCodeToUnicodeCache::~CharCodeToUnicodeCache() {
689 for (i
= 0; i
< size
; ++i
) {
691 cache
[i
]->decRefCnt();
697 CharCodeToUnicode
*CharCodeToUnicodeCache::getCharCodeToUnicode(GooString
*tag
) {
698 CharCodeToUnicode
*ctu
;
701 if (cache
[0] && cache
[0]->match(tag
)) {
702 cache
[0]->incRefCnt();
705 for (i
= 1; i
< size
; ++i
) {
706 if (cache
[i
] && cache
[i
]->match(tag
)) {
708 for (j
= i
; j
>= 1; --j
) {
709 cache
[j
] = cache
[j
- 1];
719 void CharCodeToUnicodeCache::add(CharCodeToUnicode
*ctu
) {
722 if (cache
[size
- 1]) {
723 cache
[size
- 1]->decRefCnt();
725 for (i
= size
- 1; i
>= 1; --i
) {
726 cache
[i
] = cache
[i
- 1];