1 //========================================================================
3 // CharCodeToUnicode.cc
5 // Copyright 2001-2003 Glyph & Cog, LLC
7 //========================================================================
9 //========================================================================
11 // Modified under the Poppler project - http://poppler.freedesktop.org
13 // All changes made under the Poppler project to this file are licensed
14 // under GPL version 2 or later
16 // Copyright (C) 2006, 2008-2010, 2012 Albert Astals Cid <aacid@kde.org>
17 // Copyright (C) 2007 Julien Rebetez <julienr@svn.gnome.org>
18 // Copyright (C) 2007 Koji Otani <sho@bbr.jp>
19 // Copyright (C) 2008 Michael Vrable <mvrable@cs.ucsd.edu>
20 // Copyright (C) 2008 Vasile Gaburici <gaburici@cs.umd.edu>
21 // Copyright (C) 2010 William Bader <williambader@hotmail.com>
22 // Copyright (C) 2010 Jakub Wilk <jwilk@jwilk.net>
23 // Copyright (C) 2012 Thomas Freitag <Thomas.Freitag@alfa.de>
24 // Copyright (C) 2012 Adrian Johnson <ajohnson@redneon.com>
25 // Copyright (C) 2014 Jiri Slaby <jirislaby@gmail.com>
27 // To see a description of the changes please see the Changelog file that
28 // came with your tarball or type make ChangeLog if you are building from git
30 //========================================================================
34 #ifdef USE_GCC_PRAGMAS
35 #pragma implementation
41 #include "goo/gfile.h"
42 #include "goo/GooLikely.h"
43 #include "goo/GooString.h"
45 #include "GlobalParams.h"
46 #include "PSTokenizer.h"
47 #include "CharCodeToUnicode.h"
50 //------------------------------------------------------------------------
52 struct CharCodeToUnicodeString
{
58 //------------------------------------------------------------------------
60 static int getCharFromString(void *data
) {
74 static int getCharFromFile(void *data
) {
75 return fgetc((FILE *)data
);
78 //------------------------------------------------------------------------
80 static int hexCharVals
[256] = {
81 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0x
82 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 1x
83 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 2x
84 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, // 3x
85 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 4x
86 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 5x
87 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 6x
88 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 7x
89 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 8x
90 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 9x
91 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Ax
92 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Bx
93 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Cx
94 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Dx
95 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Ex
96 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // Fx
99 // Parse a <len>-byte hex string <s> into *<val>. Returns false on
101 static GBool
parseHex(char *s
, int len
, Guint
*val
) {
105 for (i
= 0; i
< len
; ++i
) {
106 x
= hexCharVals
[s
[i
] & 0xff];
110 *val
= (*val
<< 4) + x
;
115 //------------------------------------------------------------------------
117 CharCodeToUnicode
*CharCodeToUnicode::makeIdentityMapping() {
118 CharCodeToUnicode
*ctu
= new CharCodeToUnicode();
119 ctu
->isIdentity
= gTrue
;
121 ctu
->map
= (Unicode
*)gmallocn(ctu
->mapLen
, sizeof(Unicode
));
125 CharCodeToUnicode
*CharCodeToUnicode::parseCIDToUnicode(GooString
*fileName
,
126 GooString
*collection
) {
129 CharCode size
, mapLenA
;
132 CharCodeToUnicode
*ctu
;
134 if (!(f
= openFile(fileName
->getCString(), "r"))) {
135 error(errIO
, -1, "Couldn't open cidToUnicode file '{0:t}'",
141 mapA
= (Unicode
*)gmallocn(size
, sizeof(Unicode
));
144 while (getLine(buf
, sizeof(buf
), f
)) {
145 if (mapLenA
== size
) {
147 mapA
= (Unicode
*)greallocn(mapA
, size
, sizeof(Unicode
));
149 if (sscanf(buf
, "%x", &u
) == 1) {
152 error(errSyntaxWarning
, -1, "Bad line ({0:d}) in cidToUnicode file '{1:t}'",
153 (int)(mapLenA
+ 1), fileName
);
160 ctu
= new CharCodeToUnicode(collection
->copy(), mapA
, mapLenA
, gTrue
,
166 CharCodeToUnicode
*CharCodeToUnicode::parseUnicodeToUnicode(
167 GooString
*fileName
) {
170 CharCodeToUnicodeString
*sMapA
;
171 CharCode size
, oldSize
, len
, sMapSizeA
, sMapLenA
;
176 Unicode
*uBuf
= (Unicode
*)gmallocn(uBufSize
, sizeof(Unicode
));
177 CharCodeToUnicode
*ctu
;
181 if (!(f
= openFile(fileName
->getCString(), "r"))) {
183 error(errIO
, -1, "Couldn't open unicodeToUnicode file '{0:t}'",
189 mapA
= (Unicode
*)gmallocn(size
, sizeof(Unicode
));
190 memset(mapA
, 0, size
* sizeof(Unicode
));
193 sMapSizeA
= sMapLenA
= 0;
196 while (getLine(buf
, sizeof(buf
), f
)) {
198 if (!(tok
= strtok_r(buf
, " \t\r\n", &tokptr
)) ||
199 !parseHex(tok
, strlen(tok
), &u0
)) {
200 error(errSyntaxWarning
, -1, "Bad line ({0:d}) in unicodeToUnicode file '{1:t}'",
205 while ((tok
= strtok_r(NULL
, " \t\r\n", &tokptr
))) {
209 uBuf
= (Unicode
*)greallocn(uBuf
, uBufSize
, sizeof(Unicode
));
211 if (!parseHex(tok
, strlen(tok
), &uBuf
[n
])) {
212 error(errSyntaxWarning
, -1, "Bad line ({0:d}) in unicodeToUnicode file '{1:t}'",
219 error(errSyntaxWarning
, -1, "Bad line ({0:d}) in unicodeToUnicode file '{1:t}'",
228 mapA
= (Unicode
*)greallocn(mapA
, size
, sizeof(Unicode
));
229 memset(mapA
+ oldSize
, 0, (size
- oldSize
) * sizeof(Unicode
));
235 if (sMapLenA
== sMapSizeA
) {
237 sMapA
= (CharCodeToUnicodeString
*)
238 greallocn(sMapA
, sMapSizeA
, sizeof(CharCodeToUnicodeString
));
240 sMapA
[sMapLenA
].c
= u0
;
241 sMapA
[sMapLenA
].u
= (Unicode
*)gmallocn(n
, sizeof(Unicode
));
242 for (i
= 0; i
< n
; ++i
) {
243 sMapA
[sMapLenA
].u
[i
] = uBuf
[i
];
245 sMapA
[sMapLenA
].len
= n
;
254 ctu
= new CharCodeToUnicode(fileName
->copy(), mapA
, len
, gTrue
,
255 sMapA
, sMapLenA
, sMapSizeA
);
261 CharCodeToUnicode
*CharCodeToUnicode::make8BitToUnicode(Unicode
*toUnicode
) {
262 return new CharCodeToUnicode(NULL
, toUnicode
, 256, gTrue
, NULL
, 0, 0);
265 CharCodeToUnicode
*CharCodeToUnicode::parseCMap(GooString
*buf
, int nBits
) {
266 CharCodeToUnicode
*ctu
;
269 ctu
= new CharCodeToUnicode(NULL
);
270 p
= buf
->getCString();
271 ctu
->parseCMap1(&getCharFromString
, &p
, nBits
);
275 CharCodeToUnicode
*CharCodeToUnicode::parseCMapFromFile(GooString
*fileName
,
277 CharCodeToUnicode
*ctu
;
280 ctu
= new CharCodeToUnicode(NULL
);
281 if ((f
= globalParams
->findToUnicodeFile(fileName
))) {
282 ctu
->parseCMap1(&getCharFromFile
, f
, nBits
);
285 error(errSyntaxError
, -1, "Couldn't find ToUnicode CMap file for '{0:t}'",
291 void CharCodeToUnicode::mergeCMap(GooString
*buf
, int nBits
) {
294 p
= buf
->getCString();
295 parseCMap1(&getCharFromString
, &p
, nBits
);
298 void CharCodeToUnicode::parseCMap1(int (*getCharFunc
)(void *), void *data
,
301 char tok1
[256], tok2
[256], tok3
[256];
304 CharCode maxCode
, code1
, code2
;
308 maxCode
= (nBits
== 8) ? 0xff : (nBits
== 16) ? 0xffff : 0xffffffff;
309 pst
= new PSTokenizer(getCharFunc
, data
);
310 pst
->getToken(tok1
, sizeof(tok1
), &n1
);
311 while (pst
->getToken(tok2
, sizeof(tok2
), &n2
)) {
312 if (!strcmp(tok2
, "usecmap")) {
313 if (tok1
[0] == '/') {
314 name
= new GooString(tok1
+ 1);
315 if ((f
= globalParams
->findToUnicodeFile(name
))) {
316 parseCMap1(&getCharFromFile
, f
, nBits
);
319 error(errSyntaxError
, -1, "Couldn't find ToUnicode CMap file for '{0:t}'",
324 pst
->getToken(tok1
, sizeof(tok1
), &n1
);
325 } else if (!strcmp(tok2
, "beginbfchar")) {
326 while (pst
->getToken(tok1
, sizeof(tok1
), &n1
)) {
327 if (!strcmp(tok1
, "endbfchar")) {
330 if (!pst
->getToken(tok2
, sizeof(tok2
), &n2
) ||
331 !strcmp(tok2
, "endbfchar")) {
332 error(errSyntaxWarning
, -1, "Illegal entry in bfchar block in ToUnicode CMap");
335 if (!(tok1
[0] == '<' && tok1
[n1
- 1] == '>' &&
336 tok2
[0] == '<' && tok2
[n2
- 1] == '>')) {
337 error(errSyntaxWarning
, -1, "Illegal entry in bfchar block in ToUnicode CMap");
340 tok1
[n1
- 1] = tok2
[n2
- 1] = '\0';
341 if (!parseHex(tok1
+ 1, n1
- 2, &code1
)) {
342 error(errSyntaxWarning
, -1, "Illegal entry in bfchar block in ToUnicode CMap");
345 if (code1
> maxCode
) {
346 error(errSyntaxWarning
, -1,
347 "Invalid entry in bfchar block in ToUnicode CMap");
349 addMapping(code1
, tok2
+ 1, n2
- 2, 0);
351 pst
->getToken(tok1
, sizeof(tok1
), &n1
);
352 } else if (!strcmp(tok2
, "beginbfrange")) {
353 while (pst
->getToken(tok1
, sizeof(tok1
), &n1
)) {
354 if (!strcmp(tok1
, "endbfrange")) {
357 if (!pst
->getToken(tok2
, sizeof(tok2
), &n2
) ||
358 !strcmp(tok2
, "endbfrange") ||
359 !pst
->getToken(tok3
, sizeof(tok3
), &n3
) ||
360 !strcmp(tok3
, "endbfrange")) {
361 error(errSyntaxWarning
, -1, "Illegal entry in bfrange block in ToUnicode CMap");
364 if (!(tok1
[0] == '<' && tok1
[n1
- 1] == '>' &&
365 tok2
[0] == '<' && tok2
[n2
- 1] == '>')) {
366 error(errSyntaxWarning
, -1, "Illegal entry in bfrange block in ToUnicode CMap");
369 tok1
[n1
- 1] = tok2
[n2
- 1] = '\0';
370 if (!parseHex(tok1
+ 1, n1
- 2, &code1
) ||
371 !parseHex(tok2
+ 1, n2
- 2, &code2
)) {
372 error(errSyntaxWarning
, -1, "Illegal entry in bfrange block in ToUnicode CMap");
375 if (code1
> maxCode
|| code2
> maxCode
) {
376 error(errSyntaxWarning
, -1,
377 "Invalid entry in bfrange block in ToUnicode CMap");
378 if (code1
> maxCode
) {
381 if (code2
> maxCode
) {
385 if (!strcmp(tok3
, "[")) {
387 while (pst
->getToken(tok1
, sizeof(tok1
), &n1
) &&
388 code1
+ i
<= code2
) {
389 if (!strcmp(tok1
, "]")) {
392 if (tok1
[0] == '<' && tok1
[n1
- 1] == '>') {
394 addMapping(code1
+ i
, tok1
+ 1, n1
- 2, 0);
396 error(errSyntaxWarning
, -1, "Illegal entry in bfrange block in ToUnicode CMap");
400 } else if (tok3
[0] == '<' && tok3
[n3
- 1] == '>') {
402 for (i
= 0; code1
<= code2
; ++code1
, ++i
) {
403 addMapping(code1
, tok3
+ 1, n3
- 2, i
);
407 error(errSyntaxWarning
, -1, "Illegal entry in bfrange block in ToUnicode CMap");
410 pst
->getToken(tok1
, sizeof(tok1
), &n1
);
418 void CharCodeToUnicode::addMapping(CharCode code
, char *uStr
, int n
,
424 if (code
> 0xffffff) {
425 // This is an arbitrary limit to avoid integer overflow issues.
426 // (I've seen CMaps with mappings for <ffffffff>.)
429 if (code
>= mapLen
) {
431 mapLen
= mapLen
? 2 * mapLen
: 256;
432 if (code
>= mapLen
) {
433 mapLen
= (code
+ 256) & ~255;
435 if (unlikely(code
>= mapLen
)) {
436 error(errSyntaxWarning
, -1, "Illegal code value in CharCodeToUnicode::addMapping");
439 map
= (Unicode
*)greallocn(map
, mapLen
, sizeof(Unicode
));
440 for (i
= oldLen
; i
< mapLen
; ++i
) {
446 if (!parseHex(uStr
, n
, &u
)) {
447 error(errSyntaxWarning
, -1, "Illegal entry in ToUnicode CMap");
450 map
[code
] = u
+ offset
;
451 if (!UnicodeIsValid(map
[code
])) {
455 if (sMapLen
>= sMapSize
) {
456 sMapSize
= sMapSize
+ 16;
457 sMap
= (CharCodeToUnicodeString
*)
458 greallocn(sMap
, sMapSize
, sizeof(CharCodeToUnicodeString
));
461 sMap
[sMapLen
].c
= code
;
462 int utf16Len
= n
/ 4;
463 Unicode
*utf16
= (Unicode
*)gmallocn(utf16Len
, sizeof(Unicode
));
464 for (j
= 0; j
< utf16Len
; ++j
) {
465 if (!parseHex(uStr
+ j
*4, 4, &utf16
[j
])) {
467 error(errSyntaxWarning
, -1, "Illegal entry in ToUnicode CMap");
471 utf16
[utf16Len
- 1] += offset
;
472 sMap
[sMapLen
].len
= UTF16toUCS4(utf16
, utf16Len
, &sMap
[sMapLen
].u
);
478 CharCodeToUnicode::CharCodeToUnicode() {
483 sMapLen
= sMapSize
= 0;
491 CharCodeToUnicode::CharCodeToUnicode(GooString
*tagA
) {
496 map
= (Unicode
*)gmallocn(mapLen
, sizeof(Unicode
));
497 for (i
= 0; i
< mapLen
; ++i
) {
501 sMapLen
= sMapSize
= 0;
509 CharCodeToUnicode::CharCodeToUnicode(GooString
*tagA
, Unicode
*mapA
,
510 CharCode mapLenA
, GBool copyMap
,
511 CharCodeToUnicodeString
*sMapA
,
512 int sMapLenA
, int sMapSizeA
) {
516 map
= (Unicode
*)gmallocn(mapLen
, sizeof(Unicode
));
517 memcpy(map
, mapA
, mapLen
* sizeof(Unicode
));
523 sMapSize
= sMapSizeA
;
531 CharCodeToUnicode::~CharCodeToUnicode() {
537 for (int i
= 0; i
< sMapLen
; ++i
) gfree(sMap
[i
].u
);
541 gDestroyMutex(&mutex
);
545 void CharCodeToUnicode::incRefCnt() {
551 gUnlockMutex(&mutex
);
555 void CharCodeToUnicode::decRefCnt() {
561 done
= --refCnt
== 0;
563 gUnlockMutex(&mutex
);
570 GBool
CharCodeToUnicode::match(GooString
*tagA
) {
571 return tag
&& !tag
->cmp(tagA
);
574 void CharCodeToUnicode::setMapping(CharCode c
, Unicode
*u
, int len
) {
577 if (!map
|| isIdentity
) {
583 for (i
= 0; i
< sMapLen
; ++i
) {
584 if (sMap
[i
].c
== c
) {
590 if (sMapLen
== sMapSize
) {
592 sMap
= (CharCodeToUnicodeString
*)
593 greallocn(sMap
, sMapSize
, sizeof(CharCodeToUnicodeString
));
600 sMap
[i
].u
= (Unicode
*)gmallocn(len
, sizeof(Unicode
));
601 for (j
= 0; j
< len
; ++j
) {
602 if (UnicodeIsValid(u
[j
])) {
605 sMap
[i
].u
[j
] = 0xfffd;
611 int CharCodeToUnicode::mapToUnicode(CharCode c
, Unicode
**u
) {
626 for (i
= sMapLen
- 1; i
>= 0; --i
) { // in reverse so CMap takes precedence
627 if (sMap
[i
].c
== c
) {
635 int CharCodeToUnicode::mapToCharCode(Unicode
* u
, CharCode
*c
, int usize
) {
636 //look for charcode in map
637 if (usize
== 1 || (usize
> 1 && !(*u
& ~0xff))) {
642 for (CharCode i
=0; i
<mapLen
; i
++) {
651 //for each entry in the sMap
652 for (i
=0; i
<sMapLen
; i
++) {
653 //if the entry's unicode length isn't the same are usize, the strings
654 // are obviously differents
655 if (sMap
[i
].len
!= usize
) continue;
656 //compare the string char by char
657 for (j
=0; j
<sMap
[i
].len
; j
++) {
658 if (sMap
[i
].u
[j
] != u
[j
]) {
663 //we have the same strings
664 if (j
==sMap
[i
].len
) {
673 //------------------------------------------------------------------------
675 CharCodeToUnicodeCache::CharCodeToUnicodeCache(int sizeA
) {
679 cache
= (CharCodeToUnicode
**)gmallocn(size
, sizeof(CharCodeToUnicode
*));
680 for (i
= 0; i
< size
; ++i
) {
685 CharCodeToUnicodeCache::~CharCodeToUnicodeCache() {
688 for (i
= 0; i
< size
; ++i
) {
690 cache
[i
]->decRefCnt();
696 CharCodeToUnicode
*CharCodeToUnicodeCache::getCharCodeToUnicode(GooString
*tag
) {
697 CharCodeToUnicode
*ctu
;
700 if (cache
[0] && cache
[0]->match(tag
)) {
701 cache
[0]->incRefCnt();
704 for (i
= 1; i
< size
; ++i
) {
705 if (cache
[i
] && cache
[i
]->match(tag
)) {
707 for (j
= i
; j
>= 1; --j
) {
708 cache
[j
] = cache
[j
- 1];
718 void CharCodeToUnicodeCache::add(CharCodeToUnicode
*ctu
) {
721 if (cache
[size
- 1]) {
722 cache
[size
- 1]->decRefCnt();
724 for (i
= size
- 1; i
>= 1; --i
) {
725 cache
[i
] = cache
[i
- 1];