1 //========================================================================
5 // Copyright 2001-2003 Glyph & Cog, LLC
7 //========================================================================
9 //========================================================================
11 // Modified under the Poppler project - http://poppler.freedesktop.org
13 // All changes made under the Poppler project to this file are licensed
14 // under GPL version 2 or later
16 // Copyright (C) 2008 Koji Otani <sho@bbr.jp>
17 // Copyright (C) 2012 Adrian Johnson <ajohnson@redneon.com>
18 // Copyright (C) 2012 Hib Eris <hib@hiberis.nl>
20 // To see a description of the changes please see the Changelog file that
21 // came with your tarball or type make ChangeLog if you are building from git
23 //========================================================================
26 #include "PDFDocEncoding.h"
29 bool UnicodeIsValid(Unicode ucs4
)
31 return (ucs4
< 0x110000) &&
32 ((ucs4
& 0xfffff800) != 0xd800) &&
33 (ucs4
< 0xfdd0 || ucs4
> 0xfdef) &&
34 ((ucs4
& 0xfffe) != 0xfffe);
37 int UTF16toUCS4(const Unicode
*utf16
, int utf16Len
, Unicode
**ucs4
)
44 for (i
= 0; i
< utf16Len
; i
++) {
45 if (utf16
[i
] >= 0xd800 && utf16
[i
] < 0xdc00 && i
+ 1 < utf16Len
&&
46 utf16
[i
+1] >= 0xdc00 && utf16
[i
+1] < 0xe000) {
47 i
++; /* surrogate pair */
54 u
= (Unicode
*)gmallocn(len
, sizeof(Unicode
));
57 for (i
= 0; i
< utf16Len
; i
++) {
58 if (utf16
[i
] >= 0xd800 && utf16
[i
] < 0xdc00) { /* surrogate pair */
59 if (i
+ 1 < utf16Len
&& utf16
[i
+1] >= 0xdc00 && utf16
[i
+1] < 0xe000) {
60 /* next code is a low surrogate */
61 u
[n
] = (((utf16
[i
] & 0x3ff) << 10) | (utf16
[i
+1] & 0x3ff)) + 0x10000;
64 /* missing low surrogate
65 replace it with REPLACEMENT CHARACTER (U+FFFD) */
68 } else if (utf16
[i
] >= 0xdc00 && utf16
[i
] < 0xe000) {
69 /* invalid low surrogate
70 replace it with REPLACEMENT CHARACTER (U+FFFD) */
75 if (!UnicodeIsValid(u
[n
])) {
84 int TextStringToUCS4(GooString
*textStr
, Unicode
**ucs4
)
90 len
= textStr
->getLength();
91 s
= textStr
->getCString();
95 if (textStr
->hasUnicodeMarker()) {
99 utf16
= new Unicode
[len
];
100 for (i
= 0 ; i
< len
; i
++) {
101 utf16
[i
] = (s
[2 + i
*2] & 0xff) << 8 | (s
[3 + i
*2] & 0xff);
103 len
= UTF16toUCS4(utf16
, len
, &u
);
109 u
= (Unicode
*)gmallocn(len
, sizeof(Unicode
));
110 for (i
= 0 ; i
< len
; i
++) {
111 u
[i
] = pdfDocEncoding
[s
[i
] & 0xff];