beta-0.89.2
[luatex.git] / source / libs / poppler / poppler-src / poppler / UTF.cc
blob46007b730069198b99a0eee75b2402f5c74af65f
1 //========================================================================
2 //
3 // UTF.h
4 //
5 // Copyright 2001-2003 Glyph & Cog, LLC
6 //
7 //========================================================================
9 //========================================================================
11 // Modified under the Poppler project - http://poppler.freedesktop.org
13 // All changes made under the Poppler project to this file are licensed
14 // under GPL version 2 or later
16 // Copyright (C) 2008 Koji Otani <sho@bbr.jp>
17 // Copyright (C) 2012 Adrian Johnson <ajohnson@redneon.com>
18 // Copyright (C) 2012 Hib Eris <hib@hiberis.nl>
20 // To see a description of the changes please see the Changelog file that
21 // came with your tarball or type make ChangeLog if you are building from git
23 //========================================================================
25 #include "goo/gmem.h"
26 #include "PDFDocEncoding.h"
27 #include "UTF.h"
29 bool UnicodeIsValid(Unicode ucs4)
31 return (ucs4 < 0x110000) &&
32 ((ucs4 & 0xfffff800) != 0xd800) &&
33 (ucs4 < 0xfdd0 || ucs4 > 0xfdef) &&
34 ((ucs4 & 0xfffe) != 0xfffe);
37 int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4)
39 int i, n, len;
40 Unicode *u;
42 // count characters
43 len = 0;
44 for (i = 0; i < utf16Len; i++) {
45 if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00 && i + 1 < utf16Len &&
46 utf16[i+1] >= 0xdc00 && utf16[i+1] < 0xe000) {
47 i++; /* surrogate pair */
49 len++;
51 if (ucs4 == NULL)
52 return len;
54 u = (Unicode*)gmallocn(len, sizeof(Unicode));
55 n = 0;
56 // convert string
57 for (i = 0; i < utf16Len; i++) {
58 if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00) { /* surrogate pair */
59 if (i + 1 < utf16Len && utf16[i+1] >= 0xdc00 && utf16[i+1] < 0xe000) {
60 /* next code is a low surrogate */
61 u[n] = (((utf16[i] & 0x3ff) << 10) | (utf16[i+1] & 0x3ff)) + 0x10000;
62 ++i;
63 } else {
64 /* missing low surrogate
65 replace it with REPLACEMENT CHARACTER (U+FFFD) */
66 u[n] = 0xfffd;
68 } else if (utf16[i] >= 0xdc00 && utf16[i] < 0xe000) {
69 /* invalid low surrogate
70 replace it with REPLACEMENT CHARACTER (U+FFFD) */
71 u[n] = 0xfffd;
72 } else {
73 u[n] = utf16[i];
75 if (!UnicodeIsValid(u[n])) {
76 u[n] = 0xfffd;
78 n++;
80 *ucs4 = u;
81 return len;
84 int TextStringToUCS4(GooString *textStr, Unicode **ucs4)
86 int i, len;
87 const char *s;
88 Unicode *u;
90 len = textStr->getLength();
91 s = textStr->getCString();
92 if (len == 0)
93 return 0;
95 if (textStr->hasUnicodeMarker()) {
96 Unicode *utf16;
97 len = len/2 - 1;
98 if (len > 0) {
99 utf16 = new Unicode[len];
100 for (i = 0 ; i < len; i++) {
101 utf16[i] = (s[2 + i*2] & 0xff) << 8 | (s[3 + i*2] & 0xff);
103 len = UTF16toUCS4(utf16, len, &u);
104 delete[] utf16;
105 } else {
106 u = NULL;
108 } else {
109 u = (Unicode*)gmallocn(len, sizeof(Unicode));
110 for (i = 0 ; i < len; i++) {
111 u[i] = pdfDocEncoding[s[i] & 0xff];
114 *ucs4 = u;
115 return len;