beta-0.89.2
[luatex.git] / source / texk / web2c / luatexdir / utils / unistring.w
bloba5b365a5f2e58915878f2fe7907a92e7f9ba9917
1 % unistring.w
3 % Copyright 2013 Taco Hoekwater <taco@@luatex.org>
5 % This file is part of LuaTeX.
7 % LuaTeX is free software; you can redistribute it and/or modify it under
8 % the terms of the GNU General Public License as published by the Free
9 % Software Foundation; either version 2 of the License, or (at your
10 % option) any later version.
12 % LuaTeX is distributed in the hope that it will be useful, but WITHOUT
13 % ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 % FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 % License for more details.
17 % You should have received a copy of the GNU General Public License along
18 % with LuaTeX; if not, see <http://www.gnu.org/licenses/>.
20 @ @c
23 @ @c
24 #include "ptexlib.h"
25 #include <string.h>
27 @ @c
28 static void utf_error(void)
30 const char *hlp[] =
31 { "A funny symbol that I can't read has just been (re)read.",
32 "Just continue, I'll change it to 0xFFFD.",
33 NULL
35 deletions_allowed = false;
36 tex_error("String contains an invalid utf-8 sequence", hlp);
37 deletions_allowed = true;
40 @ @c
41 unsigned str2uni(const unsigned char *k)
43 register int ch;
44 int val = 0xFFFD;
45 const unsigned char *text = k;
46 if ((ch = *text++) < 0x80) {
47 val = (unsigned) ch;
48 } else if (ch <= 0xbf) { /* error */
49 } else if (ch <= 0xdf) {
50 if (*text >= 0x80 && *text < 0xc0)
51 val = (unsigned) (((ch & 0x1f) << 6) | (*text++ & 0x3f));
52 } else if (ch <= 0xef) {
53 if (*text >= 0x80 && *text < 0xc0 && text[1] >= 0x80 && text[1] < 0xc0) {
54 val = (unsigned)
55 (((ch & 0xf) << 12) | ((text[0] & 0x3f) << 6) |
56 (text[1] & 0x3f));
58 } else if (ch <= 0xf7) {
59 int w = (((ch & 0x7) << 2) | ((text[0] & 0x30) >> 4)) - 1, w2;
60 w = (w << 6) | ((text[0] & 0xf) << 2) | ((text[1] & 0x30) >> 4);
61 w2 = ((text[1] & 0xf) << 6) | (text[2] & 0x3f);
62 val = (unsigned) (w * 0x400 + w2 + 0x10000);
63 if (*text < 0x80 || text[1] < 0x80 || text[2] < 0x80 ||
64 *text >= 0xc0 || text[1] >= 0xc0 || text[2] >= 0xc0)
65 val = 0xFFFD;
66 } else {
67 /* the 5- and 6-byte UTF-8 sequences generate integers
68 that are outside of the valid UCS range, and therefore
69 unsupported
72 if (val == 0xFFFD)
73 utf_error();
74 return (val);
77 @ This is a very basic helper
79 unsigned char *uni2str(unsigned unic)
81 unsigned char *buf = xmalloc(5);
82 unsigned char *pt = buf;
83 if (unic < 0x80)
84 *pt++ = (unsigned char) unic;
85 else if (unic < 0x800) {
86 *pt++ = (unsigned char) (0xc0 | (unic >> 6));
87 *pt++ = (unsigned char) (0x80 | (unic & 0x3f));
88 } else if (unic >= 0x110000) {
89 *pt++ = (unsigned char) (unic - 0x110000);
90 } else if (unic < 0x10000) {
91 *pt++ = (unsigned char) (0xe0 | (unic >> 12));
92 *pt++ = (unsigned char) (0x80 | ((unic >> 6) & 0x3f));
93 *pt++ = (unsigned char) (0x80 | (unic & 0x3f));
94 } else {
95 unsigned val = unic - 0x10000;
96 int u = (int) (((val & 0xf0000) >> 16) + 1);
97 int z = (int) ((val & 0x0f000) >> 12);
98 int y = (int) ((val & 0x00fc0) >> 6);
99 int x = (int) (val & 0x0003f);
100 *pt++ = (unsigned char) (0xf0 | (u >> 2));
101 *pt++ = (unsigned char) (0x80 | ((u & 3) << 4) | z);
102 *pt++ = (unsigned char) (0x80 | y);
103 *pt++ = (unsigned char) (0x80 | x);
105 *pt = '\0';
106 return buf;
109 @ |buffer_to_unichar| converts a sequence of bytes in the |buffer|
110 into a unicode character value. It does not check for overflow
111 of the |buffer|, but it is careful to check the validity of the
112 UTF-8 encoding.
115 int buffer_to_unichar(int k)
117 return str2uni((const unsigned char *)(buffer+k));
121 @ These came from texlang.w
123 char *uni2string(char *utf8_text, unsigned ch)
125 /* Increment and deposit character */
126 if (ch >= 17 * 65536)
127 return (utf8_text);
129 if (ch <= 127)
130 *utf8_text++ = (char) ch;
131 else if (ch <= 0x7ff) {
132 *utf8_text++ = (char) (0xc0 | (ch >> 6));
133 *utf8_text++ = (char) (0x80 | (ch & 0x3f));
134 } else if (ch <= 0xffff) {
135 *utf8_text++ = (char) (0xe0 | (ch >> 12));
136 *utf8_text++ = (char) (0x80 | ((ch >> 6) & 0x3f));
137 *utf8_text++ = (char) (0x80 | (ch & 0x3f));
138 } else {
139 unsigned val = ch - 0x10000;
140 unsigned u = ((val & 0xf0000) >> 16) + 1, z = (val & 0x0f000) >> 12, y =
141 (val & 0x00fc0) >> 6, x = val & 0x0003f;
142 *utf8_text++ = (char) (0xf0 | (u >> 2));
143 *utf8_text++ = (char) (0x80 | ((u & 3) << 4) | z);
144 *utf8_text++ = (char) (0x80 | y);
145 *utf8_text++ = (char) (0x80 | x);
147 return (utf8_text);
150 unsigned u_length(register unsigned int *str)
152 register unsigned len = 0;
153 while (*str++ != '\0')
154 ++len;
155 return (len);
159 void utf2uni_strcpy(unsigned int *ubuf, const char *utf8buf)
161 int len = (int) strlen(utf8buf) + 1;
162 unsigned int *upt = ubuf, *uend = ubuf + len - 1;
163 const unsigned char *pt = (const unsigned char *) utf8buf, *end =
164 pt + strlen(utf8buf);
165 int w, w2;
167 while (pt < end && *pt != '\0' && upt < uend) {
168 if (*pt <= 127)
169 *upt = *pt++;
170 else if (*pt <= 0xdf) {
171 *upt = (unsigned int) (((*pt & 0x1f) << 6) | (pt[1] & 0x3f));
172 pt += 2;
173 } else if (*pt <= 0xef) {
174 *upt =
175 (unsigned int) (((*pt & 0xf) << 12) | ((pt[1] & 0x3f) << 6) |
176 (pt[2] & 0x3f));
177 pt += 3;
178 } else {
179 w = (((*pt & 0x7) << 2) | ((pt[1] & 0x30) >> 4)) - 1;
180 w = (w << 6) | ((pt[1] & 0xf) << 2) | ((pt[2] & 0x30) >> 4);
181 w2 = ((pt[2] & 0xf) << 6) | (pt[3] & 0x3f);
182 *upt = (unsigned int) (w * 0x400 + w2 + 0x10000);
183 pt += 4;
185 ++upt;
187 *upt = '\0';
190 @ @c
191 char *utf16be_str(long code)
193 static char buf[SMALL_BUF_SIZE];
194 long v;
195 unsigned vh, vl;
197 assert(code >= 0);
199 if (code <= 0xFFFF)
200 sprintf(buf, "%04lX", code);
201 else {
202 v = code - 0x10000;
203 vh = (unsigned) (v / 0x400 + 0xD800);
204 vl = (unsigned) (v % 0x400 + 0xDC00);
205 sprintf(buf, "%04X%04X", vh, vl);
207 return buf;