Initial commit of newLISP.
[newlisp.git] / nl-utf8.c
blob40327b35b561917330079a8ce9ff005a6f606f8f
1 /* nl-utf8.c --- functions for UTF-8 unicode support
3 Copyright (C) 2008 Lutz Mueller
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
19 // portions are copied from pcre.c by: Philip Hazel <ph10@cam.ac.uk>
20 // and Copyright (c) 1997-2003 University of Cambridge
25 #include "newlisp.h"
26 #include <wchar.h>
27 #include <wctype.h>
28 #include "protos.h"
31 /*************************************************
32 * Macros and tables for character handling *
33 * by Philip Hazel <ph10@cam.ac.uk> *
34 *************************************************/
36 /* These are the breakpoints for different numbers of bytes in a UTF-8
37 character. */
39 static const int utf8_table1[] =
40 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
42 /* These are the indicator bits and the mask for the data bits to set in the
43 first byte of a character, indexed by the number of additional bytes. */
45 static const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
46 static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
48 /* Table of the number of extra characters, indexed by the first character
49 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
50 0x3d. */
52 static const char utf8_table4[] = {
53 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
54 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
55 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
56 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
58 /* Get the next UTF-8 character, advancing the pointer. This is called when we
59 know we are in UTF-8 mode. */
61 #define GETCHARINC(c, eptr) \
62 c = (unsigned char)*eptr++; \
63 if ((c & 0xc0) == 0xc0) \
64 { \
65 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
66 int gcss = 6*gcaa; \
67 c = (c & utf8_table3[gcaa]) << gcss; \
68 while (gcaa-- > 0) \
69 { \
70 gcss -= 6; \
71 c |= (*eptr++ & 0x3f) << gcss; \
72 } \
75 /* This function takes an integer value in the range 0 - 0x7fffffff
76 and encodes it as a UTF-8 character in 0 to 6 bytes.
78 Arguments:
79 cvalue the character value
80 buffer pointer to buffer for result - at least 6 bytes long
82 Returns: number of characters placed in the buffer
85 int wchar_utf8(int cvalue, char *buffer)
87 register int i, j;
88 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
89 if (cvalue <= utf8_table1[i]) break;
90 buffer += i;
91 for (j = i; j > 0; j--)
93 *buffer-- = 0x80 | (cvalue & 0x3f);
94 cvalue >>= 6;
96 *buffer = utf8_table2[i] | cvalue;
97 return i + 1;
101 /* ---------------------- UTF-8 utility fuctions --------------------------- */
103 /* get utf8 string from unicode wide character
105 * int wchar_utf8(int wchar, char * utf8str)
107 * the string is not nullterminated for contiguos filling
108 * of longer strings
109 * returns number of bytes placed in utf8str
113 /* get a unicode wide character from the utf8 string
114 * return advanced utf8 string pointer
117 char * utf8_wchar(char * utf8str, int * chr)
119 GETCHARINC(*chr, utf8str)
121 return(utf8str);
124 /* return the number of characters encoded in utf8 string
125 * without counting the zero terminator
128 size_t utf8_wlen(char * utf8str)
130 int gcaa;
131 int c;
132 size_t count = 0;
134 while((c = *utf8str++) != 0)
136 count++;
137 if ((c & 0xc0) == 0xc0)
139 gcaa = utf8_table4[c & 0x3f];
140 utf8str += gcaa;
144 return(count);
148 /* return the length of the first utf8 character
151 int utf8_1st_len(char * utf8str)
153 int c;
155 if((c = *utf8str) != 0)
157 if((c & 0xc0) == 0xc0)
158 return(utf8_table4[c & 0x3f] + 1);
159 else return(1);
162 return(0);
166 /* convert utf8 string to vector of maxwc wide characters
167 * unicode vector is zero terminated
168 * return number of unicode characters (excluding zero int)
171 int utf8_wstr(int * unicode, char * utf8str, int maxwc)
173 int wchar;
174 int count = 0;
176 while(maxwc-- && *utf8str != 0)
178 count++;
179 GETCHARINC(wchar, utf8str);
180 /* utf8str = utf8_wchar(utf8str, &wchar); */
181 *(unicode++) = wchar;
183 *unicode = 0;
185 return(count);
188 /* convert zero terminated unicode vector into utf8 string
189 * return number of bytes stored in utr8 string excluding terminator
190 * don't use more then maxstr bytes (excluding zero terminator)
193 int wstr_utf8(char * utf8str, int * unicode, int maxstr)
195 int len, size = 0;
197 while(*unicode != 0 && size < maxstr)
199 len = wchar_utf8(*unicode, utf8str);
200 utf8str += len;
201 size += len;
202 unicode++;
205 *utf8str = 0;
207 return(size);
210 /* -------------------------------------- newLISP API -----------------------------------*/
212 CELL * p_unicode(CELL * params)
214 char * utf8str;
215 size_t size;
216 int * unicode;
217 CELL * cell;
219 getStringSize(params, &utf8str, &size, TRUE);
220 unicode = allocMemory((size + 1) * sizeof(int));
222 size = utf8_wstr(unicode, utf8str, size);
223 unicode = reallocMemory(unicode, (size + 1) * sizeof(int) + 1);
225 cell = getCell(CELL_STRING);
226 cell->contents = (UINT)unicode;
227 cell->aux = (size + 1) * sizeof(int) + 1;
229 return(cell);
233 CELL * p_utf8(CELL * params)
235 int * unicode;
236 size_t size;
237 char * utf8str;
238 CELL *cell;
240 getStringSize(params, (void *)&unicode, &size, TRUE);
241 utf8str = allocMemory(size * UTF8_MAX_BYTES + 1);
243 size = wstr_utf8(utf8str, unicode, size);
244 utf8str = reallocMemory(utf8str, size + 1);
246 cell = getCell(CELL_STRING);
247 cell->contents = (UINT)utf8str;
248 cell->aux = size + 1;
250 return(cell);
254 CELL * p_utf8len(CELL * params)
256 char * str;
258 getString(params, &str);
260 return(stuffInteger(utf8_wlen(str)));
263 /* eof */