doc: Add Unicode Strings to Unicode index
[nasm.git] / quote.c
blobdc880442f2f0a10c253625d3e6d6a6791bc2ef34
1 /* quote.c library routines for the Netwide Assembler
3 * The Netwide Assembler is copyright (C) 1996 Simon Tatham and
4 * Julian Hall. All rights reserved. The software is
5 * redistributable under the license given in the file "LICENSE"
6 * distributed in the NASM archive.
7 */
9 #include "compiler.h"
11 #include <assert.h>
12 #include <stdlib.h>
14 #include "nasmlib.h"
15 #include "quote.h"
17 #define numvalue(c) ((c)>='a' ? (c)-'a'+10 : (c)>='A' ? (c)-'A'+10 : (c)-'0')
19 char *nasm_quote(char *str, size_t len)
21 char c, c1, *p, *q, *nstr, *ep;
22 bool sq_ok, dq_ok;
23 size_t qlen;
25 sq_ok = dq_ok = true;
26 ep = str+len;
27 qlen = 0; /* Length if we need `...` quotes */
28 for (p = str; p < ep; p++) {
29 c = *p;
30 switch (c) {
31 case '\'':
32 sq_ok = false;
33 qlen++;
34 break;
35 case '\"':
36 dq_ok = false;
37 qlen++;
38 break;
39 case '`':
40 case '\\':
41 qlen += 2;
42 break;
43 default:
44 if (c < ' ' || c > '~') {
45 sq_ok = dq_ok = false;
46 switch (c) {
47 case '\a':
48 case '\b':
49 case '\t':
50 case '\n':
51 case '\v':
52 case '\f':
53 case '\r':
54 case 27:
55 qlen += 2;
56 break;
57 default:
58 c1 = (p+1 < ep) ? p[1] : 0;
59 if (c > 077 || (c1 >= '0' && c1 <= '7'))
60 qlen += 4; /* Must use the full form */
61 else if (c > 07)
62 qlen += 3;
63 else
64 qlen += 2;
65 break;
67 } else {
68 qlen++;
70 break;
74 if (sq_ok || dq_ok) {
75 /* Use '...' or "..." */
76 nstr = nasm_malloc(len+3);
77 nstr[0] = nstr[len+1] = sq_ok ? '\'' : '\"';
78 nstr[len+2] = '\0';
79 memcpy(nstr+1, str, len);
80 } else {
81 /* Need to use `...` quoted syntax */
82 nstr = nasm_malloc(qlen+3);
83 q = nstr;
84 *q++ = '`';
85 for (p = str; p < ep; p++) {
86 c = *p;
87 switch (c) {
88 case '`':
89 case '\\':
90 *q++ = '\\';
91 *q++ = c;
92 break;
93 case 7:
94 *q++ = '\\';
95 *q++ = 'a';
96 break;
97 case 8:
98 *q++ = '\\';
99 *q++ = 'b';
100 break;
101 case 9:
102 *q++ = '\\';
103 *q++ = 't';
104 break;
105 case 10:
106 *q++ = '\\';
107 *q++ = 'n';
108 break;
109 case 11:
110 *q++ = '\\';
111 *q++ = 'v';
112 break;
113 case 12:
114 *q++ = '\\';
115 *q++ = 'f';
116 break;
117 case 13:
118 *q++ = '\\';
119 *q++ = 'r';
120 break;
121 case 27:
122 *q++ = '\\';
123 *q++ = 'e';
124 break;
125 default:
126 if (c < ' ' || c > '~') {
127 c1 = (p+1 < ep) ? p[1] : 0;
128 if (c1 >= '0' && c1 <= '7')
129 q += sprintf(q, "\\%03o", (unsigned char)c);
130 else
131 q += sprintf(q, "\\%o", (unsigned char)c);
132 } else {
133 *q++ = c;
135 break;
138 *q++ = '`';
139 *q++ = '\0';
140 assert((size_t)(q-nstr) == qlen+3);
142 return nstr;
145 static char *emit_utf8(char *q, int32_t v)
147 if (v < 0) {
148 /* Impossible - do nothing */
149 } else if (v <= 0x7f) {
150 *q++ = v;
151 } else if (v <= 0x000007ff) {
152 *q++ = 0xc0 | (v >> 6);
153 *q++ = 0x80 | (v & 63);
154 } else if (v <= 0x0000ffff) {
155 *q++ = 0xe0 | (v >> 12);
156 *q++ = 0x80 | ((v >> 6) & 63);
157 *q++ = 0x80 | (v & 63);
158 } else if (v <= 0x001fffff) {
159 *q++ = 0xf0 | (v >> 18);
160 *q++ = 0x80 | ((v >> 12) & 63);
161 *q++ = 0x80 | ((v >> 6) & 63);
162 *q++ = 0x80 | (v & 63);
163 } else if (v <= 0x03ffffff) {
164 *q++ = 0xf8 | (v >> 24);
165 *q++ = 0x80 | ((v >> 18) & 63);
166 *q++ = 0x80 | ((v >> 12) & 63);
167 *q++ = 0x80 | ((v >> 6) & 63);
168 *q++ = 0x80 | (v & 63);
169 } else {
170 *q++ = 0xfc | (v >> 30);
171 *q++ = 0x80 | ((v >> 24) & 63);
172 *q++ = 0x80 | ((v >> 18) & 63);
173 *q++ = 0x80 | ((v >> 12) & 63);
174 *q++ = 0x80 | ((v >> 6) & 63);
175 *q++ = 0x80 | (v & 63);
177 return q;
181 * Do an *in-place* dequoting of the specified string, returning the
182 * resulting length (which may be containing embedded nulls.)
184 * In-place replacement is possible since the unquoted length is always
185 * shorter than or equal to the quoted length.
187 * *ep points to the final quote, or to the null if improperly quoted.
189 size_t nasm_unquote(char *str, char **ep)
191 char bq;
192 char *p, *q;
193 char *escp = NULL;
194 char c;
195 enum unq_state {
196 st_start,
197 st_backslash,
198 st_hex,
199 st_oct,
200 st_ucs,
201 } state;
202 int ndig = 0;
203 int32_t nval = 0;
205 p = q = str;
207 bq = *p++;
208 if (!bq)
209 return 0;
211 switch (bq) {
212 case '\'':
213 case '\"':
214 /* '...' or "..." string */
215 while ((c = *p) && c != bq) {
216 p++;
217 *q++ = c;
219 *q = '\0';
220 break;
222 case '`':
223 /* `...` string */
224 state = st_start;
226 while ((c = *p)) {
227 p++;
228 switch (state) {
229 case st_start:
230 switch (c) {
231 case '\\':
232 state = st_backslash;
233 break;
234 case '`':
235 p--;
236 goto out;
237 default:
238 *q++ = c;
239 break;
241 break;
243 case st_backslash:
244 state = st_start;
245 escp = p; /* Beginning of argument sequence */
246 nval = 0;
247 switch (c) {
248 case 'a':
249 *q++ = 7;
250 break;
251 case 'b':
252 *q++ = 8;
253 break;
254 case 'e':
255 *q++ = 27;
256 break;
257 case 'f':
258 *q++ = 12;
259 break;
260 case 'n':
261 *q++ = 10;
262 break;
263 case 'r':
264 *q++ = 13;
265 break;
266 case 't':
267 *q++ = 9;
268 break;
269 case 'u':
270 state = st_ucs;
271 ndig = 4;
272 break;
273 case 'U':
274 state = st_ucs;
275 ndig = 8;
276 break;
277 case 'v':
278 *q++ = 11;
279 break;
280 case 'x':
281 case 'X':
282 state = st_hex;
283 ndig = 2;
284 break;
285 case '0':
286 case '1':
287 case '2':
288 case '3':
289 case '4':
290 case '5':
291 case '6':
292 case '7':
293 state = st_oct;
294 ndig = 2; /* Up to two more digits */
295 nval = c - '0';
296 break;
297 default:
298 *q++ = c;
299 break;
301 break;
303 case st_oct:
304 if (c >= '0' && c <= '7') {
305 nval = (nval << 3) + (c - '0');
306 if (!--ndig) {
307 *q++ = nval;
308 state = st_start;
310 } else {
311 p--; /* Process this character again */
312 *q++ = nval;
313 state = st_start;
315 break;
317 case st_hex:
318 if ((c >= '0' && c <= '9') ||
319 (c >= 'A' && c <= 'F') ||
320 (c >= 'a' && c <= 'f')) {
321 nval = (nval << 4) + numvalue(c);
322 if (!--ndig) {
323 *q++ = nval;
324 state = st_start;
326 } else {
327 p--; /* Process this character again */
328 *q++ = (p > escp) ? nval : escp[-1];
329 state = st_start;
331 break;
333 case st_ucs:
334 if ((c >= '0' && c <= '9') ||
335 (c >= 'A' && c <= 'F') ||
336 (c >= 'a' && c <= 'f')) {
337 nval = (nval << 4) + numvalue(c);
338 if (!--ndig) {
339 q = emit_utf8(q, nval);
340 state = st_start;
342 } else {
343 p--; /* Process this character again */
344 if (p > escp)
345 q = emit_utf8(q, nval);
346 else
347 *q++ = escp[-1];
348 state = st_start;
350 break;
353 switch (state) {
354 case st_start:
355 case st_backslash:
356 break;
357 case st_oct:
358 *q++ = nval;
359 break;
360 case st_hex:
361 *q++ = (p > escp) ? nval : escp[-1];
362 break;
363 case st_ucs:
364 if (p > escp)
365 q = emit_utf8(q, nval);
366 else
367 *q++ = escp[-1];
368 break;
370 out:
371 break;
373 default:
374 /* Not a quoted string, just return the input... */
375 p = q = strchr(str, '\0');
376 break;
379 if (ep)
380 *ep = p;
381 return q-str;
385 * Find the end of a quoted string; returns the pointer to the terminating
386 * character (either the ending quote or the null character, if unterminated.)
388 char *nasm_skip_string(char *str)
390 char bq;
391 char *p;
392 char c;
393 enum unq_state {
394 st_start,
395 st_backslash,
396 } state;
398 bq = str[0];
399 if (bq == '\'' || bq == '\"') {
400 /* '...' or "..." string */
401 for (p = str+1; *p && *p != bq; p++)
403 return p;
404 } else if (bq == '`') {
405 /* `...` string */
406 p = str+1;
407 state = st_start;
409 while ((c = *p++)) {
410 switch (state) {
411 case st_start:
412 switch (c) {
413 case '\\':
414 state = st_backslash;
415 break;
416 case '`':
417 return p-1; /* Found the end */
418 default:
419 break;
421 break;
423 case st_backslash:
425 * Note: for the purpose of finding the end of the string,
426 * all successor states to st_backslash are functionally
427 * equivalent to st_start, since either a backslash or
428 * a backquote will force a return to the st_start state.
430 state = st_start;
431 break;
434 return p; /* Unterminated string... */
435 } else {
436 return str; /* Not a string... */