Fix bug where the WinHelp backend corrupts the internal data
[nasm/autotest.git] / quote.c
blobf926b85a8defa6d4efee2a54e58675db088230b5
1 /* quote.c library routines for the Netwide Assembler
3 * The Netwide Assembler is copyright (C) 1996 Simon Tatham and
4 * Julian Hall. All rights reserved. The software is
5 * redistributable under the license given in the file "LICENSE"
6 * distributed in the NASM archive.
7 */
9 #include "compiler.h"
11 #include <assert.h>
12 #include <stdlib.h>
14 #include "nasmlib.h"
15 #include "quote.h"
17 #define numvalue(c) ((c)>='a' ? (c)-'a'+10 : (c)>='A' ? (c)-'A'+10 : (c)-'0')
19 char *nasm_quote(char *str, size_t len)
21 char c, c1, *p, *q, *nstr, *ep;
22 bool sq_ok, dq_ok;
23 size_t qlen;
25 sq_ok = dq_ok = true;
26 ep = str+len;
27 qlen = 0; /* Length if we need `...` quotes */
28 for (p = str; p < ep; p++) {
29 c = *p;
30 switch (c) {
31 case '\'':
32 sq_ok = false;
33 qlen++;
34 break;
35 case '\"':
36 dq_ok = false;
37 qlen++;
38 break;
39 case '`':
40 case '\\':
41 qlen += 2;
42 break;
43 default:
44 if (c < ' ' || c > '~') {
45 sq_ok = dq_ok = false;
46 switch (c) {
47 case '\a':
48 case '\b':
49 case '\t':
50 case '\n':
51 case '\v':
52 case '\f':
53 case '\r':
54 case 27:
55 qlen += 2;
56 break;
57 default:
58 c1 = (p+1 < ep) ? p[1] : 0;
59 if (c > 077 || (c1 >= '0' && c1 <= '7'))
60 qlen += 4; /* Must use the full form */
61 else if (c > 07)
62 qlen += 3;
63 else
64 qlen += 2;
65 break;
67 } else {
68 qlen++;
70 break;
74 if (sq_ok || dq_ok) {
75 /* Use '...' or "..." */
76 nstr = nasm_malloc(len+3);
77 nstr[0] = nstr[len+1] = sq_ok ? '\'' : '\"';
78 nstr[len+2] = '\0';
79 memcpy(nstr+1, str, len);
80 } else {
81 /* Need to use `...` quoted syntax */
82 nstr = nasm_malloc(qlen+3);
83 q = nstr;
84 *q++ = '`';
85 for (p = str; p < ep; p++) {
86 c = *p;
87 switch (c) {
88 case '`':
89 case '\\':
90 *q++ = '\\';
91 *q++ = c;
92 break;
93 case '\a':
94 *q++ = '\\';
95 *q++ = 'a';
96 break;
97 case '\b':
98 *q++ = '\\';
99 *q++ = 'b';
100 break;
101 case '\t':
102 *q++ = '\\';
103 *q++ = 't';
104 break;
105 case '\n':
106 *q++ = '\\';
107 *q++ = 'n';
108 break;
109 case '\v':
110 *q++ = '\\';
111 *q++ = 'v';
112 break;
113 case '\f':
114 *q++ = '\\';
115 *q++ = 'f';
116 break;
117 case '\r':
118 *q++ = '\\';
119 *q++ = 'r';
120 break;
121 case 27:
122 *q++ = '\\';
123 *q++ = 'e';
124 break;
125 default:
126 if (c < ' ' || c > '~') {
127 c1 = (p+1 < ep) ? p[1] : 0;
128 if (c1 >= '0' && c1 <= '7')
129 q += sprintf(q, "\\%03o", (unsigned char)c);
130 else
131 q += sprintf(q, "\\%o", (unsigned char)c);
132 } else {
133 *q++ = c;
135 break;
138 *q++ = '`';
139 *q++ = '\0';
140 assert((size_t)(q-nstr) == qlen+3);
142 return nstr;
145 static char *emit_utf8(char *q, int32_t v)
147 if (v < 0) {
148 /* Impossible - do nothing */
149 } else if (v <= 0x7f) {
150 *q++ = v;
151 } else if (v <= 0x000007ff) {
152 *q++ = 0xc0 | (v >> 6);
153 *q++ = 0x80 | (v & 63);
154 } else if (v <= 0x0000ffff) {
155 *q++ = 0xe0 | (v >> 12);
156 *q++ = 0x80 | ((v >> 6) & 63);
157 *q++ = 0x80 | (v & 63);
158 } else if (v <= 0x001fffff) {
159 *q++ = 0xf0 | (v >> 18);
160 *q++ = 0x80 | ((v >> 12) & 63);
161 *q++ = 0x80 | ((v >> 6) & 63);
162 *q++ = 0x80 | (v & 63);
163 } else if (v <= 0x03ffffff) {
164 *q++ = 0xf8 | (v >> 24);
165 *q++ = 0x80 | ((v >> 18) & 63);
166 *q++ = 0x80 | ((v >> 12) & 63);
167 *q++ = 0x80 | ((v >> 6) & 63);
168 *q++ = 0x80 | (v & 63);
169 } else {
170 *q++ = 0xfc | (v >> 30);
171 *q++ = 0x80 | ((v >> 24) & 63);
172 *q++ = 0x80 | ((v >> 18) & 63);
173 *q++ = 0x80 | ((v >> 12) & 63);
174 *q++ = 0x80 | ((v >> 6) & 63);
175 *q++ = 0x80 | (v & 63);
177 return q;
181 * Do an *in-place* dequoting of the specified string, returning the
182 * resulting length (which may be containing embedded nulls.)
184 * In-place replacement is possible since the unquoted length is always
185 * shorter than or equal to the quoted length.
187 size_t nasm_unquote(char *str)
189 size_t ln;
190 char bq, eq;
191 char *p, *q, *ep;
192 char *escp = NULL;
193 char c;
194 enum unq_state {
195 st_start,
196 st_backslash,
197 st_hex,
198 st_oct,
199 st_ucs,
200 } state;
201 int ndig = 0;
202 int32_t nval = 0;
204 bq = str[0];
205 if (!bq)
206 return 0;
207 ln = strlen(str);
208 eq = str[ln-1];
210 if ((bq == '\'' || bq == '\"') && bq == eq) {
211 /* '...' or "..." string */
212 memmove(str, str+1, ln-2);
213 str[ln-2] = '\0';
214 return ln-2;
216 if (bq == '`' || eq == '`') {
217 /* `...` string */
218 q = str;
219 p = str+1;
220 ep = str+ln-1;
221 state = st_start;
223 while (p < ep) {
224 c = *p++;
225 switch (state) {
226 case st_start:
227 if (c == '\\')
228 state = st_backslash;
229 else
230 *q++ = c;
231 break;
233 case st_backslash:
234 state = st_start;
235 escp = p-1;
236 switch (c) {
237 case 'a':
238 *q++ = 7;
239 break;
240 case 'b':
241 *q++ = 8;
242 break;
243 case 'e':
244 *q++ = 27;
245 break;
246 case 'f':
247 *q++ = 12;
248 break;
249 case 'n':
250 *q++ = 10;
251 break;
252 case 'r':
253 *q++ = 13;
254 break;
255 case 't':
256 *q++ = 9;
257 break;
258 case 'u':
259 state = st_ucs;
260 ndig = 4;
261 nval = 0;
262 break;
263 case 'U':
264 state = st_ucs;
265 ndig = 8;
266 nval = 0;
267 break;
268 case 'v':
269 *q++ = 11;
270 case 'x':
271 case 'X':
272 state = st_hex;
273 ndig = nval = 0;
274 break;
275 case '0':
276 case '1':
277 case '2':
278 case '3':
279 case '4':
280 case '5':
281 case '6':
282 case '7':
283 state = st_oct;
284 ndig = 1;
285 nval = c - '0';
286 break;
287 default:
288 *q++ = c;
289 break;
291 break;
293 case st_oct:
294 if (c >= '0' && c <= '7') {
295 nval = (nval << 3) + (c - '0');
296 if (++ndig >= 3) {
297 *q++ = nval;
298 state = st_start;
300 } else {
301 p--; /* Process this character again */
302 *q++ = nval;
303 state = st_start;
305 break;
307 case st_hex:
308 if ((c >= '0' && c <= '9') ||
309 (c >= 'A' && c <= 'F') ||
310 (c >= 'a' && c <= 'f')) {
311 nval = (nval << 4) + numvalue(c);
312 if (++ndig >= 2) {
313 *q++ = nval;
314 state = st_start;
316 } else {
317 p--; /* Process this character again */
318 *q++ = ndig ? nval : *escp;
319 state = st_start;
321 break;
323 case st_ucs:
324 if ((c >= '0' && c <= '9') ||
325 (c >= 'A' && c <= 'F') ||
326 (c >= 'a' && c <= 'f')) {
327 nval = (nval << 4) + numvalue(c);
328 if (!--ndig) {
329 q = emit_utf8(q, nval);
330 state = st_start;
332 } else {
333 p--; /* Process this character again */
334 if (p > escp+1)
335 q = emit_utf8(q, nval);
336 else
337 *q++ = *escp;
338 state = st_start;
340 break;
343 switch (state) {
344 case st_start:
345 case st_backslash:
346 break;
347 case st_oct:
348 *q++ = nval;
349 break;
350 case st_hex:
351 *q++ = ndig ? nval : *escp;
352 break;
353 case st_ucs:
354 if (ndig)
355 q = emit_utf8(q, nval);
356 else
357 *q++ = *escp;
358 break;
360 *q = '\0';
361 return q-str;
364 /* Otherwise, just return the input... */
365 return ln;
369 * Find the end of a quoted string; returns the pointer to the terminating
370 * character (either the ending quote or the null character, if unterminated.)
372 char *nasm_skip_string(char *str)
374 char bq;
375 char *p;
376 char c;
377 enum unq_state {
378 st_start,
379 st_backslash,
380 st_hex,
381 st_oct,
382 st_ucs,
383 } state;
384 int ndig = 0;
386 bq = str[0];
387 if (bq == '\'' || bq == '\"') {
388 /* '...' or "..." string */
389 for (p = str+1; *p && *p != bq; p++)
391 return p;
392 } else if (bq == '`') {
393 /* `...` string */
394 p = str+1;
395 state = st_start;
397 while ((c = *p++)) {
398 switch (state) {
399 case st_start:
400 switch (c) {
401 case '\\':
402 state = st_backslash;
403 break;
404 case '`':
405 return p-1; /* Found the end */
406 default:
407 break;
409 break;
411 case st_backslash:
412 switch (c) {
413 case 'a':
414 case 'b':
415 case 'e':
416 case 'f':
417 case 'n':
418 case 'r':
419 case 't':
420 case 'v':
421 default:
422 state = st_start;
423 break;
424 case 'u':
425 state = st_ucs;
426 ndig = 4;
427 break;
428 case 'U':
429 state = st_ucs;
430 ndig = 8;
431 break;
432 case 'x':
433 case 'X':
434 state = st_hex;
435 ndig = 0;
436 break;
437 case '0':
438 case '1':
439 case '2':
440 case '3':
441 case '4':
442 case '5':
443 case '6':
444 case '7':
445 state = st_oct;
446 ndig = 1;
447 break;
449 break;
451 case st_oct:
452 if (c >= '0' && c <= '7') {
453 if (++ndig >= 3)
454 state = st_start;
455 } else {
456 p--; /* Process this character again */
457 state = st_start;
459 break;
461 case st_hex:
462 if ((c >= '0' && c <= '9') ||
463 (c >= 'A' && c <= 'F') ||
464 (c >= 'a' && c <= 'f')) {
465 if (++ndig >= 2)
466 state = st_start;
467 } else {
468 p--; /* Process this character again */
469 state = st_start;
471 break;
473 case st_ucs:
474 if ((c >= '0' && c <= '9') ||
475 (c >= 'A' && c <= 'F') ||
476 (c >= 'a' && c <= 'f')) {
477 if (!--ndig)
478 state = st_start;
479 } else {
480 p--; /* Process this character again */
481 state = st_start;
483 break;
486 return p; /* Unterminated string... */
487 } else {
488 return str; /* Not a string... */