bumped version
[gnutls.git] / src / libopts / tokenize.c
blob9563713e80711f2d7b7e5f94fc47df2d2d6016c6
1 /*
2 * This file defines the string_tokenize interface
3 * Time-stamp: "2012-03-04 13:23:50 bkorb"
5 * This file is part of AutoOpts, a companion to AutoGen.
6 * AutoOpts is free software.
7 * AutoOpts is Copyright (c) 1992-2012 by Bruce Korb - all rights reserved
9 * AutoOpts is available under any one of two licenses. The license
10 * in use must be one of these two and the choice is under the control
11 * of the user of the license.
13 * The GNU Lesser General Public License, version 3 or later
14 * See the files "COPYING.lgplv3" and "COPYING.gplv3"
16 * The Modified Berkeley Software Distribution License
17 * See the file "COPYING.mbsd"
19 * These files have the following md5sums:
21 * 43b91e8ca915626ed3818ffb1b71248b pkg/libopts/COPYING.gplv3
22 * 06a1a2e4760c90ea5e1dad8dfaac4d39 pkg/libopts/COPYING.lgplv3
23 * 66a5cedaf62c4b2637025f049f9b826f pkg/libopts/COPYING.mbsd
26 #include <errno.h>
27 #include <stdlib.h>
29 #define cc_t const unsigned char
30 #define ch_t unsigned char
32 /* = = = START-STATIC-FORWARD = = = */
33 static void
34 copy_cooked(ch_t** ppDest, char const ** ppSrc);
36 static void
37 copy_raw(ch_t** ppDest, char const ** ppSrc);
39 static token_list_t *
40 alloc_token_list(char const * str);
41 /* = = = END-STATIC-FORWARD = = = */
43 static void
44 copy_cooked(ch_t** ppDest, char const ** ppSrc)
46 ch_t* pDest = (ch_t*)*ppDest;
47 const ch_t* pSrc = (const ch_t*)(*ppSrc + 1);
49 for (;;) {
50 ch_t ch = *(pSrc++);
51 switch (ch) {
52 case NUL: *ppSrc = NULL; return;
53 case '"': goto done;
54 case '\\':
55 pSrc += ao_string_cook_escape_char((char*)pSrc, (char*)&ch, 0x7F);
56 if (ch == 0x7F)
57 break;
58 /* FALLTHROUGH */
60 default:
61 *(pDest++) = ch;
65 done:
66 *ppDest = (ch_t*)pDest; /* next spot for storing character */
67 *ppSrc = (char const *)pSrc; /* char following closing quote */
71 static void
72 copy_raw(ch_t** ppDest, char const ** ppSrc)
74 ch_t* pDest = *ppDest;
75 cc_t* pSrc = (cc_t*) (*ppSrc + 1);
77 for (;;) {
78 ch_t ch = *(pSrc++);
79 switch (ch) {
80 case NUL: *ppSrc = NULL; return;
81 case '\'': goto done;
82 case '\\':
84 * *Four* escapes are handled: newline removal, escape char
85 * quoting and apostrophe quoting
87 switch (*pSrc) {
88 case NUL: *ppSrc = NULL; return;
89 case '\r':
90 if (*(++pSrc) == NL)
91 ++pSrc;
92 continue;
94 case NL:
95 ++pSrc;
96 continue;
98 case '\'':
99 ch = '\'';
100 /* FALLTHROUGH */
102 case '\\':
103 ++pSrc;
104 break;
106 /* FALLTHROUGH */
108 default:
109 *(pDest++) = ch;
113 done:
114 *ppDest = pDest; /* next spot for storing character */
115 *ppSrc = (char const *) pSrc; /* char following closing quote */
118 static token_list_t *
119 alloc_token_list(char const * str)
121 token_list_t * res;
123 int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */
125 if (str == NULL) goto enoent_res;
128 * Trim leading white space. Use "ENOENT" and a NULL return to indicate
129 * an empty string was passed.
131 str = SPN_WHITESPACE_CHARS(str);
132 if (*str == NUL) goto enoent_res;
135 * Take an approximate count of tokens. If no quoted strings are used,
136 * it will be accurate. If quoted strings are used, it will be a little
137 * high and we'll squander the space for a few extra pointers.
140 char const * pz = str;
142 do {
143 max_token_ct++;
144 pz = BRK_WHITESPACE_CHARS(pz+1);
145 pz = SPN_WHITESPACE_CHARS(pz);
146 } while (*pz != NUL);
148 res = malloc(sizeof(*res) + (pz - str)
149 + (max_token_ct * sizeof(ch_t*)));
152 if (res == NULL)
153 errno = ENOMEM;
154 else res->tkn_list[0] = (ch_t*)(res->tkn_list + (max_token_ct - 1));
156 return res;
158 enoent_res:
160 errno = ENOENT;
161 return NULL;
164 /*=export_func ao_string_tokenize
166 * what: tokenize an input string
168 * arg: + char const* + string + string to be tokenized +
170 * ret_type: token_list_t*
171 * ret_desc: pointer to a structure that lists each token
173 * doc:
175 * This function will convert one input string into a list of strings.
176 * The list of strings is derived by separating the input based on
177 * white space separation. However, if the input contains either single
178 * or double quote characters, then the text after that character up to
179 * a matching quote will become the string in the list.
181 * The returned pointer should be deallocated with @code{free(3C)} when
182 * are done using the data. The data are placed in a single block of
183 * allocated memory. Do not deallocate individual token/strings.
185 * The structure pointed to will contain at least these two fields:
186 * @table @samp
187 * @item tkn_ct
188 * The number of tokens found in the input string.
189 * @item tok_list
190 * An array of @code{tkn_ct + 1} pointers to substring tokens, with
191 * the last pointer set to NULL.
192 * @end table
194 * There are two types of quoted strings: single quoted (@code{'}) and
195 * double quoted (@code{"}). Singly quoted strings are fairly raw in that
196 * escape characters (@code{\\}) are simply another character, except when
197 * preceding the following characters:
198 * @example
199 * @code{\\} double backslashes reduce to one
200 * @code{'} incorporates the single quote into the string
201 * @code{\n} suppresses both the backslash and newline character
202 * @end example
204 * Double quote strings are formed according to the rules of string
205 * constants in ANSI-C programs.
207 * example:
208 * @example
209 * #include <stdlib.h>
210 * int ix;
211 * token_list_t* ptl = ao_string_tokenize(some_string)
212 * for (ix = 0; ix < ptl->tkn_ct; ix++)
213 * do_something_with_tkn(ptl->tkn_list[ix]);
214 * free(ptl);
215 * @end example
216 * Note that everything is freed with the one call to @code{free(3C)}.
218 * err:
219 * NULL is returned and @code{errno} will be set to indicate the problem:
220 * @itemize @bullet
221 * @item
222 * @code{EINVAL} - There was an unterminated quoted string.
223 * @item
224 * @code{ENOENT} - The input string was empty.
225 * @item
226 * @code{ENOMEM} - There is not enough memory.
227 * @end itemize
229 token_list_t*
230 ao_string_tokenize(char const* str)
232 token_list_t* res = alloc_token_list(str);
233 ch_t* pzDest;
236 * Now copy each token into the output buffer.
238 if (res == NULL)
239 return res;
241 pzDest = (ch_t*)(res->tkn_list[0]);
242 res->tkn_ct = 0;
244 do {
245 res->tkn_list[ res->tkn_ct++ ] = pzDest;
246 for (;;) {
247 int ch = (ch_t)*str;
248 if (IS_WHITESPACE_CHAR(ch)) {
249 found_white_space:
250 str = SPN_WHITESPACE_CHARS(str+1);
251 break;
254 switch (ch) {
255 case '"':
256 copy_cooked(&pzDest, &str);
257 if (str == NULL) {
258 free(res);
259 errno = EINVAL;
260 return NULL;
262 if (IS_WHITESPACE_CHAR(*str))
263 goto found_white_space;
264 break;
266 case '\'':
267 copy_raw(&pzDest, &str);
268 if (str == NULL) {
269 free(res);
270 errno = EINVAL;
271 return NULL;
273 if (IS_WHITESPACE_CHAR(*str))
274 goto found_white_space;
275 break;
277 case NUL:
278 goto copy_done;
280 default:
281 str++;
282 *(pzDest++) = (unsigned char)ch;
284 } copy_done:;
287 * NUL terminate the last token and see if we have any more tokens.
289 *(pzDest++) = NUL;
290 } while (*str != NUL);
292 res->tkn_list[ res->tkn_ct ] = NULL;
294 return res;
297 #ifdef TEST
298 #include <stdio.h>
299 #include <string.h>
302 main(int argc, char** argv)
304 if (argc == 1) {
305 printf("USAGE: %s arg [ ... ]\n", *argv);
306 return 1;
308 while (--argc > 0) {
309 char* arg = *(++argv);
310 token_list_t* p = ao_string_tokenize(arg);
311 if (p == NULL) {
312 printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
313 arg, errno, strerror(errno));
314 } else {
315 int ix = 0;
316 printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct);
317 do {
318 printf(" %3d: ``%s''\n", ix+1, p->tkn_list[ix]);
319 } while (++ix < p->tkn_ct);
320 free(p);
323 return 0;
325 #endif
328 * Local Variables:
329 * mode: C
330 * c-file-style: "stroustrup"
331 * indent-tabs-mode: nil
332 * End:
333 * end of autoopts/tokenize.c */