1 /* $OpenBSD: tokenizer.c,v 1.21 2016/04/11 21:17:29 schwarze Exp $ */
2 /* $NetBSD: tokenizer.c,v 1.28 2016/04/11 18:56:31 christos Exp $ */
5 * Copyright (c) 1992, 1993
6 * The Regents of the University of California. All rights reserved.
8 * This code is derived from software contributed to Berkeley by
9 * Christos Zoulas of Cornell University.
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. Neither the name of the University nor the names of its contributors
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 /* We build this file twice, once as NARROW, once as WIDE. */
40 * tokenize.c: Bourne shell like tokenizer
48 Q_none
, Q_single
, Q_double
, Q_one
, Q_doubleone
57 #define IFS STR("\t \n")
61 #define FUN(prefix, rest) prefix ## _ ## rest
62 #define TYPE(type) type
64 #define Strchr(s, c) strchr(s, c)
65 #define tok_strdup(s) strdup(s)
68 #define FUN(prefix, rest) prefix ## _w ## rest
69 #define TYPE(type) type ## W
71 #define Strchr(s, c) wcschr(s, c)
72 #define tok_strdup(s) wcsdup(s)
75 struct TYPE(tokenizer
) {
76 Char
*ifs
; /* In field separator */
77 int argc
, amax
; /* Current and maximum number of args */
78 Char
**argv
; /* Argument list */
79 Char
*wptr
, *wmax
; /* Space and limit on the word buffer */
80 Char
*wstart
; /* Beginning of next word */
81 Char
*wspace
; /* Space of word buffer */
82 quote_t quote
; /* Quoting state */
83 int flags
; /* flags; */
87 static void FUN(tok
,finish
)(TYPE(Tokenizer
) *);
91 * Finish a word in the tokenizer.
94 FUN(tok
,finish
)(TYPE(Tokenizer
) *tok
)
98 if ((tok
->flags
& TOK_KEEP
) || tok
->wptr
!= tok
->wstart
) {
99 tok
->argv
[tok
->argc
++] = tok
->wstart
;
100 tok
->argv
[tok
->argc
] = NULL
;
101 tok
->wstart
= ++tok
->wptr
;
103 tok
->flags
&= ~TOK_KEEP
;
108 * Initialize the tokenizer
111 FUN(tok
,init
)(const Char
*ifs
)
113 TYPE(Tokenizer
) *tok
= malloc(sizeof(TYPE(Tokenizer
)));
117 tok
->ifs
= tok_strdup(ifs
? ifs
: IFS
);
118 if (tok
->ifs
== NULL
) {
124 tok
->argv
= reallocarray(NULL
, tok
->amax
, sizeof(*tok
->argv
));
125 if (tok
->argv
== NULL
) {
131 tok
->wspace
= reallocarray(NULL
, WINCR
, sizeof(*tok
->wspace
));
132 if (tok
->wspace
== NULL
) {
138 tok
->wmax
= tok
->wspace
+ WINCR
;
139 tok
->wstart
= tok
->wspace
;
140 tok
->wptr
= tok
->wspace
;
149 * Reset the tokenizer
152 FUN(tok
,reset
)(TYPE(Tokenizer
) *tok
)
156 tok
->wstart
= tok
->wspace
;
157 tok
->wptr
= tok
->wspace
;
167 FUN(tok
,end
)(TYPE(Tokenizer
) *tok
)
179 * Bourne shell (sh(1)) like tokenizing
181 * tok current tokenizer state (setup with FUN(tok,init)())
186 * 2 Unmatched double quote
187 * 1 Unmatched single quote
189 * Modifies (if return value is 0):
190 * argc number of arguments
191 * argv argument array
192 * cursorc if !NULL, argv element containing cursor
193 * cursorv if !NULL, offset in argv[cursorc] of cursor
196 FUN(tok
,line
)(TYPE(Tokenizer
) *tok
, const TYPE(LineInfo
) *line
,
197 int *argc
, const Char
***argv
, int *cursorc
, int *cursoro
)
204 for (ptr
= line
->buffer
; ;ptr
++) {
205 if (ptr
>= line
->lastchar
)
207 if (ptr
== line
->cursor
) {
209 co
= (int)(tok
->wptr
- tok
->wstart
);
213 tok
->flags
|= TOK_KEEP
;
214 tok
->flags
&= ~TOK_EAT
;
215 switch (tok
->quote
) {
217 tok
->quote
= Q_single
; /* Enter single quote
221 case Q_single
: /* Exit single quote mode */
225 case Q_one
: /* Quote this ' */
230 case Q_double
: /* Stay in double quote mode */
234 case Q_doubleone
: /* Quote this ' */
235 tok
->quote
= Q_double
;
245 tok
->flags
&= ~TOK_EAT
;
246 tok
->flags
|= TOK_KEEP
;
247 switch (tok
->quote
) {
248 case Q_none
: /* Enter double quote mode */
249 tok
->quote
= Q_double
;
252 case Q_double
: /* Exit double quote mode */
256 case Q_one
: /* Quote this " */
261 case Q_single
: /* Stay in single quote mode */
265 case Q_doubleone
: /* Quote this " */
266 tok
->quote
= Q_double
;
276 tok
->flags
|= TOK_KEEP
;
277 tok
->flags
&= ~TOK_EAT
;
278 switch (tok
->quote
) {
279 case Q_none
: /* Quote next character */
283 case Q_double
: /* Quote next character */
284 tok
->quote
= Q_doubleone
;
287 case Q_one
: /* Quote this, restore state */
292 case Q_single
: /* Stay in single quote mode */
296 case Q_doubleone
: /* Quote this \ */
297 tok
->quote
= Q_double
;
307 tok
->flags
&= ~TOK_EAT
;
308 switch (tok
->quote
) {
314 *tok
->wptr
++ = *ptr
; /* Add the return */
317 case Q_doubleone
: /* Back to double, eat the '\n' */
318 tok
->flags
|= TOK_EAT
;
319 tok
->quote
= Q_double
;
322 case Q_one
: /* No quote, more eat the '\n' */
323 tok
->flags
|= TOK_EAT
;
333 switch (tok
->quote
) {
335 /* Finish word and return */
336 if (tok
->flags
& TOK_EAT
) {
337 tok
->flags
&= ~TOK_EAT
;
349 tok
->quote
= Q_double
;
364 tok
->flags
&= ~TOK_EAT
;
365 switch (tok
->quote
) {
367 if (Strchr(tok
->ifs
, *ptr
) != NULL
)
368 FUN(tok
,finish
)(tok
);
381 tok
->quote
= Q_double
;
397 if (tok
->wptr
>= tok
->wmax
- 4) {
398 size_t size
= tok
->wmax
- tok
->wspace
+ WINCR
;
399 Char
*s
= reallocarray(tok
->wspace
, size
, sizeof(*s
));
403 if (s
!= tok
->wspace
) {
405 for (i
= 0; i
< tok
->argc
; i
++) {
407 (tok
->argv
[i
] - tok
->wspace
) + s
;
409 tok
->wptr
= (tok
->wptr
- tok
->wspace
) + s
;
410 tok
->wstart
= (tok
->wstart
- tok
->wspace
) + s
;
413 tok
->wmax
= s
+ size
;
415 if (tok
->argc
>= tok
->amax
- 4) {
418 p
= reallocarray(tok
->argv
, tok
->amax
, sizeof(*p
));
427 if (cc
== -1 && co
== -1) {
429 co
= (int)(tok
->wptr
- tok
->wstart
);
435 FUN(tok
,finish
)(tok
);
436 *argv
= (const Char
**)tok
->argv
;
442 * Simpler version of tok_line, taking a NUL terminated line
443 * and splitting into words, ignoring cursor state.
446 FUN(tok
,str
)(TYPE(Tokenizer
) *tok
, const Char
*line
, int *argc
,
451 memset(&li
, 0, sizeof(li
));
453 li
.cursor
= li
.lastchar
= Strchr(line
, '\0');
454 return FUN(tok
,line
)(tok
, &li
, argc
, argv
, NULL
, NULL
);