2 * Copyright (c) 2011 Martin Sucha
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 /* Forward declarations of static functions */
39 static wchar_t tok_get_char(tokenizer_t
*);
40 static wchar_t tok_look_char(tokenizer_t
*);
41 static int tok_push_char(tokenizer_t
*, wchar_t);
42 static int tok_push_token(tokenizer_t
*);
43 static bool tok_pending_chars(tokenizer_t
*);
44 static int tok_finish_string(tokenizer_t
*);
45 static void tok_start_token(tokenizer_t
*, token_type_t
);
47 /** Initialize the token parser
49 * @param tok the tokenizer structure to initialize
50 * @param input the input string to tokenize
51 * @param out_tokens array of strings where to store the result
52 * @param max_tokens number of elements of the out_tokens array
54 int tok_init(tokenizer_t
*tok
, char *input
, token_t
*out_tokens
,
59 tok
->last_in_offset
= 0;
60 tok
->in_char_offset
= 0;
61 tok
->last_in_char_offset
= 0;
63 tok
->outtok
= out_tokens
;
64 tok
->outtok_offset
= 0;
65 tok
->outtok_size
= max_tokens
;
67 /* Prepare a buffer where all the token strings will be stored */
68 size_t len
= str_size(input
) + max_tokens
+ 1;
69 char *tmp
= malloc(len
);
76 tok
->outbuf_offset
= 0;
77 tok
->outbuf_size
= len
;
78 tok
->outbuf_last_start
= 0;
83 /** Finalize the token parser */
84 void tok_fini(tokenizer_t
*tok
)
86 if (tok
->outbuf
!= NULL
) {
91 /** Tokenize the input string into the tokens */
92 int tok_tokenize(tokenizer_t
*tok
, size_t *tokens_length
)
97 /* Read the input line char by char and append tokens */
98 while ((next_char
= tok_look_char(tok
)) != 0) {
99 if (next_char
== ' ') {
100 /* Push the token if there is any.
101 * There may not be any pending char for a token in case
102 * there are several spaces in the input.
104 if (tok_pending_chars(tok
)) {
105 rc
= tok_push_token(tok
);
110 tok_start_token(tok
, TOKTYPE_SPACE
);
111 /* Eat all the spaces */
112 while (tok_look_char(tok
) == ' ') {
113 tok_push_char(tok
, tok_get_char(tok
));
118 else if (next_char
== '|') {
119 /* Pipes are tokens that are delimiters and should be
120 * output as a separate token
122 if (tok_pending_chars(tok
)) {
123 rc
= tok_push_token(tok
);
129 tok_start_token(tok
, TOKTYPE_PIPE
);
131 rc
= tok_push_char(tok
, tok_get_char(tok
));
136 rc
= tok_push_token(tok
);
141 else if (next_char
== '\'') {
142 /* A string starts with a quote (') and ends again with a quote.
143 * A literal quote is written as ''
145 tok_start_token(tok
, TOKTYPE_TEXT
);
148 rc
= tok_finish_string(tok
);
154 if (!tok_pending_chars(tok
)) {
155 tok_start_token(tok
, TOKTYPE_TEXT
);
157 /* If we are handling any other character, just append it to
160 rc
= tok_push_char(tok
, tok_get_char(tok
));
167 /* Push the last token */
168 if (tok_pending_chars(tok
)) {
169 rc
= tok_push_token(tok
);
175 *tokens_length
= tok
->outtok_offset
;
180 /** Finish tokenizing an opened string */
181 int tok_finish_string(tokenizer_t
*tok
)
186 while ((next_char
= tok_look_char(tok
)) != 0) {
187 if (next_char
== '\'') {
190 if (tok_look_char(tok
) == '\'') {
191 /* Encode a single literal quote */
192 rc
= tok_push_char(tok
, '\'');
197 /* Swallow the additional one in the input */
202 return tok_push_token(tok
);
206 rc
= tok_push_char(tok
, tok_get_char(tok
));
213 /* If we are here, the string run to the end without being closed */
217 /** Get a char from input, advancing the input position */
218 wchar_t tok_get_char(tokenizer_t
*tok
)
220 tok
->in_char_offset
++;
221 return str_decode(tok
->in
, &tok
->in_offset
, STR_NO_LIMIT
);
224 /** Get a char from input, while staying on the same input position */
225 wchar_t tok_look_char(tokenizer_t
*tok
)
227 size_t old_offset
= tok
->in_offset
;
228 size_t old_char_offset
= tok
->in_char_offset
;
229 wchar_t ret
= tok_get_char(tok
);
230 tok
->in_offset
= old_offset
;
231 tok
->in_char_offset
= old_char_offset
;
235 /** Append a char to the end of the current token */
236 int tok_push_char(tokenizer_t
*tok
, wchar_t ch
)
238 return chr_encode(ch
, tok
->outbuf
, &tok
->outbuf_offset
, tok
->outbuf_size
);
241 void tok_start_token(tokenizer_t
*tok
, token_type_t type
)
243 tok
->current_type
= type
;
246 /** Push the current token to the output array */
247 int tok_push_token(tokenizer_t
*tok
)
249 if (tok
->outtok_offset
>= tok
->outtok_size
) {
253 if (tok
->outbuf_offset
>= tok
->outbuf_size
) {
257 tok
->outbuf
[tok
->outbuf_offset
++] = 0;
258 token_t
*tokinfo
= &tok
->outtok
[tok
->outtok_offset
++];
259 tokinfo
->type
= tok
->current_type
;
260 tokinfo
->text
= tok
->outbuf
+ tok
->outbuf_last_start
;
261 tokinfo
->byte_start
= tok
->last_in_offset
;
262 tokinfo
->byte_length
= tok
->in_offset
- tok
->last_in_offset
;
263 tokinfo
->char_start
= tok
->last_in_char_offset
;
264 tokinfo
->char_length
= tok
->in_char_offset
- tok
->last_in_char_offset
;
265 tok
->outbuf_last_start
= tok
->outbuf_offset
;
267 /* We have consumed the first char of the next token already */
268 tok
->last_in_offset
= tok
->in_offset
;
269 tok
->last_in_char_offset
= tok
->in_char_offset
;
274 /** Return true, if the current token is not empty */
275 bool tok_pending_chars(tokenizer_t
*tok
)
277 assert(tok
->outbuf_offset
>= tok
->outbuf_last_start
);
278 return (tok
->outbuf_offset
!= tok
->outbuf_last_start
);