Remove unistd.h
[helenos.git] / uspace / app / bdsh / tok.c
blob003311afc6c3f840e1bd5f35cdbd115f266d41ae
1 /*
2 * Copyright (c) 2011 Martin Sucha
3 * All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 #include <str.h>
30 #include <assert.h>
31 #include <malloc.h>
32 #include <stdlib.h>
33 #include <stddef.h>
34 #include <errno.h>
36 #include "tok.h"
38 /* Forward declarations of static functions */
39 static wchar_t tok_get_char(tokenizer_t *);
40 static wchar_t tok_look_char(tokenizer_t *);
41 static int tok_push_char(tokenizer_t *, wchar_t);
42 static int tok_push_token(tokenizer_t *);
43 static bool tok_pending_chars(tokenizer_t *);
44 static int tok_finish_string(tokenizer_t *);
45 static void tok_start_token(tokenizer_t *, token_type_t);
47 /** Initialize the token parser
49 * @param tok the tokenizer structure to initialize
50 * @param input the input string to tokenize
51 * @param out_tokens array of strings where to store the result
52 * @param max_tokens number of elements of the out_tokens array
54 int tok_init(tokenizer_t *tok, char *input, token_t *out_tokens,
55 size_t max_tokens)
57 tok->in = input;
58 tok->in_offset = 0;
59 tok->last_in_offset = 0;
60 tok->in_char_offset = 0;
61 tok->last_in_char_offset = 0;
63 tok->outtok = out_tokens;
64 tok->outtok_offset = 0;
65 tok->outtok_size = max_tokens;
67 /* Prepare a buffer where all the token strings will be stored */
68 size_t len = str_size(input) + max_tokens + 1;
69 char *tmp = malloc(len);
71 if (tmp == NULL) {
72 return ENOMEM;
75 tok->outbuf = tmp;
76 tok->outbuf_offset = 0;
77 tok->outbuf_size = len;
78 tok->outbuf_last_start = 0;
80 return EOK;
83 /** Finalize the token parser */
84 void tok_fini(tokenizer_t *tok)
86 if (tok->outbuf != NULL) {
87 free(tok->outbuf);
91 /** Tokenize the input string into the tokens */
92 int tok_tokenize(tokenizer_t *tok, size_t *tokens_length)
94 int rc;
95 wchar_t next_char;
97 /* Read the input line char by char and append tokens */
98 while ((next_char = tok_look_char(tok)) != 0) {
99 if (next_char == ' ') {
100 /* Push the token if there is any.
101 * There may not be any pending char for a token in case
102 * there are several spaces in the input.
104 if (tok_pending_chars(tok)) {
105 rc = tok_push_token(tok);
106 if (rc != EOK) {
107 return rc;
110 tok_start_token(tok, TOKTYPE_SPACE);
111 /* Eat all the spaces */
112 while (tok_look_char(tok) == ' ') {
113 tok_push_char(tok, tok_get_char(tok));
115 tok_push_token(tok);
118 else if (next_char == '|') {
119 /* Pipes are tokens that are delimiters and should be
120 * output as a separate token
122 if (tok_pending_chars(tok)) {
123 rc = tok_push_token(tok);
124 if (rc != EOK) {
125 return rc;
129 tok_start_token(tok, TOKTYPE_PIPE);
131 rc = tok_push_char(tok, tok_get_char(tok));
132 if (rc != EOK) {
133 return rc;
136 rc = tok_push_token(tok);
137 if (rc != EOK) {
138 return rc;
141 else if (next_char == '\'') {
142 /* A string starts with a quote (') and ends again with a quote.
143 * A literal quote is written as ''
145 tok_start_token(tok, TOKTYPE_TEXT);
146 /* Eat the quote */
147 tok_get_char(tok);
148 rc = tok_finish_string(tok);
149 if (rc != EOK) {
150 return rc;
153 else {
154 if (!tok_pending_chars(tok)) {
155 tok_start_token(tok, TOKTYPE_TEXT);
157 /* If we are handling any other character, just append it to
158 * the current token.
160 rc = tok_push_char(tok, tok_get_char(tok));
161 if (rc != EOK) {
162 return rc;
167 /* Push the last token */
168 if (tok_pending_chars(tok)) {
169 rc = tok_push_token(tok);
170 if (rc != EOK) {
171 return rc;
175 *tokens_length = tok->outtok_offset;
177 return EOK;
180 /** Finish tokenizing an opened string */
181 int tok_finish_string(tokenizer_t *tok)
183 int rc;
184 wchar_t next_char;
186 while ((next_char = tok_look_char(tok)) != 0) {
187 if (next_char == '\'') {
188 /* Eat the quote */
189 tok_get_char(tok);
190 if (tok_look_char(tok) == '\'') {
191 /* Encode a single literal quote */
192 rc = tok_push_char(tok, '\'');
193 if (rc != EOK) {
194 return rc;
197 /* Swallow the additional one in the input */
198 tok_get_char(tok);
200 else {
201 /* The string end */
202 return tok_push_token(tok);
205 else {
206 rc = tok_push_char(tok, tok_get_char(tok));
207 if (rc != EOK) {
208 return rc;
213 /* If we are here, the string run to the end without being closed */
214 return EINVAL;
217 /** Get a char from input, advancing the input position */
218 wchar_t tok_get_char(tokenizer_t *tok)
220 tok->in_char_offset++;
221 return str_decode(tok->in, &tok->in_offset, STR_NO_LIMIT);
224 /** Get a char from input, while staying on the same input position */
225 wchar_t tok_look_char(tokenizer_t *tok)
227 size_t old_offset = tok->in_offset;
228 size_t old_char_offset = tok->in_char_offset;
229 wchar_t ret = tok_get_char(tok);
230 tok->in_offset = old_offset;
231 tok->in_char_offset = old_char_offset;
232 return ret;
235 /** Append a char to the end of the current token */
236 int tok_push_char(tokenizer_t *tok, wchar_t ch)
238 return chr_encode(ch, tok->outbuf, &tok->outbuf_offset, tok->outbuf_size);
241 void tok_start_token(tokenizer_t *tok, token_type_t type)
243 tok->current_type = type;
246 /** Push the current token to the output array */
247 int tok_push_token(tokenizer_t *tok)
249 if (tok->outtok_offset >= tok->outtok_size) {
250 return EOVERFLOW;
253 if (tok->outbuf_offset >= tok->outbuf_size) {
254 return EOVERFLOW;
257 tok->outbuf[tok->outbuf_offset++] = 0;
258 token_t *tokinfo = &tok->outtok[tok->outtok_offset++];
259 tokinfo->type = tok->current_type;
260 tokinfo->text = tok->outbuf + tok->outbuf_last_start;
261 tokinfo->byte_start = tok->last_in_offset;
262 tokinfo->byte_length = tok->in_offset - tok->last_in_offset;
263 tokinfo->char_start = tok->last_in_char_offset;
264 tokinfo->char_length = tok->in_char_offset - tok->last_in_char_offset;
265 tok->outbuf_last_start = tok->outbuf_offset;
267 /* We have consumed the first char of the next token already */
268 tok->last_in_offset = tok->in_offset;
269 tok->last_in_char_offset = tok->in_char_offset;
271 return EOK;
274 /** Return true, if the current token is not empty */
275 bool tok_pending_chars(tokenizer_t *tok)
277 assert(tok->outbuf_offset >= tok->outbuf_last_start);
278 return (tok->outbuf_offset != tok->outbuf_last_start);