uspace/app/bdsh/tok.c

   1 /*
   2  * Copyright (c) 2011 Martin Sucha
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  *
   9  * - Redistributions of source code must retain the above copyright
  10  *   notice, this list of conditions and the following disclaimer.
  11  * - Redistributions in binary form must reproduce the above copyright
  12  *   notice, this list of conditions and the following disclaimer in the
  13  *   documentation and/or other materials provided with the distribution.
  14  * - The name of the author may not be used to endorse or promote products
  15  *   derived from this software without specific prior written permission.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27  */
  28
  29 #include <str.h>
  30 #include <assert.h>
  31 #include <malloc.h>
  32 #include <stdlib.h>
  33 #include <stddef.h>
  34 #include <errno.h>
  35
  36 #include "tok.h"
  37
  38 /* Forward declarations of static functions */
  39 static wchar_t tok_get_char(tokenizer_t *);
  40 static wchar_t tok_look_char(tokenizer_t *);
  41 static int tok_push_char(tokenizer_t *, wchar_t);
  42 static int tok_push_token(tokenizer_t *);
  43 static bool tok_pending_chars(tokenizer_t *);
  44 static int tok_finish_string(tokenizer_t *);
  45 static void tok_start_token(tokenizer_t *, token_type_t);
  46
  47 /** Initialize the token parser
  48  *
  49  * @param tok the tokenizer structure to initialize
  50  * @param input the input string to tokenize
  51  * @param out_tokens array of strings where to store the result
  52  * @param max_tokens number of elements of the out_tokens array
  53  */
  54 int tok_init(tokenizer_t *tok, char *input, token_t *out_tokens,
  55     size_t max_tokens)
  56 {
  57         tok->in = input;
  58         tok->in_offset = 0;
  59         tok->last_in_offset = 0;
  60         tok->in_char_offset = 0;
  61         tok->last_in_char_offset = 0;
  62
  63         tok->outtok = out_tokens;
  64         tok->outtok_offset = 0;
  65         tok->outtok_size = max_tokens;
  66
  67         /* Prepare a buffer where all the token strings will be stored */
  68         size_t len = str_size(input) + max_tokens + 1;
  69         char *tmp = malloc(len);
  70
  71         if (tmp == NULL) {
  72                 return ENOMEM;
  73         }
  74
  75         tok->outbuf = tmp;
  76         tok->outbuf_offset = 0;
  77         tok->outbuf_size = len;
  78         tok->outbuf_last_start = 0;
  79
  80         return EOK;
  81 }
  82
  83 /** Finalize the token parser */
  84 void tok_fini(tokenizer_t *tok)
  85 {
  86         if (tok->outbuf != NULL) {
  87                 free(tok->outbuf);
  88         }
  89 }
  90
  91 /** Tokenize the input string into the tokens */
  92 int tok_tokenize(tokenizer_t *tok, size_t *tokens_length)
  93 {
  94         int rc;
  95         wchar_t next_char;
  96
  97         /* Read the input line char by char and append tokens */
  98         while ((next_char = tok_look_char(tok)) != 0) {
  99                 if (next_char == ' ') {
 100                         /* Push the token if there is any.
 101                          * There may not be any pending char for a token in case
 102                          * there are several spaces in the input.
 103                          */
 104                         if (tok_pending_chars(tok)) {
 105                                 rc = tok_push_token(tok);
 106                                 if (rc != EOK) {
 107                                         return rc;
 108                                 }
 109                         }
 110                         tok_start_token(tok, TOKTYPE_SPACE);
 111                         /* Eat all the spaces */
 112                         while (tok_look_char(tok) == ' ') {
 113                                 tok_push_char(tok, tok_get_char(tok));
 114                         }
 115                         tok_push_token(tok);
 116
 117                 }
 118                 else if (next_char == '|') {
 119                         /* Pipes are tokens that are delimiters and should be
 120                          * output as a separate token
 121                          */
 122                         if (tok_pending_chars(tok)) {
 123                                 rc = tok_push_token(tok);
 124                                 if (rc != EOK) {
 125                                         return rc;
 126                                 }
 127                         }
 128
 129                         tok_start_token(tok, TOKTYPE_PIPE);
 130
 131                         rc = tok_push_char(tok, tok_get_char(tok));
 132                         if (rc != EOK) {
 133                                 return rc;
 134                         }
 135
 136                         rc = tok_push_token(tok);
 137                         if (rc != EOK) {
 138                                 return rc;
 139                         }
 140                 }
 141                 else if (next_char == '\'') {
 142                         /* A string starts with a quote (') and ends again with a quote.
 143                          * A literal quote is written as ''
 144                          */
 145                         tok_start_token(tok, TOKTYPE_TEXT);
 146                         /* Eat the quote */
 147                         tok_get_char(tok);
 148                         rc = tok_finish_string(tok);
 149                         if (rc != EOK) {
 150                                 return rc;
 151                         }
 152                 }
 153                 else {
 154                         if (!tok_pending_chars(tok)) {
 155                                 tok_start_token(tok, TOKTYPE_TEXT);
 156                         }
 157                         /* If we are handling any other character, just append it to
 158                          * the current token.
 159                          */
 160                         rc = tok_push_char(tok, tok_get_char(tok));
 161                         if (rc != EOK) {
 162                                 return rc;
 163                         }
 164                 }
 165         }
 166
 167         /* Push the last token */
 168         if (tok_pending_chars(tok)) {
 169                 rc = tok_push_token(tok);
 170                 if (rc != EOK) {
 171                         return rc;
 172                 }
 173         }
 174
 175         *tokens_length = tok->outtok_offset;
 176
 177         return EOK;
 178 }
 179
 180 /** Finish tokenizing an opened string */
 181 int tok_finish_string(tokenizer_t *tok)
 182 {
 183         int rc;
 184         wchar_t next_char;
 185
 186         while ((next_char = tok_look_char(tok)) != 0) {
 187                 if (next_char == '\'') {
 188                         /* Eat the quote */
 189                         tok_get_char(tok);
 190                         if (tok_look_char(tok) == '\'') {
 191                                 /* Encode a single literal quote */
 192                                 rc = tok_push_char(tok, '\'');
 193                                 if (rc != EOK) {
 194                                         return rc;
 195                                 }
 196
 197                                 /* Swallow the additional one in the input */
 198                                 tok_get_char(tok);
 199                         }
 200                         else {
 201                                 /* The string end */
 202                                 return tok_push_token(tok);
 203                         }
 204                 }
 205                 else {
 206                         rc = tok_push_char(tok, tok_get_char(tok));
 207                         if (rc != EOK) {
 208                                 return rc;
 209                         }
 210                 }
 211         }
 212
 213         /* If we are here, the string run to the end without being closed */
 214         return EINVAL;
 215 }
 216
 217 /** Get a char from input, advancing the input position */
 218 wchar_t tok_get_char(tokenizer_t *tok)
 219 {
 220         tok->in_char_offset++;
 221         return str_decode(tok->in, &tok->in_offset, STR_NO_LIMIT);
 222 }
 223
 224 /** Get a char from input, while staying on the same input position */
 225 wchar_t tok_look_char(tokenizer_t *tok)
 226 {
 227         size_t old_offset = tok->in_offset;
 228         size_t old_char_offset = tok->in_char_offset;
 229         wchar_t ret = tok_get_char(tok);
 230         tok->in_offset = old_offset;
 231         tok->in_char_offset = old_char_offset;
 232         return ret;
 233 }
 234
 235 /** Append a char to the end of the current token */
 236 int tok_push_char(tokenizer_t *tok, wchar_t ch)
 237 {
 238         return chr_encode(ch, tok->outbuf, &tok->outbuf_offset, tok->outbuf_size);
 239 }
 240
 241 void tok_start_token(tokenizer_t *tok, token_type_t type)
 242 {
 243         tok->current_type = type;
 244 }
 245
 246 /** Push the current token to the output array */
 247 int tok_push_token(tokenizer_t *tok)
 248 {
 249         if (tok->outtok_offset >= tok->outtok_size) {
 250                 return EOVERFLOW;
 251         }
 252
 253         if (tok->outbuf_offset >= tok->outbuf_size) {
 254                 return EOVERFLOW;
 255         }
 256
 257         tok->outbuf[tok->outbuf_offset++] = 0;
 258         token_t *tokinfo = &tok->outtok[tok->outtok_offset++];
 259         tokinfo->type = tok->current_type;
 260         tokinfo->text = tok->outbuf + tok->outbuf_last_start;
 261         tokinfo->byte_start = tok->last_in_offset;
 262         tokinfo->byte_length = tok->in_offset - tok->last_in_offset;
 263         tokinfo->char_start = tok->last_in_char_offset;
 264         tokinfo->char_length = tok->in_char_offset - tok->last_in_char_offset;
 265         tok->outbuf_last_start = tok->outbuf_offset;
 266
 267         /* We have consumed the first char of the next token already */
 268         tok->last_in_offset = tok->in_offset;
 269         tok->last_in_char_offset = tok->in_char_offset;
 270
 271         return EOK;
 272 }
 273
 274 /** Return true, if the current token is not empty */
 275 bool tok_pending_chars(tokenizer_t *tok)
 276 {
 277         assert(tok->outbuf_offset >= tok->outbuf_last_start);
 278         return (tok->outbuf_offset != tok->outbuf_last_start);
 279 }