glib/gshell.c

   1 /* gshell.c - Shell-related utilities
   2  *
   3  *  Copyright 2000 Red Hat, Inc.
   4  *  g_execvpe implementation based on GNU libc execvp:
   5  *   Copyright 1991, 92, 95, 96, 97, 98, 99 Free Software Foundation, Inc.
   6  *
   7  * This library is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * This library is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public License
  18  * along with this library; if not, see <http://www.gnu.org/licenses/>.
  19  */
  20
  21 #include "config.h"
  22
  23 #include <string.h>
  24
  25 #include "gshell.h"
  26
  27 #include "gslist.h"
  28 #include "gstrfuncs.h"
  29 #include "gstring.h"
  30 #include "gtestutils.h"
  31 #include "glibintl.h"
  32 #include "gthread.h"
  33
  34 /**
  35  * SECTION:shell
  36  * @title: Shell-related Utilities
  37  * @short_description: shell-like commandline handling
  38  *
  39  * GLib provides the functions g_shell_quote() and g_shell_unquote()
  40  * to handle shell-like quoting in strings. The function g_shell_parse_argv()
  41  * parses a string similar to the way a POSIX shell (/bin/sh) would.
  42  *
  43  * Note that string handling in shells has many obscure and historical
  44  * corner-cases which these functions do not necessarily reproduce. They
  45  * are good enough in practice, though.
  46  */
  47
  48 /**
  49  * G_SHELL_ERROR:
  50  *
  51  * Error domain for shell functions. Errors in this domain will be from
  52  * the #GShellError enumeration. See #GError for information on error
  53  * domains.
  54  **/
  55
  56 /**
  57  * GShellError:
  58  * @G_SHELL_ERROR_BAD_QUOTING: Mismatched or otherwise mangled quoting.
  59  * @G_SHELL_ERROR_EMPTY_STRING: String to be parsed was empty.
  60  * @G_SHELL_ERROR_FAILED: Some other error.
  61  *
  62  * Error codes returned by shell functions.
  63  **/
  64 G_DEFINE_QUARK (g-shell-error-quark, g_shell_error)
  65
  66 /* Single quotes preserve the literal string exactly. escape
  67  * sequences are not allowed; not even \' - if you want a '
  68  * in the quoted text, you have to do something like 'foo'\''bar'
  69  *
  70  * Double quotes allow $ ` " \ and newline to be escaped with backslash.
  71  * Otherwise double quotes preserve things literally.
  72  */
  73
  74 static gboolean
  75 unquote_string_inplace (gchar* str, gchar** end, GError** err)
  76 {
  77   gchar* dest;
  78   gchar* s;
  79   gchar quote_char;
  80
  81   g_return_val_if_fail(end != NULL, FALSE);
  82   g_return_val_if_fail(err == NULL || *err == NULL, FALSE);
  83   g_return_val_if_fail(str != NULL, FALSE);
  84
  85   dest = s = str;
  86
  87   quote_char = *s;
  88
  89   if (!(*s == '"' || *s == '\''))
  90     {
  91       g_set_error_literal (err,
  92                            G_SHELL_ERROR,
  93                            G_SHELL_ERROR_BAD_QUOTING,
  94                            _("Quoted text doesn’t begin with a quotation mark"));
  95       *end = str;
  96       return FALSE;
  97     }
  98
  99   /* Skip the initial quote mark */
 100   ++s;
 101
 102   if (quote_char == '"')
 103     {
 104       while (*s)
 105         {
 106           g_assert(s > dest); /* loop invariant */
 107
 108           switch (*s)
 109             {
 110             case '"':
 111               /* End of the string, return now */
 112               *dest = '\0';
 113               ++s;
 114               *end = s;
 115               return TRUE;
 116               break;
 117
 118             case '\\':
 119               /* Possible escaped quote or \ */
 120               ++s;
 121               switch (*s)
 122                 {
 123                 case '"':
 124                 case '\\':
 125                 case '`':
 126                 case '$':
 127                 case '\n':
 128                   *dest = *s;
 129                   ++s;
 130                   ++dest;
 131                   break;
 132
 133                 default:
 134                   /* not an escaped char */
 135                   *dest = '\\';
 136                   ++dest;
 137                   /* ++s already done. */
 138                   break;
 139                 }
 140               break;
 141
 142             default:
 143               *dest = *s;
 144               ++dest;
 145               ++s;
 146               break;
 147             }
 148
 149           g_assert(s > dest); /* loop invariant */
 150         }
 151     }
 152   else
 153     {
 154       while (*s)
 155         {
 156           g_assert(s > dest); /* loop invariant */
 157
 158           if (*s == '\'')
 159             {
 160               /* End of the string, return now */
 161               *dest = '\0';
 162               ++s;
 163               *end = s;
 164               return TRUE;
 165             }
 166           else
 167             {
 168               *dest = *s;
 169               ++dest;
 170               ++s;
 171             }
 172
 173           g_assert(s > dest); /* loop invariant */
 174         }
 175     }
 176
 177   /* If we reach here this means the close quote was never encountered */
 178
 179   *dest = '\0';
 180
 181   g_set_error_literal (err,
 182                        G_SHELL_ERROR,
 183                        G_SHELL_ERROR_BAD_QUOTING,
 184                        _("Unmatched quotation mark in command line or other shell-quoted text"));
 185   *end = s;
 186   return FALSE;
 187 }
 188
 189 /**
 190  * g_shell_quote:
 191  * @unquoted_string: (type filename): a literal string
 192  *
 193  * Quotes a string so that the shell (/bin/sh) will interpret the
 194  * quoted string to mean @unquoted_string. If you pass a filename to
 195  * the shell, for example, you should first quote it with this
 196  * function.  The return value must be freed with g_free(). The
 197  * quoting style used is undefined (single or double quotes may be
 198  * used).
 199  *
 200  * Returns: (type filename): quoted string
 201  **/
 202 gchar*
 203 g_shell_quote (const gchar *unquoted_string)
 204 {
 205   /* We always use single quotes, because the algorithm is cheesier.
 206    * We could use double if we felt like it, that might be more
 207    * human-readable.
 208    */
 209
 210   const gchar *p;
 211   GString *dest;
 212
 213   g_return_val_if_fail (unquoted_string != NULL, NULL);
 214
 215   dest = g_string_new ("'");
 216
 217   p = unquoted_string;
 218
 219   /* could speed this up a lot by appending chunks of text at a
 220    * time.
 221    */
 222   while (*p)
 223     {
 224       /* Replace literal ' with a close ', a \', and a open ' */
 225       if (*p == '\'')
 226         g_string_append (dest, "'\\''");
 227       else
 228         g_string_append_c (dest, *p);
 229
 230       ++p;
 231     }
 232
 233   /* close the quote */
 234   g_string_append_c (dest, '\'');
 235
 236   return g_string_free (dest, FALSE);
 237 }
 238
 239 /**
 240  * g_shell_unquote:
 241  * @quoted_string: (type filename): shell-quoted string
 242  * @error: error return location or NULL
 243  *
 244  * Unquotes a string as the shell (/bin/sh) would. Only handles
 245  * quotes; if a string contains file globs, arithmetic operators,
 246  * variables, backticks, redirections, or other special-to-the-shell
 247  * features, the result will be different from the result a real shell
 248  * would produce (the variables, backticks, etc. will be passed
 249  * through literally instead of being expanded). This function is
 250  * guaranteed to succeed if applied to the result of
 251  * g_shell_quote(). If it fails, it returns %NULL and sets the
 252  * error. The @quoted_string need not actually contain quoted or
 253  * escaped text; g_shell_unquote() simply goes through the string and
 254  * unquotes/unescapes anything that the shell would. Both single and
 255  * double quotes are handled, as are escapes including escaped
 256  * newlines. The return value must be freed with g_free(). Possible
 257  * errors are in the #G_SHELL_ERROR domain.
 258  *
 259  * Shell quoting rules are a bit strange. Single quotes preserve the
 260  * literal string exactly. escape sequences are not allowed; not even
 261  * \' - if you want a ' in the quoted text, you have to do something
 262  * like 'foo'\''bar'.  Double quotes allow $, `, ", \, and newline to
 263  * be escaped with backslash. Otherwise double quotes preserve things
 264  * literally.
 265  *
 266  * Returns: (type filename): an unquoted string
 267  **/
 268 gchar*
 269 g_shell_unquote (const gchar *quoted_string,
 270                  GError     **error)
 271 {
 272   gchar *unquoted;
 273   gchar *end;
 274   gchar *start;
 275   GString *retval;
 276
 277   g_return_val_if_fail (quoted_string != NULL, NULL);
 278
 279   unquoted = g_strdup (quoted_string);
 280
 281   start = unquoted;
 282   end = unquoted;
 283   retval = g_string_new (NULL);
 284
 285   /* The loop allows cases such as
 286    * "foo"blah blah'bar'woo foo"baz"la la la\'\''foo'
 287    */
 288   while (*start)
 289     {
 290       /* Append all non-quoted chars, honoring backslash escape
 291        */
 292
 293       while (*start && !(*start == '"' || *start == '\''))
 294         {
 295           if (*start == '\\')
 296             {
 297               /* all characters can get escaped by backslash,
 298                * except newline, which is removed if it follows
 299                * a backslash outside of quotes
 300                */
 301
 302               ++start;
 303               if (*start)
 304                 {
 305                   if (*start != '\n')
 306                     g_string_append_c (retval, *start);
 307                   ++start;
 308                 }
 309             }
 310           else
 311             {
 312               g_string_append_c (retval, *start);
 313               ++start;
 314             }
 315         }
 316
 317       if (*start)
 318         {
 319           if (!unquote_string_inplace (start, &end, error))
 320             {
 321               goto error;
 322             }
 323           else
 324             {
 325               g_string_append (retval, start);
 326               start = end;
 327             }
 328         }
 329     }
 330
 331   g_free (unquoted);
 332   return g_string_free (retval, FALSE);
 333
 334  error:
 335   g_assert (error == NULL || *error != NULL);
 336
 337   g_free (unquoted);
 338   g_string_free (retval, TRUE);
 339   return NULL;
 340 }
 341
 342 /* g_parse_argv() does a semi-arbitrary weird subset of the way
 343  * the shell parses a command line. We don't do variable expansion,
 344  * don't understand that operators are tokens, don't do tilde expansion,
 345  * don't do command substitution, no arithmetic expansion, IFS gets ignored,
 346  * don't do filename globs, don't remove redirection stuff, etc.
 347  *
 348  * READ THE UNIX98 SPEC on "Shell Command Language" before changing
 349  * the behavior of this code.
 350  *
 351  * Steps to parsing the argv string:
 352  *
 353  *  - tokenize the string (but since we ignore operators,
 354  *    our tokenization may diverge from what the shell would do)
 355  *    note that tokenization ignores the internals of a quoted
 356  *    word and it always splits on spaces, not on IFS even
 357  *    if we used IFS. We also ignore "end of input indicator"
 358  *    (I guess this is control-D?)
 359  *
 360  *    Tokenization steps, from UNIX98 with operator stuff removed,
 361  *    are:
 362  *
 363  *    1) "If the current character is backslash, single-quote or
 364  *        double-quote (\, ' or ") and it is not quoted, it will affect
 365  *        quoting for subsequent characters up to the end of the quoted
 366  *        text. The rules for quoting are as described in Quoting
 367  *        . During token recognition no substitutions will be actually
 368  *        performed, and the result token will contain exactly the
 369  *        characters that appear in the input (except for newline
 370  *        character joining), unmodified, including any embedded or
 371  *        enclosing quotes or substitution operators, between the quote
 372  *        mark and the end of the quoted text. The token will not be
 373  *        delimited by the end of the quoted field."
 374  *
 375  *    2) "If the current character is an unquoted newline character,
 376  *        the current token will be delimited."
 377  *
 378  *    3) "If the current character is an unquoted blank character, any
 379  *        token containing the previous character is delimited and the
 380  *        current character will be discarded."
 381  *
 382  *    4) "If the previous character was part of a word, the current
 383  *        character will be appended to that word."
 384  *
 385  *    5) "If the current character is a "#", it and all subsequent
 386  *        characters up to, but excluding, the next newline character
 387  *        will be discarded as a comment. The newline character that
 388  *        ends the line is not considered part of the comment. The
 389  *        "#" starts a comment only when it is at the beginning of a
 390  *        token. Since the search for the end-of-comment does not
 391  *        consider an escaped newline character specially, a comment
 392  *        cannot be continued to the next line."
 393  *
 394  *    6) "The current character will be used as the start of a new word."
 395  *
 396  *
 397  *  - for each token (word), perform portions of word expansion, namely
 398  *    field splitting (using default whitespace IFS) and quote
 399  *    removal.  Field splitting may increase the number of words.
 400  *    Quote removal does not increase the number of words.
 401  *
 402  *   "If the complete expansion appropriate for a word results in an
 403  *   empty field, that empty field will be deleted from the list of
 404  *   fields that form the completely expanded command, unless the
 405  *   original word contained single-quote or double-quote characters."
 406  *    - UNIX98 spec
 407  *
 408  *
 409  */
 410
 411 static inline void
 412 ensure_token (GString **token)
 413 {
 414   if (*token == NULL)
 415     *token = g_string_new (NULL);
 416 }
 417
 418 static void
 419 delimit_token (GString **token,
 420                GSList **retval)
 421 {
 422   if (*token == NULL)
 423     return;
 424
 425   *retval = g_slist_prepend (*retval, g_string_free (*token, FALSE));
 426
 427   *token = NULL;
 428 }
 429
 430 static GSList*
 431 tokenize_command_line (const gchar *command_line,
 432                        GError **error)
 433 {
 434   gchar current_quote;
 435   const gchar *p;
 436   GString *current_token = NULL;
 437   GSList *retval = NULL;
 438   gboolean quoted;
 439
 440   current_quote = '\0';
 441   quoted = FALSE;
 442   p = command_line;
 443
 444   while (*p)
 445     {
 446       if (current_quote == '\\')
 447         {
 448           if (*p == '\n')
 449             {
 450               /* we append nothing; backslash-newline become nothing */
 451             }
 452           else
 453             {
 454               /* we append the backslash and the current char,
 455                * to be interpreted later after tokenization
 456                */
 457               ensure_token (&current_token);
 458               g_string_append_c (current_token, '\\');
 459               g_string_append_c (current_token, *p);
 460             }
 461
 462           current_quote = '\0';
 463         }
 464       else if (current_quote == '#')
 465         {
 466           /* Discard up to and including next newline */
 467           while (*p && *p != '\n')
 468             ++p;
 469
 470           current_quote = '\0';
 471
 472           if (*p == '\0')
 473             break;
 474         }
 475       else if (current_quote)
 476         {
 477           if (*p == current_quote &&
 478               /* check that it isn't an escaped double quote */
 479               !(current_quote == '"' && quoted))
 480             {
 481               /* close the quote */
 482               current_quote = '\0';
 483             }
 484
 485           /* Everything inside quotes, and the close quote,
 486            * gets appended literally.
 487            */
 488
 489           ensure_token (&current_token);
 490           g_string_append_c (current_token, *p);
 491         }
 492       else
 493         {
 494           switch (*p)
 495             {
 496             case '\n':
 497               delimit_token (&current_token, &retval);
 498               break;
 499
 500             case ' ':
 501             case '\t':
 502               /* If the current token contains the previous char, delimit
 503                * the current token. A nonzero length
 504                * token should always contain the previous char.
 505                */
 506               if (current_token &&
 507                   current_token->len > 0)
 508                 {
 509                   delimit_token (&current_token, &retval);
 510                 }
 511
 512               /* discard all unquoted blanks (don't add them to a token) */
 513               break;
 514
 515
 516               /* single/double quotes are appended to the token,
 517                * escapes are maybe appended next time through the loop,
 518                * comment chars are never appended.
 519                */
 520
 521             case '\'':
 522             case '"':
 523               ensure_token (&current_token);
 524               g_string_append_c (current_token, *p);
 525
 526               /* FALL THRU */
 527             case '\\':
 528               current_quote = *p;
 529               break;
 530
 531             case '#':
 532               if (p == command_line)
 533                 { /* '#' was the first char */
 534                   current_quote = *p;
 535                   break;
 536                 }
 537               switch(*(p-1))
 538                 {
 539                   case ' ':
 540                   case '\n':
 541                   case '\0':
 542                     current_quote = *p;
 543                     break;
 544                   default:
 545                     ensure_token (&current_token);
 546                     g_string_append_c (current_token, *p);
 547                     break;
 548                 }
 549               break;
 550
 551             default:
 552               /* Combines rules 4) and 6) - if we have a token, append to it,
 553                * otherwise create a new token.
 554                */
 555               ensure_token (&current_token);
 556               g_string_append_c (current_token, *p);
 557               break;
 558             }
 559         }
 560
 561       /* We need to count consecutive backslashes mod 2,
 562        * to detect escaped doublequotes.
 563        */
 564       if (*p != '\\')
 565         quoted = FALSE;
 566       else
 567         quoted = !quoted;
 568
 569       ++p;
 570     }
 571
 572   delimit_token (&current_token, &retval);
 573
 574   if (current_quote)
 575     {
 576       if (current_quote == '\\')
 577         g_set_error (error,
 578                      G_SHELL_ERROR,
 579                      G_SHELL_ERROR_BAD_QUOTING,
 580                      _("Text ended just after a “\\” character."
 581                        " (The text was “%s”)"),
 582                      command_line);
 583       else
 584         g_set_error (error,
 585                      G_SHELL_ERROR,
 586                      G_SHELL_ERROR_BAD_QUOTING,
 587                      _("Text ended before matching quote was found for %c."
 588                        " (The text was “%s”)"),
 589                      current_quote, command_line);
 590
 591       goto error;
 592     }
 593
 594   if (retval == NULL)
 595     {
 596       g_set_error_literal (error,
 597                            G_SHELL_ERROR,
 598                            G_SHELL_ERROR_EMPTY_STRING,
 599                            _("Text was empty (or contained only whitespace)"));
 600
 601       goto error;
 602     }
 603
 604   /* we appended backward */
 605   retval = g_slist_reverse (retval);
 606
 607   return retval;
 608
 609  error:
 610   g_assert (error == NULL || *error != NULL);
 611
 612   g_slist_free_full (retval, g_free);
 613
 614   return NULL;
 615 }
 616
 617 /**
 618  * g_shell_parse_argv:
 619  * @command_line: (type filename): command line to parse
 620  * @argcp: (out) (optional): return location for number of args
 621  * @argvp: (out) (optional) (array length=argcp zero-terminated=1) (element-type filename):
 622  *   return location for array of args
 623  * @error: (optional): return location for error
 624  *
 625  * Parses a command line into an argument vector, in much the same way
 626  * the shell would, but without many of the expansions the shell would
 627  * perform (variable expansion, globs, operators, filename expansion,
 628  * etc. are not supported). The results are defined to be the same as
 629  * those you would get from a UNIX98 /bin/sh, as long as the input
 630  * contains none of the unsupported shell expansions. If the input
 631  * does contain such expansions, they are passed through
 632  * literally. Possible errors are those from the #G_SHELL_ERROR
 633  * domain. Free the returned vector with g_strfreev().
 634  *
 635  * Returns: %TRUE on success, %FALSE if error set
 636  **/
 637 gboolean
 638 g_shell_parse_argv (const gchar *command_line,
 639                     gint        *argcp,
 640                     gchar     ***argvp,
 641                     GError     **error)
 642 {
 643   /* Code based on poptParseArgvString() from libpopt */
 644   gint argc = 0;
 645   gchar **argv = NULL;
 646   GSList *tokens = NULL;
 647   gint i;
 648   GSList *tmp_list;
 649
 650   g_return_val_if_fail (command_line != NULL, FALSE);
 651
 652   tokens = tokenize_command_line (command_line, error);
 653   if (tokens == NULL)
 654     return FALSE;
 655
 656   /* Because we can't have introduced any new blank space into the
 657    * tokens (we didn't do any new expansions), we don't need to
 658    * perform field splitting. If we were going to honor IFS or do any
 659    * expansions, we would have to do field splitting on each word
 660    * here. Also, if we were going to do any expansion we would need to
 661    * remove any zero-length words that didn't contain quotes
 662    * originally; but since there's no expansion we know all words have
 663    * nonzero length, unless they contain quotes.
 664    *
 665    * So, we simply remove quotes, and don't do any field splitting or
 666    * empty word removal, since we know there was no way to introduce
 667    * such things.
 668    */
 669
 670   argc = g_slist_length (tokens);
 671   argv = g_new0 (gchar*, argc + 1);
 672   i = 0;
 673   tmp_list = tokens;
 674   while (tmp_list)
 675     {
 676       argv[i] = g_shell_unquote (tmp_list->data, error);
 677
 678       /* Since we already checked that quotes matched up in the
 679        * tokenizer, this shouldn't be possible to reach I guess.
 680        */
 681       if (argv[i] == NULL)
 682         goto failed;
 683
 684       tmp_list = g_slist_next (tmp_list);
 685       ++i;
 686     }
 687
 688   g_slist_free_full (tokens, g_free);
 689
 690   if (argcp)
 691     *argcp = argc;
 692
 693   if (argvp)
 694     *argvp = argv;
 695   else
 696     g_strfreev (argv);
 697
 698   return TRUE;
 699
 700  failed:
 701
 702   g_assert (error == NULL || *error != NULL);
 703   g_strfreev (argv);
 704   g_slist_free_full (tokens, g_free);
 705
 706   return FALSE;
 707 }