src/TortoisePlink/WILDCARD.C

   1 /*\r
   2  * Wildcard matching engine for use with SFTP-based file transfer\r
   3  * programs (PSFTP, new-look PSCP): since SFTP has no notion of\r
   4  * getting the remote side to do globbing (and rightly so) we have\r
   5  * to do it locally, by retrieving all the filenames in a directory\r
   6  * and checking each against the wildcard pattern.\r
   7  */\r
   8 \r
   9 #include <assert.h>\r
  10 #include <stdlib.h>\r
  11 #include <string.h>\r
  12 \r
  13 #include "putty.h"\r
  14 \r
  15 /*\r
  16  * Definition of wildcard syntax:\r
  17  * \r
  18  *  - * matches any sequence of characters, including zero.\r
  19  *  - ? matches exactly one character which can be anything.\r
  20  *  - [abc] matches exactly one character which is a, b or c.\r
  21  *  - [a-f] matches anything from a through f.\r
  22  *  - [^a-f] matches anything _except_ a through f.\r
  23  *  - [-_] matches - or _; [^-_] matches anything else. (The - is\r
  24  *    non-special if it occurs immediately after the opening\r
  25  *    bracket or ^.)\r
  26  *  - [a^] matches an a or a ^. (The ^ is non-special if it does\r
  27  *    _not_ occur immediately after the opening bracket.)\r
  28  *  - \*, \?, \[, \], \\ match the single characters *, ?, [, ], \.\r
  29  *  - All other characters are non-special and match themselves.\r
  30  */\r
  31 \r
  32 /*\r
  33  * Some notes on differences from POSIX globs (IEEE Std 1003.1, 2003 ed.):\r
  34  *  - backslashes act as escapes even within [] bracket expressions\r
  35  *  - does not support [!...] for non-matching list (POSIX are weird);\r
  36  *    NB POSIX allows [^...] as well via "A bracket expression starting\r
  37  *    with an unquoted circumflex character produces unspecified\r
  38  *    results". If we wanted to allow [!...] we might want to define\r
  39  *    [^!] as having its literal meaning (match '^' or '!').\r
  40  *  - none of the scary [[:class:]] stuff, etc\r
  41  */\r
  42 \r
  43 /*\r
  44  * The wildcard matching technique we use is very simple and\r
  45  * potentially O(N^2) in running time, but I don't anticipate it\r
  46  * being that bad in reality (particularly since N will be the size\r
  47  * of a filename, which isn't all that much). Perhaps one day, once\r
  48  * PuTTY has grown a regexp matcher for some other reason, I might\r
  49  * come back and reimplement wildcards by translating them into\r
  50  * regexps or directly into NFAs; but for the moment, in the\r
  51  * absence of any other need for the NFA->DFA translation engine,\r
  52  * anything more than the simplest possible wildcard matcher is\r
  53  * vast code-size overkill.\r
  54  * \r
  55  * Essentially, these wildcards are much simpler than regexps in\r
  56  * that they consist of a sequence of rigid fragments (? and [...]\r
  57  * can never match more or less than one character) separated by\r
  58  * asterisks. It is therefore extremely simple to look at a rigid\r
  59  * fragment and determine whether or not it begins at a particular\r
  60  * point in the test string; so we can search along the string\r
  61  * until we find each fragment, then search for the next. As long\r
  62  * as we find each fragment in the _first_ place it occurs, there\r
  63  * will never be a danger of having to backpedal and try to find it\r
  64  * again somewhere else.\r
  65  */\r
  66 \r
  67 enum {\r
  68     WC_TRAILINGBACKSLASH = 1,\r
  69     WC_UNCLOSEDCLASS,\r
  70     WC_INVALIDRANGE\r
  71 };\r
  72 \r
  73 /*\r
  74  * Error reporting is done by returning various negative values\r
  75  * from the wildcard routines. Passing any such value to wc_error\r
  76  * will give a human-readable message.\r
  77  */\r
  78 const char *wc_error(int value)\r
  79 {\r
  80     value = abs(value);\r
  81     switch (value) {\r
  82       case WC_TRAILINGBACKSLASH:\r
  83         return "'\' occurred at end of string (expected another character)";\r
  84       case WC_UNCLOSEDCLASS:\r
  85         return "expected ']' to close character class";\r
  86       case WC_INVALIDRANGE:\r
  87         return "character range was not terminated (']' just after '-')";\r
  88     }\r
  89     return "INTERNAL ERROR: unrecognised wildcard error number";\r
  90 }\r
  91 \r
  92 /*\r
  93  * This is the routine that tests a target string to see if an\r
  94  * initial substring of it matches a fragment. If successful, it\r
  95  * returns 1, and advances both `fragment' and `target' past the\r
  96  * fragment and matching substring respectively. If unsuccessful it\r
  97  * returns zero. If the wildcard fragment suffers a syntax error,\r
  98  * it returns <0 and the precise value indexes into wc_error.\r
  99  */\r
 100 static int wc_match_fragment(const char **fragment, const char **target)\r
 101 {\r
 102     const char *f, *t;\r
 103 \r
 104     f = *fragment;\r
 105     t = *target;\r
 106     /*\r
 107      * The fragment terminates at either the end of the string, or\r
 108      * the first (unescaped) *.\r
 109      */\r
 110     while (*f && *f != '*' && *t) {\r
 111         /*\r
 112          * Extract one character from t, and one character's worth\r
 113          * of pattern from f, and step along both. Return 0 if they\r
 114          * fail to match.\r
 115          */\r
 116         if (*f == '\\') {\r
 117             /*\r
 118              * Backslash, which means f[1] is to be treated as a\r
 119              * literal character no matter what it is. It may not\r
 120              * be the end of the string.\r
 121              */\r
 122             if (!f[1])\r
 123                 return -WC_TRAILINGBACKSLASH;   /* error */\r
 124             if (f[1] != *t)\r
 125                 return 0;              /* failed to match */\r
 126             f += 2;\r
 127         } else if (*f == '?') {\r
 128             /*\r
 129              * Question mark matches anything.\r
 130              */\r
 131             f++;\r
 132         } else if (*f == '[') {\r
 133             int invert = 0;\r
 134             int matched = 0;\r
 135             /*\r
 136              * Open bracket introduces a character class.\r
 137              */\r
 138             f++;\r
 139             if (*f == '^') {\r
 140                 invert = 1;\r
 141                 f++;\r
 142             }\r
 143             while (*f != ']') {\r
 144                 if (*f == '\\')\r
 145                     f++;               /* backslashes still work */\r
 146                 if (!*f)\r
 147                     return -WC_UNCLOSEDCLASS;   /* error again */\r
 148                 if (f[1] == '-') {\r
 149                     int lower, upper, ourchr;\r
 150                     lower = (unsigned char) *f++;\r
 151                     f++;               /* eat the minus */\r
 152                     if (*f == ']')\r
 153                         return -WC_INVALIDRANGE;   /* different error! */\r
 154                     if (*f == '\\')\r
 155                         f++;           /* backslashes _still_ work */\r
 156                     if (!*f)\r
 157                         return -WC_UNCLOSEDCLASS;   /* error again */\r
 158                     upper = (unsigned char) *f++;\r
 159                     ourchr = (unsigned char) *t;\r
 160                     if (lower > upper) {\r
 161                         int t = lower; lower = upper; upper = t;\r
 162                     }\r
 163                     if (ourchr >= lower && ourchr <= upper)\r
 164                         matched = 1;\r
 165                 } else {\r
 166                     matched |= (*t == *f++);\r
 167                 }\r
 168             }\r
 169             if (invert == matched)\r
 170                 return 0;              /* failed to match character class */\r
 171             f++;                       /* eat the ] */\r
 172         } else {\r
 173             /*\r
 174              * Non-special character matches itself.\r
 175              */\r
 176             if (*f != *t)\r
 177                 return 0;\r
 178             f++;\r
 179         }\r
 180         /*\r
 181          * Now we've done that, increment t past the character we\r
 182          * matched.\r
 183          */\r
 184         t++;\r
 185     }\r
 186     if (!*f || *f == '*') {\r
 187         /*\r
 188          * We have reached the end of f without finding a mismatch;\r
 189          * so we're done. Update the caller pointers and return 1.\r
 190          */\r
 191         *fragment = f;\r
 192         *target = t;\r
 193         return 1;\r
 194     }\r
 195     /*\r
 196      * Otherwise, we must have reached the end of t before we\r
 197      * reached the end of f; so we've failed. Return 0. \r
 198      */\r
 199     return 0;\r
 200 }\r
 201 \r
 202 /*\r
 203  * This is the real wildcard matching routine. It returns 1 for a\r
 204  * successful match, 0 for an unsuccessful match, and <0 for a\r
 205  * syntax error in the wildcard.\r
 206  */\r
 207 int wc_match(const char *wildcard, const char *target)\r
 208 {\r
 209     int ret;\r
 210 \r
 211     /*\r
 212      * Every time we see a '*' _followed_ by a fragment, we just\r
 213      * search along the string for a location at which the fragment\r
 214      * matches. The only special case is when we see a fragment\r
 215      * right at the start, in which case we just call the matching\r
 216      * routine once and give up if it fails.\r
 217      */\r
 218     if (*wildcard != '*') {\r
 219         ret = wc_match_fragment(&wildcard, &target);\r
 220         if (ret <= 0)\r
 221             return ret;                /* pass back failure or error alike */\r
 222     }\r
 223 \r
 224     while (*wildcard) {\r
 225         assert(*wildcard == '*');\r
 226         while (*wildcard == '*')\r
 227             wildcard++;\r
 228 \r
 229         /*\r
 230          * It's possible we've just hit the end of the wildcard\r
 231          * after seeing a *, in which case there's no need to\r
 232          * bother searching any more because we've won.\r
 233          */\r
 234         if (!*wildcard)\r
 235             return 1;\r
 236 \r
 237         /*\r
 238          * Now `wildcard' points at the next fragment. So we\r
 239          * attempt to match it against `target', and if that fails\r
 240          * we increment `target' and try again, and so on. When we\r
 241          * find we're about to try matching against the empty\r
 242          * string, we give up and return 0.\r
 243          */\r
 244         ret = 0;\r
 245         while (*target) {\r
 246             const char *save_w = wildcard, *save_t = target;\r
 247 \r
 248             ret = wc_match_fragment(&wildcard, &target);\r
 249 \r
 250             if (ret < 0)\r
 251                 return ret;            /* syntax error */\r
 252 \r
 253             if (ret > 0 && !*wildcard && *target) {\r
 254                 /*\r
 255                  * Final special case - literally.\r
 256                  * \r
 257                  * This situation arises when we are matching a\r
 258                  * _terminal_ fragment of the wildcard (that is,\r
 259                  * there is nothing after it, e.g. "*a"), and it\r
 260                  * has matched _too early_. For example, matching\r
 261                  * "*a" against "parka" will match the "a" fragment\r
 262                  * against the _first_ a, and then (if it weren't\r
 263                  * for this special case) matching would fail\r
 264                  * because we're at the end of the wildcard but not\r
 265                  * at the end of the target string.\r
 266                  * \r
 267                  * In this case what we must do is measure the\r
 268                  * length of the fragment in the target (which is\r
 269                  * why we saved `target'), jump straight to that\r
 270                  * distance from the end of the string using\r
 271                  * strlen, and match the same fragment again there\r
 272                  * (which is why we saved `wildcard'). Then we\r
 273                  * return whatever that operation returns.\r
 274                  */\r
 275                 target = save_t + strlen(save_t) - (target - save_t);\r
 276                 wildcard = save_w;\r
 277                 return wc_match_fragment(&wildcard, &target);\r
 278             }\r
 279 \r
 280             if (ret > 0)\r
 281                 break;\r
 282             target++;\r
 283         }\r
 284         if (ret > 0)\r
 285             continue;\r
 286         return 0;\r
 287     }\r
 288 \r
 289     /*\r
 290      * If we reach here, it must be because we successfully matched\r
 291      * a fragment and then found ourselves right at the end of the\r
 292      * wildcard. Hence, we return 1 if and only if we are also\r
 293      * right at the end of the target.\r
 294      */\r
 295     return (*target ? 0 : 1);\r
 296 }\r
 297 \r
 298 /*\r
 299  * Another utility routine that translates a non-wildcard string\r
 300  * into its raw equivalent by removing any escaping backslashes.\r
 301  * Expects a target string buffer of anything up to the length of\r
 302  * the original wildcard. You can also pass NULL as the output\r
 303  * buffer if you're only interested in the return value.\r
 304  * \r
 305  * Returns 1 on success, or 0 if a wildcard character was\r
 306  * encountered. In the latter case the output string MAY not be\r
 307  * zero-terminated and you should not use it for anything!\r
 308  */\r
 309 int wc_unescape(char *output, const char *wildcard)\r
 310 {\r
 311     while (*wildcard) {\r
 312         if (*wildcard == '\\') {\r
 313             wildcard++;\r
 314             /* We are lenient about trailing backslashes in non-wildcards. */\r
 315             if (*wildcard) {\r
 316                 if (output)\r
 317                     *output++ = *wildcard;\r
 318                 wildcard++;\r
 319             }\r
 320         } else if (*wildcard == '*' || *wildcard == '?' ||\r
 321                    *wildcard == '[' || *wildcard == ']') {\r
 322             return 0;                  /* it's a wildcard! */\r
 323         } else {\r
 324             if (output)\r
 325                 *output++ = *wildcard;\r
 326             wildcard++;\r
 327         }\r
 328     }\r
 329     *output = '\0';\r
 330     return 1;                          /* it's clean */\r
 331 }\r
 332 \r
 333 #ifdef TESTMODE\r
 334 \r
 335 struct test {\r
 336     const char *wildcard;\r
 337     const char *target;\r
 338     int expected_result;\r
 339 };\r
 340 \r
 341 const struct test fragment_tests[] = {\r
 342     /*\r
 343      * We exhaustively unit-test the fragment matching routine\r
 344      * itself, which should save us the need to test all its\r
 345      * intricacies during the full wildcard tests.\r
 346      */\r
 347     {"abc", "abc", 1},\r
 348     {"abc", "abd", 0},\r
 349     {"abc", "abcd", 1},\r
 350     {"abcd", "abc", 0},\r
 351     {"ab[cd]", "abc", 1},\r
 352     {"ab[cd]", "abd", 1},\r
 353     {"ab[cd]", "abe", 0},\r
 354     {"ab[^cd]", "abc", 0},\r
 355     {"ab[^cd]", "abd", 0},\r
 356     {"ab[^cd]", "abe", 1},\r
 357     {"ab\\", "abc", -WC_TRAILINGBACKSLASH},\r
 358     {"ab\\*", "ab*", 1},\r
 359     {"ab\\?", "ab*", 0},\r
 360     {"ab?", "abc", 1},\r
 361     {"ab?", "ab", 0},\r
 362     {"ab[", "abc", -WC_UNCLOSEDCLASS},\r
 363     {"ab[c-", "abb", -WC_UNCLOSEDCLASS},\r
 364     {"ab[c-]", "abb", -WC_INVALIDRANGE},\r
 365     {"ab[c-e]", "abb", 0},\r
 366     {"ab[c-e]", "abc", 1},\r
 367     {"ab[c-e]", "abd", 1},\r
 368     {"ab[c-e]", "abe", 1},\r
 369     {"ab[c-e]", "abf", 0},\r
 370     {"ab[e-c]", "abb", 0},\r
 371     {"ab[e-c]", "abc", 1},\r
 372     {"ab[e-c]", "abd", 1},\r
 373     {"ab[e-c]", "abe", 1},\r
 374     {"ab[e-c]", "abf", 0},\r
 375     {"ab[^c-e]", "abb", 1},\r
 376     {"ab[^c-e]", "abc", 0},\r
 377     {"ab[^c-e]", "abd", 0},\r
 378     {"ab[^c-e]", "abe", 0},\r
 379     {"ab[^c-e]", "abf", 1},\r
 380     {"ab[^e-c]", "abb", 1},\r
 381     {"ab[^e-c]", "abc", 0},\r
 382     {"ab[^e-c]", "abd", 0},\r
 383     {"ab[^e-c]", "abe", 0},\r
 384     {"ab[^e-c]", "abf", 1},\r
 385     {"ab[a^]", "aba", 1},\r
 386     {"ab[a^]", "ab^", 1},\r
 387     {"ab[a^]", "abb", 0},\r
 388     {"ab[^a^]", "aba", 0},\r
 389     {"ab[^a^]", "ab^", 0},\r
 390     {"ab[^a^]", "abb", 1},\r
 391     {"ab[-c]", "ab-", 1},\r
 392     {"ab[-c]", "abc", 1},\r
 393     {"ab[-c]", "abd", 0},\r
 394     {"ab[^-c]", "ab-", 0},\r
 395     {"ab[^-c]", "abc", 0},\r
 396     {"ab[^-c]", "abd", 1},\r
 397     {"ab[\\[-\\]]", "abZ", 0},\r
 398     {"ab[\\[-\\]]", "ab[", 1},\r
 399     {"ab[\\[-\\]]", "ab\\", 1},\r
 400     {"ab[\\[-\\]]", "ab]", 1},\r
 401     {"ab[\\[-\\]]", "ab^", 0},\r
 402     {"ab[^\\[-\\]]", "abZ", 1},\r
 403     {"ab[^\\[-\\]]", "ab[", 0},\r
 404     {"ab[^\\[-\\]]", "ab\\", 0},\r
 405     {"ab[^\\[-\\]]", "ab]", 0},\r
 406     {"ab[^\\[-\\]]", "ab^", 1},\r
 407     {"ab[a-fA-F]", "aba", 1},\r
 408     {"ab[a-fA-F]", "abF", 1},\r
 409     {"ab[a-fA-F]", "abZ", 0},\r
 410 };\r
 411 \r
 412 const struct test full_tests[] = {\r
 413     {"a", "argh", 0},\r
 414     {"a", "ba", 0},\r
 415     {"a", "a", 1},\r
 416     {"a*", "aardvark", 1},\r
 417     {"a*", "badger", 0},\r
 418     {"*a", "park", 0},\r
 419     {"*a", "pArka", 1},\r
 420     {"*a", "parka", 1},\r
 421     {"*a*", "park", 1},\r
 422     {"*a*", "perk", 0},\r
 423     {"?b*r?", "abracadabra", 1},\r
 424     {"?b*r?", "abracadabr", 0},\r
 425     {"?b*r?", "abracadabzr", 0},\r
 426 };\r
 427 \r
 428 int main(void)\r
 429 {\r
 430     int i;\r
 431     int fails, passes;\r
 432 \r
 433     fails = passes = 0;\r
 434 \r
 435     for (i = 0; i < sizeof(fragment_tests)/sizeof(*fragment_tests); i++) {\r
 436         const char *f, *t;\r
 437         int eret, aret;\r
 438         f = fragment_tests[i].wildcard;\r
 439         t = fragment_tests[i].target;\r
 440         eret = fragment_tests[i].expected_result;\r
 441         aret = wc_match_fragment(&f, &t);\r
 442         if (aret != eret) {\r
 443             printf("failed test: /%s/ against /%s/ returned %d not %d\n",\r
 444                    fragment_tests[i].wildcard, fragment_tests[i].target,\r
 445                    aret, eret);\r
 446             fails++;\r
 447         } else\r
 448             passes++;\r
 449     }\r
 450 \r
 451     for (i = 0; i < sizeof(full_tests)/sizeof(*full_tests); i++) {\r
 452         const char *f, *t;\r
 453         int eret, aret;\r
 454         f = full_tests[i].wildcard;\r
 455         t = full_tests[i].target;\r
 456         eret = full_tests[i].expected_result;\r
 457         aret = wc_match(f, t);\r
 458         if (aret != eret) {\r
 459             printf("failed test: /%s/ against /%s/ returned %d not %d\n",\r
 460                    full_tests[i].wildcard, full_tests[i].target,\r
 461                    aret, eret);\r
 462             fails++;\r
 463         } else\r
 464             passes++;\r
 465     }\r
 466 \r
 467     printf("passed %d, failed %d\n", passes, fails);\r
 468 \r
 469     return 0;\r
 470 }\r
 471 \r
 472 #endif\r