Added Bas van Gompel.
[findutils.git] / locate / locate.c
blob2d9bb50a7846f3d60999001b5ffae672150009cb
1 /* locate -- search databases for filenames that match patterns
2 Copyright (C) 1994, 96, 98, 99, 2000, 2003 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
17 USA.
20 /* Usage: locate [options] pattern...
22 Scan a pathname list for the full pathname of a file, given only
23 a piece of the name (possibly containing shell globbing metacharacters).
24 The list has been processed with front-compression, which reduces
25 the list size by a factor of 4-5.
26 Recognizes two database formats, old and new. The old format is
27 bigram coded, which reduces space by a further 20-25% and uses the
28 following encoding of the database bytes:
30 0-28 likeliest differential counts + offset (14) to make nonnegative
31 30 escape code for out-of-range count to follow in next halfword
32 128-255 bigram codes (the 128 most common, as determined by `updatedb')
33 32-127 single character (printable) ASCII remainder
35 Uses a novel two-tiered string search technique:
37 First, match a metacharacter-free subpattern and a partial pathname
38 BACKWARDS to avoid full expansion of the pathname list.
39 The time savings is 40-50% over forward matching, which cannot efficiently
40 handle overlapped search patterns and compressed path remainders.
42 Then, match the actual shell glob-style regular expression (if in this form)
43 against the candidate pathnames using the slower shell filename
44 matching routines.
46 Described more fully in Usenix ;login:, Vol 8, No 1,
47 February/March, 1983, p. 8.
49 Written by James A. Woods <jwoods@adobe.com>.
50 Modified by David MacKenzie <djm@gnu.org>. */
52 #include <config.h>
53 #include <stdio.h>
54 #include <ctype.h>
55 #include <sys/types.h>
56 #include <sys/stat.h>
57 #include <time.h>
58 #include <fnmatch.h>
59 #include <getopt.h>
60 #include <xstrtol.h>
62 #define NDEBUG
63 #include <assert.h>
65 #if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
66 #include <string.h>
67 #else
68 #include <strings.h>
69 #define strchr index
70 #endif
72 #ifdef STDC_HEADERS
73 #include <stdlib.h>
74 #endif
76 #ifdef HAVE_ERRNO_H
77 #include <errno.h>
78 #else
79 extern int errno;
80 #endif
82 #ifdef HAVE_LOCALE_H
83 #include <locale.h>
84 #endif
86 #if ENABLE_NLS
87 # include <libintl.h>
88 # define _(Text) gettext (Text)
89 #else
90 # define _(Text) Text
91 #define textdomain(Domain)
92 #define bindtextdomain(Package, Directory)
93 #endif
94 #ifdef gettext_noop
95 # define N_(String) gettext_noop (String)
96 #else
97 /* We used to use (String) instead of just String, but apparentl;y ISO C
98 * doesn't allow this (at least, that's what HP said when someone reported
99 * this as a compiler bug). This is HP case number 1205608192. See
100 * also http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11250 (which references
101 * ANSI 3.5.7p14-15). The Intel icc compiler also rejects constructs
102 * like: static const char buf[] = ("string");
104 # define N_(String) String
105 #endif
107 #include "locatedb.h"
108 #include <getline.h>
109 #include "../gnulib/lib/xalloc.h"
110 #include "../gnulib/lib/error.h"
111 #include "../gnulib/lib/human.h"
112 #include "dirname.h"
114 /* Note that this evaluates C many times. */
115 #ifdef _LIBC
116 # define TOUPPER(Ch) toupper (Ch)
117 # define TOLOWER(Ch) tolower (Ch)
118 #else
119 # define TOUPPER(Ch) (islower (Ch) ? toupper (Ch) : (Ch))
120 # define TOLOWER(Ch) (isupper (Ch) ? tolower (Ch) : (Ch))
121 #endif
123 /* typedef enum {false, true} boolean; */
125 /* Warn if a database is older than this. 8 days allows for a weekly
126 update that takes up to a day to perform. */
127 #define WARN_NUMBER_UNITS (8)
128 /* Printable name of units used in WARN_SECONDS */
129 static const char warn_name_units[] = N_("days");
130 #define SECONDS_PER_UNIT (60 * 60 * 24)
132 #define WARN_SECONDS ((SECONDS_PER_UNIT) * (WARN_NUMBER_UNITS))
134 /* Check for existence of files before printing them out? */
135 int check_existence = 0;
137 /* What to separate the results with. */
138 static int separator = '\n';
140 char *next_element ();
143 /* Read in a 16-bit int, high byte first (network byte order). */
145 static short
146 get_short (fp)
147 FILE *fp;
150 register short x;
152 x = (signed char) fgetc (fp) << 8;
153 x |= (fgetc (fp) & 0xff);
154 return x;
157 const char * const metacharacters = "*?[]\\";
159 /* Return nonzero if S contains any shell glob characters.
161 static int
162 contains_metacharacter(const char *s)
164 if (NULL == strpbrk(s, metacharacters))
165 return 0;
166 else
167 return 1;
170 /* locate_read_str()
172 * Read bytes from FP into the buffer at offset OFFSET in (*BUF),
173 * until we reach DELIMITER or end-of-file. We reallocate the buffer
174 * as necessary, altering (*BUF) and (*SIZ) as appropriate. No assumption
175 * is made regarding the content of the data (i.e. the implementation is
176 * 8-bit clean, the only delimiter is DELIMITER).
178 * Written Fri May 23 18:41:16 2003 by James Youngman, because getstr()
179 * has been removed from gnulib.
181 * We call the function locate_read_str() to avoid a name clash with the curses
182 * function getstr().
184 static int
185 locate_read_str(char **buf, size_t *siz, FILE *fp, int delimiter, int offs)
187 char * p = NULL;
188 size_t sz = 0;
189 int needed, nread;
191 nread = getdelim(&p, &sz, delimiter, fp);
192 if (nread >= 0)
194 assert(p != NULL);
196 needed = offs + nread + 1;
197 if (needed > (*siz))
199 char *pnew = realloc(*buf, needed);
200 if (NULL == pnew)
202 return -1; /* FAIL */
204 else
206 *siz = needed;
207 *buf = pnew;
210 memcpy((*buf)+offs, p, nread);
211 free(p);
213 return nread;
217 static void
218 lc_strcpy(char *dest, const char *src)
220 while (*src)
222 *dest++ = TOLOWER(*src);
223 ++src;
225 *dest = 0;
228 enum visit_result
230 VISIT_CONTINUE = 1, /* please call the next visitor */
231 VISIT_ACCEPTED = 2, /* accepted, call no futher callbacks for this file */
232 VISIT_REJECTED = 4, /* rejected, process next file. */
233 VISIT_ABORT = 8 /* rejected, process no more files. */
237 struct locate_stats
239 uintmax_t compressed_bytes;
240 uintmax_t total_filename_count;
241 uintmax_t total_filename_length;
242 uintmax_t whitespace_count;
243 uintmax_t newline_count;
244 uintmax_t highbit_filename_count;
246 static struct locate_stats statistics;
249 struct casefolder
251 const char *pattern;
252 char *buffer;
253 size_t buffersize;
258 typedef int (*visitfunc)(const char *tested_filename,
259 const char *printed_filename,
260 void *context);
262 struct visitor
264 visitfunc inspector;
265 void * context;
266 struct visitor *next;
270 static struct visitor *inspectors = NULL;
271 static struct visitor *lastinspector = NULL;
273 static int
274 process_filename(const char *tested_filename, const char *printed_filename)
276 int result = VISIT_CONTINUE;
277 const struct visitor *p = inspectors;
279 while ( (VISIT_CONTINUE == result) && (NULL != p) )
281 result = (p->inspector)(tested_filename, printed_filename, p->context);
282 p = p->next;
285 if (VISIT_CONTINUE == result)
286 return VISIT_ACCEPTED;
287 else
288 return result;
291 static void
292 add_visitor(visitfunc fn, void *context)
294 struct visitor *p = xmalloc(sizeof(struct visitor));
295 p->inspector = fn;
296 p->context = context;
297 p->next = NULL;
299 if (NULL == lastinspector)
301 lastinspector = inspectors = p;
303 else
305 lastinspector->next = p;
306 lastinspector = p;
312 static int
313 visit_justprint(const char *testname, const char *printname, void *context)
315 (void) context;
316 (void) testname;
317 fputs(printname, stdout);
318 putchar(separator);
319 return VISIT_CONTINUE;
322 static int
323 visit_exists(const char *testname, const char *printname, void *context)
325 struct stat st;
326 (void) context;
327 (void) testname;
329 /* testname has been converted in some way (to lower case,
330 * or is just the basename of the file), and printname has not.
331 * Hence only printname is still actually the name of the file
332 * whose existence we would need to check.
334 if (stat(printname, &st) != 0)
336 return VISIT_REJECTED;
338 else
340 return VISIT_CONTINUE;
344 static int
345 visit_substring_match_nocasefold(const char *testname, const char *printname, void *context)
347 const char *pattern = context;
348 (void) printname;
350 if (NULL != strstr(testname, pattern))
351 return VISIT_CONTINUE;
352 else
353 return VISIT_REJECTED;
356 static int
357 visit_substring_match_casefold(const char *testname, const char *printname, void *context)
359 struct casefolder * p = context;
360 size_t len = strlen(testname);
362 (void) printname;
363 if (len+1 > p->buffersize)
365 p->buffer = xrealloc(p->buffer, len+1); /* XXX: consider using extendbuf(). */
366 p->buffersize = len+1;
368 lc_strcpy(p->buffer, testname);
371 if (NULL != strstr(p->buffer, p->pattern))
372 return VISIT_CONTINUE;
373 else
374 return VISIT_REJECTED;
378 static int
379 visit_globmatch_nofold(const char *testname, const char *printname, void *context)
381 const char *glob = context;
382 (void) printname;
383 if (fnmatch(glob, testname, 0) != 0)
384 return VISIT_REJECTED;
385 else
386 return VISIT_CONTINUE;
390 static int
391 visit_globmatch_casefold(const char *testname, const char *printname, void *context)
393 const char *glob = context;
394 (void) printname;
395 if (fnmatch(glob, testname, FNM_CASEFOLD) != 0)
396 return VISIT_REJECTED;
397 else
398 return VISIT_CONTINUE;
401 static int
402 visit_stats(const char *testname, const char *printname, void *context)
404 struct locate_stats *p = context;
405 size_t len = strlen(printname);
406 const char *s;
407 int highbit, whitespace, newline;
409 ++(p->total_filename_count);
410 p->total_filename_length += len;
412 highbit = whitespace = newline = 0;
413 for (s=printname; *s; ++s)
415 if ( (int)(*s) & 128 )
416 highbit = 1;
417 if ('\n' == *s)
419 newline = whitespace = 1;
421 else if (isspace((unsigned char)*s))
423 whitespace = 1;
427 if (highbit)
428 ++(p->highbit_filename_count);
429 if (whitespace)
430 ++(p->whitespace_count);
431 if (newline)
432 ++(p->newline_count);
434 return VISIT_CONTINUE;
439 /* Print the entries in DBFILE that match shell globbing pattern PATHPART.
440 Return the number of entries printed. */
442 static unsigned long
443 new_locate (char *pathpart,
444 char *dbfile,
445 int ignore_case,
446 int enable_print,
447 int basename_only,
448 int use_limit,
449 uintmax_t limit,
450 int stats)
452 char hbuf[LONGEST_HUMAN_READABLE + 1];
453 FILE *fp; /* The pathname database. */
454 int c; /* An input byte. */
455 int nread; /* number of bytes read from an entry. */
456 char *path; /* The current input database entry. */
457 const char *testpath;
458 size_t pathsize; /* Amount allocated for it. */
459 int count = 0; /* The length of the prefix shared with the previous database entry. */
461 int old_format = 0; /* true if reading a bigram-encoded database. */
465 /* for the old database format,
466 the first and second characters of the most common bigrams. */
467 char bigram1[128], bigram2[128];
469 /* number of items accepted (i.e. printed) */
470 unsigned long int items_accepted = 0uL;
472 /* To check the age of the database. */
473 struct stat st;
474 time_t now;
476 /* Set up the inspection regime */
477 inspectors = NULL;
478 lastinspector = NULL;
480 if (stats)
482 assert(!use_limit);
483 add_visitor(visit_stats, &statistics);
485 else
487 if (contains_metacharacter(pathpart))
489 if (ignore_case)
490 add_visitor(visit_globmatch_casefold, pathpart);
491 else
492 add_visitor(visit_globmatch_nofold, pathpart);
494 else
496 /* No glob characters used. Hence we match on
497 * _any part_ of the filename, not just the
498 * basename. This seems odd to me, but it is the
499 * traditional behaviour.
500 * James Youngman <jay@gnu.org>
502 if (ignore_case)
504 struct casefolder * cf = xmalloc(sizeof(*cf));
505 cf->pattern = pathpart;
506 cf->buffer = NULL;
507 cf->buffersize = 0;
508 add_visitor(visit_substring_match_casefold, cf);
510 else
512 add_visitor(visit_substring_match_nocasefold, pathpart);
516 /* We add visit_exists() as late as possible to reduce the
517 * number of stat() calls. (Idea by Bas van Gompel).
519 if (check_existence)
520 add_visitor(visit_exists, NULL);
522 if (enable_print)
523 add_visitor(visit_justprint, NULL);
527 if (stat (dbfile, &st) || (fp = fopen (dbfile, "r")) == NULL)
529 error (0, errno, "%s", dbfile);
530 return 0;
532 time(&now);
533 if (now - st.st_mtime > WARN_SECONDS)
535 /* For example:
536 warning: database `fred' is more than 8 days old */
537 error (0, 0, _("warning: database `%s' is more than %d %s old"),
538 dbfile, WARN_NUMBER_UNITS, _(warn_name_units));
541 pathsize = 1026; /* Increased as necessary by locate_read_str. */
542 path = xmalloc (pathsize);
544 nread = fread (path, 1, sizeof (LOCATEDB_MAGIC), fp);
545 if (nread != sizeof (LOCATEDB_MAGIC)
546 || memcmp (path, LOCATEDB_MAGIC, sizeof (LOCATEDB_MAGIC)))
548 int i;
549 /* Read the list of the most common bigrams in the database. */
550 fseek (fp, 0, 0);
551 for (i = 0; i < 128; i++)
553 bigram1[i] = getc (fp);
554 bigram2[i] = getc (fp);
556 old_format = 1;
559 if (stats)
561 printf(_("Database %s is in the %s format.\n"),
562 dbfile,
563 old_format ? _("old") : "LOCATE02");
566 /* If we ignore case, convert it to lower first so we don't have to
567 * do it every time
569 if (!stats && ignore_case)
571 lc_strcpy(pathpart, pathpart);
574 items_accepted = 0;
576 c = getc (fp);
577 while ( (c != EOF) && (!use_limit || (limit > 0)) )
579 register char *s; /* Scan the path we read in. */
581 if (old_format)
583 /* Get the offset in the path where this path info starts. */
584 if (c == LOCATEDB_OLD_ESCAPE)
585 count += getw (fp) - LOCATEDB_OLD_OFFSET;
586 else
587 count += c - LOCATEDB_OLD_OFFSET;
589 /* Overlay the old path with the remainder of the new. */
590 for (s = path + count; (c = getc (fp)) > LOCATEDB_OLD_ESCAPE;)
591 if (c < 0200)
592 *s++ = c; /* An ordinary character. */
593 else
595 /* Bigram markers have the high bit set. */
596 c &= 0177;
597 *s++ = bigram1[c];
598 *s++ = bigram2[c];
600 *s-- = '\0';
602 else
604 if (c == LOCATEDB_ESCAPE)
605 count += (short)get_short (fp);
606 else if (c > 127)
607 count += c - 256;
608 else
609 count += c;
611 if (count > strlen(path))
613 /* This should not happen generally , but since we're
614 * reading in data which is outside our control, we
615 * cannot prevent it.
617 error(1, 0, _("locate database `%s' is corrupt or invalid"), dbfile);
620 /* Overlay the old path with the remainder of the new. */
621 nread = locate_read_str (&path, &pathsize, fp, 0, count);
622 if (nread < 0)
623 break;
624 c = getc (fp);
625 s = path + count + nread - 1; /* Move to the last char in path. */
626 assert (s[0] != '\0');
627 assert (s[1] == '\0'); /* Our terminator. */
628 assert (s[2] == '\0'); /* Added by locate_read_str. */
631 testpath = basename_only ? base_name(path) : path;
632 if (VISIT_ACCEPTED == process_filename(testpath, path))
634 if ((++items_accepted >= limit) && use_limit)
636 break;
642 if (stats)
644 printf(_("Locate database size: %s bytes\n"),
645 human_readable ((uintmax_t) st.st_size,
646 hbuf, human_ceiling, 1, 1));
648 printf(_("Filenames: %s "),
649 human_readable (statistics.total_filename_count,
650 hbuf, human_ceiling, 1, 1));
651 printf(_("with a cumulative length of %s bytes"),
652 human_readable (statistics.total_filename_length,
653 hbuf, human_ceiling, 1, 1));
655 printf(_("\n\tof which %s contain whitespace, "),
656 human_readable (statistics.whitespace_count,
657 hbuf, human_ceiling, 1, 1));
658 printf(_("\n\t%s contain newline characters, "),
659 human_readable (statistics.newline_count,
660 hbuf, human_ceiling, 1, 1));
661 printf(_("\n\tand %s contain characters with the high bit set.\n"),
662 human_readable (statistics.highbit_filename_count,
663 hbuf, human_ceiling, 1, 1));
665 printf(_("Compression ratio %4.2f%%\n"),
666 100.0 * ((double)statistics.total_filename_length
667 - (double) st.st_size)
668 / (double) statistics.total_filename_length);
669 printf("\n");
672 if (ferror (fp))
674 error (0, errno, "%s", dbfile);
675 return 0;
677 if (fclose (fp) == EOF)
679 error (0, errno, "%s", dbfile);
680 return 0;
683 return items_accepted;
689 extern char *version_string;
691 /* The name this program was run with. */
692 char *program_name;
694 static void
695 usage (stream)
696 FILE *stream;
698 fprintf (stream, _("\
699 Usage: %s [-d path | --database=path] [-e | --existing]\n\
700 [-i | --ignore-case] [--wholepath] [--basename] [--limit=N | -l N]\n\
701 [--version] [--help] pattern...\n"),
702 program_name);
703 fputs (_("\nReport bugs to <bug-findutils@gnu.org>.\n"), stream);
706 static void
707 sanity_check_dbpath(const char *path)
709 size_t len;
711 if (':' == path[0])
713 error(0, 0, _("warning: locate database path `%s' contains a leading colon, which is not a valid database name"), path);
716 len = strlen(path);
717 if (len > 0u)
719 if (':' == path[len-1u])
721 error(0, 0, _("warning: locate database path `%s' contains a trailing colon, which is not a valid database name"), path);
727 static struct option const longopts[] =
729 {"database", required_argument, NULL, 'd'},
730 {"existing", no_argument, NULL, 'e'},
731 {"ignore-case", no_argument, NULL, 'i'},
732 {"help", no_argument, NULL, 'h'},
733 {"version", no_argument, NULL, 'v'},
734 {"null", no_argument, NULL, '0'},
735 {"count", no_argument, NULL, 'c'},
736 {"wholename", no_argument, NULL, 'w'},
737 {"basename", no_argument, NULL, 'b'},
738 {"stdio", no_argument, NULL, 's'},
739 {"mmap", no_argument, NULL, 'm'},
740 {"limit", required_argument, NULL, 'l'},
741 {"statistics", no_argument, NULL, 'S'},
742 {NULL, no_argument, NULL, 0}
746 main (argc, argv)
747 int argc;
748 char **argv;
750 char *dbpath;
751 unsigned long int found = 0uL;
752 int optc;
753 int ignore_case = 0;
754 int print = 1;
755 int just_count = 0;
756 int basename_only = 0;
757 uintmax_t limit = 0;
758 int use_limit = 0;
759 int stats = 0;
761 program_name = argv[0];
763 #ifdef HAVE_SETLOCALE
764 setlocale (LC_ALL, "");
765 #endif
766 bindtextdomain (PACKAGE, LOCALEDIR);
767 textdomain (PACKAGE);
769 dbpath = getenv ("LOCATE_PATH");
770 if (dbpath == NULL)
771 dbpath = LOCATE_DB;
773 check_existence = 0;
775 while ((optc = getopt_long (argc, argv, "cd:eil:sm0S", longopts, (int *) 0)) != -1)
776 switch (optc)
778 case '0':
779 separator = 0;
780 break;
782 case 'b':
783 basename_only = 1;
784 break;
786 case 'c':
787 just_count = 1;
788 print = 0;
789 break;
791 case 'd':
792 dbpath = optarg;
793 break;
795 case 'e':
796 check_existence = 1;
797 break;
799 case 'i':
800 ignore_case = 1;
801 break;
803 case 'h':
804 usage (stdout);
805 return 0;
807 case 'v':
808 printf (_("GNU locate version %s\n"), version_string);
809 return 0;
811 case 'w':
812 basename_only = 0;
813 break;
815 case 'S':
816 stats = 1;
817 break;
819 case 'l':
821 char *end = optarg;
822 strtol_error err = xstrtoumax(optarg, &end, 10, &limit, NULL);
823 if (LONGINT_OK != err)
825 STRTOL_FATAL_ERROR(optarg, _("argument to --limit"), err);
827 use_limit = 1;
829 break;
831 case 's': /* use stdio */
832 case 'm': /* use mmap */
833 /* These options are implemented simply for
834 * compatibility with FreeBSD
836 break;
838 default:
839 usage (stderr);
840 return 1;
843 if (stats)
845 use_limit = 0;
846 print = 0;
848 else
850 if (optind == argc)
852 usage (stderr);
853 return 1;
857 sanity_check_dbpath(dbpath);
859 for (; stats || optind < argc; optind++)
861 char *e;
862 const char *needle;
863 next_element (dbpath); /* Initialize. */
864 needle = stats ? NULL : argv[optind];
865 while ((e = next_element ((char *) NULL)) != NULL)
867 statistics.compressed_bytes =
868 statistics.total_filename_count =
869 statistics.total_filename_length =
870 statistics.whitespace_count =
871 statistics.newline_count =
872 statistics.highbit_filename_count = 0u;
874 found += new_locate (needle, e, ignore_case, print, basename_only, use_limit, limit, stats);
876 if (stats)
877 break;
880 if (just_count)
882 printf("%ld\n", found);
885 if (found || (use_limit && (limit==0)) || stats )
886 return 0;
887 else
888 return 1;