1 /* locate -- search databases for filenames that match patterns
2 Copyright (C) 1994, 96, 98, 99, 2000, 2003 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20 /* Usage: locate [options] pattern...
22 Scan a pathname list for the full pathname of a file, given only
23 a piece of the name (possibly containing shell globbing metacharacters).
24 The list has been processed with front-compression, which reduces
25 the list size by a factor of 4-5.
26 Recognizes two database formats, old and new. The old format is
27 bigram coded, which reduces space by a further 20-25% and uses the
28 following encoding of the database bytes:
30 0-28 likeliest differential counts + offset (14) to make nonnegative
31 30 escape code for out-of-range count to follow in next halfword
32 128-255 bigram codes (the 128 most common, as determined by `updatedb')
33 32-127 single character (printable) ASCII remainder
35 Uses a novel two-tiered string search technique:
37 First, match a metacharacter-free subpattern and a partial pathname
38 BACKWARDS to avoid full expansion of the pathname list.
39 The time savings is 40-50% over forward matching, which cannot efficiently
40 handle overlapped search patterns and compressed path remainders.
42 Then, match the actual shell glob-style regular expression (if in this form)
43 against the candidate pathnames using the slower shell filename
46 Described more fully in Usenix ;login:, Vol 8, No 1,
47 February/March, 1983, p. 8.
49 Written by James A. Woods <jwoods@adobe.com>.
50 Modified by David MacKenzie <djm@gnu.org>. */
55 #include <sys/types.h>
65 #if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
88 # define _(Text) gettext (Text)
91 #define textdomain(Domain)
92 #define bindtextdomain(Package, Directory)
95 # define N_(String) gettext_noop (String)
97 /* We used to use (String) instead of just String, but apparentl;y ISO C
98 * doesn't allow this (at least, that's what HP said when someone reported
99 * this as a compiler bug). This is HP case number 1205608192. See
100 * also http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11250 (which references
101 * ANSI 3.5.7p14-15). The Intel icc compiler also rejects constructs
102 * like: static const char buf[] = ("string");
104 # define N_(String) String
107 #include "locatedb.h"
109 #include "../gnulib/lib/xalloc.h"
110 #include "../gnulib/lib/error.h"
111 #include "../gnulib/lib/human.h"
114 /* Note that this evaluates C many times. */
116 # define TOUPPER(Ch) toupper (Ch)
117 # define TOLOWER(Ch) tolower (Ch)
119 # define TOUPPER(Ch) (islower (Ch) ? toupper (Ch) : (Ch))
120 # define TOLOWER(Ch) (isupper (Ch) ? tolower (Ch) : (Ch))
123 /* typedef enum {false, true} boolean; */
125 /* Warn if a database is older than this. 8 days allows for a weekly
126 update that takes up to a day to perform. */
127 #define WARN_NUMBER_UNITS (8)
128 /* Printable name of units used in WARN_SECONDS */
129 static const char warn_name_units
[] = N_("days");
130 #define SECONDS_PER_UNIT (60 * 60 * 24)
132 #define WARN_SECONDS ((SECONDS_PER_UNIT) * (WARN_NUMBER_UNITS))
134 /* Check for existence of files before printing them out? */
135 int check_existence
= 0;
137 /* What to separate the results with. */
138 static int separator
= '\n';
140 char *next_element ();
143 /* Read in a 16-bit int, high byte first (network byte order). */
152 x
= (signed char) fgetc (fp
) << 8;
153 x
|= (fgetc (fp
) & 0xff);
157 const char * const metacharacters
= "*?[]\\";
159 /* Return nonzero if S contains any shell glob characters.
162 contains_metacharacter(const char *s
)
164 if (NULL
== strpbrk(s
, metacharacters
))
172 * Read bytes from FP into the buffer at offset OFFSET in (*BUF),
173 * until we reach DELIMITER or end-of-file. We reallocate the buffer
174 * as necessary, altering (*BUF) and (*SIZ) as appropriate. No assumption
175 * is made regarding the content of the data (i.e. the implementation is
176 * 8-bit clean, the only delimiter is DELIMITER).
178 * Written Fri May 23 18:41:16 2003 by James Youngman, because getstr()
179 * has been removed from gnulib.
181 * We call the function locate_read_str() to avoid a name clash with the curses
185 locate_read_str(char **buf
, size_t *siz
, FILE *fp
, int delimiter
, int offs
)
191 nread
= getdelim(&p
, &sz
, delimiter
, fp
);
196 needed
= offs
+ nread
+ 1;
199 char *pnew
= realloc(*buf
, needed
);
202 return -1; /* FAIL */
210 memcpy((*buf
)+offs
, p
, nread
);
218 lc_strcpy(char *dest
, const char *src
)
222 *dest
++ = TOLOWER(*src
);
230 VISIT_CONTINUE
= 1, /* please call the next visitor */
231 VISIT_ACCEPTED
= 2, /* accepted, call no futher callbacks for this file */
232 VISIT_REJECTED
= 4, /* rejected, process next file. */
233 VISIT_ABORT
= 8 /* rejected, process no more files. */
239 uintmax_t compressed_bytes
;
240 uintmax_t total_filename_count
;
241 uintmax_t total_filename_length
;
242 uintmax_t whitespace_count
;
243 uintmax_t newline_count
;
244 uintmax_t highbit_filename_count
;
246 static struct locate_stats statistics
;
258 typedef int (*visitfunc
)(const char *tested_filename
,
259 const char *printed_filename
,
266 struct visitor
*next
;
270 static struct visitor
*inspectors
= NULL
;
271 static struct visitor
*lastinspector
= NULL
;
274 process_filename(const char *tested_filename
, const char *printed_filename
)
276 int result
= VISIT_CONTINUE
;
277 const struct visitor
*p
= inspectors
;
279 while ( (VISIT_CONTINUE
== result
) && (NULL
!= p
) )
281 result
= (p
->inspector
)(tested_filename
, printed_filename
, p
->context
);
285 if (VISIT_CONTINUE
== result
)
286 return VISIT_ACCEPTED
;
292 add_visitor(visitfunc fn
, void *context
)
294 struct visitor
*p
= xmalloc(sizeof(struct visitor
));
296 p
->context
= context
;
299 if (NULL
== lastinspector
)
301 lastinspector
= inspectors
= p
;
305 lastinspector
->next
= p
;
313 visit_justprint(const char *testname
, const char *printname
, void *context
)
317 fputs(printname
, stdout
);
319 return VISIT_CONTINUE
;
323 visit_exists(const char *testname
, const char *printname
, void *context
)
329 /* testname has been converted in some way (to lower case,
330 * or is just the basename of the file), and printname has not.
331 * Hence only printname is still actually the name of the file
332 * whose existence we would need to check.
334 if (stat(printname
, &st
) != 0)
336 return VISIT_REJECTED
;
340 return VISIT_CONTINUE
;
345 visit_substring_match_nocasefold(const char *testname
, const char *printname
, void *context
)
347 const char *pattern
= context
;
350 if (NULL
!= strstr(testname
, pattern
))
351 return VISIT_CONTINUE
;
353 return VISIT_REJECTED
;
357 visit_substring_match_casefold(const char *testname
, const char *printname
, void *context
)
359 struct casefolder
* p
= context
;
360 size_t len
= strlen(testname
);
363 if (len
+1 > p
->buffersize
)
365 p
->buffer
= xrealloc(p
->buffer
, len
+1); /* XXX: consider using extendbuf(). */
366 p
->buffersize
= len
+1;
368 lc_strcpy(p
->buffer
, testname
);
371 if (NULL
!= strstr(p
->buffer
, p
->pattern
))
372 return VISIT_CONTINUE
;
374 return VISIT_REJECTED
;
379 visit_globmatch_nofold(const char *testname
, const char *printname
, void *context
)
381 const char *glob
= context
;
383 if (fnmatch(glob
, testname
, 0) != 0)
384 return VISIT_REJECTED
;
386 return VISIT_CONTINUE
;
391 visit_globmatch_casefold(const char *testname
, const char *printname
, void *context
)
393 const char *glob
= context
;
395 if (fnmatch(glob
, testname
, FNM_CASEFOLD
) != 0)
396 return VISIT_REJECTED
;
398 return VISIT_CONTINUE
;
402 visit_stats(const char *testname
, const char *printname
, void *context
)
404 struct locate_stats
*p
= context
;
405 size_t len
= strlen(printname
);
407 int highbit
, whitespace
, newline
;
409 ++(p
->total_filename_count
);
410 p
->total_filename_length
+= len
;
412 highbit
= whitespace
= newline
= 0;
413 for (s
=printname
; *s
; ++s
)
415 if ( (int)(*s
) & 128 )
419 newline
= whitespace
= 1;
421 else if (isspace((unsigned char)*s
))
428 ++(p
->highbit_filename_count
);
430 ++(p
->whitespace_count
);
432 ++(p
->newline_count
);
434 return VISIT_CONTINUE
;
439 /* Print the entries in DBFILE that match shell globbing pattern PATHPART.
440 Return the number of entries printed. */
443 new_locate (char *pathpart
,
452 char hbuf
[LONGEST_HUMAN_READABLE
+ 1];
453 FILE *fp
; /* The pathname database. */
454 int c
; /* An input byte. */
455 int nread
; /* number of bytes read from an entry. */
456 char *path
; /* The current input database entry. */
457 const char *testpath
;
458 size_t pathsize
; /* Amount allocated for it. */
459 int count
= 0; /* The length of the prefix shared with the previous database entry. */
461 int old_format
= 0; /* true if reading a bigram-encoded database. */
465 /* for the old database format,
466 the first and second characters of the most common bigrams. */
467 char bigram1
[128], bigram2
[128];
469 /* number of items accepted (i.e. printed) */
470 unsigned long int items_accepted
= 0uL;
472 /* To check the age of the database. */
476 /* Set up the inspection regime */
478 lastinspector
= NULL
;
483 add_visitor(visit_stats
, &statistics
);
487 if (contains_metacharacter(pathpart
))
490 add_visitor(visit_globmatch_casefold
, pathpart
);
492 add_visitor(visit_globmatch_nofold
, pathpart
);
496 /* No glob characters used. Hence we match on
497 * _any part_ of the filename, not just the
498 * basename. This seems odd to me, but it is the
499 * traditional behaviour.
500 * James Youngman <jay@gnu.org>
504 struct casefolder
* cf
= xmalloc(sizeof(*cf
));
505 cf
->pattern
= pathpart
;
508 add_visitor(visit_substring_match_casefold
, cf
);
512 add_visitor(visit_substring_match_nocasefold
, pathpart
);
516 /* We add visit_exists() as late as possible to reduce the
517 * number of stat() calls. (Idea by Bas van Gompel).
520 add_visitor(visit_exists
, NULL
);
523 add_visitor(visit_justprint
, NULL
);
527 if (stat (dbfile
, &st
) || (fp
= fopen (dbfile
, "r")) == NULL
)
529 error (0, errno
, "%s", dbfile
);
533 if (now
- st
.st_mtime
> WARN_SECONDS
)
536 warning: database `fred' is more than 8 days old */
537 error (0, 0, _("warning: database `%s' is more than %d %s old"),
538 dbfile
, WARN_NUMBER_UNITS
, _(warn_name_units
));
541 pathsize
= 1026; /* Increased as necessary by locate_read_str. */
542 path
= xmalloc (pathsize
);
544 nread
= fread (path
, 1, sizeof (LOCATEDB_MAGIC
), fp
);
545 if (nread
!= sizeof (LOCATEDB_MAGIC
)
546 || memcmp (path
, LOCATEDB_MAGIC
, sizeof (LOCATEDB_MAGIC
)))
549 /* Read the list of the most common bigrams in the database. */
551 for (i
= 0; i
< 128; i
++)
553 bigram1
[i
] = getc (fp
);
554 bigram2
[i
] = getc (fp
);
561 printf(_("Database %s is in the %s format.\n"),
563 old_format
? _("old") : "LOCATE02");
566 /* If we ignore case, convert it to lower first so we don't have to
569 if (!stats
&& ignore_case
)
571 lc_strcpy(pathpart
, pathpart
);
577 while ( (c
!= EOF
) && (!use_limit
|| (limit
> 0)) )
579 register char *s
; /* Scan the path we read in. */
583 /* Get the offset in the path where this path info starts. */
584 if (c
== LOCATEDB_OLD_ESCAPE
)
585 count
+= getw (fp
) - LOCATEDB_OLD_OFFSET
;
587 count
+= c
- LOCATEDB_OLD_OFFSET
;
589 /* Overlay the old path with the remainder of the new. */
590 for (s
= path
+ count
; (c
= getc (fp
)) > LOCATEDB_OLD_ESCAPE
;)
592 *s
++ = c
; /* An ordinary character. */
595 /* Bigram markers have the high bit set. */
604 if (c
== LOCATEDB_ESCAPE
)
605 count
+= (short)get_short (fp
);
611 if (count
> strlen(path
))
613 /* This should not happen generally , but since we're
614 * reading in data which is outside our control, we
617 error(1, 0, _("locate database `%s' is corrupt or invalid"), dbfile
);
620 /* Overlay the old path with the remainder of the new. */
621 nread
= locate_read_str (&path
, &pathsize
, fp
, 0, count
);
625 s
= path
+ count
+ nread
- 1; /* Move to the last char in path. */
626 assert (s
[0] != '\0');
627 assert (s
[1] == '\0'); /* Our terminator. */
628 assert (s
[2] == '\0'); /* Added by locate_read_str. */
631 testpath
= basename_only
? base_name(path
) : path
;
632 if (VISIT_ACCEPTED
== process_filename(testpath
, path
))
634 if ((++items_accepted
>= limit
) && use_limit
)
644 printf(_("Locate database size: %s bytes\n"),
645 human_readable ((uintmax_t) st
.st_size
,
646 hbuf
, human_ceiling
, 1, 1));
648 printf(_("Filenames: %s "),
649 human_readable (statistics
.total_filename_count
,
650 hbuf
, human_ceiling
, 1, 1));
651 printf(_("with a cumulative length of %s bytes"),
652 human_readable (statistics
.total_filename_length
,
653 hbuf
, human_ceiling
, 1, 1));
655 printf(_("\n\tof which %s contain whitespace, "),
656 human_readable (statistics
.whitespace_count
,
657 hbuf
, human_ceiling
, 1, 1));
658 printf(_("\n\t%s contain newline characters, "),
659 human_readable (statistics
.newline_count
,
660 hbuf
, human_ceiling
, 1, 1));
661 printf(_("\n\tand %s contain characters with the high bit set.\n"),
662 human_readable (statistics
.highbit_filename_count
,
663 hbuf
, human_ceiling
, 1, 1));
665 printf(_("Compression ratio %4.2f%%\n"),
666 100.0 * ((double)statistics
.total_filename_length
667 - (double) st
.st_size
)
668 / (double) statistics
.total_filename_length
);
674 error (0, errno
, "%s", dbfile
);
677 if (fclose (fp
) == EOF
)
679 error (0, errno
, "%s", dbfile
);
683 return items_accepted
;
689 extern char *version_string
;
691 /* The name this program was run with. */
698 fprintf (stream
, _("\
699 Usage: %s [-d path | --database=path] [-e | --existing]\n\
700 [-i | --ignore-case] [--wholepath] [--basename] [--limit=N | -l N]\n\
701 [--version] [--help] pattern...\n"),
703 fputs (_("\nReport bugs to <bug-findutils@gnu.org>.\n"), stream
);
707 sanity_check_dbpath(const char *path
)
713 error(0, 0, _("warning: locate database path `%s' contains a leading colon, which is not a valid database name"), path
);
719 if (':' == path
[len
-1u])
721 error(0, 0, _("warning: locate database path `%s' contains a trailing colon, which is not a valid database name"), path
);
727 static struct option
const longopts
[] =
729 {"database", required_argument
, NULL
, 'd'},
730 {"existing", no_argument
, NULL
, 'e'},
731 {"ignore-case", no_argument
, NULL
, 'i'},
732 {"help", no_argument
, NULL
, 'h'},
733 {"version", no_argument
, NULL
, 'v'},
734 {"null", no_argument
, NULL
, '0'},
735 {"count", no_argument
, NULL
, 'c'},
736 {"wholename", no_argument
, NULL
, 'w'},
737 {"basename", no_argument
, NULL
, 'b'},
738 {"stdio", no_argument
, NULL
, 's'},
739 {"mmap", no_argument
, NULL
, 'm'},
740 {"limit", required_argument
, NULL
, 'l'},
741 {"statistics", no_argument
, NULL
, 'S'},
742 {NULL
, no_argument
, NULL
, 0}
751 unsigned long int found
= 0uL;
756 int basename_only
= 0;
761 program_name
= argv
[0];
763 #ifdef HAVE_SETLOCALE
764 setlocale (LC_ALL
, "");
766 bindtextdomain (PACKAGE
, LOCALEDIR
);
767 textdomain (PACKAGE
);
769 dbpath
= getenv ("LOCATE_PATH");
775 while ((optc
= getopt_long (argc
, argv
, "cd:eil:sm0S", longopts
, (int *) 0)) != -1)
808 printf (_("GNU locate version %s\n"), version_string
);
822 strtol_error err
= xstrtoumax(optarg
, &end
, 10, &limit
, NULL
);
823 if (LONGINT_OK
!= err
)
825 STRTOL_FATAL_ERROR(optarg
, _("argument to --limit"), err
);
831 case 's': /* use stdio */
832 case 'm': /* use mmap */
833 /* These options are implemented simply for
834 * compatibility with FreeBSD
857 sanity_check_dbpath(dbpath
);
859 for (; stats
|| optind
< argc
; optind
++)
863 next_element (dbpath
); /* Initialize. */
864 needle
= stats
? NULL
: argv
[optind
];
865 while ((e
= next_element ((char *) NULL
)) != NULL
)
867 statistics
.compressed_bytes
=
868 statistics
.total_filename_count
=
869 statistics
.total_filename_length
=
870 statistics
.whitespace_count
=
871 statistics
.newline_count
=
872 statistics
.highbit_filename_count
= 0u;
874 found
+= new_locate (needle
, e
, ignore_case
, print
, basename_only
, use_limit
, limit
, stats
);
882 printf("%ld\n", found
);
885 if (found
|| (use_limit
&& (limit
==0)) || stats
)