Migrated from GPL version 2 to GPL version 3
[findutils.git] / locate / code.c
blob800e5e90fe47e831843e0728ac1ac8aeacc01b34
1 /* code -- bigram- and front-encode filenames for locate
2 Copyright (C) 1994, 2005, 2007 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
18 /* Compress a sorted list.
19 Works with `find' to encode a filename database to save space
20 and search time.
22 Usage:
24 bigram < file_list > bigrams
25 process-bigrams > most_common_bigrams
26 code most_common_bigrams < file_list > squeezed_list
28 Uses `front compression' (see ";login:", March 1983, p. 8).
29 The output begins with the 128 most common bigrams.
30 After that, the output format is, for each line,
31 an offset (from the previous line) differential count byte
32 followed by a (partially bigram-encoded) ASCII remainder.
33 The output lines have no terminating byte; the start of the next line
34 is indicated by its first byte having a value <= 30.
36 The encoding of the output bytes is:
38 0-28 likeliest differential counts + offset (14) to make nonnegative
39 30 escape code for out-of-range count to follow in next halfword
40 128-255 bigram codes (the 128 most common, as determined by `updatedb')
41 32-127 single character (printable) ASCII remainder
43 Written by James A. Woods <jwoods@adobe.com>.
44 Modified by David MacKenzie <djm@gnu.org>. */
46 #include <config.h>
47 #include <stdio.h>
48 #include <sys/types.h>
49 #include <string.h>
50 #include <errno.h>
51 #include <stdbool.h>
54 #ifdef STDC_HEADERS
55 #include <stdlib.h>
56 #endif
58 #if ENABLE_NLS
59 # include <libintl.h>
60 # define _(Text) gettext (Text)
61 #else
62 # define _(Text) Text
63 #define textdomain(Domain)
64 #define bindtextdomain(Package, Directory)
65 #endif
66 #ifdef gettext_noop
67 # define N_(String) gettext_noop (String)
68 #else
69 /* See locate.c for explanation as to why not use (String) */
70 # define N_(String) String
71 #endif
73 #include "locatedb.h"
74 #include <getline.h>
75 #include "closeout.h"
76 #include "xalloc.h"
77 #include "gnulib-version.h"
78 #include "progname.h"
79 #include "error.h"
81 #ifndef ATTRIBUTE_NORETURN
82 # define ATTRIBUTE_NORETURN __attribute__ ((__noreturn__))
83 #endif
86 /* The name this program was run with. */
87 const char *program_name;
89 /* The 128 most common bigrams in the file list, padded with NULs
90 if there are fewer. */
91 static char bigrams[257] = {0};
93 /* Return the offset of PATTERN in STRING, or -1 if not found. */
95 static int
96 strindex (char *string, char *pattern)
98 register char *s;
100 for (s = string; *s != '\0'; s++)
101 /* Fast first char check. */
102 if (*s == *pattern)
104 register char *p2 = pattern + 1, *s2 = s + 1;
105 while (*p2 != '\0' && *p2 == *s2)
106 p2++, s2++;
107 if (*p2 == '\0')
108 return s2 - strlen (pattern) - string;
110 return -1;
113 /* Return the length of the longest common prefix of strings S1 and S2. */
115 static int
116 prefix_length (char *s1, char *s2)
118 register char *start;
120 for (start = s1; *s1 == *s2 && *s1 != '\0'; s1++, s2++)
122 return s1 - start;
125 extern char *version_string;
127 static void
128 usage (FILE *stream)
130 fprintf (stream, _("\
131 Usage: %s [--version | --help]\n\
132 or %s most_common_bigrams < file-list > locate-database\n"),
133 program_name, program_name);
134 fputs (_("\nReport bugs to <bug-findutils@gnu.org>.\n"), stream);
138 static void inerr (const char *filename) ATTRIBUTE_NORETURN;
139 static void outerr(void) ATTRIBUTE_NORETURN;
141 static void
142 inerr(const char *filename)
144 error(1, errno, "%s", filename);
145 /*NOTREACHED*/
146 abort();
149 static void
150 outerr(void)
152 error(1, errno, _("write error"));
153 /*NOTREACHED*/
154 abort();
159 main (int argc, char **argv)
161 char *path; /* The current input entry. */
162 char *oldpath; /* The previous input entry. */
163 size_t pathsize, oldpathsize; /* Amounts allocated for them. */
164 int count, oldcount, diffcount; /* Their prefix lengths & the difference. */
165 char bigram[3]; /* Bigram to search for in table. */
166 int code; /* Index of `bigram' in bigrams table. */
167 FILE *fp; /* Most common bigrams file. */
168 int line_len; /* Length of input line. */
170 set_program_name(argv[0]);
171 atexit (close_stdout);
173 bigram[2] = '\0';
175 if (argc != 2)
177 usage(stderr);
178 return 2;
181 if (0 == strcmp(argv[1], "--help"))
183 usage(stdout);
184 return 0;
186 else if (0 == strcmp(argv[1], "--version"))
188 printf (_("GNU findutils version %s\n"), version_string);
189 printf (_("Built using GNU gnulib version %s\n"), gnulib_version);
190 return 0;
193 fp = fopen (argv[1], "r");
194 if (fp == NULL)
196 fprintf (stderr, "%s: ", argv[0]);
197 perror (argv[1]);
198 return 1;
201 pathsize = oldpathsize = 1026; /* Increased as necessary by getline. */
202 path = xmalloc (pathsize);
203 oldpath = xmalloc (oldpathsize);
205 /* Set to empty string, to force the first prefix count to 0. */
206 oldpath[0] = '\0';
207 oldcount = 0;
209 /* Copy the list of most common bigrams to the output,
210 padding with NULs if there are <128 of them. */
211 if (NULL == fgets (bigrams, 257, fp))
212 inerr(argv[1]);
214 if (256 != fwrite (bigrams, 1, 256, stdout))
215 outerr();
217 if (EOF == fclose (fp))
218 inerr(argv[1]);
220 while ((line_len = getline (&path, &pathsize, stdin)) > 0)
222 char *pp;
224 path[line_len - 1] = '\0'; /* Remove newline. */
226 /* Squelch unprintable chars in path so as not to botch decoding. */
227 for (pp = path; *pp != '\0'; pp++)
229 if (!(*pp >= 040 && *pp < 0177))
230 *pp = '?';
233 count = prefix_length (oldpath, path);
234 diffcount = count - oldcount;
235 oldcount = count;
236 /* If the difference is small, it fits in one byte;
237 otherwise, two bytes plus a marker noting that fact. */
238 if (diffcount < -LOCATEDB_OLD_OFFSET || diffcount > LOCATEDB_OLD_OFFSET)
240 if (EOF ==- putc (LOCATEDB_OLD_ESCAPE, stdout))
241 outerr ();
243 if (!putword (stdout,
244 diffcount+LOCATEDB_OLD_OFFSET,
245 GetwordEndianStateNative))
246 outerr ();
248 else
250 if (EOF == putc (diffcount + LOCATEDB_OLD_OFFSET, stdout))
251 outerr ();
254 /* Look for bigrams in the remainder of the path. */
255 for (pp = path + count; *pp != '\0'; pp += 2)
257 if (pp[1] == '\0')
259 /* No bigram is possible; only one char is left. */
260 putchar (*pp);
261 break;
263 bigram[0] = *pp;
264 bigram[1] = pp[1];
265 /* Linear search for specific bigram in string table. */
266 code = strindex (bigrams, bigram);
267 if (code % 2 == 0)
268 putchar ((code / 2) | 0200); /* It's a common bigram. */
269 else
270 fputs (bigram, stdout); /* Write the text as printable ASCII. */
274 /* Swap path and oldpath and their sizes. */
275 char *tmppath = oldpath;
276 size_t tmppathsize = oldpathsize;
277 oldpath = path;
278 oldpathsize = pathsize;
279 path = tmppath;
280 pathsize = tmppathsize;
284 free (path);
285 free (oldpath);
287 return 0;