Bugfixes for Savannah bugs #19768 and #19766
[findutils.git] / locate / code.c
blobaab41392a70a50e9a695051068af3232462001ae
1 /* code -- bigram- and front-encode filenames for locate
2 Copyright (C) 1994 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
17 USA.
20 /* Compress a sorted list.
21 Works with `find' to encode a filename database to save space
22 and search time.
24 Usage:
26 bigram < file_list > bigrams
27 process-bigrams > most_common_bigrams
28 code most_common_bigrams < file_list > squeezed_list
30 Uses `front compression' (see ";login:", March 1983, p. 8).
31 The output begins with the 128 most common bigrams.
32 After that, the output format is, for each line,
33 an offset (from the previous line) differential count byte
34 followed by a (partially bigram-encoded) ASCII remainder.
35 The output lines have no terminating byte; the start of the next line
36 is indicated by its first byte having a value <= 30.
38 The encoding of the output bytes is:
40 0-28 likeliest differential counts + offset (14) to make nonnegative
41 30 escape code for out-of-range count to follow in next halfword
42 128-255 bigram codes (the 128 most common, as determined by `updatedb')
43 32-127 single character (printable) ASCII remainder
45 Written by James A. Woods <jwoods@adobe.com>.
46 Modified by David MacKenzie <djm@gnu.org>. */
48 #include <config.h>
49 #include <stdio.h>
50 #include <sys/types.h>
51 #include <string.h>
52 #include <errno.h>
55 #ifdef STDC_HEADERS
56 #include <stdlib.h>
57 #endif
59 #if ENABLE_NLS
60 # include <libintl.h>
61 # define _(Text) gettext (Text)
62 #else
63 # define _(Text) Text
64 #define textdomain(Domain)
65 #define bindtextdomain(Package, Directory)
66 #endif
67 #ifdef gettext_noop
68 # define N_(String) gettext_noop (String)
69 #else
70 /* See locate.c for explanation as to why not use (String) */
71 # define N_(String) String
72 #endif
74 #include "locatedb.h"
75 #include <getline.h>
76 #include "closeout.h"
77 #include "xalloc.h"
78 #include "gnulib-version.h"
79 #include "progname.h"
80 #include "error.h"
82 #ifndef ATTRIBUTE_NORETURN
83 # define ATTRIBUTE_NORETURN __attribute__ ((__noreturn__))
84 #endif
87 /* The name this program was run with. */
88 const char *program_name;
90 /* The 128 most common bigrams in the file list, padded with NULs
91 if there are fewer. */
92 static char bigrams[257] = {0};
94 /* Return the offset of PATTERN in STRING, or -1 if not found. */
96 static int
97 strindex (char *string, char *pattern)
99 register char *s;
101 for (s = string; *s != '\0'; s++)
102 /* Fast first char check. */
103 if (*s == *pattern)
105 register char *p2 = pattern + 1, *s2 = s + 1;
106 while (*p2 != '\0' && *p2 == *s2)
107 p2++, s2++;
108 if (*p2 == '\0')
109 return s2 - strlen (pattern) - string;
111 return -1;
114 /* Return the length of the longest common prefix of strings S1 and S2. */
116 static int
117 prefix_length (char *s1, char *s2)
119 register char *start;
121 for (start = s1; *s1 == *s2 && *s1 != '\0'; s1++, s2++)
123 return s1 - start;
126 extern char *version_string;
128 static void
129 usage (FILE *stream)
131 fprintf (stream, _("\
132 Usage: %s [--version | --help]\n\
133 or %s most_common_bigrams < file-list > locate-database\n"),
134 program_name, program_name);
135 fputs (_("\nReport bugs to <bug-findutils@gnu.org>.\n"), stream);
139 static void inerr (const char *filename) ATTRIBUTE_NORETURN;
140 static void outerr(void) ATTRIBUTE_NORETURN;
142 static void
143 inerr(const char *filename)
145 error(1, errno, "%s", filename);
146 /*NOTREACHED*/
147 abort();
150 static void
151 outerr(void)
153 error(1, errno, _("write error"));
154 /*NOTREACHED*/
155 abort();
160 main (int argc, char **argv)
162 char *path; /* The current input entry. */
163 char *oldpath; /* The previous input entry. */
164 size_t pathsize, oldpathsize; /* Amounts allocated for them. */
165 int count, oldcount, diffcount; /* Their prefix lengths & the difference. */
166 char bigram[3]; /* Bigram to search for in table. */
167 int code; /* Index of `bigram' in bigrams table. */
168 FILE *fp; /* Most common bigrams file. */
169 int line_len; /* Length of input line. */
171 set_program_name(argv[0]);
172 atexit (close_stdout);
174 bigram[2] = '\0';
176 if (argc != 2)
178 usage(stderr);
179 return 2;
182 if (0 == strcmp(argv[1], "--help"))
184 usage(stdout);
185 return 0;
187 else if (0 == strcmp(argv[1], "--version"))
189 printf (_("GNU findutils version %s\n"), version_string);
190 printf (_("Built using GNU gnulib version %s\n"), gnulib_version);
191 return 0;
194 fp = fopen (argv[1], "r");
195 if (fp == NULL)
197 fprintf (stderr, "%s: ", argv[0]);
198 perror (argv[1]);
199 return 1;
202 pathsize = oldpathsize = 1026; /* Increased as necessary by getline. */
203 path = xmalloc (pathsize);
204 oldpath = xmalloc (oldpathsize);
206 /* Set to empty string, to force the first prefix count to 0. */
207 oldpath[0] = '\0';
208 oldcount = 0;
210 /* Copy the list of most common bigrams to the output,
211 padding with NULs if there are <128 of them. */
212 if (NULL == fgets (bigrams, 257, fp))
213 inerr(argv[1]);
215 if (256 != fwrite (bigrams, 1, 256, stdout))
216 outerr();
218 if (EOF == fclose (fp))
219 inerr(argv[1]);
221 while ((line_len = getline (&path, &pathsize, stdin)) > 0)
223 char *pp;
225 path[line_len - 1] = '\0'; /* Remove newline. */
227 /* Squelch unprintable chars in path so as not to botch decoding. */
228 for (pp = path; *pp != '\0'; pp++)
230 if (!(*pp >= 040 && *pp < 0177))
231 *pp = '?';
234 count = prefix_length (oldpath, path);
235 diffcount = count - oldcount;
236 oldcount = count;
237 /* If the difference is small, it fits in one byte;
238 otherwise, two bytes plus a marker noting that fact. */
239 if (diffcount < -LOCATEDB_OLD_OFFSET || diffcount > LOCATEDB_OLD_OFFSET)
241 if (EOF ==- putc (LOCATEDB_OLD_ESCAPE, stdout))
242 outerr();
244 if (EOF == putw (diffcount + LOCATEDB_OLD_OFFSET, stdout))
245 outerr();
247 else
249 if (EOF == putc (diffcount + LOCATEDB_OLD_OFFSET, stdout))
250 outerr();
253 /* Look for bigrams in the remainder of the path. */
254 for (pp = path + count; *pp != '\0'; pp += 2)
256 if (pp[1] == '\0')
258 /* No bigram is possible; only one char is left. */
259 putchar (*pp);
260 break;
262 bigram[0] = *pp;
263 bigram[1] = pp[1];
264 /* Linear search for specific bigram in string table. */
265 code = strindex (bigrams, bigram);
266 if (code % 2 == 0)
267 putchar ((code / 2) | 0200); /* It's a common bigram. */
268 else
269 fputs (bigram, stdout); /* Write the text as printable ASCII. */
273 /* Swap path and oldpath and their sizes. */
274 char *tmppath = oldpath;
275 size_t tmppathsize = oldpathsize;
276 oldpath = path;
277 oldpathsize = pathsize;
278 path = tmppath;
279 pathsize = tmppathsize;
283 free (path);
284 free (oldpath);
286 return 0;