Ignore the usually-ignored files in git
[findutils.git] / locate / code.c
blobe384c9461d79853bb2415c520f0d1ad74f521c18
1 /* code -- bigram- and front-encode filenames for locate
2 Copyright (C) 1994, 2005, 2007 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
17 USA.
20 /* Compress a sorted list.
21 Works with `find' to encode a filename database to save space
22 and search time.
24 Usage:
26 bigram < file_list > bigrams
27 process-bigrams > most_common_bigrams
28 code most_common_bigrams < file_list > squeezed_list
30 Uses `front compression' (see ";login:", March 1983, p. 8).
31 The output begins with the 128 most common bigrams.
32 After that, the output format is, for each line,
33 an offset (from the previous line) differential count byte
34 followed by a (partially bigram-encoded) ASCII remainder.
35 The output lines have no terminating byte; the start of the next line
36 is indicated by its first byte having a value <= 30.
38 The encoding of the output bytes is:
40 0-28 likeliest differential counts + offset (14) to make nonnegative
41 30 escape code for out-of-range count to follow in next halfword
42 128-255 bigram codes (the 128 most common, as determined by `updatedb')
43 32-127 single character (printable) ASCII remainder
45 Written by James A. Woods <jwoods@adobe.com>.
46 Modified by David MacKenzie <djm@gnu.org>. */
48 #include <config.h>
49 #include <stdio.h>
50 #include <sys/types.h>
51 #include <string.h>
52 #include <errno.h>
53 #include <stdbool.h>
56 #ifdef STDC_HEADERS
57 #include <stdlib.h>
58 #endif
60 #if ENABLE_NLS
61 # include <libintl.h>
62 # define _(Text) gettext (Text)
63 #else
64 # define _(Text) Text
65 #define textdomain(Domain)
66 #define bindtextdomain(Package, Directory)
67 #endif
68 #ifdef gettext_noop
69 # define N_(String) gettext_noop (String)
70 #else
71 /* See locate.c for explanation as to why not use (String) */
72 # define N_(String) String
73 #endif
75 #include "locatedb.h"
76 #include <getline.h>
77 #include "closeout.h"
78 #include "xalloc.h"
79 #include "gnulib-version.h"
80 #include "progname.h"
81 #include "error.h"
83 #ifndef ATTRIBUTE_NORETURN
84 # define ATTRIBUTE_NORETURN __attribute__ ((__noreturn__))
85 #endif
88 /* The name this program was run with. */
89 const char *program_name;
91 /* The 128 most common bigrams in the file list, padded with NULs
92 if there are fewer. */
93 static char bigrams[257] = {0};
95 /* Return the offset of PATTERN in STRING, or -1 if not found. */
97 static int
98 strindex (char *string, char *pattern)
100 register char *s;
102 for (s = string; *s != '\0'; s++)
103 /* Fast first char check. */
104 if (*s == *pattern)
106 register char *p2 = pattern + 1, *s2 = s + 1;
107 while (*p2 != '\0' && *p2 == *s2)
108 p2++, s2++;
109 if (*p2 == '\0')
110 return s2 - strlen (pattern) - string;
112 return -1;
115 /* Return the length of the longest common prefix of strings S1 and S2. */
117 static int
118 prefix_length (char *s1, char *s2)
120 register char *start;
122 for (start = s1; *s1 == *s2 && *s1 != '\0'; s1++, s2++)
124 return s1 - start;
127 extern char *version_string;
129 static void
130 usage (FILE *stream)
132 fprintf (stream, _("\
133 Usage: %s [--version | --help]\n\
134 or %s most_common_bigrams < file-list > locate-database\n"),
135 program_name, program_name);
136 fputs (_("\nReport bugs to <bug-findutils@gnu.org>.\n"), stream);
140 static void inerr (const char *filename) ATTRIBUTE_NORETURN;
141 static void outerr(void) ATTRIBUTE_NORETURN;
143 static void
144 inerr(const char *filename)
146 error(1, errno, "%s", filename);
147 /*NOTREACHED*/
148 abort();
151 static void
152 outerr(void)
154 error(1, errno, _("write error"));
155 /*NOTREACHED*/
156 abort();
161 main (int argc, char **argv)
163 char *path; /* The current input entry. */
164 char *oldpath; /* The previous input entry. */
165 size_t pathsize, oldpathsize; /* Amounts allocated for them. */
166 int count, oldcount, diffcount; /* Their prefix lengths & the difference. */
167 char bigram[3]; /* Bigram to search for in table. */
168 int code; /* Index of `bigram' in bigrams table. */
169 FILE *fp; /* Most common bigrams file. */
170 int line_len; /* Length of input line. */
172 set_program_name(argv[0]);
173 atexit (close_stdout);
175 bigram[2] = '\0';
177 if (argc != 2)
179 usage(stderr);
180 return 2;
183 if (0 == strcmp(argv[1], "--help"))
185 usage(stdout);
186 return 0;
188 else if (0 == strcmp(argv[1], "--version"))
190 printf (_("GNU findutils version %s\n"), version_string);
191 printf (_("Built using GNU gnulib version %s\n"), gnulib_version);
192 return 0;
195 fp = fopen (argv[1], "r");
196 if (fp == NULL)
198 fprintf (stderr, "%s: ", argv[0]);
199 perror (argv[1]);
200 return 1;
203 pathsize = oldpathsize = 1026; /* Increased as necessary by getline. */
204 path = xmalloc (pathsize);
205 oldpath = xmalloc (oldpathsize);
207 /* Set to empty string, to force the first prefix count to 0. */
208 oldpath[0] = '\0';
209 oldcount = 0;
211 /* Copy the list of most common bigrams to the output,
212 padding with NULs if there are <128 of them. */
213 if (NULL == fgets (bigrams, 257, fp))
214 inerr(argv[1]);
216 if (256 != fwrite (bigrams, 1, 256, stdout))
217 outerr();
219 if (EOF == fclose (fp))
220 inerr(argv[1]);
222 while ((line_len = getline (&path, &pathsize, stdin)) > 0)
224 char *pp;
226 path[line_len - 1] = '\0'; /* Remove newline. */
228 /* Squelch unprintable chars in path so as not to botch decoding. */
229 for (pp = path; *pp != '\0'; pp++)
231 if (!(*pp >= 040 && *pp < 0177))
232 *pp = '?';
235 count = prefix_length (oldpath, path);
236 diffcount = count - oldcount;
237 oldcount = count;
238 /* If the difference is small, it fits in one byte;
239 otherwise, two bytes plus a marker noting that fact. */
240 if (diffcount < -LOCATEDB_OLD_OFFSET || diffcount > LOCATEDB_OLD_OFFSET)
242 if (EOF ==- putc (LOCATEDB_OLD_ESCAPE, stdout))
243 outerr ();
245 if (!putword (stdout,
246 diffcount+LOCATEDB_OLD_OFFSET,
247 GetwordEndianStateNative))
248 outerr ();
250 else
252 if (EOF == putc (diffcount + LOCATEDB_OLD_OFFSET, stdout))
253 outerr ();
256 /* Look for bigrams in the remainder of the path. */
257 for (pp = path + count; *pp != '\0'; pp += 2)
259 if (pp[1] == '\0')
261 /* No bigram is possible; only one char is left. */
262 putchar (*pp);
263 break;
265 bigram[0] = *pp;
266 bigram[1] = pp[1];
267 /* Linear search for specific bigram in string table. */
268 code = strindex (bigrams, bigram);
269 if (code % 2 == 0)
270 putchar ((code / 2) | 0200); /* It's a common bigram. */
271 else
272 fputs (bigram, stdout); /* Write the text as printable ASCII. */
276 /* Swap path and oldpath and their sizes. */
277 char *tmppath = oldpath;
278 size_t tmppathsize = oldpathsize;
279 oldpath = path;
280 oldpathsize = pathsize;
281 path = tmppath;
282 pathsize = tmppathsize;
286 free (path);
287 free (oldpath);
289 return 0;