Support the generation of slocate-format databases
[findutils.git] / locate / code.c
blob5cebfeef482364197bbdc707343120f5acffdd8b
1 /* code -- bigram- and front-encode filenames for locate
2 Copyright (C) 1994 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
17 USA.
20 /* Compress a sorted list.
21 Works with `find' to encode a filename database to save space
22 and search time.
24 Usage:
26 bigram < file_list > bigrams
27 process-bigrams > most_common_bigrams
28 code most_common_bigrams < file_list > squeezed_list
30 Uses `front compression' (see ";login:", March 1983, p. 8).
31 The output begins with the 128 most common bigrams.
32 After that, the output format is, for each line,
33 an offset (from the previous line) differential count byte
34 followed by a (partially bigram-encoded) ASCII remainder.
35 The output lines have no terminating byte; the start of the next line
36 is indicated by its first byte having a value <= 30.
38 The encoding of the output bytes is:
40 0-28 likeliest differential counts + offset (14) to make nonnegative
41 30 escape code for out-of-range count to follow in next halfword
42 128-255 bigram codes (the 128 most common, as determined by `updatedb')
43 32-127 single character (printable) ASCII remainder
45 Written by James A. Woods <jwoods@adobe.com>.
46 Modified by David MacKenzie <djm@gnu.org>. */
48 #include <config.h>
49 #include <stdio.h>
50 #include <sys/types.h>
52 #if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
53 #include <string.h>
54 #else
55 #include <strings.h>
56 #endif
58 #ifdef STDC_HEADERS
59 #include <stdlib.h>
60 #endif
62 #if ENABLE_NLS
63 # include <libintl.h>
64 # define _(Text) gettext (Text)
65 #else
66 # define _(Text) Text
67 #define textdomain(Domain)
68 #define bindtextdomain(Package, Directory)
69 #endif
70 #ifdef gettext_noop
71 # define N_(String) gettext_noop (String)
72 #else
73 /* See locate.c for explanation as to why not use (String) */
74 # define N_(String) String
75 #endif
77 #include "locatedb.h"
78 #include <getline.h>
79 #include "closeout.h"
80 #include "gnulib-version.h"
82 char *xmalloc PARAMS((size_t));
84 /* The name this program was run with. */
85 char *program_name;
87 /* The 128 most common bigrams in the file list, padded with NULs
88 if there are fewer. */
89 static char bigrams[257] = {0};
91 /* Return the offset of PATTERN in STRING, or -1 if not found. */
93 static int
94 strindex (char *string, char *pattern)
96 register char *s;
98 for (s = string; *s != '\0'; s++)
99 /* Fast first char check. */
100 if (*s == *pattern)
102 register char *p2 = pattern + 1, *s2 = s + 1;
103 while (*p2 != '\0' && *p2 == *s2)
104 p2++, s2++;
105 if (*p2 == '\0')
106 return s2 - strlen (pattern) - string;
108 return -1;
111 /* Return the length of the longest common prefix of strings S1 and S2. */
113 static int
114 prefix_length (char *s1, char *s2)
116 register char *start;
118 for (start = s1; *s1 == *s2 && *s1 != '\0'; s1++, s2++)
120 return s1 - start;
123 extern char *version_string;
125 static void
126 usage (FILE *stream)
128 fprintf (stream, _("\
129 Usage: %s [--version | --help]\n\
130 or %s most_common_bigrams < file-list > locate-database\n"),
131 program_name, program_name);
132 fputs (_("\nReport bugs to <bug-findutils@gnu.org>.\n"), stream);
137 main (int argc, char **argv)
139 char *path; /* The current input entry. */
140 char *oldpath; /* The previous input entry. */
141 size_t pathsize, oldpathsize; /* Amounts allocated for them. */
142 int count, oldcount, diffcount; /* Their prefix lengths & the difference. */
143 char bigram[3]; /* Bigram to search for in table. */
144 int code; /* Index of `bigram' in bigrams table. */
145 FILE *fp; /* Most common bigrams file. */
146 int line_len; /* Length of input line. */
148 program_name = argv[0];
149 atexit (close_stdout);
151 bigram[2] = '\0';
153 if (argc != 2)
155 usage(stderr);
156 return 2;
159 if (0 == strcmp(argv[1], "--help"))
161 usage(stdout);
162 return 0;
164 else if (0 == strcmp(argv[1], "--version"))
166 printf (_("GNU findutils version %s\n"), version_string);
167 printf (_("Built using GNU gnulib version %s\n"), gnulib_version);
168 return 0;
171 fp = fopen (argv[1], "r");
172 if (fp == NULL)
174 fprintf (stderr, "%s: ", argv[0]);
175 perror (argv[1]);
176 return 1;
179 pathsize = oldpathsize = 1026; /* Increased as necessary by getline. */
180 path = xmalloc (pathsize);
181 oldpath = xmalloc (oldpathsize);
183 /* Set to empty string, to force the first prefix count to 0. */
184 oldpath[0] = '\0';
185 oldcount = 0;
187 /* Copy the list of most common bigrams to the output,
188 padding with NULs if there are <128 of them. */
189 fgets (bigrams, 257, fp);
190 fwrite (bigrams, 1, 256, stdout);
191 fclose (fp);
193 while ((line_len = getline (&path, &pathsize, stdin)) > 0)
195 char *pp;
197 path[line_len - 1] = '\0'; /* Remove newline. */
199 /* Squelch unprintable chars in path so as not to botch decoding. */
200 for (pp = path; *pp != '\0'; pp++)
202 if (!(*pp >= 040 && *pp < 0177))
203 *pp = '?';
206 count = prefix_length (oldpath, path);
207 diffcount = count - oldcount;
208 oldcount = count;
209 /* If the difference is small, it fits in one byte;
210 otherwise, two bytes plus a marker noting that fact. */
211 if (diffcount < -LOCATEDB_OLD_OFFSET || diffcount > LOCATEDB_OLD_OFFSET)
213 putc (LOCATEDB_OLD_ESCAPE, stdout);
214 putw (diffcount + LOCATEDB_OLD_OFFSET, stdout);
216 else
217 putc (diffcount + LOCATEDB_OLD_OFFSET, stdout);
219 /* Look for bigrams in the remainder of the path. */
220 for (pp = path + count; *pp != '\0'; pp += 2)
222 if (pp[1] == '\0')
224 /* No bigram is possible; only one char is left. */
225 putchar (*pp);
226 break;
228 bigram[0] = *pp;
229 bigram[1] = pp[1];
230 /* Linear search for specific bigram in string table. */
231 code = strindex (bigrams, bigram);
232 if (code % 2 == 0)
233 putchar ((code / 2) | 0200); /* It's a common bigram. */
234 else
235 fputs (bigram, stdout); /* Write the text as printable ASCII. */
239 /* Swap path and oldpath and their sizes. */
240 char *tmppath = oldpath;
241 size_t tmppathsize = oldpathsize;
242 oldpath = path;
243 oldpathsize = pathsize;
244 path = tmppath;
245 pathsize = tmppathsize;
249 free (path);
250 free (oldpath);
252 return 0;