Added Bas van Gompel.
[findutils.git] / locate / code.c
blobd78287fc2deda52909d72c1f8b7175383c52089b
1 /* code -- bigram- and front-encode filenames for locate
2 Copyright (C) 1994 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
17 USA.
20 /* Compress a sorted list.
21 Works with `find' to encode a filename database to save space
22 and search time.
24 Usage:
26 bigram < file_list > bigrams
27 process-bigrams > most_common_bigrams
28 code most_common_bigrams < file_list > squeezed_list
30 Uses `front compression' (see ";login:", March 1983, p. 8).
31 The output begins with the 128 most common bigrams.
32 After that, the output format is, for each line,
33 an offset (from the previous line) differential count byte
34 followed by a (partially bigram-encoded) ASCII remainder.
35 The output lines have no terminating byte; the start of the next line
36 is indicated by its first byte having a value <= 30.
38 The encoding of the output bytes is:
40 0-28 likeliest differential counts + offset (14) to make nonnegative
41 30 escape code for out-of-range count to follow in next halfword
42 128-255 bigram codes (the 128 most common, as determined by `updatedb')
43 32-127 single character (printable) ASCII remainder
45 Written by James A. Woods <jwoods@adobe.com>.
46 Modified by David MacKenzie <djm@gnu.ai.mit.edu>. */
48 #include <config.h>
49 #include <stdio.h>
50 #include <sys/types.h>
52 #if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
53 #include <string.h>
54 #else
55 #include <strings.h>
56 #endif
58 #ifdef STDC_HEADERS
59 #include <stdlib.h>
60 #endif
62 #if ENABLE_NLS
63 # include <libintl.h>
64 # define _(Text) gettext (Text)
65 #else
66 # define _(Text) Text
67 #define textdomain(Domain)
68 #define bindtextdomain(Package, Directory)
69 #endif
70 #ifdef gettext_noop
71 # define N_(String) gettext_noop (String)
72 #else
73 /* See locate.c for explanation as to why not use (String) */
74 # define N_(String) String
75 #endif
77 #include "locatedb.h"
78 #include <getline.h>
80 char *xmalloc PARAMS((size_t));
82 /* The name this program was run with. */
83 char *program_name;
85 /* The 128 most common bigrams in the file list, padded with NULs
86 if there are fewer. */
87 static char bigrams[257] = {0};
89 /* Return the offset of PATTERN in STRING, or -1 if not found. */
91 static int
92 strindex (char *string, char *pattern)
94 register char *s;
96 for (s = string; *s != '\0'; s++)
97 /* Fast first char check. */
98 if (*s == *pattern)
100 register char *p2 = pattern + 1, *s2 = s + 1;
101 while (*p2 != '\0' && *p2 == *s2)
102 p2++, s2++;
103 if (*p2 == '\0')
104 return s2 - strlen (pattern) - string;
106 return -1;
109 /* Return the length of the longest common prefix of strings S1 and S2. */
111 static int
112 prefix_length (char *s1, char *s2)
114 register char *start;
116 for (start = s1; *s1 == *s2 && *s1 != '\0'; s1++, s2++)
118 return s1 - start;
121 extern char *version_string;
123 static void
124 usage (stream)
125 FILE *stream;
127 fprintf (stream, _("\
128 Usage: %s [--version | --help]\n\
129 or %s most_common_bigrams < file-list > locate-database\n"),
130 program_name, program_name);
131 fputs (_("\nReport bugs to <bug-findutils@gnu.org>.\n"), stream);
136 main (int argc, char **argv)
138 char *path; /* The current input entry. */
139 char *oldpath; /* The previous input entry. */
140 size_t pathsize, oldpathsize; /* Amounts allocated for them. */
141 int count, oldcount, diffcount; /* Their prefix lengths & the difference. */
142 char bigram[3]; /* Bigram to search for in table. */
143 int code; /* Index of `bigram' in bigrams table. */
144 FILE *fp; /* Most common bigrams file. */
145 int line_len; /* Length of input line. */
147 program_name = argv[0];
149 bigram[2] = '\0';
151 if (argc != 2)
153 usage(stderr);
154 return 2;
157 if (0 == strcmp(argv[1], "--help"))
159 usage(stdout);
160 return 0;
162 else if (0 == strcmp(argv[1], "--version"))
164 printf (_("GNU findutils version %s\n"), version_string);
165 return 0;
168 fp = fopen (argv[1], "r");
169 if (fp == NULL)
171 fprintf (stderr, "%s: ", argv[0]);
172 perror (argv[1]);
173 return 1;
176 pathsize = oldpathsize = 1026; /* Increased as necessary by getline. */
177 path = xmalloc (pathsize);
178 oldpath = xmalloc (oldpathsize);
180 /* Set to anything not starting with a slash, to force the first
181 prefix count to 0. */
182 strcpy (oldpath, " ");
183 oldcount = 0;
185 /* Copy the list of most common bigrams to the output,
186 padding with NULs if there are <128 of them. */
187 fgets (bigrams, 257, fp);
188 fwrite (bigrams, 1, 256, stdout);
189 fclose (fp);
191 while ((line_len = getline (&path, &pathsize, stdin)) > 0)
193 char *pp;
195 path[line_len - 1] = '\0'; /* Remove newline. */
197 /* Squelch unprintable chars in path so as not to botch decoding. */
198 for (pp = path; *pp != '\0'; pp++)
200 if (!(*pp >= 040 && *pp < 0177))
201 *pp = '?';
204 count = prefix_length (oldpath, path);
205 diffcount = count - oldcount;
206 oldcount = count;
207 /* If the difference is small, it fits in one byte;
208 otherwise, two bytes plus a marker noting that fact. */
209 if (diffcount < -LOCATEDB_OLD_OFFSET || diffcount > LOCATEDB_OLD_OFFSET)
211 putc (LOCATEDB_OLD_ESCAPE, stdout);
212 putw (diffcount + LOCATEDB_OLD_OFFSET, stdout);
214 else
215 putc (diffcount + LOCATEDB_OLD_OFFSET, stdout);
217 /* Look for bigrams in the remainder of the path. */
218 for (pp = path + count; *pp != '\0'; pp += 2)
220 if (pp[1] == '\0')
222 /* No bigram is possible; only one char is left. */
223 putchar (*pp);
224 break;
226 bigram[0] = *pp;
227 bigram[1] = pp[1];
228 /* Linear search for specific bigram in string table. */
229 code = strindex (bigrams, bigram);
230 if (code % 2 == 0)
231 putchar ((code / 2) | 0200); /* It's a common bigram. */
232 else
233 fputs (bigram, stdout); /* Write the text as printable ASCII. */
237 /* Swap path and oldpath and their sizes. */
238 char *tmppath = oldpath;
239 size_t tmppathsize = oldpathsize;
240 oldpath = path;
241 oldpathsize = pathsize;
242 path = tmppath;
243 pathsize = tmppathsize;
247 free (path);
248 free (oldpath);
250 return 0;