Merged changes made for version 4.1.20 onto the trunk
[findutils.git] / locate / code.c
blobaa9b181f3a34a548e964b92a359df7a199b2c87a
1 /* code -- bigram- and front-encode filenames for locate
2 Copyright (C) 1994 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 9 Temple Place - Suite 330, Boston, MA 02111-1307,
17 USA.
20 /* Compress a sorted list.
21 Works with `find' to encode a filename database to save space
22 and search time.
24 Usage:
26 bigram < file_list > bigrams
27 process-bigrams > most_common_bigrams
28 code most_common_bigrams < file_list > squeezed_list
30 Uses `front compression' (see ";login:", March 1983, p. 8).
31 The output begins with the 128 most common bigrams.
32 After that, the output format is, for each line,
33 an offset (from the previous line) differential count byte
34 followed by a (partially bigram-encoded) ASCII remainder.
35 The output lines have no terminating byte; the start of the next line
36 is indicated by its first byte having a value <= 30.
38 The encoding of the output bytes is:
40 0-28 likeliest differential counts + offset (14) to make nonnegative
41 30 escape code for out-of-range count to follow in next halfword
42 128-255 bigram codes (the 128 most common, as determined by `updatedb')
43 32-127 single character (printable) ASCII remainder
45 Written by James A. Woods <jwoods@adobe.com>.
46 Modified by David MacKenzie <djm@gnu.ai.mit.edu>. */
48 #include <gnulib/config.h>
49 #undef VERSION
50 #undef PACKAGE_VERSION
51 #undef PACKAGE_TARNAME
52 #undef PACKAGE_STRING
53 #undef PACKAGE_NAME
54 #undef PACKAGE
55 #include <config.h>
56 #include <stdio.h>
57 #include <sys/types.h>
59 #if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
60 #include <string.h>
61 #else
62 #include <strings.h>
63 #endif
65 #ifdef STDC_HEADERS
66 #include <stdlib.h>
67 #endif
69 #if ENABLE_NLS
70 # include <libintl.h>
71 # define _(Text) gettext (Text)
72 #else
73 # define _(Text) Text
74 #define textdomain(Domain)
75 #define bindtextdomain(Package, Directory)
76 #endif
77 #ifdef gettext_noop
78 # define N_(String) gettext_noop (String)
79 #else
80 # define N_(String) (String)
81 #endif
83 #include "locatedb.h"
84 #include <getline.h>
86 char *xmalloc PARAMS((size_t));
88 /* The name this program was run with. */
89 char *program_name;
91 /* The 128 most common bigrams in the file list, padded with NULs
92 if there are fewer. */
93 static char bigrams[257] = {0};
95 /* Return the offset of PATTERN in STRING, or -1 if not found. */
97 static int
98 strindex (char *string, char *pattern)
100 register char *s;
102 for (s = string; *s != '\0'; s++)
103 /* Fast first char check. */
104 if (*s == *pattern)
106 register char *p2 = pattern + 1, *s2 = s + 1;
107 while (*p2 != '\0' && *p2 == *s2)
108 p2++, s2++;
109 if (*p2 == '\0')
110 return s2 - strlen (pattern) - string;
112 return -1;
115 /* Return the length of the longest common prefix of strings S1 and S2. */
117 static int
118 prefix_length (char *s1, char *s2)
120 register char *start;
122 for (start = s1; *s1 == *s2 && *s1 != '\0'; s1++, s2++)
124 return s1 - start;
128 main (int argc, char **argv)
130 char *path; /* The current input entry. */
131 char *oldpath; /* The previous input entry. */
132 size_t pathsize, oldpathsize; /* Amounts allocated for them. */
133 int count, oldcount, diffcount; /* Their prefix lengths & the difference. */
134 char bigram[3]; /* Bigram to search for in table. */
135 int code; /* Index of `bigram' in bigrams table. */
136 FILE *fp; /* Most common bigrams file. */
137 int line_len; /* Length of input line. */
139 program_name = argv[0];
141 bigram[2] = '\0';
143 if (argc != 2)
145 fprintf (stderr, _("Usage: %s most_common_bigrams < list > coded_list\n"),
146 argv[0]);
147 exit (2);
150 fp = fopen (argv[1], "r");
151 if (fp == NULL)
153 fprintf (stderr, "%s: ", argv[0]);
154 perror (argv[1]);
155 exit (1);
158 pathsize = oldpathsize = 1026; /* Increased as necessary by getline. */
159 path = xmalloc (pathsize);
160 oldpath = xmalloc (oldpathsize);
162 /* Set to anything not starting with a slash, to force the first
163 prefix count to 0. */
164 strcpy (oldpath, " ");
165 oldcount = 0;
167 /* Copy the list of most common bigrams to the output,
168 padding with NULs if there are <128 of them. */
169 fgets (bigrams, 257, fp);
170 fwrite (bigrams, 1, 256, stdout);
171 fclose (fp);
173 while ((line_len = getline (&path, &pathsize, stdin)) > 0)
175 char *pp;
177 path[line_len - 1] = '\0'; /* Remove newline. */
179 /* Squelch unprintable chars in path so as not to botch decoding. */
180 for (pp = path; *pp != '\0'; pp++)
182 if (!(*pp >= 040 && *pp < 0177))
183 *pp = '?';
186 count = prefix_length (oldpath, path);
187 diffcount = count - oldcount;
188 oldcount = count;
189 /* If the difference is small, it fits in one byte;
190 otherwise, two bytes plus a marker noting that fact. */
191 if (diffcount < -LOCATEDB_OLD_OFFSET || diffcount > LOCATEDB_OLD_OFFSET)
193 putc (LOCATEDB_OLD_ESCAPE, stdout);
194 putw (diffcount + LOCATEDB_OLD_OFFSET, stdout);
196 else
197 putc (diffcount + LOCATEDB_OLD_OFFSET, stdout);
199 /* Look for bigrams in the remainder of the path. */
200 for (pp = path + count; *pp != '\0'; pp += 2)
202 if (pp[1] == '\0')
204 /* No bigram is possible; only one char is left. */
205 putchar (*pp);
206 break;
208 bigram[0] = *pp;
209 bigram[1] = pp[1];
210 /* Linear search for specific bigram in string table. */
211 code = strindex (bigrams, bigram);
212 if (code % 2 == 0)
213 putchar ((code / 2) | 0200); /* It's a common bigram. */
214 else
215 fputs (bigram, stdout); /* Write the text as printable ASCII. */
219 /* Swap path and oldpath and their sizes. */
220 char *tmppath = oldpath;
221 size_t tmppathsize = oldpathsize;
222 oldpath = path;
223 oldpathsize = pathsize;
224 path = tmppath;
225 pathsize = tmppathsize;
229 free (path);
230 free (oldpath);
232 exit (0);