*** empty log message ***
[findutils.git] / locate / code.c
blob9dd25e87db5b487fcab1fc2729e722e049311459
1 /* code -- bigram- and front-encode filenames for locate
2 Copyright (C) 1994 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
18 /* Compress a sorted list.
19 Works with `find' to encode a filename database to save space
20 and search time.
22 Usage:
24 bigram < file_list > bigrams
25 process-bigrams > most_common_bigrams
26 code most_common_bigrams < file_list > squeezed_list
28 Uses `front compression' (see ";login:", March 1983, p. 8).
29 The output begins with the 128 most common bigrams.
30 After that, the output format is, for each line,
31 an offset (from the previous line) differential count byte
32 followed by a (partially bigram-encoded) ASCII remainder.
33 The output lines have no terminating byte; the start of the next line
34 is indicated by its first byte having a value <= 30.
36 The encoding of the output bytes is:
38 0-28 likeliest differential counts + offset (14) to make nonnegative
39 30 escape code for out-of-range count to follow in next halfword
40 128-255 bigram codes (the 128 most common, as determined by `updatedb')
41 32-127 single character (printable) ASCII remainder
43 Written by James A. Woods <jwoods@adobe.com>.
44 Modified by David MacKenzie <djm@gnu.ai.mit.edu>. */
46 #include <config.h>
47 #include <stdio.h>
48 #include <sys/types.h>
50 #if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
51 #include <string.h>
52 #else
53 #include <strings.h>
54 #endif
56 #ifdef STDC_HEADERS
57 #include <stdlib.h>
58 #endif
60 #if ENABLE_NLS
61 # include <libintl.h>
62 # define _(Text) gettext (Text)
63 #else
64 # define _(Text) Text
65 #define textdomain(Domain)
66 #define bindtextdomain(Package, Directory)
67 #endif
68 #ifdef gettext_noop
69 # define N_(String) gettext_noop (String)
70 #else
71 # define N_(String) (String)
72 #endif
74 #include "locatedb.h"
75 #include <getline.h>
77 char *xmalloc PARAMS((size_t));
79 /* The name this program was run with. */
80 char *program_name;
82 /* The 128 most common bigrams in the file list, padded with NULs
83 if there are fewer. */
84 static char bigrams[257] = {0};
86 /* Return the offset of PATTERN in STRING, or -1 if not found. */
88 static int
89 strindex (char *string, char *pattern)
91 register char *s;
93 for (s = string; *s != '\0'; s++)
94 /* Fast first char check. */
95 if (*s == *pattern)
97 register char *p2 = pattern + 1, *s2 = s + 1;
98 while (*p2 != '\0' && *p2 == *s2)
99 p2++, s2++;
100 if (*p2 == '\0')
101 return s2 - strlen (pattern) - string;
103 return -1;
106 /* Return the length of the longest common prefix of strings S1 and S2. */
108 static int
109 prefix_length (char *s1, char *s2)
111 register char *start;
113 for (start = s1; *s1 == *s2 && *s1 != '\0'; s1++, s2++)
115 return s1 - start;
119 main (int argc, char **argv)
121 char *path; /* The current input entry. */
122 char *oldpath; /* The previous input entry. */
123 size_t pathsize, oldpathsize; /* Amounts allocated for them. */
124 int count, oldcount, diffcount; /* Their prefix lengths & the difference. */
125 char bigram[3]; /* Bigram to search for in table. */
126 int code; /* Index of `bigram' in bigrams table. */
127 FILE *fp; /* Most common bigrams file. */
128 int line_len; /* Length of input line. */
130 program_name = argv[0];
132 bigram[2] = '\0';
134 if (argc != 2)
136 fprintf (stderr, _("Usage: %s most_common_bigrams < list > coded_list\n"),
137 argv[0]);
138 exit (2);
141 fp = fopen (argv[1], "r");
142 if (fp == NULL)
144 fprintf (stderr, "%s: ", argv[0]);
145 perror (argv[1]);
146 exit (1);
149 pathsize = oldpathsize = 1026; /* Increased as necessary by getline. */
150 path = xmalloc (pathsize);
151 oldpath = xmalloc (oldpathsize);
153 /* Set to anything not starting with a slash, to force the first
154 prefix count to 0. */
155 strcpy (oldpath, " ");
156 oldcount = 0;
158 /* Copy the list of most common bigrams to the output,
159 padding with NULs if there are <128 of them. */
160 fgets (bigrams, 257, fp);
161 fwrite (bigrams, 1, 256, stdout);
162 fclose (fp);
164 while ((line_len = getline (&path, &pathsize, stdin)) > 0)
166 char *pp;
168 path[line_len - 1] = '\0'; /* Remove newline. */
170 /* Squelch unprintable chars in path so as not to botch decoding. */
171 for (pp = path; *pp != '\0'; pp++)
173 *pp &= 0177;
174 if (*pp < 040 || *pp == 0177)
175 *pp = '?';
178 count = prefix_length (oldpath, path);
179 diffcount = count - oldcount;
180 oldcount = count;
181 /* If the difference is small, it fits in one byte;
182 otherwise, two bytes plus a marker noting that fact. */
183 if (diffcount < -LOCATEDB_OLD_OFFSET || diffcount > LOCATEDB_OLD_OFFSET)
185 putc (LOCATEDB_OLD_ESCAPE, stdout);
186 putw (diffcount + LOCATEDB_OLD_OFFSET, stdout);
188 else
189 putc (diffcount + LOCATEDB_OLD_OFFSET, stdout);
191 /* Look for bigrams in the remainder of the path. */
192 for (pp = path + count; *pp != '\0'; pp += 2)
194 if (pp[1] == '\0')
196 /* No bigram is possible; only one char is left. */
197 putchar (*pp);
198 break;
200 bigram[0] = *pp;
201 bigram[1] = pp[1];
202 /* Linear search for specific bigram in string table. */
203 code = strindex (bigrams, bigram);
204 if (code % 2 == 0)
205 putchar ((code / 2) | 0200); /* It's a common bigram. */
206 else
207 fputs (bigram, stdout); /* Write the text as printable ASCII. */
211 /* Swap path and oldpath and their sizes. */
212 char *tmppath = oldpath;
213 size_t tmppathsize = oldpathsize;
214 oldpath = path;
215 oldpathsize = pathsize;
216 path = tmppath;
217 pathsize = tmppathsize;
221 free (path);
222 free (oldpath);
224 exit (0);