1 /* code -- bigram- and front-encode filenames for locate
2 Copyright (C) 1994, 2005, 2007 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
20 /* Compress a sorted list.
21 Works with `find' to encode a filename database to save space
26 bigram < file_list > bigrams
27 process-bigrams > most_common_bigrams
28 code most_common_bigrams < file_list > squeezed_list
30 Uses `front compression' (see ";login:", March 1983, p. 8).
31 The output begins with the 128 most common bigrams.
32 After that, the output format is, for each line,
33 an offset (from the previous line) differential count byte
34 followed by a (partially bigram-encoded) ASCII remainder.
35 The output lines have no terminating byte; the start of the next line
36 is indicated by its first byte having a value <= 30.
38 The encoding of the output bytes is:
40 0-28 likeliest differential counts + offset (14) to make nonnegative
41 30 escape code for out-of-range count to follow in next halfword
42 128-255 bigram codes (the 128 most common, as determined by `updatedb')
43 32-127 single character (printable) ASCII remainder
45 Written by James A. Woods <jwoods@adobe.com>.
46 Modified by David MacKenzie <djm@gnu.org>. */
50 #include <sys/types.h>
62 # define _(Text) gettext (Text)
65 #define textdomain(Domain)
66 #define bindtextdomain(Package, Directory)
69 # define N_(String) gettext_noop (String)
71 /* See locate.c for explanation as to why not use (String) */
72 # define N_(String) String
79 #include "gnulib-version.h"
83 #ifndef ATTRIBUTE_NORETURN
84 # define ATTRIBUTE_NORETURN __attribute__ ((__noreturn__))
88 /* The name this program was run with. */
89 const char *program_name
;
91 /* The 128 most common bigrams in the file list, padded with NULs
92 if there are fewer. */
93 static char bigrams
[257] = {0};
95 /* Return the offset of PATTERN in STRING, or -1 if not found. */
98 strindex (char *string
, char *pattern
)
102 for (s
= string
; *s
!= '\0'; s
++)
103 /* Fast first char check. */
106 register char *p2
= pattern
+ 1, *s2
= s
+ 1;
107 while (*p2
!= '\0' && *p2
== *s2
)
110 return s2
- strlen (pattern
) - string
;
115 /* Return the length of the longest common prefix of strings S1 and S2. */
118 prefix_length (char *s1
, char *s2
)
120 register char *start
;
122 for (start
= s1
; *s1
== *s2
&& *s1
!= '\0'; s1
++, s2
++)
127 extern char *version_string
;
132 fprintf (stream
, _("\
133 Usage: %s [--version | --help]\n\
134 or %s most_common_bigrams < file-list > locate-database\n"),
135 program_name
, program_name
);
136 fputs (_("\nReport bugs to <bug-findutils@gnu.org>.\n"), stream
);
140 static void inerr (const char *filename
) ATTRIBUTE_NORETURN
;
141 static void outerr(void) ATTRIBUTE_NORETURN
;
144 inerr(const char *filename
)
146 error(1, errno
, "%s", filename
);
154 error(1, errno
, _("write error"));
161 main (int argc
, char **argv
)
163 char *path
; /* The current input entry. */
164 char *oldpath
; /* The previous input entry. */
165 size_t pathsize
, oldpathsize
; /* Amounts allocated for them. */
166 int count
, oldcount
, diffcount
; /* Their prefix lengths & the difference. */
167 char bigram
[3]; /* Bigram to search for in table. */
168 int code
; /* Index of `bigram' in bigrams table. */
169 FILE *fp
; /* Most common bigrams file. */
170 int line_len
; /* Length of input line. */
172 set_program_name(argv
[0]);
173 atexit (close_stdout
);
183 if (0 == strcmp(argv
[1], "--help"))
188 else if (0 == strcmp(argv
[1], "--version"))
190 printf (_("GNU findutils version %s\n"), version_string
);
191 printf (_("Built using GNU gnulib version %s\n"), gnulib_version
);
195 fp
= fopen (argv
[1], "r");
198 fprintf (stderr
, "%s: ", argv
[0]);
203 pathsize
= oldpathsize
= 1026; /* Increased as necessary by getline. */
204 path
= xmalloc (pathsize
);
205 oldpath
= xmalloc (oldpathsize
);
207 /* Set to empty string, to force the first prefix count to 0. */
211 /* Copy the list of most common bigrams to the output,
212 padding with NULs if there are <128 of them. */
213 if (NULL
== fgets (bigrams
, 257, fp
))
216 if (256 != fwrite (bigrams
, 1, 256, stdout
))
219 if (EOF
== fclose (fp
))
222 while ((line_len
= getline (&path
, &pathsize
, stdin
)) > 0)
226 path
[line_len
- 1] = '\0'; /* Remove newline. */
228 /* Squelch unprintable chars in path so as not to botch decoding. */
229 for (pp
= path
; *pp
!= '\0'; pp
++)
231 if (!(*pp
>= 040 && *pp
< 0177))
235 count
= prefix_length (oldpath
, path
);
236 diffcount
= count
- oldcount
;
238 /* If the difference is small, it fits in one byte;
239 otherwise, two bytes plus a marker noting that fact. */
240 if (diffcount
< -LOCATEDB_OLD_OFFSET
|| diffcount
> LOCATEDB_OLD_OFFSET
)
242 if (EOF
==- putc (LOCATEDB_OLD_ESCAPE
, stdout
))
245 if (!putword (stdout
,
246 diffcount
+LOCATEDB_OLD_OFFSET
,
247 GetwordEndianStateNative
))
252 if (EOF
== putc (diffcount
+ LOCATEDB_OLD_OFFSET
, stdout
))
256 /* Look for bigrams in the remainder of the path. */
257 for (pp
= path
+ count
; *pp
!= '\0'; pp
+= 2)
261 /* No bigram is possible; only one char is left. */
267 /* Linear search for specific bigram in string table. */
268 code
= strindex (bigrams
, bigram
);
270 putchar ((code
/ 2) | 0200); /* It's a common bigram. */
272 fputs (bigram
, stdout
); /* Write the text as printable ASCII. */
276 /* Swap path and oldpath and their sizes. */
277 char *tmppath
= oldpath
;
278 size_t tmppathsize
= oldpathsize
;
280 oldpathsize
= pathsize
;
282 pathsize
= tmppathsize
;