1 /* code -- bigram- and front-encode filenames for locate
2 Copyright (C) 1994 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
18 /* Compress a sorted list.
19 Works with `find' to encode a filename database to save space
24 bigram < file_list > bigrams
25 process-bigrams > most_common_bigrams
26 code most_common_bigrams < file_list > squeezed_list
28 Uses `front compression' (see ";login:", March 1983, p. 8).
29 The output begins with the 128 most common bigrams.
30 After that, the output format is, for each line,
31 an offset (from the previous line) differential count byte
32 followed by a (partially bigram-encoded) ASCII remainder.
33 The output lines have no terminating byte; the start of the next line
34 is indicated by its first byte having a value <= 30.
36 The encoding of the output bytes is:
38 0-28 likeliest differential counts + offset (14) to make nonnegative
39 30 escape code for out-of-range count to follow in next halfword
40 128-255 bigram codes (the 128 most common, as determined by `updatedb')
41 32-127 single character (printable) ASCII remainder
43 Written by James A. Woods <jwoods@adobe.com>.
44 Modified by David MacKenzie <djm@gnu.ai.mit.edu>. */
48 #include <sys/types.h>
50 #if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
62 # define _(Text) gettext (Text)
65 #define textdomain(Domain)
66 #define bindtextdomain(Package, Directory)
69 # define N_(String) gettext_noop (String)
71 # define N_(String) (String)
79 /* The name this program was run with. */
82 /* The 128 most common bigrams in the file list, padded with NULs
83 if there are fewer. */
84 static char bigrams
[257] = {0};
86 /* Return the offset of PATTERN in STRING, or -1 if not found. */
89 strindex (string
, pattern
)
90 char *string
, *pattern
;
94 for (s
= string
; *s
!= '\0'; s
++)
95 /* Fast first char check. */
98 register char *p2
= pattern
+ 1, *s2
= s
+ 1;
99 while (*p2
!= '\0' && *p2
== *s2
)
102 return s2
- strlen (pattern
) - string
;
107 /* Return the length of the longest common prefix of strings S1 and S2. */
110 prefix_length (s1
, s2
)
113 register char *start
;
115 for (start
= s1
; *s1
== *s2
&& *s1
!= '\0'; s1
++, s2
++)
125 char *path
; /* The current input entry. */
126 char *oldpath
; /* The previous input entry. */
127 size_t pathsize
, oldpathsize
; /* Amounts allocated for them. */
128 int count
, oldcount
, diffcount
; /* Their prefix lengths & the difference. */
129 char bigram
[3]; /* Bigram to search for in table. */
130 int code
; /* Index of `bigram' in bigrams table. */
131 FILE *fp
; /* Most common bigrams file. */
132 int line_len
; /* Length of input line. */
134 program_name
= argv
[0];
140 fprintf (stderr
, _("Usage: %s most_common_bigrams < list > coded_list\n"),
145 fp
= fopen (argv
[1], "r");
148 fprintf (stderr
, "%s: ", argv
[0]);
153 pathsize
= oldpathsize
= 1026; /* Increased as necessary by getline. */
154 path
= xmalloc (pathsize
);
155 oldpath
= xmalloc (oldpathsize
);
157 /* Set to anything not starting with a slash, to force the first
158 prefix count to 0. */
159 strcpy (oldpath
, " ");
162 /* Copy the list of most common bigrams to the output,
163 padding with NULs if there are <128 of them. */
164 fgets (bigrams
, 257, fp
);
165 fwrite (bigrams
, 1, 256, stdout
);
168 while ((line_len
= getline (&path
, &pathsize
, stdin
)) > 0)
172 path
[line_len
- 1] = '\0'; /* Remove newline. */
174 /* Squelch unprintable chars in path so as not to botch decoding. */
175 for (pp
= path
; *pp
!= '\0'; pp
++)
178 if (*pp
< 040 || *pp
== 0177)
182 count
= prefix_length (oldpath
, path
);
183 diffcount
= count
- oldcount
;
185 /* If the difference is small, it fits in one byte;
186 otherwise, two bytes plus a marker noting that fact. */
187 if (diffcount
< -LOCATEDB_OLD_OFFSET
|| diffcount
> LOCATEDB_OLD_OFFSET
)
189 putc (LOCATEDB_OLD_ESCAPE
, stdout
);
190 putw (diffcount
+ LOCATEDB_OLD_OFFSET
, stdout
);
193 putc (diffcount
+ LOCATEDB_OLD_OFFSET
, stdout
);
195 /* Look for bigrams in the remainder of the path. */
196 for (pp
= path
+ count
; *pp
!= '\0'; pp
+= 2)
200 /* No bigram is possible; only one char is left. */
206 /* Linear search for specific bigram in string table. */
207 code
= strindex (bigrams
, bigram
);
209 putchar ((code
/ 2) | 0200); /* It's a common bigram. */
211 fputs (bigram
, stdout
); /* Write the text as printable ASCII. */
215 /* Swap path and oldpath and their sizes. */
216 char *tmppath
= oldpath
;
217 size_t tmppathsize
= oldpathsize
;
219 oldpathsize
= pathsize
;
221 pathsize
= tmppathsize
;