groff before CVS: release 1.06
[s-roff.git] / libbib / linear.cc
blob0798008210e4003bae2349809262be22acbd4ee1
1 // -*- C++ -*-
2 /* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc.
3 Written by James Clark (jjc@jclark.com)
5 This file is part of groff.
7 groff is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
12 groff is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License along
18 with groff; see the file COPYING. If not, write to the Free Software
19 Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */
22 #include <string.h>
23 #include <stdlib.h>
24 #include <assert.h>
25 #include <errno.h>
27 #include "posix.h"
28 #include "lib.h"
29 #include "errarg.h"
30 #include "error.h"
31 #include "cset.h"
32 #include "cmap.h"
34 #include "refid.h"
35 #include "search.h"
37 class file_buffer {
38 char *buffer;
39 char *bufend;
40 public:
41 file_buffer();
42 ~file_buffer();
43 int load(int fd, const char *filename);
44 const char *get_start() const;
45 const char *get_end() const;
48 typedef unsigned char uchar;
50 static uchar map[256];
51 static uchar inv_map[256][3];
53 struct map_init {
54 map_init();
57 static map_init the_map_init;
59 map_init::map_init()
61 for (int i = 0; i < 256; i++)
62 map[i] = csalnum(i) ? cmlower(i) : '\0';
63 for (i = 0; i < 256; i++) {
64 if (cslower(i)) {
65 inv_map[i][0] = i;
66 inv_map[i][1] = cmupper(i);
67 inv_map[i][2] = '\0';
69 else if (csdigit(i)) {
70 inv_map[i][0] = i;
71 inv_map[i][1] = 0;
73 else
74 inv_map[i][0] = '\0';
79 class bmpattern {
80 char *pat;
81 int len;
82 int delta[256];
83 public:
84 bmpattern(const char *pattern, int pattern_length);
85 ~bmpattern();
86 const char *search(const char *p, const char *end) const;
87 int length() const;
90 bmpattern::bmpattern(const char *pattern, int pattern_length)
91 : len(pattern_length)
93 pat = new char[len];
94 int i;
95 for (i = 0; i < len; i++)
96 pat[i] = map[uchar(pattern[i])];
97 for (i = 0; i < 256; i++)
98 delta[i] = len;
99 for (i = 0; i < len; i++)
100 for (const unsigned char *inv = inv_map[uchar(pat[i])]; *inv; inv++)
101 delta[*inv] = len - i - 1;
104 const char *bmpattern::search(const char *buf, const char *end) const
106 int buflen = end - buf;
107 if (len > buflen)
108 return 0;
109 const char *strend;
110 if (buflen > len*4)
111 strend = end - len*4;
112 else
113 strend = buf;
114 const char *k = buf + len - 1;
115 const int *del = delta;
116 const char *pattern = pat;
117 for (;;) {
118 while (k < strend) {
119 int t = del[uchar(*k)];
120 if (!t)
121 break;
122 k += t;
123 k += del[uchar(*k)];
124 k += del[uchar(*k)];
126 while (k < end && del[uchar(*k)] != 0)
127 k++;
128 if (k == end)
129 break;
130 int j = len - 1;
131 const char *s = k;
132 for (;;) {
133 if (j == 0)
134 return s;
135 if (map[uchar(*--s)] != pattern[--j])
136 break;
138 k++;
140 return 0;
143 bmpattern::~bmpattern()
145 a_delete pat;
148 inline int bmpattern::length() const
150 return len;
154 static const char *find_end(const char *bufend, const char *p);
156 const char *linear_searcher::search_and_check(const bmpattern *key,
157 const char *buf, const char *bufend, const char **start) const
159 assert(buf[-1] == '\n');
160 assert(bufend[-1] == '\n');
161 const char *ptr = buf;
162 for (;;) {
163 const char *found = key->search(ptr, bufend);
164 if (!found)
165 break;
166 if (check_match(buf, bufend, found, key->length(), &ptr, start))
167 return found;
169 return 0;
172 static const char *skip_field(const char *end, const char *p)
174 for (;;)
175 if (*p++ == '\n') {
176 if (p == end || *p == '%')
177 break;
178 for (const char *q = p; *q == ' ' || *q == '\t'; q++)
180 if (*q == '\n')
181 break;
182 p = q + 1;
184 return p;
187 static const char *find_end(const char *bufend, const char *p)
189 for (;;)
190 if (*p++ == '\n') {
191 if (p == bufend)
192 break;
193 for (const char *q = p; *q == ' ' || *q == '\t'; q++)
195 if (*q == '\n')
196 break;
197 p = q + 1;
199 return p;
203 int linear_searcher::check_match(const char *buf, const char *bufend,
204 const char *match, int matchlen,
205 const char **cont, const char **start) const
207 *cont = match + 1;
208 // The user is required to supply only the first truncate_len characters
209 // of the key. If truncate_len <= 0, he must supply all the key.
210 if ((truncate_len <= 0 || matchlen < truncate_len)
211 && map[uchar(match[matchlen])] != '\0')
212 return 0;
214 // The character before the match must not be an alphanumeric
215 // character (unless the alphanumeric character follows one or two
216 // percent characters at the beginning of the line), nor must it be
217 // a percent character at the beginning of a line, nor a percent
218 // character following a percent character at the beginning of a
219 // line.
221 switch (match - buf) {
222 case 0:
223 break;
224 case 1:
225 if (match[-1] == '%' || map[uchar(match[-1])] != '\0')
226 return 0;
227 break;
228 case 2:
229 if (map[uchar(match[-1])] != '\0' && match[-2] != '%')
230 return 0;
231 if (match[-1] == '%'
232 && (match[-2] == '\n' || match[-2] == '%'))
233 return 0;
234 break;
235 default:
236 if (map[uchar(match[-1])] != '\0'
237 && !(match[-2] == '%'
238 && (match[-3] == '\n'
239 || (match[-3] == '%' && match[-4] == '\n'))))
240 return 0;
241 if (match[-1] == '%'
242 && (match[-2] == '\n'
243 || (match[-2] == '%' && match[-3] == '\n')))
244 return 0;
247 const char *p = match;
248 int had_percent = 0;
249 for (;;) {
250 if (*p == '\n') {
251 if (!had_percent && p[1] == '%') {
252 if (p[2] != '\0' && strchr(ignore_fields, p[2]) != 0) {
253 *cont = skip_field(bufend, match + matchlen);
254 return 0;
256 if (!start)
257 break;
258 had_percent = 1;
260 if (p <= buf) {
261 if (start)
262 *start = p + 1;
263 return 1;
265 for (const char *q = p - 1; *q == ' ' || *q == '\t'; q--)
267 if (*q == '\n') {
268 if (start)
269 *start = p + 1;
270 break;
272 p = q;
274 p--;
276 return 1;
279 file_buffer::file_buffer()
280 : buffer(0), bufend(0)
284 file_buffer::~file_buffer()
286 a_delete buffer;
289 const char *file_buffer::get_start() const
291 return buffer ? buffer + 4 : 0;
294 const char *file_buffer::get_end() const
296 return bufend;
299 int file_buffer::load(int fd, const char *filename)
301 struct stat sb;
302 if (fstat(fd, &sb) < 0)
303 error("can't fstat `%1': %2", filename, strerror(errno));
304 else if ((sb.st_mode & S_IFMT) != S_IFREG)
305 error("`%1' is not a regular file", filename);
306 else {
307 // We need one character extra at the beginning for an additional newline
308 // used as a sentinel. We get 4 instead so that the read buffer will be
309 // word-aligned. This seems to make the read slightly faster. We also
310 // need one character at the end also for an addional newline used as a
311 // sentinel.
312 int size = int(sb.st_size);
313 buffer = new char[size + 4 + 1];
314 int nread = read(fd, buffer + 4, size);
315 if (nread < 0)
316 error("error reading `%1': %2", filename, strerror(errno));
317 else if (nread != size)
318 error("size of `%1' decreased", filename);
319 else {
320 char c;
321 nread = read(fd, &c, 1);
322 if (nread != 0)
323 error("size of `%1' increased", filename);
324 else if (memchr(buffer + 4, '\0', size < 1024 ? size : 1024) != 0)
325 error("database `%1' is a binary file", filename);
326 else {
327 close(fd);
328 buffer[3] = '\n';
329 bufend = buffer + 4 + size;
330 if (bufend[-1] != '\n')
331 *bufend++ = '\n';
332 return 1;
335 a_delete buffer;
336 buffer = 0;
338 close(fd);
339 return 0;
342 linear_searcher::linear_searcher(const char *query, int query_len,
343 const char *ign, int trunc)
344 : keys(0), nkeys(0), truncate_len(trunc), ignore_fields(ign)
346 const char *query_end = query + query_len;
347 int nk = 0;
348 const char *p;
349 for (p = query; p < query_end; p++)
350 if (map[uchar(*p)] != '\0'
351 && (p[1] == '\0' || map[uchar(p[1])] == '\0'))
352 nk++;
353 if (nk == 0)
354 return;
355 keys = new bmpattern*[nk];
356 p = query;
357 for (;;) {
358 while (p < query_end && map[uchar(*p)] == '\0')
359 p++;
360 if (p == query_end)
361 break;
362 const char *start = p;
363 while (p < query_end && map[uchar(*p)] != '\0')
364 p++;
365 keys[nkeys++] = new bmpattern(start, p - start);
367 assert(nkeys <= nk);
368 if (nkeys == 0) {
369 a_delete keys;
370 keys = 0;
374 linear_searcher::~linear_searcher()
376 for (int i = 0; i < nkeys; i++)
377 delete keys[i];
378 a_delete keys;
381 int linear_searcher::search(const char *buffer, const char *bufend,
382 const char **startp, int *lengthp) const
384 assert(bufend - buffer > 0);
385 assert(buffer[-1] == '\n');
386 assert(bufend[-1] == '\n');
387 if (nkeys == 0)
388 return 0;
389 for (;;) {
390 const char *refstart;
391 const char *found = search_and_check(keys[0], buffer, bufend, &refstart);
392 if (!found)
393 break;
394 const char *refend = find_end(bufend, found + keys[0]->length());
395 for (int i = 1; i < nkeys; i++)
396 if (!search_and_check(keys[i], refstart, refend))
397 break;
398 if (i >= nkeys) {
399 *startp = refstart;
400 *lengthp = refend - refstart;
401 return 1;
403 buffer = refend;
405 return 0;
408 class linear_search_item : public search_item {
409 file_buffer fbuf;
410 public:
411 linear_search_item(const char *filename, int fid);
412 ~linear_search_item();
413 int load(int fd);
414 search_item_iterator *make_search_item_iterator(const char *);
415 friend class linear_search_item_iterator;
418 class linear_search_item_iterator : public search_item_iterator {
419 linear_search_item *lsi;
420 int pos;
421 public:
422 linear_search_item_iterator(linear_search_item *, const char *query);
423 ~linear_search_item_iterator();
424 int next(const linear_searcher &, const char **ptr, int *lenp,
425 reference_id *ridp);
428 search_item *make_linear_search_item(int fd, const char *filename, int fid)
430 linear_search_item *item = new linear_search_item(filename, fid);
431 if (!item->load(fd)) {
432 delete item;
433 return 0;
435 else
436 return item;
439 linear_search_item::linear_search_item(const char *filename, int fid)
440 : search_item(filename, fid)
444 linear_search_item::~linear_search_item()
448 int linear_search_item::load(int fd)
450 return fbuf.load(fd, name);
453 search_item_iterator *linear_search_item::make_search_item_iterator(
454 const char *query)
456 return new linear_search_item_iterator(this, query);
459 linear_search_item_iterator::linear_search_item_iterator(
460 linear_search_item *p, const char *)
461 : lsi(p), pos(0)
465 linear_search_item_iterator::~linear_search_item_iterator()
469 int linear_search_item_iterator::next(const linear_searcher &searcher,
470 const char **startp, int *lengthp,
471 reference_id *ridp)
473 const char *bufstart = lsi->fbuf.get_start();
474 const char *bufend = lsi->fbuf.get_end();
475 const char *ptr = bufstart + pos;
476 if (ptr < bufend && searcher.search(ptr, bufend, startp, lengthp)) {
477 pos = *startp + *lengthp - bufstart;
478 if (ridp)
479 *ridp = reference_id(lsi->filename_id, *startp - bufstart);
480 return 1;
482 else
483 return 0;