* All affected files: Update postal address of FSF.
[s-roff.git] / src / libs / libbib / linear.cpp
blobc1919ab7d63a835e7fac4288965514f542718909
1 // -*- C++ -*-
2 /* Copyright (C) 1989, 1990, 1991, 1992, 2000, 2001
3 Free Software Foundation, Inc.
4 Written by James Clark (jjc@jclark.com)
6 This file is part of groff.
8 groff is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
11 version.
13 groff is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License along
19 with groff; see the file COPYING. If not, write to the Free Software
20 Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
22 #include "lib.h"
24 #include <stdlib.h>
25 #include <assert.h>
26 #include <errno.h>
28 #include "posix.h"
29 #include "errarg.h"
30 #include "error.h"
31 #include "cset.h"
32 #include "cmap.h"
33 #include "nonposix.h"
35 #include "refid.h"
36 #include "search.h"
38 class file_buffer {
39 char *buffer;
40 char *bufend;
41 public:
42 file_buffer();
43 ~file_buffer();
44 int load(int fd, const char *filename);
45 const char *get_start() const;
46 const char *get_end() const;
49 typedef unsigned char uchar;
51 static uchar map[256];
52 static uchar inv_map[256][3];
54 struct map_init {
55 map_init();
58 static map_init the_map_init;
60 map_init::map_init()
62 int i;
63 for (i = 0; i < 256; i++)
64 map[i] = csalnum(i) ? cmlower(i) : '\0';
65 for (i = 0; i < 256; i++) {
66 if (cslower(i)) {
67 inv_map[i][0] = i;
68 inv_map[i][1] = cmupper(i);
69 inv_map[i][2] = '\0';
71 else if (csdigit(i)) {
72 inv_map[i][0] = i;
73 inv_map[i][1] = 0;
75 else
76 inv_map[i][0] = '\0';
81 class bmpattern {
82 char *pat;
83 int len;
84 int delta[256];
85 public:
86 bmpattern(const char *pattern, int pattern_length);
87 ~bmpattern();
88 const char *search(const char *p, const char *end) const;
89 int length() const;
92 bmpattern::bmpattern(const char *pattern, int pattern_length)
93 : len(pattern_length)
95 pat = new char[len];
96 int i;
97 for (i = 0; i < len; i++)
98 pat[i] = map[uchar(pattern[i])];
99 for (i = 0; i < 256; i++)
100 delta[i] = len;
101 for (i = 0; i < len; i++)
102 for (const unsigned char *inv = inv_map[uchar(pat[i])]; *inv; inv++)
103 delta[*inv] = len - i - 1;
106 const char *bmpattern::search(const char *buf, const char *end) const
108 int buflen = end - buf;
109 if (len > buflen)
110 return 0;
111 const char *strend;
112 if (buflen > len*4)
113 strend = end - len*4;
114 else
115 strend = buf;
116 const char *k = buf + len - 1;
117 const int *del = delta;
118 const char *pattern = pat;
119 for (;;) {
120 while (k < strend) {
121 int t = del[uchar(*k)];
122 if (!t)
123 break;
124 k += t;
125 k += del[uchar(*k)];
126 k += del[uchar(*k)];
128 while (k < end && del[uchar(*k)] != 0)
129 k++;
130 if (k == end)
131 break;
132 int j = len - 1;
133 const char *s = k;
134 for (;;) {
135 if (j == 0)
136 return s;
137 if (map[uchar(*--s)] != uchar(pattern[--j]))
138 break;
140 k++;
142 return 0;
145 bmpattern::~bmpattern()
147 a_delete pat;
150 inline int bmpattern::length() const
152 return len;
156 static const char *find_end(const char *bufend, const char *p);
158 const char *linear_searcher::search_and_check(const bmpattern *key,
159 const char *buf, const char *bufend, const char **start) const
161 assert(buf[-1] == '\n');
162 assert(bufend[-1] == '\n');
163 const char *ptr = buf;
164 for (;;) {
165 const char *found = key->search(ptr, bufend);
166 if (!found)
167 break;
168 if (check_match(buf, bufend, found, key->length(), &ptr, start))
169 return found;
171 return 0;
174 static const char *skip_field(const char *end, const char *p)
176 for (;;)
177 if (*p++ == '\n') {
178 if (p == end || *p == '%')
179 break;
180 const char *q;
181 for (q = p; *q == ' ' || *q == '\t'; q++)
183 if (*q == '\n')
184 break;
185 p = q + 1;
187 return p;
190 static const char *find_end(const char *bufend, const char *p)
192 for (;;)
193 if (*p++ == '\n') {
194 if (p == bufend)
195 break;
196 const char *q;
197 for (q = p; *q == ' ' || *q == '\t'; q++)
199 if (*q == '\n')
200 break;
201 p = q + 1;
203 return p;
207 int linear_searcher::check_match(const char *buf, const char *bufend,
208 const char *match, int matchlen,
209 const char **cont, const char **start) const
211 *cont = match + 1;
212 // The user is required to supply only the first truncate_len characters
213 // of the key. If truncate_len <= 0, he must supply all the key.
214 if ((truncate_len <= 0 || matchlen < truncate_len)
215 && map[uchar(match[matchlen])] != '\0')
216 return 0;
218 // The character before the match must not be an alphanumeric
219 // character (unless the alphanumeric character follows one or two
220 // percent characters at the beginning of the line), nor must it be
221 // a percent character at the beginning of a line, nor a percent
222 // character following a percent character at the beginning of a
223 // line.
225 switch (match - buf) {
226 case 0:
227 break;
228 case 1:
229 if (match[-1] == '%' || map[uchar(match[-1])] != '\0')
230 return 0;
231 break;
232 case 2:
233 if (map[uchar(match[-1])] != '\0' && match[-2] != '%')
234 return 0;
235 if (match[-1] == '%'
236 && (match[-2] == '\n' || match[-2] == '%'))
237 return 0;
238 break;
239 default:
240 if (map[uchar(match[-1])] != '\0'
241 && !(match[-2] == '%'
242 && (match[-3] == '\n'
243 || (match[-3] == '%' && match[-4] == '\n'))))
244 return 0;
245 if (match[-1] == '%'
246 && (match[-2] == '\n'
247 || (match[-2] == '%' && match[-3] == '\n')))
248 return 0;
251 const char *p = match;
252 int had_percent = 0;
253 for (;;) {
254 if (*p == '\n') {
255 if (!had_percent && p[1] == '%') {
256 if (p[2] != '\0' && strchr(ignore_fields, p[2]) != 0) {
257 *cont = skip_field(bufend, match + matchlen);
258 return 0;
260 if (!start)
261 break;
262 had_percent = 1;
264 if (p <= buf) {
265 if (start)
266 *start = p + 1;
267 return 1;
269 const char *q;
270 for (q = p - 1; *q == ' ' || *q == '\t'; q--)
272 if (*q == '\n') {
273 if (start)
274 *start = p + 1;
275 break;
277 p = q;
279 p--;
281 return 1;
284 file_buffer::file_buffer()
285 : buffer(0), bufend(0)
289 file_buffer::~file_buffer()
291 a_delete buffer;
294 const char *file_buffer::get_start() const
296 return buffer ? buffer + 4 : 0;
299 const char *file_buffer::get_end() const
301 return bufend;
304 int file_buffer::load(int fd, const char *filename)
306 struct stat sb;
307 if (fstat(fd, &sb) < 0)
308 error("can't fstat `%1': %2", filename, strerror(errno));
309 else if (!S_ISREG(sb.st_mode))
310 error("`%1' is not a regular file", filename);
311 else {
312 // We need one character extra at the beginning for an additional newline
313 // used as a sentinel. We get 4 instead so that the read buffer will be
314 // word-aligned. This seems to make the read slightly faster. We also
315 // need one character at the end also for an additional newline used as a
316 // sentinel.
317 int size = int(sb.st_size);
318 buffer = new char[size + 4 + 1];
319 int nread = read(fd, buffer + 4, size);
320 if (nread < 0)
321 error("error reading `%1': %2", filename, strerror(errno));
322 else if (nread != size)
323 error("size of `%1' decreased", filename);
324 else {
325 char c;
326 nread = read(fd, &c, 1);
327 if (nread != 0)
328 error("size of `%1' increased", filename);
329 else if (memchr(buffer + 4, '\0', size < 1024 ? size : 1024) != 0)
330 error("database `%1' is a binary file", filename);
331 else {
332 close(fd);
333 buffer[3] = '\n';
334 int sidx = 4, didx = 4;
335 for ( ; sidx < size + 4; sidx++, didx++)
337 if (buffer[sidx] == '\r')
339 if (buffer[++sidx] != '\n')
340 buffer[didx++] = '\r';
341 else
342 size--;
344 if (sidx != didx)
345 buffer[didx] = buffer[sidx];
347 bufend = buffer + 4 + size;
348 if (bufend[-1] != '\n')
349 *bufend++ = '\n';
350 return 1;
353 a_delete buffer;
354 buffer = 0;
356 close(fd);
357 return 0;
360 linear_searcher::linear_searcher(const char *query, int query_len,
361 const char *ign, int trunc)
362 : ignore_fields(ign), truncate_len(trunc), keys(0), nkeys(0)
364 const char *query_end = query + query_len;
365 int nk = 0;
366 const char *p;
367 for (p = query; p < query_end; p++)
368 if (map[uchar(*p)] != '\0'
369 && (p[1] == '\0' || map[uchar(p[1])] == '\0'))
370 nk++;
371 if (nk == 0)
372 return;
373 keys = new bmpattern*[nk];
374 p = query;
375 for (;;) {
376 while (p < query_end && map[uchar(*p)] == '\0')
377 p++;
378 if (p == query_end)
379 break;
380 const char *start = p;
381 while (p < query_end && map[uchar(*p)] != '\0')
382 p++;
383 keys[nkeys++] = new bmpattern(start, p - start);
385 assert(nkeys <= nk);
386 if (nkeys == 0) {
387 a_delete keys;
388 keys = 0;
392 linear_searcher::~linear_searcher()
394 for (int i = 0; i < nkeys; i++)
395 delete keys[i];
396 a_delete keys;
399 int linear_searcher::search(const char *buffer, const char *bufend,
400 const char **startp, int *lengthp) const
402 assert(bufend - buffer > 0);
403 assert(buffer[-1] == '\n');
404 assert(bufend[-1] == '\n');
405 if (nkeys == 0)
406 return 0;
407 for (;;) {
408 const char *refstart;
409 const char *found = search_and_check(keys[0], buffer, bufend, &refstart);
410 if (!found)
411 break;
412 const char *refend = find_end(bufend, found + keys[0]->length());
413 int i;
414 for (i = 1; i < nkeys; i++)
415 if (!search_and_check(keys[i], refstart, refend))
416 break;
417 if (i >= nkeys) {
418 *startp = refstart;
419 *lengthp = refend - refstart;
420 return 1;
422 buffer = refend;
424 return 0;
427 class linear_search_item : public search_item {
428 file_buffer fbuf;
429 public:
430 linear_search_item(const char *filename, int fid);
431 ~linear_search_item();
432 int load(int fd);
433 search_item_iterator *make_search_item_iterator(const char *);
434 friend class linear_search_item_iterator;
437 class linear_search_item_iterator : public search_item_iterator {
438 linear_search_item *lsi;
439 int pos;
440 public:
441 linear_search_item_iterator(linear_search_item *, const char *query);
442 ~linear_search_item_iterator();
443 int next(const linear_searcher &, const char **ptr, int *lenp,
444 reference_id *ridp);
447 search_item *make_linear_search_item(int fd, const char *filename, int fid)
449 linear_search_item *item = new linear_search_item(filename, fid);
450 if (!item->load(fd)) {
451 delete item;
452 return 0;
454 else
455 return item;
458 linear_search_item::linear_search_item(const char *filename, int fid)
459 : search_item(filename, fid)
463 linear_search_item::~linear_search_item()
467 int linear_search_item::load(int fd)
469 return fbuf.load(fd, name);
472 search_item_iterator *linear_search_item::make_search_item_iterator(
473 const char *query)
475 return new linear_search_item_iterator(this, query);
478 linear_search_item_iterator::linear_search_item_iterator(
479 linear_search_item *p, const char *)
480 : lsi(p), pos(0)
484 linear_search_item_iterator::~linear_search_item_iterator()
488 int linear_search_item_iterator::next(const linear_searcher &searcher,
489 const char **startp, int *lengthp,
490 reference_id *ridp)
492 const char *bufstart = lsi->fbuf.get_start();
493 const char *bufend = lsi->fbuf.get_end();
494 const char *ptr = bufstart + pos;
495 if (ptr < bufend && searcher.search(ptr, bufend, startp, lengthp)) {
496 pos = *startp + *lengthp - bufstart;
497 if (ridp)
498 *ridp = reference_id(lsi->filename_id, *startp - bufstart);
499 return 1;
501 else
502 return 0;