Adapt src/lib-bib (src/libs/libbib)
[s-roff.git] / src / lib-bib / linear.cpp
blob6123ae0661d016618c847175081e9a2ac16ab300
1 /*@
2 * Copyright (c) 2014 Steffen (Daode) Nurpmeso <sdaoden@users.sf.net>.
4 * Copyright (C) 1989 - 1992, 2000, 2001
5 * Free Software Foundation, Inc.
6 * Written by James Clark (jjc@jclark.com)
8 * groff is free software; you can redistribute it and/or modify it under
9 * the terms of the GNU General Public License as published by the Free
10 * Software Foundation; either version 2, or (at your option) any later
11 * version.
13 * groff is distributed in the hope that it will be useful, but WITHOUT ANY
14 * WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 * for more details.
18 * You should have received a copy of the GNU General Public License along
19 * with groff; see the file COPYING. If not, write to the Free Software
20 * Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA.
23 #include "config.h"
24 #include "lib.h"
26 #include <assert.h>
27 #include <errno.h>
28 #include <stdlib.h>
30 #include "cset.h"
31 #include "cmap.h"
32 #include "errarg.h"
33 #include "error.h"
34 #include "nonposix.h"
35 #include "posix.h"
37 #include "refid.h"
38 #include "search.h"
40 class file_buffer
42 char *buffer;
43 char *bufend;
45 public:
46 file_buffer();
47 ~file_buffer();
48 int load(int fd, const char *filename);
49 const char *get_start() const;
50 const char *get_end() const;
53 typedef unsigned char uchar;
55 static uchar map[256];
56 static uchar inv_map[256][3];
58 class map_init // FIXME static initializer ctor
60 public:
61 map_init();
64 static map_init the_map_init;
66 map_init::map_init()
68 int i;
69 for (i = 0; i < 256; i++)
70 map[i] = csalnum(i) ? cmlower(i) : '\0';
71 for (i = 0; i < 256; i++) {
72 if (cslower(i)) {
73 inv_map[i][0] = i;
74 inv_map[i][1] = cmupper(i);
75 inv_map[i][2] = '\0';
77 else if (csdigit(i)) {
78 inv_map[i][0] = i;
79 inv_map[i][1] = 0;
81 else
82 inv_map[i][0] = '\0';
86 class bmpattern
88 char *pat;
89 int len;
90 int delta[256];
92 public:
93 bmpattern(const char *pattern, int pattern_length);
94 ~bmpattern();
95 const char *search(const char *p, const char *end) const;
96 int length() const;
99 bmpattern::bmpattern(const char *pattern, int pattern_length)
100 : len(pattern_length)
102 pat = new char[len];
103 int i;
104 for (i = 0; i < len; i++)
105 pat[i] = map[uchar(pattern[i])];
106 for (i = 0; i < 256; i++)
107 delta[i] = len;
108 for (i = 0; i < len; i++)
109 for (const unsigned char *inv = inv_map[uchar(pat[i])]; *inv; inv++)
110 delta[*inv] = len - i - 1;
113 const char *bmpattern::search(const char *buf, const char *end) const
115 int buflen = end - buf;
116 if (len > buflen)
117 return 0;
118 const char *strend;
119 if (buflen > len*4)
120 strend = end - len*4;
121 else
122 strend = buf;
123 const char *k = buf + len - 1;
124 const int *del = delta;
125 const char *pattern = pat;
126 for (;;) {
127 while (k < strend) {
128 int t = del[uchar(*k)];
129 if (!t)
130 break;
131 k += t;
132 k += del[uchar(*k)];
133 k += del[uchar(*k)];
135 while (k < end && del[uchar(*k)] != 0)
136 k++;
137 if (k == end)
138 break;
139 int j = len - 1;
140 const char *s = k;
141 for (;;) {
142 if (j == 0)
143 return s;
144 if (map[uchar(*--s)] != uchar(pattern[--j]))
145 break;
147 k++;
149 return 0;
152 bmpattern::~bmpattern()
154 a_delete pat;
157 inline int bmpattern::length() const
159 return len;
162 static const char *find_end(const char *bufend, const char *p);
164 const char *linear_searcher::search_and_check(const bmpattern *key,
165 const char *buf, const char *bufend, const char **start) const
167 assert(buf[-1] == '\n');
168 assert(bufend[-1] == '\n');
169 const char *ptr = buf;
170 for (;;) {
171 const char *found = key->search(ptr, bufend);
172 if (!found)
173 break;
174 if (check_match(buf, bufend, found, key->length(), &ptr, start))
175 return found;
177 return 0;
180 static const char *skip_field(const char *end, const char *p)
182 for (;;)
183 if (*p++ == '\n') {
184 if (p == end || *p == '%')
185 break;
186 const char *q;
187 for (q = p; *q == ' ' || *q == '\t'; q++)
189 if (*q == '\n')
190 break;
191 p = q + 1;
193 return p;
196 static const char *find_end(const char *bufend, const char *p)
198 for (;;)
199 if (*p++ == '\n') {
200 if (p == bufend)
201 break;
202 const char *q;
203 for (q = p; *q == ' ' || *q == '\t'; q++)
205 if (*q == '\n')
206 break;
207 p = q + 1;
209 return p;
212 int linear_searcher::check_match(const char *buf, const char *bufend,
213 const char *match, int matchlen,
214 const char **cont, const char **start) const
216 *cont = match + 1;
217 // The user is required to supply only the first truncate_len characters
218 // of the key. If truncate_len <= 0, he must supply all the key.
219 if ((truncate_len <= 0 || matchlen < truncate_len)
220 && map[uchar(match[matchlen])] != '\0')
221 return 0;
223 // The character before the match must not be an alphanumeric
224 // character (unless the alphanumeric character follows one or two
225 // percent characters at the beginning of the line), nor must it be
226 // a percent character at the beginning of a line, nor a percent
227 // character following a percent character at the beginning of a
228 // line.
230 switch (match - buf) {
231 case 0:
232 break;
233 case 1:
234 if (match[-1] == '%' || map[uchar(match[-1])] != '\0')
235 return 0;
236 break;
237 case 2:
238 if (map[uchar(match[-1])] != '\0' && match[-2] != '%')
239 return 0;
240 if (match[-1] == '%'
241 && (match[-2] == '\n' || match[-2] == '%'))
242 return 0;
243 break;
244 default:
245 if (map[uchar(match[-1])] != '\0'
246 && !(match[-2] == '%'
247 && (match[-3] == '\n'
248 || (match[-3] == '%' && match[-4] == '\n'))))
249 return 0;
250 if (match[-1] == '%'
251 && (match[-2] == '\n'
252 || (match[-2] == '%' && match[-3] == '\n')))
253 return 0;
256 const char *p = match;
257 int had_percent = 0;
258 for (;;) {
259 if (*p == '\n') {
260 if (!had_percent && p[1] == '%') {
261 if (p[2] != '\0' && strchr(ignore_fields, p[2]) != 0) {
262 *cont = skip_field(bufend, match + matchlen);
263 return 0;
265 if (!start)
266 break;
267 had_percent = 1;
269 if (p <= buf) {
270 if (start)
271 *start = p + 1;
272 return 1;
274 const char *q;
275 for (q = p - 1; *q == ' ' || *q == '\t'; q--)
277 if (*q == '\n') {
278 if (start)
279 *start = p + 1;
280 break;
282 p = q;
284 p--;
286 return 1;
289 file_buffer::file_buffer()
290 : buffer(0), bufend(0)
294 file_buffer::~file_buffer()
296 a_delete buffer;
299 const char *file_buffer::get_start() const
301 return buffer ? buffer + 4 : 0;
304 const char *file_buffer::get_end() const
306 return bufend;
309 int file_buffer::load(int fd, const char *filename)
311 struct stat sb;
312 if (fstat(fd, &sb) < 0)
313 error("can't fstat `%1': %2", filename, strerror(errno));
314 else if (!S_ISREG(sb.st_mode))
315 error("`%1' is not a regular file", filename);
316 else {
317 // We need one character extra at the beginning for an additional newline
318 // used as a sentinel. We get 4 instead so that the read buffer will be
319 // word-aligned. This seems to make the read slightly faster. We also
320 // need one character at the end also for an additional newline used as a
321 // sentinel.
322 int size = int(sb.st_size);
323 buffer = new char[size + 4 + 1];
324 int nread = read(fd, buffer + 4, size);
325 if (nread < 0)
326 error("error reading `%1': %2", filename, strerror(errno));
327 else if (nread != size)
328 error("size of `%1' decreased", filename);
329 else {
330 char c;
331 nread = read(fd, &c, 1);
332 if (nread != 0)
333 error("size of `%1' increased", filename);
334 else if (memchr(buffer + 4, '\0', size < 1024 ? size : 1024) != 0)
335 error("database `%1' is a binary file", filename);
336 else {
337 close(fd);
338 buffer[3] = '\n';
339 int sidx = 4, didx = 4;
340 for ( ; sidx < size + 4; sidx++, didx++)
342 if (buffer[sidx] == '\r')
344 if (buffer[++sidx] != '\n')
345 buffer[didx++] = '\r';
346 else
347 size--;
349 if (sidx != didx)
350 buffer[didx] = buffer[sidx];
352 bufend = buffer + 4 + size;
353 if (bufend[-1] != '\n')
354 *bufend++ = '\n';
355 return 1;
358 a_delete buffer;
359 buffer = 0;
361 close(fd);
362 return 0;
365 linear_searcher::linear_searcher(const char *query, int query_len,
366 const char *ign, int trunc)
367 : ignore_fields(ign), truncate_len(trunc), keys(0), nkeys(0)
369 const char *query_end = query + query_len;
370 int nk = 0;
371 const char *p;
372 for (p = query; p < query_end; p++)
373 if (map[uchar(*p)] != '\0'
374 && (p[1] == '\0' || map[uchar(p[1])] == '\0'))
375 nk++;
376 if (nk == 0)
377 return;
378 keys = new bmpattern*[nk];
379 p = query;
380 for (;;) {
381 while (p < query_end && map[uchar(*p)] == '\0')
382 p++;
383 if (p == query_end)
384 break;
385 const char *start = p;
386 while (p < query_end && map[uchar(*p)] != '\0')
387 p++;
388 keys[nkeys++] = new bmpattern(start, p - start);
390 assert(nkeys <= nk);
391 if (nkeys == 0) {
392 a_delete keys;
393 keys = 0;
397 linear_searcher::~linear_searcher()
399 for (int i = 0; i < nkeys; i++)
400 delete keys[i];
401 a_delete keys;
404 int linear_searcher::search(const char *buffer, const char *bufend,
405 const char **startp, int *lengthp) const
407 assert(bufend - buffer > 0);
408 assert(buffer[-1] == '\n');
409 assert(bufend[-1] == '\n');
410 if (nkeys == 0)
411 return 0;
412 for (;;) {
413 const char *refstart;
414 const char *found = search_and_check(keys[0], buffer, bufend, &refstart);
415 if (!found)
416 break;
417 const char *refend = find_end(bufend, found + keys[0]->length());
418 int i;
419 for (i = 1; i < nkeys; i++)
420 if (!search_and_check(keys[i], refstart, refend))
421 break;
422 if (i >= nkeys) {
423 *startp = refstart;
424 *lengthp = refend - refstart;
425 return 1;
427 buffer = refend;
429 return 0;
432 class linear_search_item
433 : public search_item
435 file_buffer fbuf;
437 public:
438 linear_search_item(const char *filename, int fid);
439 ~linear_search_item();
440 int load(int fd);
441 search_item_iterator *make_search_item_iterator(const char *);
442 friend class linear_search_item_iterator;
445 class linear_search_item_iterator
446 : public search_item_iterator
448 linear_search_item *lsi;
449 int pos;
451 public:
452 linear_search_item_iterator(linear_search_item *, const char *query);
453 ~linear_search_item_iterator();
454 int next(const linear_searcher &, const char **ptr, int *lenp,
455 reference_id *ridp);
458 search_item *make_linear_search_item(int fd, const char *filename, int fid)
460 linear_search_item *item = new linear_search_item(filename, fid);
461 if (!item->load(fd)) {
462 delete item;
463 return 0;
465 else
466 return item;
469 linear_search_item::linear_search_item(const char *filename, int fid)
470 : search_item(filename, fid)
474 linear_search_item::~linear_search_item()
478 int linear_search_item::load(int fd)
480 return fbuf.load(fd, name);
483 search_item_iterator *linear_search_item::make_search_item_iterator(
484 const char *query)
486 return new linear_search_item_iterator(this, query);
489 linear_search_item_iterator::linear_search_item_iterator(
490 linear_search_item *p, const char *)
491 : lsi(p), pos(0)
495 linear_search_item_iterator::~linear_search_item_iterator()
499 int linear_search_item_iterator::next(const linear_searcher &searcher,
500 const char **startp, int *lengthp,
501 reference_id *ridp)
503 const char *bufstart = lsi->fbuf.get_start();
504 const char *bufend = lsi->fbuf.get_end();
505 const char *ptr = bufstart + pos;
506 if (ptr < bufend && searcher.search(ptr, bufend, startp, lengthp)) {
507 pos = *startp + *lengthp - bufstart;
508 if (ridp)
509 *ridp = reference_id(lsi->filename_id, *startp - bufstart);
510 return 1;
512 else
513 return 0;
516 // s-it2-mode