groff before CVS: release 1.06
[s-roff.git] / refer / token.cc
blob8847081bd482bee76d1addfb5032811275159f6f
1 // -*- C++ -*-
2 /* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc.
3 Written by James Clark (jjc@jclark.com)
5 This file is part of groff.
7 groff is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
12 groff is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License along
18 with groff; see the file COPYING. If not, write to the Free Software
19 Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */
21 #include "refer.h"
22 #include "token.h"
24 #define TOKEN_TABLE_SIZE 1009
25 // I believe in Icelandic thorn sorts after z.
26 #define THORN_SORT_KEY "{"
28 struct token_table_entry {
29 const char *tok;
30 token_info ti;
31 token_table_entry();
34 token_table_entry token_table[TOKEN_TABLE_SIZE];
35 int ntokens = 0;
37 static void skip_name(const char **ptr, const char *end)
39 if (*ptr < end) {
40 switch (*(*ptr)++) {
41 case '(':
42 if (*ptr < end) {
43 *ptr += 1;
44 if (*ptr < end)
45 *ptr += 1;
47 break;
48 case '[':
49 while (*ptr < end)
50 if (*(*ptr)++ == ']')
51 break;
52 break;
57 int get_token(const char **ptr, const char *end)
59 if (*ptr >= end)
60 return 0;
61 char c = *(*ptr)++;
62 if (c == '\\' && *ptr < end) {
63 switch (**ptr) {
64 default:
65 *ptr += 1;
66 break;
67 case '(':
68 case '[':
69 skip_name(ptr, end);
70 break;
71 case '*':
72 case 'f':
73 *ptr += 1;
74 skip_name(ptr, end);
75 break;
78 return 1;
81 token_info::token_info()
82 : type(TOKEN_OTHER), sort_key(0), other_case(0)
86 void token_info::set(token_type t, const char *sk, const char *oc)
88 assert(oc == 0 || t == TOKEN_UPPER || t == TOKEN_LOWER);
89 type = t;
90 sort_key = sk;
91 other_case = oc;
94 void token_info::sortify(const char *start, const char *end, string &result)
95 const
97 if (sort_key)
98 result += sort_key;
99 else if (type == TOKEN_UPPER || type == TOKEN_LOWER) {
100 for (; start < end; start++)
101 if (csalpha(*start))
102 result += cmlower(*start);
106 int token_info::sortify_non_empty(const char *start, const char *end) const
108 if (sort_key)
109 return *sort_key != '\0';
110 if (type != TOKEN_UPPER && type != TOKEN_LOWER)
111 return 0;
112 for (; start < end; start++)
113 if (csalpha(*start))
114 return 1;
115 return 0;
119 void token_info::lower_case(const char *start, const char *end,
120 string &result) const
122 if (type != TOKEN_UPPER) {
123 while (start < end)
124 result += *start++;
126 else if (other_case)
127 result += other_case;
128 else {
129 while (start < end)
130 result += cmlower(*start++);
134 void token_info::upper_case(const char *start, const char *end,
135 string &result) const
137 if (type != TOKEN_LOWER) {
138 while (start < end)
139 result += *start++;
141 else if (other_case)
142 result += other_case;
143 else {
144 while (start < end)
145 result += cmupper(*start++);
149 token_table_entry::token_table_entry()
150 : tok(0)
154 static void store_token(const char *tok, token_type typ,
155 const char *sk = 0, const char *oc = 0)
157 unsigned n = hash_string(tok, strlen(tok)) % TOKEN_TABLE_SIZE;
158 while (n >= 0) {
159 if (token_table[n].tok == 0) {
160 if (++ntokens == TOKEN_TABLE_SIZE)
161 assert(0);
162 token_table[n].tok = tok;
163 break;
165 if (strcmp(tok, token_table[n].tok) == 0)
166 break;
167 if (--n < 0)
168 n = TOKEN_TABLE_SIZE - 1;
170 token_table[n].ti.set(typ, sk, oc);
174 token_info default_token_info;
176 const token_info *lookup_token(const char *start, const char *end)
178 unsigned n = hash_string(start, end - start) % TOKEN_TABLE_SIZE;
179 while (n >= 0) {
180 if (token_table[n].tok == 0)
181 break;
182 if (strlen(token_table[n].tok) == end - start
183 && memcmp(token_table[n].tok, start, end - start) == 0)
184 return &(token_table[n].ti);
185 if (--n < 0)
186 n = TOKEN_TABLE_SIZE - 1;
188 return &default_token_info;
191 static void init_ascii()
193 for (const char *p = "abcdefghijklmnopqrstuvwxyz"; *p; p++) {
194 char buf[2];
195 buf[0] = *p;
196 buf[1] = '\0';
197 store_token(strsave(buf), TOKEN_LOWER);
198 buf[0] = cmupper(buf[0]);
199 store_token(strsave(buf), TOKEN_UPPER);
201 for (p = "0123456789"; *p; p++) {
202 char buf[2];
203 buf[0] = *p;
204 buf[1] = '\0';
205 const char *s = strsave(buf);
206 store_token(s, TOKEN_OTHER, s);
208 for (p = ".,:;?!"; *p; p++) {
209 char buf[2];
210 buf[0] = *p;
211 buf[1] = '\0';
212 store_token(strsave(buf), TOKEN_PUNCT);
214 store_token("-", TOKEN_HYPHEN);
217 static void store_letter(const char *lower, const char *upper,
218 const char *sort_key = 0)
220 store_token(lower, TOKEN_LOWER, sort_key, upper);
221 store_token(upper, TOKEN_UPPER, sort_key, lower);
224 static void init_letter(unsigned char uc_code, unsigned char lc_code,
225 const char *sort_key)
227 char lbuf[2];
228 lbuf[0] = lc_code;
229 lbuf[1] = 0;
230 char ubuf[2];
231 ubuf[0] = uc_code;
232 ubuf[1] = 0;
233 store_letter(strsave(lbuf), strsave(ubuf), sort_key);
236 static void init_latin1()
238 init_letter(0xc0, 0xe0, "a");
239 init_letter(0xc1, 0xe1, "a");
240 init_letter(0xc2, 0xe2, "a");
241 init_letter(0xc3, 0xe3, "a");
242 init_letter(0xc4, 0xe4, "a");
243 init_letter(0xc5, 0xe5, "a");
244 init_letter(0xc6, 0xe6, "ae");
245 init_letter(0xc7, 0xe7, "c");
246 init_letter(0xc8, 0xe8, "e");
247 init_letter(0xc9, 0xe9, "e");
248 init_letter(0xca, 0xea, "e");
249 init_letter(0xcb, 0xeb, "e");
250 init_letter(0xcc, 0xec, "i");
251 init_letter(0xcd, 0xed, "i");
252 init_letter(0xce, 0xee, "i");
253 init_letter(0xcf, 0xef, "i");
255 init_letter(0xd0, 0xf0, "d");
256 init_letter(0xd1, 0xf1, "n");
257 init_letter(0xd2, 0xf2, "o");
258 init_letter(0xd3, 0xf3, "o");
259 init_letter(0xd4, 0xf4, "o");
260 init_letter(0xd5, 0xf5, "o");
261 init_letter(0xd6, 0xf6, "o");
262 init_letter(0xd8, 0xf8, "o");
263 init_letter(0xd9, 0xf9, "u");
264 init_letter(0xda, 0xfa, "u");
265 init_letter(0xdb, 0xfb, "u");
266 init_letter(0xdc, 0xfc, "u");
267 init_letter(0xdd, 0xfd, "y");
268 init_letter(0xde, 0xfe, THORN_SORT_KEY);
270 store_token("\337", TOKEN_LOWER, "ss", "SS");
271 store_token("\377", TOKEN_LOWER, "y", "Y");
274 static void init_two_char_letter(char l1, char l2, char u1, char u2,
275 const char *sk = 0)
277 char buf[6];
278 buf[0] = '\\';
279 buf[1] = '(';
280 buf[2] = l1;
281 buf[3] = l2;
282 buf[4] = '\0';
283 const char *p = strsave(buf);
284 buf[2] = u1;
285 buf[3] = u2;
286 store_letter(p, strsave(buf), sk);
287 buf[1] = '[';
288 buf[4] = ']';
289 buf[5] = '\0';
290 p = strsave(buf);
291 buf[2] = l1;
292 buf[3] = l2;
293 store_letter(strsave(buf), p, sk);
297 static void init_special_chars()
299 for (const char *p = "':^`~"; *p; p++)
300 for (const char *q = "aeiouy"; *q; q++) {
301 // Use a variable to work around bug in gcc 2.0
302 char c = cmupper(*q);
303 init_two_char_letter(*p, *q, *p, c);
305 for (p = "/l/o~n,coeaeij"; *p; p += 2) {
306 // Use variables to work around bug in gcc 2.0
307 char c0 = cmupper(p[0]);
308 char c1 = cmupper(p[1]);
309 init_two_char_letter(p[0], p[1], c0, c1);
311 init_two_char_letter('v', 's', 'v', 'S', "s");
312 init_two_char_letter('v', 'z', 'v', 'Z', "z");
313 init_two_char_letter('o', 'a', 'o', 'A', "a");
314 init_two_char_letter('T', 'p', 'T', 'P', THORN_SORT_KEY);
315 init_two_char_letter('-', 'd', '-', 'D');
317 store_token("\\(ss", TOKEN_LOWER, 0, "SS");
318 store_token("\\[ss]", TOKEN_LOWER, 0, "SS");
320 store_token("\\(Sd", TOKEN_LOWER, "d", "\\(-D");
321 store_token("\\[Sd]", TOKEN_LOWER, "d", "\\[-D]");
322 store_token("\\(hy", TOKEN_HYPHEN);
323 store_token("\\[hy]", TOKEN_HYPHEN);
326 static void init_strings()
328 char buf[6];
329 buf[0] = '\\';
330 buf[1] = '*';
331 for (const char *p = "'`^^,:~v_o./;"; *p; p++) {
332 buf[2] = *p;
333 buf[3] = '\0';
334 store_token(strsave(buf), TOKEN_ACCENT);
335 buf[2] = '[';
336 buf[3] = *p;
337 buf[4] = ']';
338 buf[5] = '\0';
339 store_token(strsave(buf), TOKEN_ACCENT);
342 // -ms special letters
343 store_letter("\\*(th", "\\*(Th", THORN_SORT_KEY);
344 store_letter("\\*[th]", "\\*[Th]", THORN_SORT_KEY);
345 store_letter("\\*(d-", "\\*(D-");
346 store_letter("\\*[d-]", "\\*[D-]");
347 store_letter("\\*(ae", "\\*(Ae", "ae");
348 store_letter("\\*[ae]", "\\*[Ae]", "ae");
349 store_letter("\\*(oe", "\\*(Oe", "oe");
350 store_letter("\\*[oe]", "\\*[Oe]", "oe");
352 store_token("\\*3", TOKEN_LOWER, "y", "Y");
353 store_token("\\*8", TOKEN_LOWER, "ss", "SS");
354 store_token("\\*q", TOKEN_LOWER, "o", "O");
357 struct token_initer {
358 token_initer();
361 static token_initer the_token_initer;
363 token_initer::token_initer()
365 init_ascii();
366 init_latin1();
367 init_special_chars();
368 init_strings();
369 default_token_info.set(TOKEN_OTHER);