Sync-to-go: update copyright for 2015
[s-roff.git] / src / pre-refer / token.cpp
blob821136de99fc7f1f47ab398041b76f977bde0439
1 /*@
2 * Copyright (c) 2014 - 2015 Steffen (Daode) Nurpmeso <sdaoden@users.sf.net>.
4 * Copyright (C) 1989 - 1992, 2001 Free Software Foundation, Inc.
5 * Written by James Clark (jjc@jclark.com)
7 * This is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License as published by the Free
9 * Software Foundation; either version 2, or (at your option) any later
10 * version.
12 * This is distributed in the hope that it will be useful, but WITHOUT ANY
13 * WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 * for more details.
17 * You should have received a copy of the GNU General Public License along
18 * with groff; see the file COPYING. If not, write to the Free Software
19 * Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA.
22 #include "config.h"
23 #include "refer-config.h"
25 #include "refer.h"
26 #include "token.h"
28 class token_table_entry
30 public:
31 const char *tok;
32 token_info ti;
33 token_table_entry();
36 token_table_entry token_table[TOKEN_TABLE_SIZE];
37 int ntokens = 0;
39 static void skip_name(const char **ptr, const char *end)
41 if (*ptr < end) {
42 switch (*(*ptr)++) {
43 case '(':
44 if (*ptr < end) {
45 *ptr += 1;
46 if (*ptr < end)
47 *ptr += 1;
49 break;
50 case '[':
51 while (*ptr < end)
52 if (*(*ptr)++ == ']')
53 break;
54 break;
59 int get_token(const char **ptr, const char *end)
61 if (*ptr >= end)
62 return 0;
63 char c = *(*ptr)++;
64 if (c == '\\' && *ptr < end) {
65 switch (**ptr) {
66 default:
67 *ptr += 1;
68 break;
69 case '(':
70 case '[':
71 skip_name(ptr, end);
72 break;
73 case '*':
74 case 'f':
75 *ptr += 1;
76 skip_name(ptr, end);
77 break;
80 return 1;
83 token_info::token_info()
84 : type(TOKEN_OTHER), sort_key(0), other_case(0)
88 void token_info::set(token_type t, const char *sk, const char *oc)
90 assert(oc == 0 || t == TOKEN_UPPER || t == TOKEN_LOWER);
91 type = t;
92 sort_key = sk;
93 other_case = oc;
96 void token_info::sortify(const char *start, const char *end, string &result)
97 const
99 if (sort_key)
100 result += sort_key;
101 else if (type == TOKEN_UPPER || type == TOKEN_LOWER) {
102 for (; start < end; start++)
103 if (csalpha(*start))
104 result += cmlower(*start);
108 int token_info::sortify_non_empty(const char *start, const char *end) const
110 if (sort_key)
111 return *sort_key != '\0';
112 if (type != TOKEN_UPPER && type != TOKEN_LOWER)
113 return 0;
114 for (; start < end; start++)
115 if (csalpha(*start))
116 return 1;
117 return 0;
120 void token_info::lower_case(const char *start, const char *end,
121 string &result) const
123 if (type != TOKEN_UPPER) {
124 while (start < end)
125 result += *start++;
127 else if (other_case)
128 result += other_case;
129 else {
130 while (start < end)
131 result += cmlower(*start++);
135 void token_info::upper_case(const char *start, const char *end,
136 string &result) const
138 if (type != TOKEN_LOWER) {
139 while (start < end)
140 result += *start++;
142 else if (other_case)
143 result += other_case;
144 else {
145 while (start < end)
146 result += cmupper(*start++);
150 token_table_entry::token_table_entry()
151 : tok(0)
155 static void store_token(const char *tok, token_type typ,
156 const char *sk = 0, const char *oc = 0)
158 unsigned n = hash_string(tok, strlen(tok)) % TOKEN_TABLE_SIZE;
159 for (;;) {
160 if (token_table[n].tok == 0) {
161 if (++ntokens == TOKEN_TABLE_SIZE)
162 assert(0);
163 token_table[n].tok = tok;
164 break;
166 if (strcmp(tok, token_table[n].tok) == 0)
167 break;
168 if (n == 0)
169 n = TOKEN_TABLE_SIZE - 1;
170 else
171 --n;
173 token_table[n].ti.set(typ, sk, oc);
176 token_info default_token_info; // FIXME
178 const token_info *lookup_token(const char *start, const char *end)
180 unsigned n = hash_string(start, end - start) % TOKEN_TABLE_SIZE;
181 for (;;) {
182 if (token_table[n].tok == 0)
183 break;
184 if (strlen(token_table[n].tok) == size_t(end - start)
185 && memcmp(token_table[n].tok, start, end - start) == 0)
186 return &(token_table[n].ti);
187 if (n == 0)
188 n = TOKEN_TABLE_SIZE - 1;
189 else
190 --n;
192 return &default_token_info;
195 static void init_ascii()
197 const char *p;
198 for (p = "abcdefghijklmnopqrstuvwxyz"; *p; p++) {
199 char buf[2];
200 buf[0] = *p;
201 buf[1] = '\0';
202 store_token(strsave(buf), TOKEN_LOWER);
203 buf[0] = cmupper(buf[0]);
204 store_token(strsave(buf), TOKEN_UPPER);
206 for (p = "0123456789"; *p; p++) {
207 char buf[2];
208 buf[0] = *p;
209 buf[1] = '\0';
210 const char *s = strsave(buf);
211 store_token(s, TOKEN_OTHER, s);
213 for (p = ".,:;?!"; *p; p++) {
214 char buf[2];
215 buf[0] = *p;
216 buf[1] = '\0';
217 store_token(strsave(buf), TOKEN_PUNCT);
219 store_token("-", TOKEN_HYPHEN);
222 static void store_letter(const char *lower, const char *upper,
223 const char *sort_key = 0)
225 store_token(lower, TOKEN_LOWER, sort_key, upper);
226 store_token(upper, TOKEN_UPPER, sort_key, lower);
229 static void init_letter(unsigned char uc_code, unsigned char lc_code,
230 const char *sort_key)
232 char lbuf[2];
233 lbuf[0] = lc_code;
234 lbuf[1] = 0;
235 char ubuf[2];
236 ubuf[0] = uc_code;
237 ubuf[1] = 0;
238 store_letter(strsave(lbuf), strsave(ubuf), sort_key);
241 static void init_latin1()
243 init_letter(0xc0, 0xe0, "a");
244 init_letter(0xc1, 0xe1, "a");
245 init_letter(0xc2, 0xe2, "a");
246 init_letter(0xc3, 0xe3, "a");
247 init_letter(0xc4, 0xe4, "a");
248 init_letter(0xc5, 0xe5, "a");
249 init_letter(0xc6, 0xe6, "ae");
250 init_letter(0xc7, 0xe7, "c");
251 init_letter(0xc8, 0xe8, "e");
252 init_letter(0xc9, 0xe9, "e");
253 init_letter(0xca, 0xea, "e");
254 init_letter(0xcb, 0xeb, "e");
255 init_letter(0xcc, 0xec, "i");
256 init_letter(0xcd, 0xed, "i");
257 init_letter(0xce, 0xee, "i");
258 init_letter(0xcf, 0xef, "i");
260 init_letter(0xd0, 0xf0, "d");
261 init_letter(0xd1, 0xf1, "n");
262 init_letter(0xd2, 0xf2, "o");
263 init_letter(0xd3, 0xf3, "o");
264 init_letter(0xd4, 0xf4, "o");
265 init_letter(0xd5, 0xf5, "o");
266 init_letter(0xd6, 0xf6, "o");
267 init_letter(0xd8, 0xf8, "o");
268 init_letter(0xd9, 0xf9, "u");
269 init_letter(0xda, 0xfa, "u");
270 init_letter(0xdb, 0xfb, "u");
271 init_letter(0xdc, 0xfc, "u");
272 init_letter(0xdd, 0xfd, "y");
273 init_letter(0xde, 0xfe, THORN_SORT_KEY);
275 store_token("\337", TOKEN_LOWER, "ss", "SS");
276 store_token("\377", TOKEN_LOWER, "y", "Y");
279 static void init_two_char_letter(char l1, char l2, char u1, char u2,
280 const char *sk = 0)
282 char buf[6];
283 buf[0] = '\\';
284 buf[1] = '(';
285 buf[2] = l1;
286 buf[3] = l2;
287 buf[4] = '\0';
288 const char *p = strsave(buf);
289 buf[2] = u1;
290 buf[3] = u2;
291 store_letter(p, strsave(buf), sk);
292 buf[1] = '[';
293 buf[4] = ']';
294 buf[5] = '\0';
295 p = strsave(buf);
296 buf[2] = l1;
297 buf[3] = l2;
298 store_letter(strsave(buf), p, sk);
301 static void init_special_chars()
303 const char *p;
304 for (p = "':^`~"; *p; p++)
305 for (const char *q = "aeiouy"; *q; q++) {
306 // Use a variable to work around bug in gcc 2.0
307 char c = cmupper(*q);
308 init_two_char_letter(*p, *q, *p, c);
310 for (p = "/l/o~n,coeaeij"; *p; p += 2) {
311 // Use variables to work around bug in gcc 2.0
312 char c0 = cmupper(p[0]);
313 char c1 = cmupper(p[1]);
314 init_two_char_letter(p[0], p[1], c0, c1);
316 init_two_char_letter('v', 's', 'v', 'S', "s");
317 init_two_char_letter('v', 'z', 'v', 'Z', "z");
318 init_two_char_letter('o', 'a', 'o', 'A', "a");
319 init_two_char_letter('T', 'p', 'T', 'P', THORN_SORT_KEY);
320 init_two_char_letter('-', 'd', '-', 'D');
322 store_token("\\(ss", TOKEN_LOWER, 0, "SS");
323 store_token("\\[ss]", TOKEN_LOWER, 0, "SS");
325 store_token("\\(Sd", TOKEN_LOWER, "d", "\\(-D");
326 store_token("\\[Sd]", TOKEN_LOWER, "d", "\\[-D]");
327 store_token("\\(hy", TOKEN_HYPHEN);
328 store_token("\\[hy]", TOKEN_HYPHEN);
329 store_token("\\(en", TOKEN_RANGE_SEP);
330 store_token("\\[en]", TOKEN_RANGE_SEP);
333 static void init_strings()
335 char buf[6];
336 buf[0] = '\\';
337 buf[1] = '*';
338 for (const char *p = "'`^^,:~v_o./;"; *p; p++) {
339 buf[2] = *p;
340 buf[3] = '\0';
341 store_token(strsave(buf), TOKEN_ACCENT);
342 buf[2] = '[';
343 buf[3] = *p;
344 buf[4] = ']';
345 buf[5] = '\0';
346 store_token(strsave(buf), TOKEN_ACCENT);
349 // -ms special letters
350 store_letter("\\*(th", "\\*(Th", THORN_SORT_KEY);
351 store_letter("\\*[th]", "\\*[Th]", THORN_SORT_KEY);
352 store_letter("\\*(d-", "\\*(D-");
353 store_letter("\\*[d-]", "\\*[D-]");
354 store_letter("\\*(ae", "\\*(Ae", "ae");
355 store_letter("\\*[ae]", "\\*[Ae]", "ae");
356 store_letter("\\*(oe", "\\*(Oe", "oe");
357 store_letter("\\*[oe]", "\\*[Oe]", "oe");
359 store_token("\\*3", TOKEN_LOWER, "y", "Y");
360 store_token("\\*8", TOKEN_LOWER, "ss", "SS");
361 store_token("\\*q", TOKEN_LOWER, "o", "O");
364 struct token_initer {
365 token_initer();
368 static token_initer the_token_initer; // FIXME static ctor init
370 token_initer::token_initer()
372 init_ascii();
373 init_latin1();
374 init_special_chars();
375 init_strings();
376 default_token_info.set(TOKEN_OTHER);
379 // s-it2-mode