2 * Copyright (c) 2014 - 2015 Steffen (Daode) Nurpmeso <sdaoden@users.sf.net>.
4 * Copyright (C) 1989 - 1992, 2001 Free Software Foundation, Inc.
5 * Written by James Clark (jjc@jclark.com)
7 * This is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License as published by the Free
9 * Software Foundation; either version 2, or (at your option) any later
12 * This is distributed in the hope that it will be useful, but WITHOUT ANY
13 * WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 * You should have received a copy of the GNU General Public License along
18 * with groff; see the file COPYING. If not, write to the Free Software
19 * Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA.
23 #include "refer-config.h"
28 class token_table_entry
36 token_table_entry token_table
[TOKEN_TABLE_SIZE
];
39 static void skip_name(const char **ptr
, const char *end
)
59 int get_token(const char **ptr
, const char *end
)
64 if (c
== '\\' && *ptr
< end
) {
83 token_info::token_info()
84 : type(TOKEN_OTHER
), sort_key(0), other_case(0)
88 void token_info::set(token_type t
, const char *sk
, const char *oc
)
90 assert(oc
== 0 || t
== TOKEN_UPPER
|| t
== TOKEN_LOWER
);
96 void token_info::sortify(const char *start
, const char *end
, string
&result
)
101 else if (type
== TOKEN_UPPER
|| type
== TOKEN_LOWER
) {
102 for (; start
< end
; start
++)
104 result
+= cmlower(*start
);
108 int token_info::sortify_non_empty(const char *start
, const char *end
) const
111 return *sort_key
!= '\0';
112 if (type
!= TOKEN_UPPER
&& type
!= TOKEN_LOWER
)
114 for (; start
< end
; start
++)
120 void token_info::lower_case(const char *start
, const char *end
,
121 string
&result
) const
123 if (type
!= TOKEN_UPPER
) {
128 result
+= other_case
;
131 result
+= cmlower(*start
++);
135 void token_info::upper_case(const char *start
, const char *end
,
136 string
&result
) const
138 if (type
!= TOKEN_LOWER
) {
143 result
+= other_case
;
146 result
+= cmupper(*start
++);
150 token_table_entry::token_table_entry()
155 static void store_token(const char *tok
, token_type typ
,
156 const char *sk
= 0, const char *oc
= 0)
158 unsigned n
= hash_string(tok
, strlen(tok
)) % TOKEN_TABLE_SIZE
;
160 if (token_table
[n
].tok
== 0) {
161 if (++ntokens
== TOKEN_TABLE_SIZE
)
163 token_table
[n
].tok
= tok
;
166 if (strcmp(tok
, token_table
[n
].tok
) == 0)
169 n
= TOKEN_TABLE_SIZE
- 1;
173 token_table
[n
].ti
.set(typ
, sk
, oc
);
176 token_info default_token_info
; // FIXME
178 const token_info
*lookup_token(const char *start
, const char *end
)
180 unsigned n
= hash_string(start
, end
- start
) % TOKEN_TABLE_SIZE
;
182 if (token_table
[n
].tok
== 0)
184 if (strlen(token_table
[n
].tok
) == size_t(end
- start
)
185 && memcmp(token_table
[n
].tok
, start
, end
- start
) == 0)
186 return &(token_table
[n
].ti
);
188 n
= TOKEN_TABLE_SIZE
- 1;
192 return &default_token_info
;
195 static void init_ascii()
198 for (p
= "abcdefghijklmnopqrstuvwxyz"; *p
; p
++) {
202 store_token(strsave(buf
), TOKEN_LOWER
);
203 buf
[0] = cmupper(buf
[0]);
204 store_token(strsave(buf
), TOKEN_UPPER
);
206 for (p
= "0123456789"; *p
; p
++) {
210 const char *s
= strsave(buf
);
211 store_token(s
, TOKEN_OTHER
, s
);
213 for (p
= ".,:;?!"; *p
; p
++) {
217 store_token(strsave(buf
), TOKEN_PUNCT
);
219 store_token("-", TOKEN_HYPHEN
);
222 static void store_letter(const char *lower
, const char *upper
,
223 const char *sort_key
= 0)
225 store_token(lower
, TOKEN_LOWER
, sort_key
, upper
);
226 store_token(upper
, TOKEN_UPPER
, sort_key
, lower
);
229 static void init_letter(unsigned char uc_code
, unsigned char lc_code
,
230 const char *sort_key
)
238 store_letter(strsave(lbuf
), strsave(ubuf
), sort_key
);
241 static void init_latin1()
243 init_letter(0xc0, 0xe0, "a");
244 init_letter(0xc1, 0xe1, "a");
245 init_letter(0xc2, 0xe2, "a");
246 init_letter(0xc3, 0xe3, "a");
247 init_letter(0xc4, 0xe4, "a");
248 init_letter(0xc5, 0xe5, "a");
249 init_letter(0xc6, 0xe6, "ae");
250 init_letter(0xc7, 0xe7, "c");
251 init_letter(0xc8, 0xe8, "e");
252 init_letter(0xc9, 0xe9, "e");
253 init_letter(0xca, 0xea, "e");
254 init_letter(0xcb, 0xeb, "e");
255 init_letter(0xcc, 0xec, "i");
256 init_letter(0xcd, 0xed, "i");
257 init_letter(0xce, 0xee, "i");
258 init_letter(0xcf, 0xef, "i");
260 init_letter(0xd0, 0xf0, "d");
261 init_letter(0xd1, 0xf1, "n");
262 init_letter(0xd2, 0xf2, "o");
263 init_letter(0xd3, 0xf3, "o");
264 init_letter(0xd4, 0xf4, "o");
265 init_letter(0xd5, 0xf5, "o");
266 init_letter(0xd6, 0xf6, "o");
267 init_letter(0xd8, 0xf8, "o");
268 init_letter(0xd9, 0xf9, "u");
269 init_letter(0xda, 0xfa, "u");
270 init_letter(0xdb, 0xfb, "u");
271 init_letter(0xdc, 0xfc, "u");
272 init_letter(0xdd, 0xfd, "y");
273 init_letter(0xde, 0xfe, THORN_SORT_KEY
);
275 store_token("\337", TOKEN_LOWER
, "ss", "SS");
276 store_token("\377", TOKEN_LOWER
, "y", "Y");
279 static void init_two_char_letter(char l1
, char l2
, char u1
, char u2
,
288 const char *p
= strsave(buf
);
291 store_letter(p
, strsave(buf
), sk
);
298 store_letter(strsave(buf
), p
, sk
);
301 static void init_special_chars()
304 for (p
= "':^`~"; *p
; p
++)
305 for (const char *q
= "aeiouy"; *q
; q
++) {
306 // Use a variable to work around bug in gcc 2.0
307 char c
= cmupper(*q
);
308 init_two_char_letter(*p
, *q
, *p
, c
);
310 for (p
= "/l/o~n,coeaeij"; *p
; p
+= 2) {
311 // Use variables to work around bug in gcc 2.0
312 char c0
= cmupper(p
[0]);
313 char c1
= cmupper(p
[1]);
314 init_two_char_letter(p
[0], p
[1], c0
, c1
);
316 init_two_char_letter('v', 's', 'v', 'S', "s");
317 init_two_char_letter('v', 'z', 'v', 'Z', "z");
318 init_two_char_letter('o', 'a', 'o', 'A', "a");
319 init_two_char_letter('T', 'p', 'T', 'P', THORN_SORT_KEY
);
320 init_two_char_letter('-', 'd', '-', 'D');
322 store_token("\\(ss", TOKEN_LOWER
, 0, "SS");
323 store_token("\\[ss]", TOKEN_LOWER
, 0, "SS");
325 store_token("\\(Sd", TOKEN_LOWER
, "d", "\\(-D");
326 store_token("\\[Sd]", TOKEN_LOWER
, "d", "\\[-D]");
327 store_token("\\(hy", TOKEN_HYPHEN
);
328 store_token("\\[hy]", TOKEN_HYPHEN
);
329 store_token("\\(en", TOKEN_RANGE_SEP
);
330 store_token("\\[en]", TOKEN_RANGE_SEP
);
333 static void init_strings()
338 for (const char *p
= "'`^^,:~v_o./;"; *p
; p
++) {
341 store_token(strsave(buf
), TOKEN_ACCENT
);
346 store_token(strsave(buf
), TOKEN_ACCENT
);
349 // -ms special letters
350 store_letter("\\*(th", "\\*(Th", THORN_SORT_KEY
);
351 store_letter("\\*[th]", "\\*[Th]", THORN_SORT_KEY
);
352 store_letter("\\*(d-", "\\*(D-");
353 store_letter("\\*[d-]", "\\*[D-]");
354 store_letter("\\*(ae", "\\*(Ae", "ae");
355 store_letter("\\*[ae]", "\\*[Ae]", "ae");
356 store_letter("\\*(oe", "\\*(Oe", "oe");
357 store_letter("\\*[oe]", "\\*[Oe]", "oe");
359 store_token("\\*3", TOKEN_LOWER
, "y", "Y");
360 store_token("\\*8", TOKEN_LOWER
, "ss", "SS");
361 store_token("\\*q", TOKEN_LOWER
, "o", "O");
364 struct token_initer
{
368 static token_initer the_token_initer
; // FIXME static ctor init
370 token_initer::token_initer()
374 init_special_chars();
376 default_token_info
.set(TOKEN_OTHER
);