Sync-to-go: update copyright for 2015
[s-roff.git] / src / pre-refer / label.y
blob67ecd4621395b252995eab5ed2823b57cf26c165
1 /*@
2 * Copyright (c) 2014 - 2015 Steffen (Daode) Nurpmeso <sdaoden@users.sf.net>.
4 * Copyright (C) 1989 - 1992, 2000, 2004, 2007
5 * Free Software Foundation, Inc.
6 * Written by James Clark (jjc@jclark.com)
8 * This is free software; you can redistribute it and/or modify it under
9 * the terms of the GNU General Public License as published by the Free
10 * Software Foundation; either version 2, or (at your option) any later
11 * version.
13 * This is distributed in the hope that it will be useful, but WITHOUT ANY
14 * WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 * for more details.
18 * You should have received a copy of the GNU General Public License along
19 * with groff; see the file COPYING. If not, write to the Free Software
20 * Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA.
24 #include "config.h"
25 #include "refer-config.h"
27 #include "refid.h"
29 #include "refer.h"
30 #include "ref.h"
31 #include "token.h"
33 int yylex();
34 void yyerror(const char *);
35 int yyparse();
37 static const char *format_serial(char c, int n);
39 class label_info
41 public:
42 int start;
43 int length;
44 int count;
45 int total;
46 label_info(const string &);
49 label_info *lookup_label(const string &label);
51 class expression
53 public:
54 enum {
55 // Does the tentative label depend on the reference?
56 CONTAINS_VARIABLE = 01,
57 CONTAINS_STAR = 02,
58 CONTAINS_FORMAT = 04,
59 CONTAINS_AT = 010
62 virtual ~expression() { }
63 virtual void evaluate(int, const reference &, string &,
64 substring_position &) = 0;
65 virtual unsigned analyze() { return 0; }
68 class at_expr
69 : public expression
71 public:
72 at_expr() { }
73 void evaluate(int, const reference &, string &, substring_position &);
74 unsigned analyze() { return CONTAINS_VARIABLE|CONTAINS_AT; }
77 class format_expr
78 : public expression
80 char type;
81 int width;
82 int first_number;
84 public:
85 format_expr(char c, int w = 0, int f = 1)
86 : type(c), width(w), first_number(f) { }
87 void evaluate(int, const reference &, string &, substring_position &);
88 unsigned analyze() { return CONTAINS_FORMAT; }
91 class field_expr
92 : public expression
94 int number;
95 char name;
97 public:
98 field_expr(char nm, int num) : number(num), name(nm) { }
99 void evaluate(int, const reference &, string &, substring_position &);
100 unsigned analyze() { return CONTAINS_VARIABLE; }
103 class literal_expr
104 : public expression
106 string s;
108 public:
109 literal_expr(const char *ptr, int len) : s(ptr, len) { }
110 void evaluate(int, const reference &, string &, substring_position &);
113 class unary_expr
114 : public expression
116 protected:
117 expression *expr;
119 public:
120 unary_expr(expression *e) : expr(e) { }
121 ~unary_expr() { delete expr; }
122 void evaluate(int, const reference &, string &, substring_position &) = 0;
123 unsigned analyze() { return expr ? expr->analyze() : 0; }
126 // This caches the analysis of an expression.
128 class analyzed_expr
129 : public unary_expr
131 unsigned flags;
133 public:
134 analyzed_expr(expression *);
135 void evaluate(int, const reference &, string &, substring_position &);
136 unsigned analyze() { return flags; }
139 class star_expr
140 : public unary_expr
142 public:
143 star_expr(expression *e) : unary_expr(e) { }
144 void evaluate(int, const reference &, string &, substring_position &);
145 unsigned analyze() {
146 return ((expr ? (expr->analyze() & ~CONTAINS_VARIABLE) : 0)
147 | CONTAINS_STAR);
151 typedef void map_func(const char *, const char *, string &);
153 class map_expr
154 : public unary_expr
156 map_func *func;
158 public:
159 map_expr(expression *e, map_func *f) : unary_expr(e), func(f) { }
160 void evaluate(int, const reference &, string &, substring_position &);
163 typedef const char *extractor_func(const char *, const char *, const char **);
165 class extractor_expr
166 : public unary_expr
168 int part;
169 extractor_func *func;
171 public:
172 enum { BEFORE = +1, MATCH = 0, AFTER = -1 };
173 extractor_expr(expression *e, extractor_func *f, int pt)
174 : unary_expr(e), part(pt), func(f) { }
175 void evaluate(int, const reference &, string &, substring_position &);
178 class truncate_expr
179 : public unary_expr
181 int n;
183 public:
184 truncate_expr(expression *e, int i) : unary_expr(e), n(i) { }
185 void evaluate(int, const reference &, string &, substring_position &);
188 class separator_expr
189 : public unary_expr
191 public:
192 separator_expr(expression *e) : unary_expr(e) { }
193 void evaluate(int, const reference &, string &, substring_position &);
196 class binary_expr
197 : public expression
199 protected:
200 expression *expr1;
201 expression *expr2;
203 public:
204 binary_expr(expression *e1, expression *e2) : expr1(e1), expr2(e2) { }
205 ~binary_expr() { delete expr1; delete expr2; }
206 void evaluate(int, const reference &, string &, substring_position &) = 0;
207 unsigned analyze() {
208 return (expr1 ? expr1->analyze() : 0) | (expr2 ? expr2->analyze() : 0);
212 class alternative_expr
213 : public binary_expr
215 public:
216 alternative_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
217 void evaluate(int, const reference &, string &, substring_position &);
220 class list_expr
221 : public binary_expr
223 public:
224 list_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
225 void evaluate(int, const reference &, string &, substring_position &);
228 class substitute_expr
229 : public binary_expr
231 public:
232 substitute_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
233 void evaluate(int, const reference &, string &, substring_position &);
236 class ternary_expr
237 : public expression
239 protected:
240 expression *expr1;
241 expression *expr2;
242 expression *expr3;
244 public:
245 ternary_expr(expression *e1, expression *e2, expression *e3)
246 : expr1(e1), expr2(e2), expr3(e3) { }
247 ~ternary_expr() { delete expr1; delete expr2; delete expr3; }
248 void evaluate(int, const reference &, string &, substring_position &) = 0;
249 unsigned analyze() {
250 return ((expr1 ? expr1->analyze() : 0)
251 | (expr2 ? expr2->analyze() : 0)
252 | (expr3 ? expr3->analyze() : 0));
256 class conditional_expr
257 : public ternary_expr
259 public:
260 conditional_expr(expression *e1, expression *e2, expression *e3)
261 : ternary_expr(e1, e2, e3) { }
262 void evaluate(int, const reference &, string &, substring_position &);
265 static expression *parsed_label = 0;
266 static expression *parsed_date_label = 0;
267 static expression *parsed_short_label = 0;
269 static expression *parse_result;
271 string literals;
274 %union {
275 int num;
276 expression *expr;
277 struct { int ndigits; int val; } dig;
278 struct { int start; int len; } str;
281 /* uppercase or lowercase letter */
282 %token <num> TOKEN_LETTER
283 /* literal characters */
284 %token <str> TOKEN_LITERAL
285 /* digit */
286 %token <num> TOKEN_DIGIT
288 %type <expr> conditional
289 %type <expr> alternative
290 %type <expr> list
291 %type <expr> string
292 %type <expr> substitute
293 %type <expr> optional_conditional
294 %type <num> number
295 %type <dig> digits
296 %type <num> optional_number
297 %type <num> flag
301 expr:
302 optional_conditional
303 { parse_result = ($1 ? new analyzed_expr($1) : 0); }
306 conditional:
307 alternative
308 { $$ = $1; }
309 | alternative '?' optional_conditional ':' conditional
310 { $$ = new conditional_expr($1, $3, $5); }
313 optional_conditional:
314 /* empty */
315 { $$ = 0; }
316 | conditional
317 { $$ = $1; }
320 alternative:
321 list
322 { $$ = $1; }
323 | alternative '|' list
324 { $$ = new alternative_expr($1, $3); }
325 | alternative '&' list
326 { $$ = new conditional_expr($1, $3, 0); }
329 list:
330 substitute
331 { $$ = $1; }
332 | list substitute
333 { $$ = new list_expr($1, $2); }
336 substitute:
337 string
338 { $$ = $1; }
339 | substitute '~' string
340 { $$ = new substitute_expr($1, $3); }
343 string:
345 { $$ = new at_expr; }
346 | TOKEN_LITERAL
348 $$ = new literal_expr(literals.contents() + $1.start,
349 $1.len);
351 | TOKEN_LETTER
352 { $$ = new field_expr($1, 0); }
353 | TOKEN_LETTER number
354 { $$ = new field_expr($1, $2 - 1); }
355 | '%' TOKEN_LETTER
357 switch ($2) {
358 case 'I':
359 case 'i':
360 case 'A':
361 case 'a':
362 $$ = new format_expr($2);
363 break;
364 default:
365 command_error("unrecognized format `%1'", char($2));
366 $$ = new format_expr('a');
367 break;
371 | '%' digits
373 $$ = new format_expr('0', $2.ndigits, $2.val);
375 | string '.' flag TOKEN_LETTER optional_number
377 switch ($4) {
378 case 'l':
379 $$ = new map_expr($1, lowercase);
380 break;
381 case 'u':
382 $$ = new map_expr($1, uppercase);
383 break;
384 case 'c':
385 $$ = new map_expr($1, capitalize);
386 break;
387 case 'r':
388 $$ = new map_expr($1, reverse_name);
389 break;
390 case 'a':
391 $$ = new map_expr($1, abbreviate_name);
392 break;
393 case 'y':
394 $$ = new extractor_expr($1, find_year, $3);
395 break;
396 case 'n':
397 $$ = new extractor_expr($1, find_last_name, $3);
398 break;
399 default:
400 $$ = $1;
401 command_error("unknown function `%1'", char($4));
402 break;
406 | string '+' number
407 { $$ = new truncate_expr($1, $3); }
408 | string '-' number
409 { $$ = new truncate_expr($1, -$3); }
410 | string '*'
411 { $$ = new star_expr($1); }
412 | '(' optional_conditional ')'
413 { $$ = $2; }
414 | '<' optional_conditional '>'
415 { $$ = new separator_expr($2); }
418 optional_number:
419 /* empty */
420 { $$ = -1; }
421 | number
422 { $$ = $1; }
425 number:
426 TOKEN_DIGIT
427 { $$ = $1; }
428 | number TOKEN_DIGIT
429 { $$ = $1*10 + $2; }
432 digits:
433 TOKEN_DIGIT
434 { $$.ndigits = 1; $$.val = $1; }
435 | digits TOKEN_DIGIT
436 { $$.ndigits = $1.ndigits + 1; $$.val = $1.val*10 + $2; }
439 flag:
440 /* empty */
441 { $$ = 0; }
442 | '+'
443 { $$ = 1; }
444 | '-'
445 { $$ = -1; }
450 /* bison defines const to be empty unless __STDC__ is defined, which it
451 isn't under cfront */
453 #ifdef const // FIXME
454 #undef const
455 #endif
457 const char *spec_ptr;
458 const char *spec_end;
459 const char *spec_cur;
461 static char uppercase_array[] = { // FIXME const
462 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
463 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
464 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
465 'Y', 'Z',
468 static char lowercase_array[] = { // FIXME const
469 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
470 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
471 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
472 'y', 'z',
475 int yylex()
477 while (spec_ptr < spec_end && csspace(*spec_ptr))
478 spec_ptr++;
479 spec_cur = spec_ptr;
480 if (spec_ptr >= spec_end)
481 return 0;
482 unsigned char c = *spec_ptr++;
483 if (csalpha(c)) {
484 yylval.num = c;
485 return TOKEN_LETTER;
487 if (csdigit(c)) {
488 yylval.num = c - '0';
489 return TOKEN_DIGIT;
491 if (c == '\'') {
492 yylval.str.start = literals.length();
493 for (; spec_ptr < spec_end; spec_ptr++) {
494 if (*spec_ptr == '\'') {
495 if (++spec_ptr < spec_end && *spec_ptr == '\'')
496 literals += '\'';
497 else {
498 yylval.str.len = literals.length() - yylval.str.start;
499 return TOKEN_LITERAL;
502 else
503 literals += *spec_ptr;
505 yylval.str.len = literals.length() - yylval.str.start;
506 return TOKEN_LITERAL;
508 return c;
511 int set_label_spec(const char *label_spec)
513 spec_cur = spec_ptr = label_spec;
514 spec_end = strchr(label_spec, '\0');
515 literals.clear();
516 if (yyparse())
517 return 0;
518 delete parsed_label;
519 parsed_label = parse_result;
520 return 1;
523 int set_date_label_spec(const char *label_spec)
525 spec_cur = spec_ptr = label_spec;
526 spec_end = strchr(label_spec, '\0');
527 literals.clear();
528 if (yyparse())
529 return 0;
530 delete parsed_date_label;
531 parsed_date_label = parse_result;
532 return 1;
535 int set_short_label_spec(const char *label_spec)
537 spec_cur = spec_ptr = label_spec;
538 spec_end = strchr(label_spec, '\0');
539 literals.clear();
540 if (yyparse())
541 return 0;
542 delete parsed_short_label;
543 parsed_short_label = parse_result;
544 return 1;
547 void yyerror(const char *message)
549 if (spec_cur < spec_end)
550 command_error("label specification %1 before `%2'", message, spec_cur);
551 else
552 command_error("label specification %1 at end of string",
553 message, spec_cur);
556 void at_expr::evaluate(int tentative, const reference &ref,
557 string &result, substring_position &)
559 if (tentative)
560 ref.canonicalize_authors(result);
561 else {
562 const char *end, *start = ref.get_authors(&end);
563 if (start)
564 result.append(start, end - start);
568 void format_expr::evaluate(int tentative, const reference &ref,
569 string &result, substring_position &)
571 if (tentative)
572 return;
573 const label_info *lp = ref.get_label_ptr();
574 int num = lp == 0 ? ref.get_number() : lp->count;
575 if (type != '0')
576 result += format_serial(type, num + 1);
577 else {
578 const char *ptr = i_to_a(num + first_number);
579 int pad = width - strlen(ptr);
580 while (--pad >= 0)
581 result += '0';
582 result += ptr;
586 static const char *format_serial(char c, int n)
588 assert(n > 0);
589 static char buf[128]; // more than enough.
590 switch (c) {
591 case 'i':
592 case 'I':
594 char *p = buf;
595 // troff uses z and w to represent 10000 and 5000 in Roman
596 // numerals; I can find no historical basis for this usage
597 const char *s = c == 'i' ? "zwmdclxvi" : "ZWMDCLXVI";
598 if (n >= 40000)
599 return i_to_a(n);
600 while (n >= 10000) {
601 *p++ = s[0];
602 n -= 10000;
604 for (int i = 1000; i > 0; i /= 10, s += 2) {
605 int m = n/i;
606 n -= m*i;
607 switch (m) {
608 case 3:
609 *p++ = s[2];
610 /* falls through */
611 case 2:
612 *p++ = s[2];
613 /* falls through */
614 case 1:
615 *p++ = s[2];
616 break;
617 case 4:
618 *p++ = s[2];
619 *p++ = s[1];
620 break;
621 case 8:
622 *p++ = s[1];
623 *p++ = s[2];
624 *p++ = s[2];
625 *p++ = s[2];
626 break;
627 case 7:
628 *p++ = s[1];
629 *p++ = s[2];
630 *p++ = s[2];
631 break;
632 case 6:
633 *p++ = s[1];
634 *p++ = s[2];
635 break;
636 case 5:
637 *p++ = s[1];
638 break;
639 case 9:
640 *p++ = s[2];
641 *p++ = s[0];
644 *p = 0;
645 break;
647 case 'a':
648 case 'A':
650 char *p = buf;
651 // this is derived from troff/reg.c
652 while (n > 0) {
653 int d = n % 26;
654 if (d == 0)
655 d = 26;
656 n -= d;
657 n /= 26;
658 *p++ = c == 'a' ? lowercase_array[d - 1] :
659 uppercase_array[d - 1];
661 *p-- = 0;
662 // Reverse it.
663 char *q = buf;
664 while (q < p) {
665 char temp = *q;
666 *q = *p;
667 *p = temp;
668 --p;
669 ++q;
671 break;
673 default:
674 assert(0);
676 return buf;
679 void field_expr::evaluate(int, const reference &ref,
680 string &result, substring_position &)
682 const char *end;
683 const char *start = ref.get_field(name, &end);
684 if (start) {
685 start = nth_field(number, start, &end);
686 if (start)
687 result.append(start, end - start);
691 void literal_expr::evaluate(int, const reference &,
692 string &result, substring_position &)
694 result += s;
697 analyzed_expr::analyzed_expr(expression *e)
698 : unary_expr(e), flags(e ? e->analyze() : 0)
702 void analyzed_expr::evaluate(int tentative, const reference &ref,
703 string &result, substring_position &pos)
705 if (expr)
706 expr->evaluate(tentative, ref, result, pos);
709 void star_expr::evaluate(int tentative, const reference &ref,
710 string &result, substring_position &pos)
712 const label_info *lp = ref.get_label_ptr();
713 if (!tentative
714 && (lp == 0 || lp->total > 1)
715 && expr)
716 expr->evaluate(tentative, ref, result, pos);
719 void separator_expr::evaluate(int tentative, const reference &ref,
720 string &result, substring_position &pos)
722 int start_length = result.length();
723 int is_first = pos.start < 0;
724 if (expr)
725 expr->evaluate(tentative, ref, result, pos);
726 if (is_first) {
727 pos.start = start_length;
728 pos.length = result.length() - start_length;
732 void map_expr::evaluate(int tentative, const reference &ref,
733 string &result, substring_position &)
735 if (expr) {
736 string temp;
737 substring_position temp_pos;
738 expr->evaluate(tentative, ref, temp, temp_pos);
739 (*func)(temp.contents(), temp.contents() + temp.length(), result);
743 void extractor_expr::evaluate(int tentative, const reference &ref,
744 string &result, substring_position &)
746 if (expr) {
747 string temp;
748 substring_position temp_pos;
749 expr->evaluate(tentative, ref, temp, temp_pos);
750 const char *end, *start = (*func)(temp.contents(),
751 temp.contents() + temp.length(),
752 &end);
753 switch (part) {
754 case BEFORE:
755 if (start)
756 result.append(temp.contents(), start - temp.contents());
757 else
758 result += temp;
759 break;
760 case MATCH:
761 if (start)
762 result.append(start, end - start);
763 break;
764 case AFTER:
765 if (start)
766 result.append(end, temp.contents() + temp.length() - end);
767 break;
768 default:
769 assert(0);
774 static void first_part(int len, const char *ptr, const char *end,
775 string &result)
777 for (;;) {
778 const char *token_start = ptr;
779 if (!get_token(&ptr, end))
780 break;
781 const token_info *ti = lookup_token(token_start, ptr);
782 int counts = ti->sortify_non_empty(token_start, ptr);
783 if (counts && --len < 0)
784 break;
785 if (counts || ti->is_accent())
786 result.append(token_start, ptr - token_start);
790 static void last_part(int len, const char *ptr, const char *end,
791 string &result)
793 const char *start = ptr;
794 int count = 0;
795 for (;;) {
796 const char *token_start = ptr;
797 if (!get_token(&ptr, end))
798 break;
799 const token_info *ti = lookup_token(token_start, ptr);
800 if (ti->sortify_non_empty(token_start, ptr))
801 count++;
803 ptr = start;
804 int skip = count - len;
805 if (skip > 0) {
806 for (;;) {
807 const char *token_start = ptr;
808 if (!get_token(&ptr, end))
809 assert(0);
810 const token_info *ti = lookup_token(token_start, ptr);
811 if (ti->sortify_non_empty(token_start, ptr) && --skip < 0) {
812 ptr = token_start;
813 break;
817 first_part(len, ptr, end, result);
820 void truncate_expr::evaluate(int tentative, const reference &ref,
821 string &result, substring_position &)
823 if (expr) {
824 string temp;
825 substring_position temp_pos;
826 expr->evaluate(tentative, ref, temp, temp_pos);
827 const char *start = temp.contents();
828 const char *end = start + temp.length();
829 if (n > 0)
830 first_part(n, start, end, result);
831 else if (n < 0)
832 last_part(-n, start, end, result);
836 void alternative_expr::evaluate(int tentative, const reference &ref,
837 string &result, substring_position &pos)
839 int start_length = result.length();
840 if (expr1)
841 expr1->evaluate(tentative, ref, result, pos);
842 if (result.length() == start_length && expr2)
843 expr2->evaluate(tentative, ref, result, pos);
846 void list_expr::evaluate(int tentative, const reference &ref,
847 string &result, substring_position &pos)
849 if (expr1)
850 expr1->evaluate(tentative, ref, result, pos);
851 if (expr2)
852 expr2->evaluate(tentative, ref, result, pos);
855 void substitute_expr::evaluate(int tentative, const reference &ref,
856 string &result, substring_position &pos)
858 int start_length = result.length();
859 if (expr1)
860 expr1->evaluate(tentative, ref, result, pos);
861 if (result.length() > start_length && result[result.length() - 1] == '-') {
862 // ought to see if pos covers the -
863 result.set_length(result.length() - 1);
864 if (expr2)
865 expr2->evaluate(tentative, ref, result, pos);
869 void conditional_expr::evaluate(int tentative, const reference &ref,
870 string &result, substring_position &pos)
872 string temp;
873 substring_position temp_pos;
874 if (expr1)
875 expr1->evaluate(tentative, ref, temp, temp_pos);
876 if (temp.length() > 0) {
877 if (expr2)
878 expr2->evaluate(tentative, ref, result, pos);
880 else {
881 if (expr3)
882 expr3->evaluate(tentative, ref, result, pos);
886 void reference::pre_compute_label()
888 if (parsed_label != 0
889 && (parsed_label->analyze() & expression::CONTAINS_VARIABLE)) {
890 label.clear();
891 substring_position temp_pos;
892 parsed_label->evaluate(1, *this, label, temp_pos);
893 label_ptr = lookup_label(label);
897 void reference::compute_label()
899 label.clear();
900 if (parsed_label)
901 parsed_label->evaluate(0, *this, label, separator_pos);
902 if (short_label_flag && parsed_short_label)
903 parsed_short_label->evaluate(0, *this, short_label, short_separator_pos);
904 if (date_as_label) {
905 string new_date;
906 if (parsed_date_label) {
907 substring_position temp_pos;
908 parsed_date_label->evaluate(0, *this, new_date, temp_pos);
910 set_date(new_date);
912 if (label_ptr)
913 label_ptr->count += 1;
916 void reference::immediate_compute_label()
918 if (label_ptr)
919 label_ptr->total = 2; // force use of disambiguator
920 compute_label();
923 int reference::merge_labels(reference **v, int n, label_type type,
924 string &result)
926 if (abbreviate_label_ranges)
927 return merge_labels_by_number(v, n, type, result);
928 else
929 return merge_labels_by_parts(v, n, type, result);
932 int reference::merge_labels_by_number(reference **v, int n, label_type type,
933 string &result)
935 if (n <= 1)
936 return 0;
937 int num = get_number();
938 // Only merge three or more labels.
939 if (v[0]->get_number() != num + 1
940 || v[1]->get_number() != num + 2)
941 return 0;
942 int i;
943 for (i = 2; i < n; i++)
944 if (v[i]->get_number() != num + i + 1)
945 break;
946 result = get_label(type);
947 result += label_range_indicator;
948 result += v[i - 1]->get_label(type);
949 return i;
952 const substring_position &reference::get_separator_pos(label_type type) const
954 if (type == SHORT_LABEL && short_label_flag)
955 return short_separator_pos;
956 else
957 return separator_pos;
960 const string &reference::get_label(label_type type) const
962 if (type == SHORT_LABEL && short_label_flag)
963 return short_label;
964 else
965 return label;
968 int reference::merge_labels_by_parts(reference **v, int n, label_type type,
969 string &result)
971 if (n <= 0)
972 return 0;
973 const string &lb = get_label(type);
974 const substring_position &sp = get_separator_pos(type);
975 if (sp.start < 0
976 || sp.start != v[0]->get_separator_pos(type).start
977 || memcmp(lb.contents(), v[0]->get_label(type).contents(),
978 sp.start) != 0)
979 return 0;
980 result = lb;
981 int i = 0;
982 do {
983 result += separate_label_second_parts;
984 const substring_position &s = v[i]->get_separator_pos(type);
985 int sep_end_pos = s.start + s.length;
986 result.append(v[i]->get_label(type).contents() + sep_end_pos,
987 v[i]->get_label(type).length() - sep_end_pos);
988 } while (++i < n
989 && sp.start == v[i]->get_separator_pos(type).start
990 && memcmp(lb.contents(), v[i]->get_label(type).contents(),
991 sp.start) == 0);
992 return i;
995 string label_pool;
997 label_info::label_info(const string &s)
998 : start(label_pool.length()), length(s.length()), count(0), total(1)
1000 label_pool += s;
1003 static label_info **label_table = 0;
1004 static int label_table_size = 0;
1005 static int label_table_used = 0;
1007 label_info *lookup_label(const string &label)
1009 if (label_table == 0) {
1010 label_table = new label_info *[17];
1011 label_table_size = 17;
1012 for (int i = 0; i < 17; i++)
1013 label_table[i] = 0;
1015 unsigned h = hash_string(label.contents(), label.length()) % label_table_size;
1016 label_info **ptr;
1017 for (ptr = label_table + h;
1018 *ptr != 0;
1019 (ptr == label_table)
1020 ? (ptr = label_table + label_table_size - 1)
1021 : ptr--)
1022 if ((*ptr)->length == label.length()
1023 && memcmp(label_pool.contents() + (*ptr)->start, label.contents(),
1024 label.length()) == 0) {
1025 (*ptr)->total += 1;
1026 return *ptr;
1028 label_info *result = *ptr = new label_info(label);
1029 if (++label_table_used * 2 > label_table_size) {
1030 // Rehash the table.
1031 label_info **old_table = label_table;
1032 int old_size = label_table_size;
1033 label_table_size = next_size(label_table_size);
1034 label_table = new label_info *[label_table_size];
1035 int i;
1036 for (i = 0; i < label_table_size; i++)
1037 label_table[i] = 0;
1038 for (i = 0; i < old_size; i++)
1039 if (old_table[i]) {
1040 h = hash_string(label_pool.contents() + old_table[i]->start,
1041 old_table[i]->length);
1042 label_info **p;
1043 for (p = label_table + (h % label_table_size);
1044 *p != 0;
1045 (p == label_table)
1046 ? (p = label_table + label_table_size - 1)
1047 : --p)
1049 *p = old_table[i];
1051 a_delete old_table;
1053 return result;
1056 void clear_labels()
1058 for (int i = 0; i < label_table_size; i++) {
1059 delete label_table[i];
1060 label_table[i] = 0;
1062 label_table_used = 0;
1063 label_pool.clear();
1066 static void consider_authors(reference **start, reference **end, int i);
1068 void compute_labels(reference **v, int n)
1070 if (parsed_label
1071 && (parsed_label->analyze() & expression::CONTAINS_AT)
1072 && sort_fields.length() >= 2
1073 && sort_fields[0] == 'A'
1074 && sort_fields[1] == '+')
1075 consider_authors(v, v + n, 0);
1076 for (int i = 0; i < n; i++)
1077 v[i]->compute_label();
1081 /* A reference with a list of authors <A0,A1,...,AN> _needs_ author i
1082 where 0 <= i <= N if there exists a reference with a list of authors
1083 <B0,B1,...,BM> such that <A0,A1,...,AN> != <B0,B1,...,BM> and M >= i
1084 and Aj = Bj for 0 <= j < i. In this case if we can't say ``A0,
1085 A1,...,A(i-1) et al'' because this would match both <A0,A1,...,AN> and
1086 <B0,B1,...,BM>. If a reference needs author i we only have to call
1087 need_author(j) for some j >= i such that the reference also needs
1088 author j. */
1090 /* This function handles 2 tasks:
1091 determine which authors are needed (cannot be elided with et al.);
1092 determine which authors can have only last names in the labels.
1094 References >= start and < end have the same first i author names.
1095 Also they're sorted by A+. */
1097 static void consider_authors(reference **start, reference **end, int i)
1099 if (start >= end)
1100 return;
1101 reference **p = start;
1102 if (i >= (*p)->get_nauthors()) {
1103 for (++p; p < end && i >= (*p)->get_nauthors(); p++)
1105 if (p < end && i > 0) {
1106 // If we have an author list <A B C> and an author list <A B C D>,
1107 // then both lists need C.
1108 for (reference **q = start; q < end; q++)
1109 (*q)->need_author(i - 1);
1111 start = p;
1113 while (p < end) {
1114 reference **last_name_start = p;
1115 reference **name_start = p;
1116 for (++p;
1117 p < end && i < (*p)->get_nauthors()
1118 && same_author_last_name(**last_name_start, **p, i);
1119 p++) {
1120 if (!same_author_name(**name_start, **p, i)) {
1121 consider_authors(name_start, p, i + 1);
1122 name_start = p;
1125 consider_authors(name_start, p, i + 1);
1126 if (last_name_start == name_start) {
1127 for (reference **q = last_name_start; q < p; q++)
1128 (*q)->set_last_name_unambiguous(i);
1130 // If we have an author list <A B C D> and <A B C E>, then the lists
1131 // need author D and E respectively.
1132 if (name_start > start || p < end) {
1133 for (reference **q = last_name_start; q < p; q++)
1134 (*q)->need_author(i);
1139 int same_author_last_name(const reference &r1, const reference &r2, int n)
1141 const char *ae1;
1142 const char *as1 = r1.get_sort_field(0, n, 0, &ae1);
1143 const char *ae2;
1144 const char *as2 = r2.get_sort_field(0, n, 0, &ae2);
1145 if (!as1 && !as2) return 1; // they are the same
1146 if (!as1 || !as2) return 0;
1147 return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1150 int same_author_name(const reference &r1, const reference &r2, int n)
1152 const char *ae1;
1153 const char *as1 = r1.get_sort_field(0, n, -1, &ae1);
1154 const char *ae2;
1155 const char *as2 = r2.get_sort_field(0, n, -1, &ae2);
1156 if (!as1 && !as2) return 1; // they are the same
1157 if (!as1 || !as2) return 0;
1158 return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1161 void int_set::set(int i)
1163 assert(i >= 0);
1164 int bytei = i >> 3;
1165 if (bytei >= v.length()) {
1166 int old_length = v.length();
1167 v.set_length(bytei + 1);
1168 for (int j = old_length; j <= bytei; j++)
1169 v[j] = 0;
1171 v[bytei] |= 1 << (i & 7);
1174 int int_set::get(int i) const
1176 assert(i >= 0);
1177 int bytei = i >> 3;
1178 return bytei >= v.length() ? 0 : (v[bytei] & (1 << (i & 7))) != 0;
1181 void reference::set_last_name_unambiguous(int i)
1183 last_name_unambiguous.set(i);
1186 void reference::need_author(int n)
1188 if (n > last_needed_author)
1189 last_needed_author = n;
1192 const char *reference::get_authors(const char **end) const
1194 if (!computed_authors) {
1195 ((reference *)this)->computed_authors = 1;
1196 string &result = ((reference *)this)->authors;
1197 int na = get_nauthors();
1198 result.clear();
1199 for (int i = 0; i < na; i++) {
1200 if (last_name_unambiguous.get(i)) {
1201 const char *e, *start = get_author_last_name(i, &e);
1202 assert(start != 0);
1203 result.append(start, e - start);
1205 else {
1206 const char *e, *start = get_author(i, &e);
1207 assert(start != 0);
1208 result.append(start, e - start);
1210 if (i == last_needed_author
1211 && et_al.length() > 0
1212 && et_al_min_elide > 0
1213 && last_needed_author + et_al_min_elide < na
1214 && na >= et_al_min_total) {
1215 result += et_al;
1216 break;
1218 if (i < na - 1) {
1219 if (na == 2)
1220 result += join_authors_exactly_two;
1221 else if (i < na - 2)
1222 result += join_authors_default;
1223 else
1224 result += join_authors_last_two;
1228 const char *start = authors.contents();
1229 *end = start + authors.length();
1230 return start;
1233 int reference::get_nauthors() const
1235 if (nauthors < 0) {
1236 const char *dummy;
1237 int na;
1238 for (na = 0; get_author(na, &dummy) != 0; na++)
1240 ((reference *)this)->nauthors = na;
1242 return nauthors;
1245 // s-it2-mode