contrib/groff-1.19/src/preproc/refer/label.y

   1 /* -*- C++ -*-
   2    Copyright (C) 1989, 1990, 1991, 1992, 2000, 2004
   3    Free Software Foundation, Inc.
   4      Written by James Clark (jjc@jclark.com)
   5
   6 This file is part of groff.
   7
   8 groff is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 2, or (at your option) any later
  11 version.
  12
  13 groff is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License along
  19 with groff; see the file COPYING.  If not, write to the Free Software
  20 Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
  21
  22 %{
  23
  24 #include "refer.h"
  25 #include "refid.h"
  26 #include "ref.h"
  27 #include "token.h"
  28
  29 int yylex();
  30 void yyerror(const char *);
  31 int yyparse();
  32
  33 static const char *format_serial(char c, int n);
  34
  35 struct label_info {
  36   int start;
  37   int length;
  38   int count;
  39   int total;
  40   label_info(const string &);
  41 };
  42
  43 label_info *lookup_label(const string &label);
  44
  45 struct expression {
  46   enum {
  47     // Does the tentative label depend on the reference?
  48     CONTAINS_VARIABLE = 01,
  49     CONTAINS_STAR = 02,
  50     CONTAINS_FORMAT = 04,
  51     CONTAINS_AT = 010
  52   };
  53   virtual ~expression() { }
  54   virtual void evaluate(int, const reference &, string &,
  55                         substring_position &) = 0;
  56   virtual unsigned analyze() { return 0; }
  57 };
  58
  59 class at_expr : public expression {
  60 public:
  61   at_expr() { }
  62   void evaluate(int, const reference &, string &, substring_position &);
  63   unsigned analyze() { return CONTAINS_VARIABLE|CONTAINS_AT; }
  64 };
  65
  66 class format_expr : public expression {
  67   char type;
  68   int width;
  69   int first_number;
  70 public:
  71   format_expr(char c, int w = 0, int f = 1)
  72     : type(c), width(w), first_number(f) { }
  73   void evaluate(int, const reference &, string &, substring_position &);
  74   unsigned analyze() { return CONTAINS_FORMAT; }
  75 };
  76
  77 class field_expr : public expression {
  78   int number;
  79   char name;
  80 public:
  81   field_expr(char nm, int num) : number(num), name(nm) { }
  82   void evaluate(int, const reference &, string &, substring_position &);
  83   unsigned analyze() { return CONTAINS_VARIABLE; }
  84 };
  85
  86 class literal_expr : public expression {
  87   string s;
  88 public:
  89   literal_expr(const char *ptr, int len) : s(ptr, len) { }
  90   void evaluate(int, const reference &, string &, substring_position &);
  91 };
  92
  93 class unary_expr : public expression {
  94 protected:
  95   expression *expr;
  96 public:
  97   unary_expr(expression *e) : expr(e) { }
  98   ~unary_expr() { delete expr; }
  99   void evaluate(int, const reference &, string &, substring_position &) = 0;
 100   unsigned analyze() { return expr ? expr->analyze() : 0; }
 101 };
 102
 103 // This caches the analysis of an expression.
 104
 105 class analyzed_expr : public unary_expr {
 106   unsigned flags;
 107 public:
 108   analyzed_expr(expression *);
 109   void evaluate(int, const reference &, string &, substring_position &);
 110   unsigned analyze() { return flags; }
 111 };
 112
 113 class star_expr : public unary_expr {
 114 public:
 115   star_expr(expression *e) : unary_expr(e) { }
 116   void evaluate(int, const reference &, string &, substring_position &);
 117   unsigned analyze() {
 118     return ((expr ? (expr->analyze() & ~CONTAINS_VARIABLE) : 0)
 119             | CONTAINS_STAR);
 120   }
 121 };
 122
 123 typedef void map_func(const char *, const char *, string &);
 124
 125 class map_expr : public unary_expr {
 126   map_func *func;
 127 public:
 128   map_expr(expression *e, map_func *f) : unary_expr(e), func(f) { }
 129   void evaluate(int, const reference &, string &, substring_position &);
 130 };
 131
 132 typedef const char *extractor_func(const char *, const char *, const char **);
 133
 134 class extractor_expr : public unary_expr {
 135   int part;
 136   extractor_func *func;
 137 public:
 138   enum { BEFORE = +1, MATCH = 0, AFTER = -1 };
 139   extractor_expr(expression *e, extractor_func *f, int pt)
 140     : unary_expr(e), part(pt), func(f) { }
 141   void evaluate(int, const reference &, string &, substring_position &);
 142 };
 143
 144 class truncate_expr : public unary_expr {
 145   int n;
 146 public:
 147   truncate_expr(expression *e, int i) : unary_expr(e), n(i) { }
 148   void evaluate(int, const reference &, string &, substring_position &);
 149 };
 150
 151 class separator_expr : public unary_expr {
 152 public:
 153   separator_expr(expression *e) : unary_expr(e) { }
 154   void evaluate(int, const reference &, string &, substring_position &);
 155 };
 156
 157 class binary_expr : public expression {
 158 protected:
 159   expression *expr1;
 160   expression *expr2;
 161 public:
 162   binary_expr(expression *e1, expression *e2) : expr1(e1), expr2(e2) { }
 163   ~binary_expr() { delete expr1; delete expr2; }
 164   void evaluate(int, const reference &, string &, substring_position &) = 0;
 165   unsigned analyze() {
 166     return (expr1 ? expr1->analyze() : 0) | (expr2 ? expr2->analyze() : 0);
 167   }
 168 };
 169
 170 class alternative_expr : public binary_expr {
 171 public:
 172   alternative_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
 173   void evaluate(int, const reference &, string &, substring_position &);
 174 };
 175
 176 class list_expr : public binary_expr {
 177 public:
 178   list_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
 179   void evaluate(int, const reference &, string &, substring_position &);
 180 };
 181
 182 class substitute_expr : public binary_expr {
 183 public:
 184   substitute_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
 185   void evaluate(int, const reference &, string &, substring_position &);
 186 };
 187
 188 class ternary_expr : public expression {
 189 protected:
 190   expression *expr1;
 191   expression *expr2;
 192   expression *expr3;
 193 public:
 194   ternary_expr(expression *e1, expression *e2, expression *e3)
 195     : expr1(e1), expr2(e2), expr3(e3) { }
 196   ~ternary_expr() { delete expr1; delete expr2; delete expr3; }
 197   void evaluate(int, const reference &, string &, substring_position &) = 0;
 198   unsigned analyze() {
 199     return ((expr1 ? expr1->analyze() : 0)
 200             | (expr2 ? expr2->analyze() : 0)
 201             | (expr3 ? expr3->analyze() : 0));
 202   }
 203 };
 204
 205 class conditional_expr : public ternary_expr {
 206 public:
 207   conditional_expr(expression *e1, expression *e2, expression *e3)
 208     : ternary_expr(e1, e2, e3) { }
 209   void evaluate(int, const reference &, string &, substring_position &);
 210 };
 211
 212 static expression *parsed_label = 0;
 213 static expression *parsed_date_label = 0;
 214 static expression *parsed_short_label = 0;
 215
 216 static expression *parse_result;
 217
 218 string literals;
 219
 220 %}
 221
 222 %union {
 223   int num;
 224   expression *expr;
 225   struct { int ndigits; int val; } dig;
 226   struct { int start; int len; } str;
 227 }
 228
 229 /* uppercase or lowercase letter */
 230 %token <num> TOKEN_LETTER
 231 /* literal characters */
 232 %token <str> TOKEN_LITERAL
 233 /* digit */
 234 %token <num> TOKEN_DIGIT
 235
 236 %type <expr> conditional
 237 %type <expr> alternative
 238 %type <expr> list
 239 %type <expr> string
 240 %type <expr> substitute
 241 %type <expr> optional_conditional
 242 %type <num> number
 243 %type <dig> digits
 244 %type <num> optional_number
 245 %type <num> flag
 246
 247 %%
 248
 249 expr:
 250         optional_conditional
 251                 { parse_result = ($1 ? new analyzed_expr($1) : 0); }
 252         ;
 253
 254 conditional:
 255         alternative
 256                 { $$ = $1; }
 257         | alternative '?' optional_conditional ':' conditional
 258                 { $$ = new conditional_expr($1, $3, $5); }
 259         ;
 260
 261 optional_conditional:
 262         /* empty */
 263                 { $$ = 0; }
 264         | conditional
 265                 { $$ = $1; }
 266         ;
 267
 268 alternative:
 269         list
 270                 { $$ = $1; }
 271         | alternative '|' list
 272                 { $$ = new alternative_expr($1, $3); }
 273         | alternative '&' list
 274                 { $$ = new conditional_expr($1, $3, 0); }
 275         ;
 276
 277 list:
 278         substitute
 279                 { $$ = $1; }
 280         | list substitute
 281                 { $$ = new list_expr($1, $2); }
 282         ;
 283
 284 substitute:
 285         string
 286                 { $$ = $1; }
 287         | substitute '~' string
 288                 { $$ = new substitute_expr($1, $3); }
 289         ;
 290
 291 string:
 292         '@'
 293                 { $$ = new at_expr; }
 294         | TOKEN_LITERAL
 295                 {
 296                   $$ = new literal_expr(literals.contents() + $1.start,
 297                                         $1.len);
 298                 }
 299         | TOKEN_LETTER
 300                 { $$ = new field_expr($1, 0); }
 301         | TOKEN_LETTER number
 302                 { $$ = new field_expr($1, $2 - 1); }
 303         | '%' TOKEN_LETTER
 304                 {
 305                   switch ($2) {
 306                   case 'I':
 307                   case 'i':
 308                   case 'A':
 309                   case 'a':
 310                     $$ = new format_expr($2);
 311                     break;
 312                   default:
 313                     command_error("unrecognized format `%1'", char($2));
 314                     $$ = new format_expr('a');
 315                     break;
 316                   }
 317                 }
 318
 319         | '%' digits
 320                 {
 321                   $$ = new format_expr('0', $2.ndigits, $2.val);
 322                 }
 323         | string '.' flag TOKEN_LETTER optional_number
 324                 {
 325                   switch ($4) {
 326                   case 'l':
 327                     $$ = new map_expr($1, lowercase);
 328                     break;
 329                   case 'u':
 330                     $$ = new map_expr($1, uppercase);
 331                     break;
 332                   case 'c':
 333                     $$ = new map_expr($1, capitalize);
 334                     break;
 335                   case 'r':
 336                     $$ = new map_expr($1, reverse_name);
 337                     break;
 338                   case 'a':
 339                     $$ = new map_expr($1, abbreviate_name);
 340                     break;
 341                   case 'y':
 342                     $$ = new extractor_expr($1, find_year, $3);
 343                     break;
 344                   case 'n':
 345                     $$ = new extractor_expr($1, find_last_name, $3);
 346                     break;
 347                   default:
 348                     $$ = $1;
 349                     command_error("unknown function `%1'", char($4));
 350                     break;
 351                   }
 352                 }
 353
 354         | string '+' number
 355                 { $$ = new truncate_expr($1, $3); }
 356         | string '-' number
 357                 { $$ = new truncate_expr($1, -$3); }
 358         | string '*'
 359                 { $$ = new star_expr($1); }
 360         | '(' optional_conditional ')'
 361                 { $$ = $2; }
 362         | '<' optional_conditional '>'
 363                 { $$ = new separator_expr($2); }
 364         ;
 365
 366 optional_number:
 367         /* empty */
 368                 { $$ = -1; }
 369         | number
 370                 { $$ = $1; }
 371         ;
 372
 373 number:
 374         TOKEN_DIGIT
 375                 { $$ = $1; }
 376         | number TOKEN_DIGIT
 377                 { $$ = $1*10 + $2; }
 378         ;
 379
 380 digits:
 381         TOKEN_DIGIT
 382                 { $$.ndigits = 1; $$.val = $1; }
 383         | digits TOKEN_DIGIT
 384                 { $$.ndigits = $1.ndigits + 1; $$.val = $1.val*10 + $2; }
 385         ;
 386
 387
 388 flag:
 389         /* empty */
 390                 { $$ = 0; }
 391         | '+'
 392                 { $$ = 1; }
 393         | '-'
 394                 { $$ = -1; }
 395         ;
 396
 397 %%
 398
 399 /* bison defines const to be empty unless __STDC__ is defined, which it
 400 isn't under cfront */
 401
 402 #ifdef const
 403 #undef const
 404 #endif
 405
 406 const char *spec_ptr;
 407 const char *spec_end;
 408 const char *spec_cur;
 409
 410 static char uppercase_array[] = {
 411   'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 412   'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 413   'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 414   'Y', 'Z',
 415 };
 416
 417 static char lowercase_array[] = {
 418   'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
 419   'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
 420   'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
 421   'y', 'z',
 422 };
 423
 424 int yylex()
 425 {
 426   while (spec_ptr < spec_end && csspace(*spec_ptr))
 427     spec_ptr++;
 428   spec_cur = spec_ptr;
 429   if (spec_ptr >= spec_end)
 430     return 0;
 431   unsigned char c = *spec_ptr++;
 432   if (csalpha(c)) {
 433     yylval.num = c;
 434     return TOKEN_LETTER;
 435   }
 436   if (csdigit(c)) {
 437     yylval.num = c - '0';
 438     return TOKEN_DIGIT;
 439   }
 440   if (c == '\'') {
 441     yylval.str.start = literals.length();
 442     for (; spec_ptr < spec_end; spec_ptr++) {
 443       if (*spec_ptr == '\'') {
 444         if (++spec_ptr < spec_end && *spec_ptr == '\'')
 445           literals += '\'';
 446         else {
 447           yylval.str.len = literals.length() - yylval.str.start;
 448           return TOKEN_LITERAL;
 449         }
 450       }
 451       else
 452         literals += *spec_ptr;
 453     }
 454     yylval.str.len = literals.length() - yylval.str.start;
 455     return TOKEN_LITERAL;
 456   }
 457   return c;
 458 }
 459
 460 int set_label_spec(const char *label_spec)
 461 {
 462   spec_cur = spec_ptr = label_spec;
 463   spec_end = strchr(label_spec, '\0');
 464   literals.clear();
 465   if (yyparse())
 466     return 0;
 467   delete parsed_label;
 468   parsed_label = parse_result;
 469   return 1;
 470 }
 471
 472 int set_date_label_spec(const char *label_spec)
 473 {
 474   spec_cur = spec_ptr = label_spec;
 475   spec_end = strchr(label_spec, '\0');
 476   literals.clear();
 477   if (yyparse())
 478     return 0;
 479   delete parsed_date_label;
 480   parsed_date_label = parse_result;
 481   return 1;
 482 }
 483
 484 int set_short_label_spec(const char *label_spec)
 485 {
 486   spec_cur = spec_ptr = label_spec;
 487   spec_end = strchr(label_spec, '\0');
 488   literals.clear();
 489   if (yyparse())
 490     return 0;
 491   delete parsed_short_label;
 492   parsed_short_label = parse_result;
 493   return 1;
 494 }
 495
 496 void yyerror(const char *message)
 497 {
 498   if (spec_cur < spec_end)
 499     command_error("label specification %1 before `%2'", message, spec_cur);
 500   else
 501     command_error("label specification %1 at end of string",
 502                   message, spec_cur);
 503 }
 504
 505 void at_expr::evaluate(int tentative, const reference &ref,
 506                        string &result, substring_position &)
 507 {
 508   if (tentative)
 509     ref.canonicalize_authors(result);
 510   else {
 511     const char *end, *start = ref.get_authors(&end);
 512     if (start)
 513       result.append(start, end - start);
 514   }
 515 }
 516
 517 void format_expr::evaluate(int tentative, const reference &ref,
 518                            string &result, substring_position &)
 519 {
 520   if (tentative)
 521     return;
 522   const label_info *lp = ref.get_label_ptr();
 523   int num = lp == 0 ? ref.get_number() : lp->count;
 524   if (type != '0')
 525     result += format_serial(type, num + 1);
 526   else {
 527     const char *ptr = i_to_a(num + first_number);
 528     int pad = width - strlen(ptr);
 529     while (--pad >= 0)
 530       result += '0';
 531     result += ptr;
 532   }
 533 }
 534
 535 static const char *format_serial(char c, int n)
 536 {
 537   assert(n > 0);
 538   static char buf[128]; // more than enough.
 539   switch (c) {
 540   case 'i':
 541   case 'I':
 542     {
 543       char *p = buf;
 544       // troff uses z and w to represent 10000 and 5000 in Roman
 545       // numerals; I can find no historical basis for this usage
 546       const char *s = c == 'i' ? "zwmdclxvi" : "ZWMDCLXVI";
 547       if (n >= 40000)
 548         return i_to_a(n);
 549       while (n >= 10000) {
 550         *p++ = s[0];
 551         n -= 10000;
 552       }
 553       for (int i = 1000; i > 0; i /= 10, s += 2) {
 554         int m = n/i;
 555         n -= m*i;
 556         switch (m) {
 557         case 3:
 558           *p++ = s[2];
 559           /* falls through */
 560         case 2:
 561           *p++ = s[2];
 562           /* falls through */
 563         case 1:
 564           *p++ = s[2];
 565           break;
 566         case 4:
 567           *p++ = s[2];
 568           *p++ = s[1];
 569           break;
 570         case 8:
 571           *p++ = s[1];
 572           *p++ = s[2];
 573           *p++ = s[2];
 574           *p++ = s[2];
 575           break;
 576         case 7:
 577           *p++ = s[1];
 578           *p++ = s[2];
 579           *p++ = s[2];
 580           break;
 581         case 6:
 582           *p++ = s[1];
 583           *p++ = s[2];
 584           break;
 585         case 5:
 586           *p++ = s[1];
 587           break;
 588         case 9:
 589           *p++ = s[2];
 590           *p++ = s[0];
 591         }
 592       }
 593       *p = 0;
 594       break;
 595     }
 596   case 'a':
 597   case 'A':
 598     {
 599       char *p = buf;
 600       // this is derived from troff/reg.c
 601       while (n > 0) {
 602         int d = n % 26;
 603         if (d == 0)
 604           d = 26;
 605         n -= d;
 606         n /= 26;
 607         *p++ = c == 'a' ? lowercase_array[d - 1] :
 608                                uppercase_array[d - 1];
 609       }
 610       *p-- = 0;
 611       // Reverse it.
 612       char *q = buf;
 613       while (q < p) {
 614         char temp = *q;
 615         *q = *p;
 616         *p = temp;
 617         --p;
 618         ++q;
 619       }
 620       break;
 621     }
 622   default:
 623     assert(0);
 624   }
 625   return buf;
 626 }
 627
 628 void field_expr::evaluate(int, const reference &ref,
 629                           string &result, substring_position &)
 630 {
 631   const char *end;
 632   const char *start = ref.get_field(name, &end);
 633   if (start) {
 634     start = nth_field(number, start, &end);
 635     if (start)
 636       result.append(start, end - start);
 637   }
 638 }
 639
 640 void literal_expr::evaluate(int, const reference &,
 641                             string &result, substring_position &)
 642 {
 643   result += s;
 644 }
 645
 646 analyzed_expr::analyzed_expr(expression *e)
 647 : unary_expr(e), flags(e ? e->analyze() : 0)
 648 {
 649 }
 650
 651 void analyzed_expr::evaluate(int tentative, const reference &ref,
 652                              string &result, substring_position &pos)
 653 {
 654   if (expr)
 655     expr->evaluate(tentative, ref, result, pos);
 656 }
 657
 658 void star_expr::evaluate(int tentative, const reference &ref,
 659                          string &result, substring_position &pos)
 660 {
 661   const label_info *lp = ref.get_label_ptr();
 662   if (!tentative
 663       && (lp == 0 || lp->total > 1)
 664       && expr)
 665     expr->evaluate(tentative, ref, result, pos);
 666 }
 667
 668 void separator_expr::evaluate(int tentative, const reference &ref,
 669                               string &result, substring_position &pos)
 670 {
 671   int start_length = result.length();
 672   int is_first = pos.start < 0;
 673   if (expr)
 674     expr->evaluate(tentative, ref, result, pos);
 675   if (is_first) {
 676     pos.start = start_length;
 677     pos.length = result.length() - start_length;
 678   }
 679 }
 680
 681 void map_expr::evaluate(int tentative, const reference &ref,
 682                         string &result, substring_position &)
 683 {
 684   if (expr) {
 685     string temp;
 686     substring_position temp_pos;
 687     expr->evaluate(tentative, ref, temp, temp_pos);
 688     (*func)(temp.contents(), temp.contents() + temp.length(), result);
 689   }
 690 }
 691
 692 void extractor_expr::evaluate(int tentative, const reference &ref,
 693                               string &result, substring_position &)
 694 {
 695   if (expr) {
 696     string temp;
 697     substring_position temp_pos;
 698     expr->evaluate(tentative, ref, temp, temp_pos);
 699     const char *end, *start = (*func)(temp.contents(),
 700                                       temp.contents() + temp.length(),
 701                                       &end);
 702     switch (part) {
 703     case BEFORE:
 704       if (start)
 705         result.append(temp.contents(), start - temp.contents());
 706       else
 707         result += temp;
 708       break;
 709     case MATCH:
 710       if (start)
 711         result.append(start, end - start);
 712       break;
 713     case AFTER:
 714       if (start)
 715         result.append(end, temp.contents() + temp.length() - end);
 716       break;
 717     default:
 718       assert(0);
 719     }
 720   }
 721 }
 722
 723 static void first_part(int len, const char *ptr, const char *end,
 724                           string &result)
 725 {
 726   for (;;) {
 727     const char *token_start = ptr;
 728     if (!get_token(&ptr, end))
 729       break;
 730     const token_info *ti = lookup_token(token_start, ptr);
 731     int counts = ti->sortify_non_empty(token_start, ptr);
 732     if (counts && --len < 0)
 733       break;
 734     if (counts || ti->is_accent())
 735       result.append(token_start, ptr - token_start);
 736   }
 737 }
 738
 739 static void last_part(int len, const char *ptr, const char *end,
 740                       string &result)
 741 {
 742   const char *start = ptr;
 743   int count = 0;
 744   for (;;) {
 745     const char *token_start = ptr;
 746     if (!get_token(&ptr, end))
 747       break;
 748     const token_info *ti = lookup_token(token_start, ptr);
 749     if (ti->sortify_non_empty(token_start, ptr))
 750       count++;
 751   }
 752   ptr = start;
 753   int skip = count - len;
 754   if (skip > 0) {
 755     for (;;) {
 756       const char *token_start = ptr;
 757       if (!get_token(&ptr, end))
 758         assert(0);
 759       const token_info *ti = lookup_token(token_start, ptr);
 760       if (ti->sortify_non_empty(token_start, ptr) && --skip < 0) {
 761         ptr = token_start;
 762         break;
 763       }
 764     }
 765   }
 766   first_part(len, ptr, end, result);
 767 }
 768
 769 void truncate_expr::evaluate(int tentative, const reference &ref,
 770                              string &result, substring_position &)
 771 {
 772   if (expr) {
 773     string temp;
 774     substring_position temp_pos;
 775     expr->evaluate(tentative, ref, temp, temp_pos);
 776     const char *start = temp.contents();
 777     const char *end = start + temp.length();
 778     if (n > 0)
 779       first_part(n, start, end, result);
 780     else if (n < 0)
 781       last_part(-n, start, end, result);
 782   }
 783 }
 784
 785 void alternative_expr::evaluate(int tentative, const reference &ref,
 786                                 string &result, substring_position &pos)
 787 {
 788   int start_length = result.length();
 789   if (expr1)
 790     expr1->evaluate(tentative, ref, result, pos);
 791   if (result.length() == start_length && expr2)
 792     expr2->evaluate(tentative, ref, result, pos);
 793 }
 794
 795 void list_expr::evaluate(int tentative, const reference &ref,
 796                          string &result, substring_position &pos)
 797 {
 798   if (expr1)
 799     expr1->evaluate(tentative, ref, result, pos);
 800   if (expr2)
 801     expr2->evaluate(tentative, ref, result, pos);
 802 }
 803
 804 void substitute_expr::evaluate(int tentative, const reference &ref,
 805                                string &result, substring_position &pos)
 806 {
 807   int start_length = result.length();
 808   if (expr1)
 809     expr1->evaluate(tentative, ref, result, pos);
 810   if (result.length() > start_length && result[result.length() - 1] == '-') {
 811     // ought to see if pos covers the -
 812     result.set_length(result.length() - 1);
 813     if (expr2)
 814       expr2->evaluate(tentative, ref, result, pos);
 815   }
 816 }
 817
 818 void conditional_expr::evaluate(int tentative, const reference &ref,
 819                                 string &result, substring_position &pos)
 820 {
 821   string temp;
 822   substring_position temp_pos;
 823   if (expr1)
 824     expr1->evaluate(tentative, ref, temp, temp_pos);
 825   if (temp.length() > 0) {
 826     if (expr2)
 827       expr2->evaluate(tentative, ref, result, pos);
 828   }
 829   else {
 830     if (expr3)
 831       expr3->evaluate(tentative, ref, result, pos);
 832   }
 833 }
 834
 835 void reference::pre_compute_label()
 836 {
 837   if (parsed_label != 0
 838       && (parsed_label->analyze() & expression::CONTAINS_VARIABLE)) {
 839     label.clear();
 840     substring_position temp_pos;
 841     parsed_label->evaluate(1, *this, label, temp_pos);
 842     label_ptr = lookup_label(label);
 843   }
 844 }
 845
 846 void reference::compute_label()
 847 {
 848   label.clear();
 849   if (parsed_label)
 850     parsed_label->evaluate(0, *this, label, separator_pos);
 851   if (short_label_flag && parsed_short_label)
 852     parsed_short_label->evaluate(0, *this, short_label, short_separator_pos);
 853   if (date_as_label) {
 854     string new_date;
 855     if (parsed_date_label) {
 856       substring_position temp_pos;
 857       parsed_date_label->evaluate(0, *this, new_date, temp_pos);
 858     }
 859     set_date(new_date);
 860   }
 861   if (label_ptr)
 862     label_ptr->count += 1;
 863 }
 864
 865 void reference::immediate_compute_label()
 866 {
 867   if (label_ptr)
 868     label_ptr->total = 2;       // force use of disambiguator
 869   compute_label();
 870 }
 871
 872 int reference::merge_labels(reference **v, int n, label_type type,
 873                             string &result)
 874 {
 875   if (abbreviate_label_ranges)
 876     return merge_labels_by_number(v, n, type, result);
 877   else
 878     return merge_labels_by_parts(v, n, type, result);
 879 }
 880
 881 int reference::merge_labels_by_number(reference **v, int n, label_type type,
 882                                       string &result)
 883 {
 884   if (n <= 1)
 885     return 0;
 886   int num = get_number();
 887   // Only merge three or more labels.
 888   if (v[0]->get_number() != num + 1
 889       || v[1]->get_number() != num + 2)
 890     return 0;
 891   int i;
 892   for (i = 2; i < n; i++)
 893     if (v[i]->get_number() != num + i + 1)
 894       break;
 895   result = get_label(type);
 896   result += label_range_indicator;
 897   result += v[i - 1]->get_label(type);
 898   return i;
 899 }
 900
 901 const substring_position &reference::get_separator_pos(label_type type) const
 902 {
 903   if (type == SHORT_LABEL && short_label_flag)
 904     return short_separator_pos;
 905   else
 906     return separator_pos;
 907 }
 908
 909 const string &reference::get_label(label_type type) const
 910 {
 911   if (type == SHORT_LABEL && short_label_flag)
 912     return short_label;
 913   else
 914     return label;
 915 }
 916
 917 int reference::merge_labels_by_parts(reference **v, int n, label_type type,
 918                                      string &result)
 919 {
 920   if (n <= 0)
 921     return 0;
 922   const string &lb = get_label(type);
 923   const substring_position &sp = get_separator_pos(type);
 924   if (sp.start < 0
 925       || sp.start != v[0]->get_separator_pos(type).start
 926       || memcmp(lb.contents(), v[0]->get_label(type).contents(),
 927                 sp.start) != 0)
 928     return 0;
 929   result = lb;
 930   int i = 0;
 931   do {
 932     result += separate_label_second_parts;
 933     const substring_position &s = v[i]->get_separator_pos(type);
 934     int sep_end_pos = s.start + s.length;
 935     result.append(v[i]->get_label(type).contents() + sep_end_pos,
 936                   v[i]->get_label(type).length() - sep_end_pos);
 937   } while (++i < n
 938            && sp.start == v[i]->get_separator_pos(type).start
 939            && memcmp(lb.contents(), v[i]->get_label(type).contents(),
 940                      sp.start) == 0);
 941   return i;
 942 }
 943
 944 string label_pool;
 945
 946 label_info::label_info(const string &s)
 947 : start(label_pool.length()), length(s.length()), count(0), total(1)
 948 {
 949   label_pool += s;
 950 }
 951
 952 static label_info **label_table = 0;
 953 static int label_table_size = 0;
 954 static int label_table_used = 0;
 955
 956 label_info *lookup_label(const string &label)
 957 {
 958   if (label_table == 0) {
 959     label_table = new label_info *[17];
 960     label_table_size = 17;
 961     for (int i = 0; i < 17; i++)
 962       label_table[i] = 0;
 963   }
 964   unsigned h = hash_string(label.contents(), label.length()) % label_table_size;
 965   label_info **ptr;
 966   for (ptr = label_table + h;
 967        *ptr != 0;
 968        (ptr == label_table)
 969        ? (ptr = label_table + label_table_size - 1)
 970        : ptr--)
 971     if ((*ptr)->length == label.length()
 972         && memcmp(label_pool.contents() + (*ptr)->start, label.contents(),
 973                   label.length()) == 0) {
 974       (*ptr)->total += 1;
 975       return *ptr;
 976     }
 977   label_info *result = *ptr = new label_info(label);
 978   if (++label_table_used * 2 > label_table_size) {
 979     // Rehash the table.
 980     label_info **old_table = label_table;
 981     int old_size = label_table_size;
 982     label_table_size = next_size(label_table_size);
 983     label_table = new label_info *[label_table_size];
 984     int i;
 985     for (i = 0; i < label_table_size; i++)
 986       label_table[i] = 0;
 987     for (i = 0; i < old_size; i++)
 988       if (old_table[i]) {
 989         h = hash_string(label_pool.contents() + old_table[i]->start,
 990                         old_table[i]->length);
 991         label_info **p;
 992         for (p = label_table + (h % label_table_size);
 993              *p != 0;
 994              (p == label_table)
 995              ? (p = label_table + label_table_size - 1)
 996              : --p)
 997             ;
 998         *p = old_table[i];
 999         }
1000     a_delete old_table;
1001   }
1002   return result;
1003 }
1004
1005 void clear_labels()
1006 {
1007   for (int i = 0; i < label_table_size; i++) {
1008     delete label_table[i];
1009     label_table[i] = 0;
1010   }
1011   label_table_used = 0;
1012   label_pool.clear();
1013 }
1014
1015 static void consider_authors(reference **start, reference **end, int i);
1016
1017 void compute_labels(reference **v, int n)
1018 {
1019   if (parsed_label
1020       && (parsed_label->analyze() & expression::CONTAINS_AT)
1021       && sort_fields.length() >= 2
1022       && sort_fields[0] == 'A'
1023       && sort_fields[1] == '+')
1024     consider_authors(v, v + n, 0);
1025   for (int i = 0; i < n; i++)
1026     v[i]->compute_label();
1027 }
1028
1029
1030 /* A reference with a list of authors <A0,A1,...,AN> _needs_ author i
1031 where 0 <= i <= N if there exists a reference with a list of authors
1032 <B0,B1,...,BM> such that <A0,A1,...,AN> != <B0,B1,...,BM> and M >= i
1033 and Aj = Bj for 0 <= j < i. In this case if we can't say ``A0,
1034 A1,...,A(i-1) et al'' because this would match both <A0,A1,...,AN> and
1035 <B0,B1,...,BM>.  If a reference needs author i we only have to call
1036 need_author(j) for some j >= i such that the reference also needs
1037 author j. */
1038
1039 /* This function handles 2 tasks:
1040 determine which authors are needed (cannot be elided with et al.);
1041 determine which authors can have only last names in the labels.
1042
1043 References >= start and < end have the same first i author names.
1044 Also they're sorted by A+. */
1045
1046 static void consider_authors(reference **start, reference **end, int i)
1047 {
1048   if (start >= end)
1049     return;
1050   reference **p = start;
1051   if (i >= (*p)->get_nauthors()) {
1052     for (++p; p < end && i >= (*p)->get_nauthors(); p++)
1053       ;
1054     if (p < end && i > 0) {
1055       // If we have an author list <A B C> and an author list <A B C D>,
1056       // then both lists need C.
1057       for (reference **q = start; q < end; q++)
1058         (*q)->need_author(i - 1);
1059     }
1060     start = p;
1061   }
1062   while (p < end) {
1063     reference **last_name_start = p;
1064     reference **name_start = p;
1065     for (++p;
1066          p < end && i < (*p)->get_nauthors()
1067          && same_author_last_name(**last_name_start, **p, i);
1068          p++) {
1069       if (!same_author_name(**name_start, **p, i)) {
1070         consider_authors(name_start, p, i + 1);
1071         name_start = p;
1072       }
1073     }
1074     consider_authors(name_start, p, i + 1);
1075     if (last_name_start == name_start) {
1076       for (reference **q = last_name_start; q < p; q++)
1077         (*q)->set_last_name_unambiguous(i);
1078     }
1079     // If we have an author list <A B C D> and <A B C E>, then the lists
1080     // need author D and E respectively.
1081     if (name_start > start || p < end) {
1082       for (reference **q = last_name_start; q < p; q++)
1083         (*q)->need_author(i);
1084     }
1085   }
1086 }
1087
1088 int same_author_last_name(const reference &r1, const reference &r2, int n)
1089 {
1090   const char *ae1;
1091   const char *as1 = r1.get_sort_field(0, n, 0, &ae1);
1092   const char *ae2;
1093   const char *as2 = r2.get_sort_field(0, n, 0, &ae2);
1094   if (!as1 && !as2) return 1;   // they are the same
1095   if (!as1 || !as2) return 0;
1096   return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1097 }
1098
1099 int same_author_name(const reference &r1, const reference &r2, int n)
1100 {
1101   const char *ae1;
1102   const char *as1 = r1.get_sort_field(0, n, -1, &ae1);
1103   const char *ae2;
1104   const char *as2 = r2.get_sort_field(0, n, -1, &ae2);
1105   if (!as1 && !as2) return 1;   // they are the same
1106   if (!as1 || !as2) return 0;
1107   return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1108 }
1109
1110
1111 void int_set::set(int i)
1112 {
1113   assert(i >= 0);
1114   int bytei = i >> 3;
1115   if (bytei >= v.length()) {
1116     int old_length = v.length();
1117     v.set_length(bytei + 1);
1118     for (int j = old_length; j <= bytei; j++)
1119       v[j] = 0;
1120   }
1121   v[bytei] |= 1 << (i & 7);
1122 }
1123
1124 int int_set::get(int i) const
1125 {
1126   assert(i >= 0);
1127   int bytei = i >> 3;
1128   return bytei >= v.length() ? 0 : (v[bytei] & (1 << (i & 7))) != 0;
1129 }
1130
1131 void reference::set_last_name_unambiguous(int i)
1132 {
1133   last_name_unambiguous.set(i);
1134 }
1135
1136 void reference::need_author(int n)
1137 {
1138   if (n > last_needed_author)
1139     last_needed_author = n;
1140 }
1141
1142 const char *reference::get_authors(const char **end) const
1143 {
1144   if (!computed_authors) {
1145     ((reference *)this)->computed_authors = 1;
1146     string &result = ((reference *)this)->authors;
1147     int na = get_nauthors();
1148     result.clear();
1149     for (int i = 0; i < na; i++) {
1150       if (last_name_unambiguous.get(i)) {
1151         const char *e, *start = get_author_last_name(i, &e);
1152         assert(start != 0);
1153         result.append(start, e - start);
1154       }
1155       else {
1156         const char *e, *start = get_author(i, &e);
1157         assert(start != 0);
1158         result.append(start, e - start);
1159       }
1160       if (i == last_needed_author
1161           && et_al.length() > 0
1162           && et_al_min_elide > 0
1163           && last_needed_author + et_al_min_elide < na
1164           && na >= et_al_min_total) {
1165         result += et_al;
1166         break;
1167       }
1168       if (i < na - 1) {
1169         if (na == 2)
1170           result += join_authors_exactly_two;
1171         else if (i < na - 2)
1172           result += join_authors_default;
1173         else
1174           result += join_authors_last_two;
1175       }
1176     }
1177   }
1178   const char *start = authors.contents();
1179   *end = start + authors.length();
1180   return start;
1181 }
1182
1183 int reference::get_nauthors() const
1184 {
1185   if (nauthors < 0) {
1186     const char *dummy;
1187     int na;
1188     for (na = 0; get_author(na, &dummy) != 0; na++)
1189       ;
1190     ((reference *)this)->nauthors = na;
1191   }
1192   return nauthors;
1193 }