locale/programs/ld-collate.c

   1 /* Copyright (C) 1995-2003, 2005-2008, 2009, 2011 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published
   7    by the Free Software Foundation; version 2 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, see <http://www.gnu.org/licenses/>.  */
  17
  18 #ifdef HAVE_CONFIG_H
  19 # include <config.h>
  20 #endif
  21
  22 #include <errno.h>
  23 #include <error.h>
  24 #include <stdlib.h>
  25 #include <wchar.h>
  26 #include <sys/param.h>
  27
  28 #include "localedef.h"
  29 #include "charmap.h"
  30 #include "localeinfo.h"
  31 #include "linereader.h"
  32 #include "locfile.h"
  33 #include "elem-hash.h"
  34
  35 /* Uncomment the following line in the production version.  */
  36 /* #define NDEBUG 1 */
  37 #include <assert.h>
  38
  39 #define obstack_chunk_alloc malloc
  40 #define obstack_chunk_free free
  41
  42 static inline void
  43 __attribute ((always_inline))
  44 obstack_int32_grow (struct obstack *obstack, int32_t data)
  45 {
  46   if (sizeof (int32_t) == sizeof (int))
  47     obstack_int_grow (obstack, data);
  48   else
  49     obstack_grow (obstack, &data, sizeof (int32_t));
  50 }
  51
  52 static inline void
  53 __attribute ((always_inline))
  54 obstack_int32_grow_fast (struct obstack *obstack, int32_t data)
  55 {
  56   if (sizeof (int32_t) == sizeof (int))
  57     obstack_int_grow_fast (obstack, data);
  58   else
  59     obstack_grow (obstack, &data, sizeof (int32_t));
  60 }
  61
  62 /* Forward declaration.  */
  63 struct element_t;
  64
  65 /* Data type for list of strings.  */
  66 struct section_list
  67 {
  68   /* Successor in the known_sections list.  */
  69   struct section_list *def_next;
  70   /* Successor in the sections list.  */
  71   struct section_list *next;
  72   /* Name of the section.  */
  73   const char *name;
  74   /* First element of this section.  */
  75   struct element_t *first;
  76   /* Last element of this section.  */
  77   struct element_t *last;
  78   /* These are the rules for this section.  */
  79   enum coll_sort_rule *rules;
  80   /* Index of the rule set in the appropriate section of the output file.  */
  81   int ruleidx;
  82 };
  83
  84 struct element_t;
  85
  86 struct element_list_t
  87 {
  88   /* Number of elements.  */
  89   int cnt;
  90
  91   struct element_t **w;
  92 };
  93
  94 /* Data type for collating element.  */
  95 struct element_t
  96 {
  97   const char *name;
  98
  99   const char *mbs;
 100   size_t nmbs;
 101   const uint32_t *wcs;
 102   size_t nwcs;
 103   int *mborder;
 104   int wcorder;
 105
 106   /* The following is a bit mask which bits are set if this element is
 107      used in the appropriate level.  Interesting for the singlebyte
 108      weight computation.
 109
 110      XXX The type here restricts the number of levels to 32.  It could
 111      be changed if necessary but I doubt this is necessary.  */
 112   unsigned int used_in_level;
 113
 114   struct element_list_t *weights;
 115
 116   /* Nonzero if this is a real character definition.  */
 117   int is_character;
 118
 119   /* Order of the character in the sequence.  This information will
 120      be used in range expressions.  */
 121   int mbseqorder;
 122   int wcseqorder;
 123
 124   /* Where does the definition come from.  */
 125   const char *file;
 126   size_t line;
 127
 128   /* Which section does this belong to.  */
 129   struct section_list *section;
 130
 131   /* Predecessor and successor in the order list.  */
 132   struct element_t *last;
 133   struct element_t *next;
 134
 135   /* Next element in multibyte output list.  */
 136   struct element_t *mbnext;
 137   struct element_t *mblast;
 138
 139   /* Next element in wide character output list.  */
 140   struct element_t *wcnext;
 141   struct element_t *wclast;
 142 };
 143
 144 /* Special element value.  */
 145 #define ELEMENT_ELLIPSIS2       ((struct element_t *) 1)
 146 #define ELEMENT_ELLIPSIS3       ((struct element_t *) 2)
 147 #define ELEMENT_ELLIPSIS4       ((struct element_t *) 3)
 148
 149 /* Data type for collating symbol.  */
 150 struct symbol_t
 151 {
 152   const char *name;
 153
 154   /* Point to place in the order list.  */
 155   struct element_t *order;
 156
 157   /* Where does the definition come from.  */
 158   const char *file;
 159   size_t line;
 160 };
 161
 162 /* Sparse table of struct element_t *.  */
 163 #define TABLE wchead_table
 164 #define ELEMENT struct element_t *
 165 #define DEFAULT NULL
 166 #define ITERATE
 167 #define NO_FINALIZE
 168 #include "3level.h"
 169
 170 /* Sparse table of int32_t.  */
 171 #define TABLE collidx_table
 172 #define ELEMENT int32_t
 173 #define DEFAULT 0
 174 #include "3level.h"
 175
 176 /* Sparse table of uint32_t.  */
 177 #define TABLE collseq_table
 178 #define ELEMENT uint32_t
 179 #define DEFAULT ~((uint32_t) 0)
 180 #include "3level.h"
 181
 182
 183 /* Simple name list for the preprocessor.  */
 184 struct name_list
 185 {
 186   struct name_list *next;
 187   char str[0];
 188 };
 189
 190
 191 /* The real definition of the struct for the LC_COLLATE locale.  */
 192 struct locale_collate_t
 193 {
 194   int col_weight_max;
 195   int cur_weight_max;
 196
 197   /* List of known scripts.  */
 198   struct section_list *known_sections;
 199   /* List of used sections.  */
 200   struct section_list *sections;
 201   /* Current section using definition.  */
 202   struct section_list *current_section;
 203   /* There always can be an unnamed section.  */
 204   struct section_list unnamed_section;
 205   /* Flag whether the unnamed section has been defined.  */
 206   bool unnamed_section_defined;
 207   /* To make handling of errors easier we have another section.  */
 208   struct section_list error_section;
 209   /* Sometimes we are defining the values for collating symbols before
 210      the first actual section.  */
 211   struct section_list symbol_section;
 212
 213   /* Start of the order list.  */
 214   struct element_t *start;
 215
 216   /* The undefined element.  */
 217   struct element_t undefined;
 218
 219   /* This is the cursor for `reorder_after' insertions.  */
 220   struct element_t *cursor;
 221
 222   /* This value is used when handling ellipsis.  */
 223   struct element_t ellipsis_weight;
 224
 225   /* Known collating elements.  */
 226   hash_table elem_table;
 227
 228   /* Known collating symbols.  */
 229   hash_table sym_table;
 230
 231   /* Known collation sequences.  */
 232   hash_table seq_table;
 233
 234   struct obstack mempool;
 235
 236   /* The LC_COLLATE category is a bit special as it is sometimes possible
 237      that the definitions from more than one input file contains information.
 238      Therefore we keep all relevant input in a list.  */
 239   struct locale_collate_t *next;
 240
 241   /* Arrays with heads of the list for each of the leading bytes in
 242      the multibyte sequences.  */
 243   struct element_t *mbheads[256];
 244
 245   /* Arrays with heads of the list for each of the leading bytes in
 246      the multibyte sequences.  */
 247   struct wchead_table wcheads;
 248
 249   /* The arrays with the collation sequence order.  */
 250   unsigned char mbseqorder[256];
 251   struct collseq_table wcseqorder;
 252
 253   /* State of the preprocessor.  */
 254   enum
 255     {
 256       else_none = 0,
 257       else_ignore,
 258       else_seen
 259     }
 260     else_action;
 261 };
 262
 263
 264 /* We have a few global variables which are used for reading all
 265    LC_COLLATE category descriptions in all files.  */
 266 static uint32_t nrules;
 267
 268 /* List of defined preprocessor symbols.  */
 269 static struct name_list *defined;
 270
 271
 272 /* We need UTF-8 encoding of numbers.  */
 273 static inline int
 274 __attribute ((always_inline))
 275 utf8_encode (char *buf, int val)
 276 {
 277   int retval;
 278
 279   if (val < 0x80)
 280     {
 281       *buf++ = (char) val;
 282       retval = 1;
 283     }
 284   else
 285     {
 286       int step;
 287
 288       for (step = 2; step < 6; ++step)
 289         if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
 290           break;
 291       retval = step;
 292
 293       *buf = (unsigned char) (~0xff >> step);
 294       --step;
 295       do
 296         {
 297           buf[step] = 0x80 | (val & 0x3f);
 298           val >>= 6;
 299         }
 300       while (--step > 0);
 301       *buf |= val;
 302     }
 303
 304   return retval;
 305 }
 306
 307
 308 static struct section_list *
 309 make_seclist_elem (struct locale_collate_t *collate, const char *string,
 310                    struct section_list *next)
 311 {
 312   struct section_list *newp;
 313
 314   newp = (struct section_list *) obstack_alloc (&collate->mempool,
 315                                                 sizeof (*newp));
 316   newp->next = next;
 317   newp->name = string;
 318   newp->first = NULL;
 319   newp->last = NULL;
 320
 321   return newp;
 322 }
 323
 324
 325 static struct element_t *
 326 new_element (struct locale_collate_t *collate, const char *mbs, size_t mbslen,
 327              const uint32_t *wcs, const char *name, size_t namelen,
 328              int is_character)
 329 {
 330   struct element_t *newp;
 331
 332   newp = (struct element_t *) obstack_alloc (&collate->mempool,
 333                                              sizeof (*newp));
 334   newp->name = name == NULL ? NULL : obstack_copy0 (&collate->mempool,
 335                                                     name, namelen);
 336   if (mbs != NULL)
 337     {
 338       newp->mbs = obstack_copy0 (&collate->mempool, mbs, mbslen);
 339       newp->nmbs = mbslen;
 340     }
 341   else
 342     {
 343       newp->mbs = NULL;
 344       newp->nmbs = 0;
 345     }
 346   if (wcs != NULL)
 347     {
 348       size_t nwcs = wcslen ((wchar_t *) wcs);
 349       uint32_t zero = 0;
 350       obstack_grow (&collate->mempool, wcs, nwcs * sizeof (uint32_t));
 351       obstack_grow (&collate->mempool, &zero, sizeof (uint32_t));
 352       newp->wcs = (uint32_t *) obstack_finish (&collate->mempool);
 353       newp->nwcs = nwcs;
 354     }
 355   else
 356     {
 357       newp->wcs = NULL;
 358       newp->nwcs = 0;
 359     }
 360   newp->mborder = NULL;
 361   newp->wcorder = 0;
 362   newp->used_in_level = 0;
 363   newp->is_character = is_character;
 364
 365   /* Will be assigned later.  XXX  */
 366   newp->mbseqorder = 0;
 367   newp->wcseqorder = 0;
 368
 369   /* Will be allocated later.  */
 370   newp->weights = NULL;
 371
 372   newp->file = NULL;
 373   newp->line = 0;
 374
 375   newp->section = collate->current_section;
 376
 377   newp->last = NULL;
 378   newp->next = NULL;
 379
 380   newp->mbnext = NULL;
 381   newp->mblast = NULL;
 382
 383   newp->wcnext = NULL;
 384   newp->wclast = NULL;
 385
 386   return newp;
 387 }
 388
 389
 390 static struct symbol_t *
 391 new_symbol (struct locale_collate_t *collate, const char *name, size_t len)
 392 {
 393   struct symbol_t *newp;
 394
 395   newp = (struct symbol_t *) obstack_alloc (&collate->mempool, sizeof (*newp));
 396
 397   newp->name = obstack_copy0 (&collate->mempool, name, len);
 398   newp->order = NULL;
 399
 400   newp->file = NULL;
 401   newp->line = 0;
 402
 403   return newp;
 404 }
 405
 406
 407 /* Test whether this name is already defined somewhere.  */
 408 static int
 409 check_duplicate (struct linereader *ldfile, struct locale_collate_t *collate,
 410                  const struct charmap_t *charmap,
 411                  struct repertoire_t *repertoire, const char *symbol,
 412                  size_t symbol_len)
 413 {
 414   void *ignore = NULL;
 415
 416   if (find_entry (&charmap->char_table, symbol, symbol_len, &ignore) == 0)
 417     {
 418       lr_error (ldfile, _("`%.*s' already defined in charmap"),
 419                 (int) symbol_len, symbol);
 420       return 1;
 421     }
 422
 423   if (repertoire != NULL
 424       && (find_entry (&repertoire->char_table, symbol, symbol_len, &ignore)
 425           == 0))
 426     {
 427       lr_error (ldfile, _("`%.*s' already defined in repertoire"),
 428                 (int) symbol_len, symbol);
 429       return 1;
 430     }
 431
 432   if (find_entry (&collate->sym_table, symbol, symbol_len, &ignore) == 0)
 433     {
 434       lr_error (ldfile, _("`%.*s' already defined as collating symbol"),
 435                 (int) symbol_len, symbol);
 436       return 1;
 437     }
 438
 439   if (find_entry (&collate->elem_table, symbol, symbol_len, &ignore) == 0)
 440     {
 441       lr_error (ldfile, _("`%.*s' already defined as collating element"),
 442                 (int) symbol_len, symbol);
 443       return 1;
 444     }
 445
 446   return 0;
 447 }
 448
 449
 450 /* Read the direction specification.  */
 451 static void
 452 read_directions (struct linereader *ldfile, struct token *arg,
 453                  const struct charmap_t *charmap,
 454                  struct repertoire_t *repertoire, struct localedef_t *result)
 455 {
 456   int cnt = 0;
 457   int max = nrules ?: 10;
 458   enum coll_sort_rule *rules = calloc (max, sizeof (*rules));
 459   int warned = 0;
 460   struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
 461
 462   while (1)
 463     {
 464       int valid = 0;
 465
 466       if (arg->tok == tok_forward)
 467         {
 468           if (rules[cnt] & sort_backward)
 469             {
 470               if (! warned)
 471                 {
 472                   lr_error (ldfile, _("\
 473 %s: `forward' and `backward' are mutually excluding each other"),
 474                             "LC_COLLATE");
 475                   warned = 1;
 476                 }
 477             }
 478           else if (rules[cnt] & sort_forward)
 479             {
 480               if (! warned)
 481                 {
 482                   lr_error (ldfile, _("\
 483 %s: `%s' mentioned more than once in definition of weight %d"),
 484                             "LC_COLLATE", "forward", cnt + 1);
 485                 }
 486             }
 487           else
 488             rules[cnt] |= sort_forward;
 489
 490           valid = 1;
 491         }
 492       else if (arg->tok == tok_backward)
 493         {
 494           if (rules[cnt] & sort_forward)
 495             {
 496               if (! warned)
 497                 {
 498                   lr_error (ldfile, _("\
 499 %s: `forward' and `backward' are mutually excluding each other"),
 500                             "LC_COLLATE");
 501                   warned = 1;
 502                 }
 503             }
 504           else if (rules[cnt] & sort_backward)
 505             {
 506               if (! warned)
 507                 {
 508                   lr_error (ldfile, _("\
 509 %s: `%s' mentioned more than once in definition of weight %d"),
 510                             "LC_COLLATE", "backward", cnt + 1);
 511                 }
 512             }
 513           else
 514             rules[cnt] |= sort_backward;
 515
 516           valid = 1;
 517         }
 518       else if (arg->tok == tok_position)
 519         {
 520           if (rules[cnt] & sort_position)
 521             {
 522               if (! warned)
 523                 {
 524                   lr_error (ldfile, _("\
 525 %s: `%s' mentioned more than once in definition of weight %d"),
 526                             "LC_COLLATE", "position", cnt + 1);
 527                 }
 528             }
 529           else
 530             rules[cnt] |= sort_position;
 531
 532           valid = 1;
 533         }
 534
 535       if (valid)
 536         arg = lr_token (ldfile, charmap, result, repertoire, verbose);
 537
 538       if (arg->tok == tok_eof || arg->tok == tok_eol || arg->tok == tok_comma
 539           || arg->tok == tok_semicolon)
 540         {
 541           if (! valid && ! warned)
 542             {
 543               lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
 544               warned = 1;
 545             }
 546
 547           /* See whether we have to increment the counter.  */
 548           if (arg->tok != tok_comma && rules[cnt] != 0)
 549             {
 550               /* Add the default `forward' if we have seen only `position'.  */
 551               if (rules[cnt] == sort_position)
 552                 rules[cnt] = sort_position | sort_forward;
 553
 554               ++cnt;
 555             }
 556
 557           if (arg->tok == tok_eof || arg->tok == tok_eol)
 558             /* End of line or file, so we exit the loop.  */
 559             break;
 560
 561           if (nrules == 0)
 562             {
 563               /* See whether we have enough room in the array.  */
 564               if (cnt == max)
 565                 {
 566                   max += 10;
 567                   rules = (enum coll_sort_rule *) xrealloc (rules,
 568                                                             max
 569                                                             * sizeof (*rules));
 570                   memset (&rules[cnt], '\0', (max - cnt) * sizeof (*rules));
 571                 }
 572             }
 573           else
 574             {
 575               if (cnt == nrules)
 576                 {
 577                   /* There must not be any more rule.  */
 578                   if (! warned)
 579                     {
 580                       lr_error (ldfile, _("\
 581 %s: too many rules; first entry only had %d"),
 582                                 "LC_COLLATE", nrules);
 583                       warned = 1;
 584                     }
 585
 586                   lr_ignore_rest (ldfile, 0);
 587                   break;
 588                 }
 589             }
 590         }
 591       else
 592         {
 593           if (! warned)
 594             {
 595               lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
 596               warned = 1;
 597             }
 598         }
 599
 600       arg = lr_token (ldfile, charmap, result, repertoire, verbose);
 601     }
 602
 603   if (nrules == 0)
 604     {
 605       /* Now we know how many rules we have.  */
 606       nrules = cnt;
 607       rules = (enum coll_sort_rule *) xrealloc (rules,
 608                                                 nrules * sizeof (*rules));
 609     }
 610   else
 611     {
 612       if (cnt < nrules)
 613         {
 614           /* Not enough rules in this specification.  */
 615           if (! warned)
 616             lr_error (ldfile, _("%s: not enough sorting rules"), "LC_COLLATE");
 617
 618           do
 619             rules[cnt] = sort_forward;
 620           while (++cnt < nrules);
 621         }
 622     }
 623
 624   collate->current_section->rules = rules;
 625 }
 626
 627
 628 static struct element_t *
 629 find_element (struct linereader *ldfile, struct locale_collate_t *collate,
 630               const char *str, size_t len)
 631 {
 632   void *result = NULL;
 633
 634   /* Search for the entries among the collation sequences already define.  */
 635   if (find_entry (&collate->seq_table, str, len, &result) != 0)
 636     {
 637       /* Nope, not define yet.  So we see whether it is a
 638          collation symbol.  */
 639       void *ptr;
 640
 641       if (find_entry (&collate->sym_table, str, len, &ptr) == 0)
 642         {
 643           /* It's a collation symbol.  */
 644           struct symbol_t *sym = (struct symbol_t *) ptr;
 645           result = sym->order;
 646
 647           if (result == NULL)
 648             result = sym->order = new_element (collate, NULL, 0, NULL,
 649                                                NULL, 0, 0);
 650         }
 651       else if (find_entry (&collate->elem_table, str, len, &result) != 0)
 652         {
 653           /* It's also no collation element.  So it is a character
 654              element defined later.  */
 655           result = new_element (collate, NULL, 0, NULL, str, len, 1);
 656           /* Insert it into the sequence table.  */
 657           insert_entry (&collate->seq_table, str, len, result);
 658         }
 659     }
 660
 661   return (struct element_t *) result;
 662 }
 663
 664
 665 static void
 666 unlink_element (struct locale_collate_t *collate)
 667 {
 668   if (collate->cursor == collate->start)
 669     {
 670       assert (collate->cursor->next == NULL);
 671       assert (collate->cursor->last == NULL);
 672       collate->cursor = NULL;
 673     }
 674   else
 675     {
 676       if (collate->cursor->next != NULL)
 677         collate->cursor->next->last = collate->cursor->last;
 678       if (collate->cursor->last != NULL)
 679         collate->cursor->last->next = collate->cursor->next;
 680       collate->cursor = collate->cursor->last;
 681     }
 682 }
 683
 684
 685 static void
 686 insert_weights (struct linereader *ldfile, struct element_t *elem,
 687                 const struct charmap_t *charmap,
 688                 struct repertoire_t *repertoire, struct localedef_t *result,
 689                 enum token_t ellipsis)
 690 {
 691   int weight_cnt;
 692   struct token *arg;
 693   struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
 694
 695   /* Initialize all the fields.  */
 696   elem->file = ldfile->fname;
 697   elem->line = ldfile->lineno;
 698
 699   elem->last = collate->cursor;
 700   elem->next = collate->cursor ? collate->cursor->next : NULL;
 701   if (collate->cursor != NULL && collate->cursor->next != NULL)
 702     collate->cursor->next->last = elem;
 703   if (collate->cursor != NULL)
 704     collate->cursor->next = elem;
 705   if (collate->start == NULL)
 706     {
 707       assert (collate->cursor == NULL);
 708       collate->start = elem;
 709     }
 710
 711   elem->section = collate->current_section;
 712
 713   if (collate->current_section->first == NULL)
 714     collate->current_section->first = elem;
 715   if (collate->current_section->last == collate->cursor)
 716     collate->current_section->last = elem;
 717
 718   collate->cursor = elem;
 719
 720   elem->weights = (struct element_list_t *)
 721     obstack_alloc (&collate->mempool, nrules * sizeof (struct element_list_t));
 722   memset (elem->weights, '\0', nrules * sizeof (struct element_list_t));
 723
 724   weight_cnt = 0;
 725
 726   arg = lr_token (ldfile, charmap, result, repertoire, verbose);
 727   do
 728     {
 729       if (arg->tok == tok_eof || arg->tok == tok_eol)
 730         break;
 731
 732       if (arg->tok == tok_ignore)
 733         {
 734           /* The weight for this level has to be ignored.  We use the
 735              null pointer to indicate this.  */
 736           elem->weights[weight_cnt].w = (struct element_t **)
 737             obstack_alloc (&collate->mempool, sizeof (struct element_t *));
 738           elem->weights[weight_cnt].w[0] = NULL;
 739           elem->weights[weight_cnt].cnt = 1;
 740         }
 741       else if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
 742         {
 743           char ucs4str[10];
 744           struct element_t *val;
 745           char *symstr;
 746           size_t symlen;
 747
 748           if (arg->tok == tok_bsymbol)
 749             {
 750               symstr = arg->val.str.startmb;
 751               symlen = arg->val.str.lenmb;
 752             }
 753           else
 754             {
 755               snprintf (ucs4str, sizeof (ucs4str), "U%08X", arg->val.ucs4);
 756               symstr = ucs4str;
 757               symlen = 9;
 758             }
 759
 760           val = find_element (ldfile, collate, symstr, symlen);
 761           if (val == NULL)
 762             break;
 763
 764           elem->weights[weight_cnt].w = (struct element_t **)
 765             obstack_alloc (&collate->mempool, sizeof (struct element_t *));
 766           elem->weights[weight_cnt].w[0] = val;
 767           elem->weights[weight_cnt].cnt = 1;
 768         }
 769       else if (arg->tok == tok_string)
 770         {
 771           /* Split the string up in the individual characters and put
 772              the element definitions in the list.  */
 773           const char *cp = arg->val.str.startmb;
 774           int cnt = 0;
 775           struct element_t *charelem;
 776           struct element_t **weights = NULL;
 777           int max = 0;
 778
 779           if (*cp == '\0')
 780             {
 781               lr_error (ldfile, _("%s: empty weight string not allowed"),
 782                         "LC_COLLATE");
 783               lr_ignore_rest (ldfile, 0);
 784               break;
 785             }
 786
 787           do
 788             {
 789               if (*cp == '<')
 790                 {
 791                   /* Ahh, it's a bsymbol or an UCS4 value.  If it's
 792                      the latter we have to unify the name.  */
 793                   const char *startp = ++cp;
 794                   size_t len;
 795
 796                   while (*cp != '>')
 797                     {
 798                       if (*cp == ldfile->escape_char)
 799                         ++cp;
 800                       if (*cp == '\0')
 801                         /* It's a syntax error.  */
 802                         goto syntax;
 803
 804                       ++cp;
 805                     }
 806
 807                   if (cp - startp == 5 && startp[0] == 'U'
 808                       && isxdigit (startp[1]) && isxdigit (startp[2])
 809                       && isxdigit (startp[3]) && isxdigit (startp[4]))
 810                     {
 811                       unsigned int ucs4 = strtoul (startp + 1, NULL, 16);
 812                       char *newstr;
 813
 814                       newstr = (char *) xmalloc (10);
 815                       snprintf (newstr, 10, "U%08X", ucs4);
 816                       startp = newstr;
 817
 818                       len = 9;
 819                     }
 820                   else
 821                     len = cp - startp;
 822
 823                   charelem = find_element (ldfile, collate, startp, len);
 824                   ++cp;
 825                 }
 826               else
 827                 {
 828                   /* People really shouldn't use characters directly in
 829                      the string.  Especially since it's not really clear
 830                      what this means.  We interpret all characters in the
 831                      string as if that would be bsymbols.  Otherwise we
 832                      would have to match back to bsymbols somehow and this
 833                      is normally not what people normally expect.  */
 834                   charelem = find_element (ldfile, collate, cp++, 1);
 835                 }
 836
 837               if (charelem == NULL)
 838                 {
 839                   /* We ignore the rest of the line.  */
 840                   lr_ignore_rest (ldfile, 0);
 841                   break;
 842                 }
 843
 844               /* Add the pointer.  */
 845               if (cnt >= max)
 846                 {
 847                   struct element_t **newp;
 848                   max += 10;
 849                   newp = (struct element_t **)
 850                     alloca (max * sizeof (struct element_t *));
 851                   memcpy (newp, weights, cnt * sizeof (struct element_t *));
 852                   weights = newp;
 853                 }
 854               weights[cnt++] = charelem;
 855             }
 856           while (*cp != '\0');
 857
 858           /* Now store the information.  */
 859           elem->weights[weight_cnt].w = (struct element_t **)
 860             obstack_alloc (&collate->mempool,
 861                            cnt * sizeof (struct element_t *));
 862           memcpy (elem->weights[weight_cnt].w, weights,
 863                   cnt * sizeof (struct element_t *));
 864           elem->weights[weight_cnt].cnt = cnt;
 865
 866           /* We don't need the string anymore.  */
 867           free (arg->val.str.startmb);
 868         }
 869       else if (ellipsis != tok_none
 870                && (arg->tok == tok_ellipsis2
 871                    || arg->tok == tok_ellipsis3
 872                    || arg->tok == tok_ellipsis4))
 873         {
 874           /* It must be the same ellipsis as used in the initial column.  */
 875           if (arg->tok != ellipsis)
 876             lr_error (ldfile, _("\
 877 %s: weights must use the same ellipsis symbol as the name"),
 878                       "LC_COLLATE");
 879
 880           /* The weight for this level will depend on the element
 881              iterating over the range.  Put a placeholder.  */
 882           elem->weights[weight_cnt].w = (struct element_t **)
 883             obstack_alloc (&collate->mempool, sizeof (struct element_t *));
 884           elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
 885           elem->weights[weight_cnt].cnt = 1;
 886         }
 887       else
 888         {
 889         syntax:
 890           /* It's a syntax error.  */
 891           lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
 892           lr_ignore_rest (ldfile, 0);
 893           break;
 894         }
 895
 896       arg = lr_token (ldfile, charmap, result, repertoire, verbose);
 897       /* This better should be the end of the line or a semicolon.  */
 898       if (arg->tok == tok_semicolon)
 899         /* OK, ignore this and read the next token.  */
 900         arg = lr_token (ldfile, charmap, result, repertoire, verbose);
 901       else if (arg->tok != tok_eof && arg->tok != tok_eol)
 902         {
 903           /* It's a syntax error.  */
 904           lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
 905           lr_ignore_rest (ldfile, 0);
 906           break;
 907         }
 908     }
 909   while (++weight_cnt < nrules);
 910
 911   if (weight_cnt < nrules)
 912     {
 913       /* This means the rest of the line uses the current element as
 914          the weight.  */
 915       do
 916         {
 917           elem->weights[weight_cnt].w = (struct element_t **)
 918             obstack_alloc (&collate->mempool, sizeof (struct element_t *));
 919           if (ellipsis == tok_none)
 920             elem->weights[weight_cnt].w[0] = elem;
 921           else
 922             elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
 923           elem->weights[weight_cnt].cnt = 1;
 924         }
 925       while (++weight_cnt < nrules);
 926     }
 927   else
 928     {
 929       if (arg->tok == tok_ignore || arg->tok == tok_bsymbol)
 930         {
 931           /* Too many rule values.  */
 932           lr_error (ldfile, _("%s: too many values"), "LC_COLLATE");
 933           lr_ignore_rest (ldfile, 0);
 934         }
 935       else
 936         lr_ignore_rest (ldfile, arg->tok != tok_eol && arg->tok != tok_eof);
 937     }
 938 }
 939
 940
 941 static int
 942 insert_value (struct linereader *ldfile, const char *symstr, size_t symlen,
 943               const struct charmap_t *charmap, struct repertoire_t *repertoire,
 944               struct localedef_t *result)
 945 {
 946   /* First find out what kind of symbol this is.  */
 947   struct charseq *seq;
 948   uint32_t wc;
 949   struct element_t *elem = NULL;
 950   struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
 951
 952   /* Try to find the character in the charmap.  */
 953   seq = charmap_find_value (charmap, symstr, symlen);
 954
 955   /* Determine the wide character.  */
 956   if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
 957     {
 958       wc = repertoire_find_value (repertoire, symstr, symlen);
 959       if (seq != NULL)
 960         seq->ucs4 = wc;
 961     }
 962   else
 963     wc = seq->ucs4;
 964
 965   if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
 966     {
 967       /* It's no character, so look through the collation elements and
 968          symbol list.  */
 969       void *ptr = elem;
 970       if (find_entry (&collate->elem_table, symstr, symlen, &ptr) != 0)
 971         {
 972           void *result;
 973           struct symbol_t *sym = NULL;
 974
 975           /* It's also collation element.  Therefore it's either a
 976              collating symbol or it's a character which is not
 977              supported by the character set.  In the later case we
 978              simply create a dummy entry.  */
 979           if (find_entry (&collate->sym_table, symstr, symlen, &result) == 0)
 980             {
 981               /* It's a collation symbol.  */
 982               sym = (struct symbol_t *) result;
 983
 984               elem = sym->order;
 985             }
 986
 987           if (elem == NULL)
 988             {
 989               elem = new_element (collate, NULL, 0, NULL, symstr, symlen, 0);
 990
 991               if (sym != NULL)
 992                 sym->order = elem;
 993               else
 994                 /* Enter a fake element in the sequence table.  This
 995                    won't cause anything in the output since there is
 996                    no multibyte or wide character associated with
 997                    it.  */
 998                 insert_entry (&collate->seq_table, symstr, symlen, elem);
 999             }
1000         }
1001       else
1002         /* Copy the result back.  */
1003         elem = ptr;
1004     }
1005   else
1006     {
1007       /* Otherwise the symbols stands for a character.  */
1008       void *ptr = elem;
1009       if (find_entry (&collate->seq_table, symstr, symlen, &ptr) != 0)
1010         {
1011           uint32_t wcs[2] = { wc, 0 };
1012
1013           /* We have to allocate an entry.  */
1014           elem = new_element (collate,
1015                               seq != NULL ? (char *) seq->bytes : NULL,
1016                               seq != NULL ? seq->nbytes : 0,
1017                               wc == ILLEGAL_CHAR_VALUE ? NULL : wcs,
1018                               symstr, symlen, 1);
1019
1020           /* And add it to the table.  */
1021           if (insert_entry (&collate->seq_table, symstr, symlen, elem) != 0)
1022             /* This cannot happen.  */
1023             assert (! "Internal error");
1024         }
1025       else
1026         {
1027           /* Copy the result back.  */
1028           elem = ptr;
1029
1030           /* Maybe the character was used before the definition.  In this case
1031              we have to insert the byte sequences now.  */
1032           if (elem->mbs == NULL && seq != NULL)
1033             {
1034               elem->mbs = obstack_copy0 (&collate->mempool,
1035                                          seq->bytes, seq->nbytes);
1036               elem->nmbs = seq->nbytes;
1037             }
1038
1039           if (elem->wcs == NULL && wc != ILLEGAL_CHAR_VALUE)
1040             {
1041               uint32_t wcs[2] = { wc, 0 };
1042
1043               elem->wcs = obstack_copy (&collate->mempool, wcs, sizeof (wcs));
1044               elem->nwcs = 1;
1045             }
1046         }
1047     }
1048
1049   /* Test whether this element is not already in the list.  */
1050   if (elem->next != NULL || elem == collate->cursor)
1051     {
1052       lr_error (ldfile, _("order for `%.*s' already defined at %s:%Zu"),
1053                 (int) symlen, symstr, elem->file, elem->line);
1054       lr_ignore_rest (ldfile, 0);
1055       return 1;
1056     }
1057
1058   insert_weights (ldfile, elem, charmap, repertoire, result, tok_none);
1059
1060   return 0;
1061 }
1062
1063
1064 static void
1065 handle_ellipsis (struct linereader *ldfile, const char *symstr, size_t symlen,
1066                  enum token_t ellipsis, const struct charmap_t *charmap,
1067                  struct repertoire_t *repertoire,
1068                  struct localedef_t *result)
1069 {
1070   struct element_t *startp;
1071   struct element_t *endp;
1072   struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
1073
1074   /* Unlink the entry added for the ellipsis.  */
1075   unlink_element (collate);
1076   startp = collate->cursor;
1077
1078   /* Process and add the end-entry.  */
1079   if (symstr != NULL
1080       && insert_value (ldfile, symstr, symlen, charmap, repertoire, result))
1081     /* Something went wrong with inserting the to-value.  This means
1082        we cannot process the ellipsis.  */
1083     return;
1084
1085   /* Reset the cursor.  */
1086   collate->cursor = startp;
1087
1088   /* Now we have to handle many different situations:
1089      - we have to distinguish between the three different ellipsis forms
1090      - the is the ellipsis at the beginning, in the middle, or at the end.
1091   */
1092   endp = collate->cursor->next;
1093   assert (symstr == NULL || endp != NULL);
1094
1095   /* XXX The following is probably very wrong since also collating symbols
1096      can appear in ranges.  But do we want/can refine the test for that?  */
1097 #if 0
1098   /* Both, the start and the end symbol, must stand for characters.  */
1099   if ((startp != NULL && (startp->name == NULL || ! startp->is_character))
1100       || (endp != NULL && (endp->name == NULL|| ! endp->is_character)))
1101     {
1102       lr_error (ldfile, _("\
1103 %s: the start and the end symbol of a range must stand for characters"),
1104                 "LC_COLLATE");
1105       return;
1106     }
1107 #endif
1108
1109   if (ellipsis == tok_ellipsis3)
1110     {
1111       /* One requirement we make here: the length of the byte
1112          sequences for the first and end character must be the same.
1113          This is mainly to prevent unwanted effects and this is often
1114          not what is wanted.  */
1115       size_t len = (startp->mbs != NULL ? startp->nmbs
1116                     : (endp->mbs != NULL ? endp->nmbs : 0));
1117       char mbcnt[len + 1];
1118       char mbend[len + 1];
1119
1120       /* Well, this should be caught somewhere else already.  Just to
1121          make sure.  */
1122       assert (startp == NULL || startp->wcs == NULL || startp->wcs[1] == 0);
1123       assert (endp == NULL || endp->wcs == NULL || endp->wcs[1] == 0);
1124
1125       if (startp != NULL && endp != NULL
1126           && startp->mbs != NULL && endp->mbs != NULL
1127           && startp->nmbs != endp->nmbs)
1128         {
1129           lr_error (ldfile, _("\
1130 %s: byte sequences of first and last character must have the same length"),
1131                     "LC_COLLATE");
1132           return;
1133         }
1134
1135       /* Determine whether we have to generate multibyte sequences.  */
1136       if ((startp == NULL || startp->mbs != NULL)
1137           && (endp == NULL || endp->mbs != NULL))
1138         {
1139           int cnt;
1140           int ret;
1141
1142           /* Prepare the beginning byte sequence.  This is either from the
1143              beginning byte sequence or it is all nulls if it was an
1144              initial ellipsis.  */
1145           if (startp == NULL || startp->mbs == NULL)
1146             memset (mbcnt, '\0', len);
1147           else
1148             {
1149               memcpy (mbcnt, startp->mbs, len);
1150
1151               /* And increment it so that the value is the first one we will
1152                  try to insert.  */
1153               for (cnt = len - 1; cnt >= 0; --cnt)
1154                 if (++mbcnt[cnt] != '\0')
1155                   break;
1156             }
1157           mbcnt[len] = '\0';
1158
1159           /* And the end sequence.  */
1160           if (endp == NULL || endp->mbs == NULL)
1161             memset (mbend, '\0', len);
1162           else
1163             memcpy (mbend, endp->mbs, len);
1164           mbend[len] = '\0';
1165
1166           /* Test whether we have a correct range.  */
1167           ret = memcmp (mbcnt, mbend, len);
1168           if (ret >= 0)
1169             {
1170               if (ret > 0)
1171                 lr_error (ldfile, _("%s: byte sequence of first character of \
1172 range is not lower than that of the last character"), "LC_COLLATE");
1173               return;
1174             }
1175
1176           /* Generate the byte sequences data.  */
1177           while (1)
1178             {
1179               struct charseq *seq;
1180
1181               /* Quite a bit of work ahead.  We have to find the character
1182                  definition for the byte sequence and then determine the
1183                  wide character belonging to it.  */
1184               seq = charmap_find_symbol (charmap, mbcnt, len);
1185               if (seq != NULL)
1186                 {
1187                   struct element_t *elem;
1188                   size_t namelen;
1189
1190                   /* I don't think this can ever happen.  */
1191                   assert (seq->name != NULL);
1192                   namelen = strlen (seq->name);
1193
1194                   if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1195                     seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1196                                                        namelen);
1197
1198                   /* Now we are ready to insert the new value in the
1199                      sequence.  Find out whether the element is
1200                      already known.  */
1201                   void *ptr;
1202                   if (find_entry (&collate->seq_table, seq->name, namelen,
1203                                   &ptr) != 0)
1204                     {
1205                       uint32_t wcs[2] = { seq->ucs4, 0 };
1206
1207                       /* We have to allocate an entry.  */
1208                       elem = new_element (collate, mbcnt, len,
1209                                           seq->ucs4 == ILLEGAL_CHAR_VALUE
1210                                           ? NULL : wcs, seq->name,
1211                                           namelen, 1);
1212
1213                       /* And add it to the table.  */
1214                       if (insert_entry (&collate->seq_table, seq->name,
1215                                         namelen, elem) != 0)
1216                         /* This cannot happen.  */
1217                         assert (! "Internal error");
1218                     }
1219                   else
1220                     /* Copy the result.  */
1221                     elem = ptr;
1222
1223                   /* Test whether this element is not already in the list.  */
1224                   if (elem->next != NULL || (collate->cursor != NULL
1225                                              && elem->next == collate->cursor))
1226                     {
1227                       lr_error (ldfile, _("\
1228 order for `%.*s' already defined at %s:%Zu"),
1229                                 (int) namelen, seq->name,
1230                                 elem->file, elem->line);
1231                       goto increment;
1232                     }
1233
1234                   /* Enqueue the new element.  */
1235                   elem->last = collate->cursor;
1236                   if (collate->cursor == NULL)
1237                     elem->next = NULL;
1238                   else
1239                     {
1240                       elem->next = collate->cursor->next;
1241                       elem->last->next = elem;
1242                       if (elem->next != NULL)
1243                         elem->next->last = elem;
1244                     }
1245                   if (collate->start == NULL)
1246                     {
1247                       assert (collate->cursor == NULL);
1248                       collate->start = elem;
1249                     }
1250                   collate->cursor = elem;
1251
1252                  /* Add the weight value.  We take them from the
1253                     `ellipsis_weights' member of `collate'.  */
1254                   elem->weights = (struct element_list_t *)
1255                     obstack_alloc (&collate->mempool,
1256                                    nrules * sizeof (struct element_list_t));
1257                   for (cnt = 0; cnt < nrules; ++cnt)
1258                     if (collate->ellipsis_weight.weights[cnt].cnt == 1
1259                         && (collate->ellipsis_weight.weights[cnt].w[0]
1260                             == ELEMENT_ELLIPSIS2))
1261                       {
1262                         elem->weights[cnt].w = (struct element_t **)
1263                           obstack_alloc (&collate->mempool,
1264                                          sizeof (struct element_t *));
1265                         elem->weights[cnt].w[0] = elem;
1266                         elem->weights[cnt].cnt = 1;
1267                       }
1268                     else
1269                       {
1270                         /* Simply use the weight from `ellipsis_weight'.  */
1271                         elem->weights[cnt].w =
1272                           collate->ellipsis_weight.weights[cnt].w;
1273                         elem->weights[cnt].cnt =
1274                           collate->ellipsis_weight.weights[cnt].cnt;
1275                       }
1276                 }
1277
1278               /* Increment for the next round.  */
1279             increment:
1280               for (cnt = len - 1; cnt >= 0; --cnt)
1281                 if (++mbcnt[cnt] != '\0')
1282                   break;
1283
1284               /* Find out whether this was all.  */
1285               if (cnt < 0 || memcmp (mbcnt, mbend, len) >= 0)
1286                 /* Yep, that's all.  */
1287                 break;
1288             }
1289         }
1290     }
1291   else
1292     {
1293       /* For symbolic range we naturally must have a beginning and an
1294          end specified by the user.  */
1295       if (startp == NULL)
1296         lr_error (ldfile, _("\
1297 %s: symbolic range ellipsis must not directly follow `order_start'"),
1298                   "LC_COLLATE");
1299       else if (endp == NULL)
1300         lr_error (ldfile, _("\
1301 %s: symbolic range ellipsis must not be directly followed by `order_end'"),
1302                   "LC_COLLATE");
1303       else
1304         {
1305           /* Determine the range.  To do so we have to determine the
1306              common prefix of the both names and then the numeric
1307              values of both ends.  */
1308           size_t lenfrom = strlen (startp->name);
1309           size_t lento = strlen (endp->name);
1310           char buf[lento + 1];
1311           int preflen = 0;
1312           long int from;
1313           long int to;
1314           char *cp;
1315           int base = ellipsis == tok_ellipsis2 ? 16 : 10;
1316
1317           if (lenfrom != lento)
1318             {
1319             invalid_range:
1320               lr_error (ldfile, _("\
1321 `%s' and `%.*s' are not valid names for symbolic range"),
1322                         startp->name, (int) lento, endp->name);
1323               return;
1324             }
1325
1326           while (startp->name[preflen] == endp->name[preflen])
1327             if (startp->name[preflen] == '\0')
1328               /* Nothing to be done.  The start and end point are identical
1329                  and while inserting the end point we have already given
1330                  the user an error message.  */
1331               return;
1332             else
1333               ++preflen;
1334
1335           errno = 0;
1336           from = strtol (startp->name + preflen, &cp, base);
1337           if ((from == UINT_MAX && errno == ERANGE) || *cp != '\0')
1338             goto invalid_range;
1339
1340           errno = 0;
1341           to = strtol (endp->name + preflen, &cp, base);
1342           if ((to == UINT_MAX && errno == ERANGE) || *cp != '\0')
1343             goto invalid_range;
1344
1345           /* Copy the prefix.  */
1346           memcpy (buf, startp->name, preflen);
1347
1348           /* Loop over all values.  */
1349           for (++from; from < to; ++from)
1350             {
1351               struct element_t *elem = NULL;
1352               struct charseq *seq;
1353               uint32_t wc;
1354               int cnt;
1355
1356               /* Generate the name.  */
1357               sprintf (buf + preflen, base == 10 ? "%0*ld" : "%0*lX",
1358                        (int) (lenfrom - preflen), from);
1359
1360               /* Look whether this name is already defined.  */
1361               void *ptr;
1362               if (find_entry (&collate->seq_table, buf, symlen, &ptr) == 0)
1363                 {
1364                   /* Copy back the result.  */
1365                   elem = ptr;
1366
1367                   if (elem->next != NULL || (collate->cursor != NULL
1368                                              && elem->next == collate->cursor))
1369                     {
1370                       lr_error (ldfile, _("\
1371 %s: order for `%.*s' already defined at %s:%Zu"),
1372                                 "LC_COLLATE", (int) lenfrom, buf,
1373                                 elem->file, elem->line);
1374                       continue;
1375                     }
1376
1377                   if (elem->name == NULL)
1378                     {
1379                       lr_error (ldfile, _("%s: `%s' must be a character"),
1380                                 "LC_COLLATE", buf);
1381                       continue;
1382                     }
1383                 }
1384
1385               if (elem == NULL || (elem->mbs == NULL && elem->wcs == NULL))
1386                 {
1387                   /* Search for a character of this name.  */
1388                   seq = charmap_find_value (charmap, buf, lenfrom);
1389                   if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1390                     {
1391                       wc = repertoire_find_value (repertoire, buf, lenfrom);
1392
1393                       if (seq != NULL)
1394                         seq->ucs4 = wc;
1395                     }
1396                   else
1397                     wc = seq->ucs4;
1398
1399                   if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
1400                     /* We don't know anything about a character with this
1401                        name.  XXX Should we warn?  */
1402                     continue;
1403
1404                   if (elem == NULL)
1405                     {
1406                       uint32_t wcs[2] = { wc, 0 };
1407
1408                       /* We have to allocate an entry.  */
1409                       elem = new_element (collate,
1410                                           seq != NULL
1411                                           ? (char *) seq->bytes : NULL,
1412                                           seq != NULL ? seq->nbytes : 0,
1413                                           wc == ILLEGAL_CHAR_VALUE
1414                                           ? NULL : wcs, buf, lenfrom, 1);
1415                     }
1416                   else
1417                     {
1418                       /* Update the element.  */
1419                       if (seq != NULL)
1420                         {
1421                           elem->mbs = obstack_copy0 (&collate->mempool,
1422                                                      seq->bytes, seq->nbytes);
1423                           elem->nmbs = seq->nbytes;
1424                         }
1425
1426                       if (wc != ILLEGAL_CHAR_VALUE)
1427                         {
1428                           uint32_t zero = 0;
1429
1430                           obstack_grow (&collate->mempool,
1431                                         &wc, sizeof (uint32_t));
1432                           obstack_grow (&collate->mempool,
1433                                         &zero, sizeof (uint32_t));
1434                           elem->wcs = obstack_finish (&collate->mempool);
1435                           elem->nwcs = 1;
1436                         }
1437                     }
1438
1439                   elem->file = ldfile->fname;
1440                   elem->line = ldfile->lineno;
1441                   elem->section = collate->current_section;
1442                 }
1443
1444               /* Enqueue the new element.  */
1445               elem->last = collate->cursor;
1446               elem->next = collate->cursor->next;
1447               elem->last->next = elem;
1448               if (elem->next != NULL)
1449                 elem->next->last = elem;
1450               collate->cursor = elem;
1451
1452               /* Now add the weights.  They come from the `ellipsis_weights'
1453                  member of `collate'.  */
1454               elem->weights = (struct element_list_t *)
1455                 obstack_alloc (&collate->mempool,
1456                                nrules * sizeof (struct element_list_t));
1457               for (cnt = 0; cnt < nrules; ++cnt)
1458                 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1459                     && (collate->ellipsis_weight.weights[cnt].w[0]
1460                         == ELEMENT_ELLIPSIS2))
1461                   {
1462                     elem->weights[cnt].w = (struct element_t **)
1463                       obstack_alloc (&collate->mempool,
1464                                      sizeof (struct element_t *));
1465                     elem->weights[cnt].w[0] = elem;
1466                     elem->weights[cnt].cnt = 1;
1467                   }
1468                 else
1469                   {
1470                     /* Simly use the weight from `ellipsis_weight'.  */
1471                     elem->weights[cnt].w =
1472                       collate->ellipsis_weight.weights[cnt].w;
1473                     elem->weights[cnt].cnt =
1474                       collate->ellipsis_weight.weights[cnt].cnt;
1475                   }
1476             }
1477         }
1478     }
1479 }
1480
1481
1482 static void
1483 collate_startup (struct linereader *ldfile, struct localedef_t *locale,
1484                  struct localedef_t *copy_locale, int ignore_content)
1485 {
1486   if (!ignore_content && locale->categories[LC_COLLATE].collate == NULL)
1487     {
1488       struct locale_collate_t *collate;
1489
1490       if (copy_locale == NULL)
1491         {
1492           collate = locale->categories[LC_COLLATE].collate =
1493             (struct locale_collate_t *)
1494             xcalloc (1, sizeof (struct locale_collate_t));
1495
1496           /* Init the various data structures.  */
1497           init_hash (&collate->elem_table, 100);
1498           init_hash (&collate->sym_table, 100);
1499           init_hash (&collate->seq_table, 500);
1500           obstack_init (&collate->mempool);
1501
1502           collate->col_weight_max = -1;
1503         }
1504       else
1505         /* Reuse the copy_locale's data structures.  */
1506         collate = locale->categories[LC_COLLATE].collate =
1507           copy_locale->categories[LC_COLLATE].collate;
1508     }
1509
1510   ldfile->translate_strings = 0;
1511   ldfile->return_widestr = 0;
1512 }
1513
1514
1515 void
1516 collate_finish (struct localedef_t *locale, const struct charmap_t *charmap)
1517 {
1518   /* Now is the time when we can assign the individual collation
1519      values for all the symbols.  We have possibly different values
1520      for the wide- and the multibyte-character symbols.  This is done
1521      since it might make a difference in the encoding if there is in
1522      some cases no multibyte-character but there are wide-characters.
1523      (The other way around it is not important since theencoded
1524      collation value in the wide-character case is 32 bits wide and
1525      therefore requires no encoding).
1526
1527      The lowest collation value assigned is 2.  Zero is reserved for
1528      the NUL byte terminating the strings in the `strxfrm'/`wcsxfrm'
1529      functions and 1 is used to separate the individual passes for the
1530      different rules.
1531
1532      We also have to construct is list with all the bytes/words which
1533      can come first in a sequence, followed by all the elements which
1534      also start with this byte/word.  The order is reverse which has
1535      among others the important effect that longer strings are located
1536      first in the list.  This is required for the output data since
1537      the algorithm used in `strcoll' etc depends on this.
1538
1539      The multibyte case is easy.  We simply sort into an array with
1540      256 elements.  */
1541   struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1542   int mbact[nrules];
1543   int wcact;
1544   int mbseqact;
1545   int wcseqact;
1546   struct element_t *runp;
1547   int i;
1548   int need_undefined = 0;
1549   struct section_list *sect;
1550   int ruleidx;
1551   int nr_wide_elems = 0;
1552
1553   if (collate == NULL)
1554     {
1555       /* No data, no check.  */
1556       if (! be_quiet)
1557         WITH_CUR_LOCALE (error (0, 0, _("No definition for %s category found"),
1558                                 "LC_COLLATE"));
1559       return;
1560     }
1561
1562   /* If this assertion is hit change the type in `element_t'.  */
1563   assert (nrules <= sizeof (runp->used_in_level) * 8);
1564
1565   /* Make sure that the `position' rule is used either in all sections
1566      or in none.  */
1567   for (i = 0; i < nrules; ++i)
1568     for (sect = collate->sections; sect != NULL; sect = sect->next)
1569       if (sect != collate->current_section
1570           && sect->rules != NULL
1571           && ((sect->rules[i] & sort_position)
1572               != (collate->current_section->rules[i] & sort_position)))
1573         {
1574           WITH_CUR_LOCALE (error (0, 0, _("\
1575 %s: `position' must be used for a specific level in all sections or none"),
1576                                   "LC_COLLATE"));
1577           break;
1578         }
1579
1580   /* Find out which elements are used at which level.  At the same
1581      time we find out whether we have any undefined symbols.  */
1582   runp = collate->start;
1583   while (runp != NULL)
1584     {
1585       if (runp->mbs != NULL)
1586         {
1587           for (i = 0; i < nrules; ++i)
1588             {
1589               int j;
1590
1591               for (j = 0; j < runp->weights[i].cnt; ++j)
1592                 /* A NULL pointer as the weight means IGNORE.  */
1593                 if (runp->weights[i].w[j] != NULL)
1594                   {
1595                     if (runp->weights[i].w[j]->weights == NULL)
1596                       {
1597                         WITH_CUR_LOCALE (error_at_line (0, 0, runp->file,
1598                                                         runp->line,
1599                                                         _("symbol `%s' not defined"),
1600                                                         runp->weights[i].w[j]->name));
1601
1602                         need_undefined = 1;
1603                         runp->weights[i].w[j] = &collate->undefined;
1604                       }
1605                     else
1606                       /* Set the bit for the level.  */
1607                       runp->weights[i].w[j]->used_in_level |= 1 << i;
1608                   }
1609             }
1610         }
1611
1612       /* Up to the next entry.  */
1613       runp = runp->next;
1614     }
1615
1616   /* Walk through the list of defined sequences and assign weights.  Also
1617      create the data structure which will allow generating the single byte
1618      character based tables.
1619
1620      Since at each time only the weights for each of the rules are
1621      only compared to other weights for this rule it is possible to
1622      assign more compact weight values than simply counting all
1623      weights in sequence.  We can assign weights from 3, one for each
1624      rule individually and only for those elements, which are actually
1625      used for this rule.
1626
1627      Why is this important?  It is not for the wide char table.  But
1628      it is for the singlebyte output since here larger numbers have to
1629      be encoded to make it possible to emit the value as a byte
1630      string.  */
1631   for (i = 0; i < nrules; ++i)
1632     mbact[i] = 2;
1633   wcact = 2;
1634   mbseqact = 0;
1635   wcseqact = 0;
1636   runp = collate->start;
1637   while (runp != NULL)
1638     {
1639       /* Determine the order.  */
1640       if (runp->used_in_level != 0)
1641         {
1642           runp->mborder = (int *) obstack_alloc (&collate->mempool,
1643                                                  nrules * sizeof (int));
1644
1645           for (i = 0; i < nrules; ++i)
1646             if ((runp->used_in_level & (1 << i)) != 0)
1647               runp->mborder[i] = mbact[i]++;
1648             else
1649               runp->mborder[i] = 0;
1650         }
1651
1652       if (runp->mbs != NULL)
1653         {
1654           struct element_t **eptr;
1655           struct element_t *lastp = NULL;
1656
1657           /* Find the point where to insert in the list.  */
1658           eptr = &collate->mbheads[((unsigned char *) runp->mbs)[0]];
1659           while (*eptr != NULL)
1660             {
1661               if ((*eptr)->nmbs < runp->nmbs)
1662                 break;
1663
1664               if ((*eptr)->nmbs == runp->nmbs)
1665                 {
1666                   int c = memcmp ((*eptr)->mbs, runp->mbs, runp->nmbs);
1667
1668                   if (c == 0)
1669                     {
1670                       /* This should not happen.  It means that we have
1671                          to symbols with the same byte sequence.  It is
1672                          of course an error.  */
1673                       WITH_CUR_LOCALE (error_at_line (0, 0, (*eptr)->file,
1674                                                       (*eptr)->line,
1675                                                       _("\
1676 symbol `%s' has the same encoding as"), (*eptr)->name);
1677                                        error_at_line (0, 0, runp->file,
1678                                                       runp->line,
1679                                                       _("symbol `%s'"),
1680                                                       runp->name));
1681                       goto dont_insert;
1682                     }
1683                   else if (c < 0)
1684                     /* Insert it here.  */
1685                     break;
1686                 }
1687
1688               /* To the next entry.  */
1689               lastp = *eptr;
1690               eptr = &(*eptr)->mbnext;
1691             }
1692
1693           /* Set the pointers.  */
1694           runp->mbnext = *eptr;
1695           runp->mblast = lastp;
1696           if (*eptr != NULL)
1697             (*eptr)->mblast = runp;
1698           *eptr = runp;
1699         dont_insert:
1700           ;
1701         }
1702
1703       if (runp->used_in_level)
1704         {
1705           runp->wcorder = wcact++;
1706
1707           /* We take the opportunity to count the elements which have
1708              wide characters.  */
1709           ++nr_wide_elems;
1710         }
1711
1712       if (runp->is_character)
1713         {
1714           if (runp->nmbs == 1)
1715             collate->mbseqorder[((unsigned char *) runp->mbs)[0]] = mbseqact++;
1716
1717           runp->wcseqorder = wcseqact++;
1718         }
1719       else if (runp->mbs != NULL && runp->weights != NULL)
1720         /* This is for collation elements.  */
1721         runp->wcseqorder = wcseqact++;
1722
1723       /* Up to the next entry.  */
1724       runp = runp->next;
1725     }
1726
1727   /* Find out whether any of the `mbheads' entries is unset.  In this
1728      case we use the UNDEFINED entry.  */
1729   for (i = 1; i < 256; ++i)
1730     if (collate->mbheads[i] == NULL)
1731       {
1732         need_undefined = 1;
1733         collate->mbheads[i] = &collate->undefined;
1734       }
1735
1736   /* Now to the wide character case.  */
1737   collate->wcheads.p = 6;
1738   collate->wcheads.q = 10;
1739   wchead_table_init (&collate->wcheads);
1740
1741   collate->wcseqorder.p = 6;
1742   collate->wcseqorder.q = 10;
1743   collseq_table_init (&collate->wcseqorder);
1744
1745   /* Start adding.  */
1746   runp = collate->start;
1747   while (runp != NULL)
1748     {
1749       if (runp->wcs != NULL)
1750         {
1751           struct element_t *e;
1752           struct element_t **eptr;
1753           struct element_t *lastp;
1754
1755           /* Insert the collation sequence value.  */
1756           if (runp->is_character)
1757             collseq_table_add (&collate->wcseqorder, runp->wcs[0],
1758                                runp->wcseqorder);
1759
1760           /* Find the point where to insert in the list.  */
1761           e = wchead_table_get (&collate->wcheads, runp->wcs[0]);
1762           eptr = &e;
1763           lastp = NULL;
1764           while (*eptr != NULL)
1765             {
1766               if ((*eptr)->nwcs < runp->nwcs)
1767                 break;
1768
1769               if ((*eptr)->nwcs == runp->nwcs)
1770                 {
1771                   int c = wmemcmp ((wchar_t *) (*eptr)->wcs,
1772                                    (wchar_t *) runp->wcs, runp->nwcs);
1773
1774                   if (c == 0)
1775                     {
1776                       /* This should not happen.  It means that we have
1777                          two symbols with the same byte sequence.  It is
1778                          of course an error.  */
1779                       WITH_CUR_LOCALE (error_at_line (0, 0, (*eptr)->file,
1780                                                       (*eptr)->line,
1781                                                       _("\
1782 symbol `%s' has the same encoding as"), (*eptr)->name);
1783                                        error_at_line (0, 0, runp->file,
1784                                                       runp->line,
1785                                                       _("symbol `%s'"),
1786                                                       runp->name));
1787                       goto dont_insertwc;
1788                     }
1789                   else if (c < 0)
1790                     /* Insert it here.  */
1791                     break;
1792                 }
1793
1794               /* To the next entry.  */
1795               lastp = *eptr;
1796               eptr = &(*eptr)->wcnext;
1797             }
1798
1799           /* Set the pointers.  */
1800           runp->wcnext = *eptr;
1801           runp->wclast = lastp;
1802           if (*eptr != NULL)
1803             (*eptr)->wclast = runp;
1804           *eptr = runp;
1805           if (eptr == &e)
1806             wchead_table_add (&collate->wcheads, runp->wcs[0], e);
1807         dont_insertwc:
1808           ;
1809         }
1810
1811       /* Up to the next entry.  */
1812       runp = runp->next;
1813     }
1814
1815   collseq_table_finalize (&collate->wcseqorder);
1816
1817   /* Now determine whether the UNDEFINED entry is needed and if yes,
1818      whether it was defined.  */
1819   collate->undefined.used_in_level = need_undefined ? ~0ul : 0;
1820   if (collate->undefined.file == NULL)
1821     {
1822       if (need_undefined)
1823         {
1824           /* This seems not to be enforced by recent standards.  Don't
1825              emit an error, simply append UNDEFINED at the end.  */
1826           if (0)
1827             WITH_CUR_LOCALE (error (0, 0, _("no definition of `UNDEFINED'")));
1828
1829           /* Add UNDEFINED at the end.  */
1830           collate->undefined.mborder =
1831             (int *) obstack_alloc (&collate->mempool, nrules * sizeof (int));
1832
1833           for (i = 0; i < nrules; ++i)
1834             collate->undefined.mborder[i] = mbact[i]++;
1835         }
1836
1837       /* In any case we will need the definition for the wide character
1838          case.  But we will not complain that it is missing since the
1839          specification strangely enough does not seem to account for
1840          this.  */
1841       collate->undefined.wcorder = wcact++;
1842     }
1843
1844   /* Finally, try to unify the rules for the sections.  Whenever the rules
1845      for a section are the same as those for another section give the
1846      ruleset the same index.  Since there are never many section we can
1847      use an O(n^2) algorithm here.  */
1848   sect = collate->sections;
1849   while (sect != NULL && sect->rules == NULL)
1850     sect = sect->next;
1851
1852   /* Bail out if we have no sections because of earlier errors.  */
1853   if (sect == NULL)
1854     {
1855       WITH_CUR_LOCALE (error (EXIT_FAILURE, 0,
1856                               _("too many errors; giving up")));
1857       return;
1858     }
1859
1860   ruleidx = 0;
1861   do
1862     {
1863       struct section_list *osect = collate->sections;
1864
1865       while (osect != sect)
1866         if (osect->rules != NULL
1867             && memcmp (osect->rules, sect->rules,
1868                        nrules * sizeof (osect->rules[0])) == 0)
1869           break;
1870         else
1871           osect = osect->next;
1872
1873       if (osect == sect)
1874         sect->ruleidx = ruleidx++;
1875       else
1876         sect->ruleidx = osect->ruleidx;
1877
1878       /* Next section.  */
1879       do
1880         sect = sect->next;
1881       while (sect != NULL && sect->rules == NULL);
1882     }
1883   while (sect != NULL);
1884   /* We are currently not prepared for more than 128 rulesets.  But this
1885      should never really be a problem.  */
1886   assert (ruleidx <= 128);
1887 }
1888
1889
1890 static int32_t
1891 output_weight (struct obstack *pool, struct locale_collate_t *collate,
1892                struct element_t *elem)
1893 {
1894   size_t cnt;
1895   int32_t retval;
1896
1897   /* Optimize the use of UNDEFINED.  */
1898   if (elem == &collate->undefined)
1899     /* The weights are already inserted.  */
1900     return 0;
1901
1902   /* This byte can start exactly one collation element and this is
1903      a single byte.  We can directly give the index to the weights.  */
1904   retval = obstack_object_size (pool);
1905
1906   /* Construct the weight.  */
1907   for (cnt = 0; cnt < nrules; ++cnt)
1908     {
1909       char buf[elem->weights[cnt].cnt * 7];
1910       int len = 0;
1911       int i;
1912
1913       for (i = 0; i < elem->weights[cnt].cnt; ++i)
1914         /* Encode the weight value.  We do nothing for IGNORE entries.  */
1915         if (elem->weights[cnt].w[i] != NULL)
1916           len += utf8_encode (&buf[len],
1917                               elem->weights[cnt].w[i]->mborder[cnt]);
1918
1919       /* And add the buffer content.  */
1920       obstack_1grow (pool, len);
1921       obstack_grow (pool, buf, len);
1922     }
1923
1924   return retval | ((elem->section->ruleidx & 0x7f) << 24);
1925 }
1926
1927
1928 static int32_t
1929 output_weightwc (struct obstack *pool, struct locale_collate_t *collate,
1930                  struct element_t *elem)
1931 {
1932   size_t cnt;
1933   int32_t retval;
1934
1935   /* Optimize the use of UNDEFINED.  */
1936   if (elem == &collate->undefined)
1937     /* The weights are already inserted.  */
1938     return 0;
1939
1940   /* This byte can start exactly one collation element and this is
1941      a single byte.  We can directly give the index to the weights.  */
1942   retval = obstack_object_size (pool) / sizeof (int32_t);
1943
1944   /* Construct the weight.  */
1945   for (cnt = 0; cnt < nrules; ++cnt)
1946     {
1947       int32_t buf[elem->weights[cnt].cnt];
1948       int i;
1949       int32_t j;
1950
1951       for (i = 0, j = 0; i < elem->weights[cnt].cnt; ++i)
1952         if (elem->weights[cnt].w[i] != NULL)
1953           buf[j++] = elem->weights[cnt].w[i]->wcorder;
1954
1955       /* And add the buffer content.  */
1956       obstack_int32_grow (pool, j);
1957
1958       obstack_grow (pool, buf, j * sizeof (int32_t));
1959     }
1960
1961   return retval | ((elem->section->ruleidx & 0x7f) << 24);
1962 }
1963
1964 /* If localedef is every threaded, this would need to be __thread var.  */
1965 static struct
1966 {
1967   struct obstack *weightpool;
1968   struct obstack *extrapool;
1969   struct obstack *indpool;
1970   struct locale_collate_t *collate;
1971   struct collidx_table *tablewc;
1972 } atwc;
1973
1974 static void add_to_tablewc (uint32_t ch, struct element_t *runp);
1975
1976 static void
1977 add_to_tablewc (uint32_t ch, struct element_t *runp)
1978 {
1979   if (runp->wcnext == NULL && runp->nwcs == 1)
1980     {
1981       int32_t weigthidx = output_weightwc (atwc.weightpool, atwc.collate,
1982                                            runp);
1983       collidx_table_add (atwc.tablewc, ch, weigthidx);
1984     }
1985   else
1986     {
1987       /* As for the singlebyte table, we recognize sequences and
1988          compress them.  */
1989
1990       collidx_table_add (atwc.tablewc, ch,
1991                          -(obstack_object_size (atwc.extrapool)
1992                          / sizeof (uint32_t)));
1993
1994       do
1995         {
1996           /* Store the current index in the weight table.  We know that
1997              the current position in the `extrapool' is aligned on a
1998              32-bit address.  */
1999           int32_t weightidx;
2000           int added;
2001
2002           /* Find out wether this is a single entry or we have more than
2003              one consecutive entry.  */
2004           if (runp->wcnext != NULL
2005               && runp->nwcs == runp->wcnext->nwcs
2006               && wmemcmp ((wchar_t *) runp->wcs,
2007                           (wchar_t *)runp->wcnext->wcs,
2008                           runp->nwcs - 1) == 0
2009               && (runp->wcs[runp->nwcs - 1]
2010                   == runp->wcnext->wcs[runp->nwcs - 1] + 1))
2011             {
2012               int i;
2013               struct element_t *series_startp = runp;
2014               struct element_t *curp;
2015
2016               /* Now add first the initial byte sequence.  */
2017               added = (1 + 1 + 2 * (runp->nwcs - 1)) * sizeof (int32_t);
2018               if (sizeof (int32_t) == sizeof (int))
2019                 obstack_make_room (atwc.extrapool, added);
2020
2021               /* More than one consecutive entry.  We mark this by having
2022                  a negative index into the indirect table.  */
2023               obstack_int32_grow_fast (atwc.extrapool,
2024                                        -(obstack_object_size (atwc.indpool)
2025                                          / sizeof (int32_t)));
2026               obstack_int32_grow_fast (atwc.extrapool, runp->nwcs - 1);
2027
2028               do
2029                 runp = runp->wcnext;
2030               while (runp->wcnext != NULL
2031                      && runp->nwcs == runp->wcnext->nwcs
2032                      && wmemcmp ((wchar_t *) runp->wcs,
2033                                  (wchar_t *)runp->wcnext->wcs,
2034                                  runp->nwcs - 1) == 0
2035                      && (runp->wcs[runp->nwcs - 1]
2036                          == runp->wcnext->wcs[runp->nwcs - 1] + 1));
2037
2038               /* Now walk backward from here to the beginning.  */
2039               curp = runp;
2040
2041               for (i = 1; i < runp->nwcs; ++i)
2042                 obstack_int32_grow_fast (atwc.extrapool, curp->wcs[i]);
2043
2044               /* Now find the end of the consecutive sequence and
2045                  add all the indeces in the indirect pool.  */
2046               do
2047                 {
2048                   weightidx = output_weightwc (atwc.weightpool, atwc.collate,
2049                                                curp);
2050                   obstack_int32_grow (atwc.indpool, weightidx);
2051
2052                   curp = curp->wclast;
2053                 }
2054               while (curp != series_startp);
2055
2056               /* Add the final weight.  */
2057               weightidx = output_weightwc (atwc.weightpool, atwc.collate,
2058                                            curp);
2059               obstack_int32_grow (atwc.indpool, weightidx);
2060
2061               /* And add the end byte sequence.  Without length this
2062                  time.  */
2063               for (i = 1; i < curp->nwcs; ++i)
2064                 obstack_int32_grow (atwc.extrapool, curp->wcs[i]);
2065             }
2066           else
2067             {
2068               /* A single entry.  Simply add the index and the length and
2069                  string (except for the first character which is already
2070                  tested for).  */
2071               int i;
2072
2073               /* Output the weight info.  */
2074               weightidx = output_weightwc (atwc.weightpool, atwc.collate,
2075                                            runp);
2076
2077               added = (1 + 1 + runp->nwcs - 1) * sizeof (int32_t);
2078               if (sizeof (int) == sizeof (int32_t))
2079                 obstack_make_room (atwc.extrapool, added);
2080
2081               obstack_int32_grow_fast (atwc.extrapool, weightidx);
2082               obstack_int32_grow_fast (atwc.extrapool, runp->nwcs - 1);
2083               for (i = 1; i < runp->nwcs; ++i)
2084                 obstack_int32_grow_fast (atwc.extrapool, runp->wcs[i]);
2085             }
2086
2087           /* Next entry.  */
2088           runp = runp->wcnext;
2089         }
2090       while (runp != NULL);
2091     }
2092 }
2093
2094 void
2095 collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
2096                 const char *output_path)
2097 {
2098   struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
2099   const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE);
2100   struct iovec iov[2 + nelems];
2101   struct locale_file data;
2102   uint32_t idx[nelems];
2103   size_t cnt;
2104   size_t ch;
2105   int32_t tablemb[256];
2106   struct obstack weightpool;
2107   struct obstack extrapool;
2108   struct obstack indirectpool;
2109   struct section_list *sect;
2110   struct collidx_table tablewc;
2111   uint32_t elem_size;
2112   uint32_t *elem_table;
2113   int i;
2114   struct element_t *runp;
2115
2116   data.magic = LIMAGIC (LC_COLLATE);
2117   data.n = nelems;
2118   iov[0].iov_base = (void *) &data;
2119   iov[0].iov_len = sizeof (data);
2120
2121   iov[1].iov_base = (void *) idx;
2122   iov[1].iov_len = sizeof (idx);
2123
2124   idx[0] = iov[0].iov_len + iov[1].iov_len;
2125   cnt = 0;
2126
2127   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_NRULES));
2128   iov[2 + cnt].iov_base = &nrules;
2129   iov[2 + cnt].iov_len = sizeof (uint32_t);
2130   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2131   ++cnt;
2132
2133   /* If we have no LC_COLLATE data emit only the number of rules as zero.  */
2134   if (collate == NULL)
2135     {
2136       int32_t dummy = 0;
2137
2138       while (cnt < _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE))
2139         {
2140           /* The words have to be handled specially.  */
2141           if (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB))
2142             {
2143               iov[2 + cnt].iov_base = &dummy;
2144               iov[2 + cnt].iov_len = sizeof (int32_t);
2145             }
2146           else
2147             {
2148               iov[2 + cnt].iov_base = NULL;
2149               iov[2 + cnt].iov_len = 0;
2150             }
2151
2152           if (cnt + 1 < _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE))
2153             idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2154           ++cnt;
2155         }
2156
2157       assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE));
2158
2159       write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", 2 + cnt, iov);
2160
2161       return;
2162     }
2163
2164   obstack_init (&weightpool);
2165   obstack_init (&extrapool);
2166   obstack_init (&indirectpool);
2167
2168   /* Since we are using the sign of an integer to mark indirection the
2169      offsets in the arrays we are indirectly referring to must not be
2170      zero since -0 == 0.  Therefore we add a bit of dummy content.  */
2171   obstack_int32_grow (&extrapool, 0);
2172   obstack_int32_grow (&indirectpool, 0);
2173
2174   /* Prepare the ruleset table.  */
2175   for (sect = collate->sections, i = 0; sect != NULL; sect = sect->next)
2176     if (sect->rules != NULL && sect->ruleidx == i)
2177       {
2178         int j;
2179
2180         obstack_make_room (&weightpool, nrules);
2181
2182         for (j = 0; j < nrules; ++j)
2183           obstack_1grow_fast (&weightpool, sect->rules[j]);
2184         ++i;
2185       }
2186   /* And align the output.  */
2187   i = (nrules * i) % __alignof__ (int32_t);
2188   if (i > 0)
2189     do
2190       obstack_1grow (&weightpool, '\0');
2191     while (++i < __alignof__ (int32_t));
2192
2193   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_RULESETS));
2194   iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2195   iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2196   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2197   ++cnt;
2198
2199   /* Generate the 8-bit table.  Walk through the lists of sequences
2200      starting with the same byte and add them one after the other to
2201      the table.  In case we have more than one sequence starting with
2202      the same byte we have to use extra indirection.
2203
2204      First add a record for the NUL byte.  This entry will never be used
2205      so it does not matter.  */
2206   tablemb[0] = 0;
2207
2208   /* Now insert the `UNDEFINED' value if it is used.  Since this value
2209      will probably be used more than once it is good to store the
2210      weights only once.  */
2211   if (collate->undefined.used_in_level != 0)
2212     output_weight (&weightpool, collate, &collate->undefined);
2213
2214   for (ch = 1; ch < 256; ++ch)
2215     if (collate->mbheads[ch]->mbnext == NULL
2216         && collate->mbheads[ch]->nmbs <= 1)
2217       {
2218         tablemb[ch] = output_weight (&weightpool, collate,
2219                                      collate->mbheads[ch]);
2220       }
2221     else
2222       {
2223         /* The entries in the list are sorted by length and then
2224            alphabetically.  This is the order in which we will add the
2225            elements to the collation table.  This allows simply walking
2226            the table in sequence and stopping at the first matching
2227            entry.  Since the longer sequences are coming first in the
2228            list they have the possibility to match first, just as it
2229            has to be.  In the worst case we are walking to the end of
2230            the list where we put, if no singlebyte sequence is defined
2231            in the locale definition, the weights for UNDEFINED.
2232
2233            To reduce the length of the search list we compress them a bit.
2234            This happens by collecting sequences of consecutive byte
2235            sequences in one entry (having and begin and end byte sequence)
2236            and add only one index into the weight table.  We can find the
2237            consecutive entries since they are also consecutive in the list.  */
2238         struct element_t *runp = collate->mbheads[ch];
2239         struct element_t *lastp;
2240
2241         assert ((obstack_object_size (&extrapool)
2242                  & (__alignof__ (int32_t) - 1)) == 0);
2243
2244         tablemb[ch] = -obstack_object_size (&extrapool);
2245
2246         do
2247           {
2248             /* Store the current index in the weight table.  We know that
2249                the current position in the `extrapool' is aligned on a
2250                32-bit address.  */
2251             int32_t weightidx;
2252             int added;
2253
2254             /* Find out wether this is a single entry or we have more than
2255                one consecutive entry.  */
2256             if (runp->mbnext != NULL
2257                 && runp->nmbs == runp->mbnext->nmbs
2258                 && memcmp (runp->mbs, runp->mbnext->mbs, runp->nmbs - 1) == 0
2259                 && (runp->mbs[runp->nmbs - 1]
2260                     == runp->mbnext->mbs[runp->nmbs - 1] + 1))
2261               {
2262                 int i;
2263                 struct element_t *series_startp = runp;
2264                 struct element_t *curp;
2265
2266                 /* Compute how much space we will need.  */
2267                 added = ((sizeof (int32_t) + 1 + 2 * (runp->nmbs - 1)
2268                           + __alignof__ (int32_t) - 1)
2269                          & ~(__alignof__ (int32_t) - 1));
2270                 assert ((obstack_object_size (&extrapool)
2271                          & (__alignof__ (int32_t) - 1)) == 0);
2272                 obstack_make_room (&extrapool, added);
2273
2274                 /* More than one consecutive entry.  We mark this by having
2275                    a negative index into the indirect table.  */
2276                 obstack_int32_grow_fast (&extrapool,
2277                                          -(obstack_object_size (&indirectpool)
2278                                            / sizeof (int32_t)));
2279
2280                 /* Now search first the end of the series.  */
2281                 do
2282                   runp = runp->mbnext;
2283                 while (runp->mbnext != NULL
2284                        && runp->nmbs == runp->mbnext->nmbs
2285                        && memcmp (runp->mbs, runp->mbnext->mbs,
2286                                   runp->nmbs - 1) == 0
2287                        && (runp->mbs[runp->nmbs - 1]
2288                            == runp->mbnext->mbs[runp->nmbs - 1] + 1));
2289
2290                 /* Now walk backward from here to the beginning.  */
2291                 curp = runp;
2292
2293                 assert (runp->nmbs <= 256);
2294                 obstack_1grow_fast (&extrapool, curp->nmbs - 1);
2295                 for (i = 1; i < curp->nmbs; ++i)
2296                   obstack_1grow_fast (&extrapool, curp->mbs[i]);
2297
2298                 /* Now find the end of the consecutive sequence and
2299                    add all the indeces in the indirect pool.  */
2300                 do
2301                   {
2302                     weightidx = output_weight (&weightpool, collate, curp);
2303                     obstack_int32_grow (&indirectpool, weightidx);
2304
2305                     curp = curp->mblast;
2306                   }
2307                 while (curp != series_startp);
2308
2309                 /* Add the final weight.  */
2310                 weightidx = output_weight (&weightpool, collate, curp);
2311                 obstack_int32_grow (&indirectpool, weightidx);
2312
2313                 /* And add the end byte sequence.  Without length this
2314                    time.  */
2315                 for (i = 1; i < curp->nmbs; ++i)
2316                   obstack_1grow_fast (&extrapool, curp->mbs[i]);
2317               }
2318             else
2319               {
2320                 /* A single entry.  Simply add the index and the length and
2321                    string (except for the first character which is already
2322                    tested for).  */
2323                 int i;
2324
2325                 /* Output the weight info.  */
2326                 weightidx = output_weight (&weightpool, collate, runp);
2327
2328                 added = ((sizeof (int32_t) + 1 + runp->nmbs - 1
2329                           + __alignof__ (int32_t) - 1)
2330                          & ~(__alignof__ (int32_t) - 1));
2331                 assert ((obstack_object_size (&extrapool)
2332                          & (__alignof__ (int32_t) - 1)) == 0);
2333                 obstack_make_room (&extrapool, added);
2334
2335                 obstack_int32_grow_fast (&extrapool, weightidx);
2336                 assert (runp->nmbs <= 256);
2337                 obstack_1grow_fast (&extrapool, runp->nmbs - 1);
2338
2339                 for (i = 1; i < runp->nmbs; ++i)
2340                   obstack_1grow_fast (&extrapool, runp->mbs[i]);
2341               }
2342
2343             /* Add alignment bytes if necessary.  */
2344             while ((obstack_object_size (&extrapool)
2345                     & (__alignof__ (int32_t) - 1)) != 0)
2346               obstack_1grow_fast (&extrapool, '\0');
2347
2348             /* Next entry.  */
2349             lastp = runp;
2350             runp = runp->mbnext;
2351           }
2352         while (runp != NULL);
2353
2354         assert ((obstack_object_size (&extrapool)
2355                  & (__alignof__ (int32_t) - 1)) == 0);
2356
2357         /* If the final entry in the list is not a single character we
2358            add an UNDEFINED entry here.  */
2359         if (lastp->nmbs != 1)
2360           {
2361             int added = ((sizeof (int32_t) + 1 + 1 + __alignof__ (int32_t) - 1)
2362                          & ~(__alignof__ (int32_t) - 1));
2363             obstack_make_room (&extrapool, added);
2364
2365             obstack_int32_grow_fast (&extrapool, 0);
2366             /* XXX What rule? We just pick the first.  */
2367             obstack_1grow_fast (&extrapool, 0);
2368             /* Length is zero.  */
2369             obstack_1grow_fast (&extrapool, 0);
2370
2371             /* Add alignment bytes if necessary.  */
2372             while ((obstack_object_size (&extrapool)
2373                     & (__alignof__ (int32_t) - 1)) != 0)
2374               obstack_1grow_fast (&extrapool, '\0');
2375           }
2376       }
2377
2378   /* Add padding to the tables if necessary.  */
2379   while ((obstack_object_size (&weightpool) & (__alignof__ (int32_t) - 1))
2380          != 0)
2381     obstack_1grow (&weightpool, 0);
2382
2383   /* Now add the four tables.  */
2384   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_TABLEMB));
2385   iov[2 + cnt].iov_base = tablemb;
2386   iov[2 + cnt].iov_len = sizeof (tablemb);
2387   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2388   assert ((iov[2 + cnt].iov_len & (__alignof__ (int32_t) - 1)) == 0);
2389   ++cnt;
2390
2391   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_WEIGHTMB));
2392   iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2393   iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2394   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2395   ++cnt;
2396
2397   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_EXTRAMB));
2398   iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2399   iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2400   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2401   ++cnt;
2402
2403   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_INDIRECTMB));
2404   iov[2 + cnt].iov_len = obstack_object_size (&indirectpool);
2405   iov[2 + cnt].iov_base = obstack_finish (&indirectpool);
2406   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2407   assert ((iov[2 + cnt].iov_len & (__alignof__ (int32_t) - 1)) == 0);
2408   ++cnt;
2409
2410
2411   /* Now the same for the wide character table.  We need to store some
2412      more information here.  */
2413   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP1));
2414   iov[2 + cnt].iov_base = NULL;
2415   iov[2 + cnt].iov_len = 0;
2416   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2417   assert (idx[cnt] % __alignof__ (int32_t) == 0);
2418   ++cnt;
2419
2420   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP2));
2421   iov[2 + cnt].iov_base = NULL;
2422   iov[2 + cnt].iov_len = 0;
2423   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2424   assert (idx[cnt] % __alignof__ (int32_t) == 0);
2425   ++cnt;
2426
2427   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP3));
2428   iov[2 + cnt].iov_base = NULL;
2429   iov[2 + cnt].iov_len = 0;
2430   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2431   assert (idx[cnt] % __alignof__ (int32_t) == 0);
2432   ++cnt;
2433
2434   /* Since we are using the sign of an integer to mark indirection the
2435      offsets in the arrays we are indirectly referring to must not be
2436      zero since -0 == 0.  Therefore we add a bit of dummy content.  */
2437   obstack_int32_grow (&extrapool, 0);
2438   obstack_int32_grow (&indirectpool, 0);
2439
2440   /* Now insert the `UNDEFINED' value if it is used.  Since this value
2441      will probably be used more than once it is good to store the
2442      weights only once.  */
2443   if (output_weightwc (&weightpool, collate, &collate->undefined) != 0)
2444     abort ();
2445
2446   /* Generate the table.  Walk through the lists of sequences starting
2447      with the same wide character and add them one after the other to
2448      the table.  In case we have more than one sequence starting with
2449      the same byte we have to use extra indirection.  */
2450   tablewc.p = 6;
2451   tablewc.q = 10;
2452   collidx_table_init (&tablewc);
2453
2454   atwc.weightpool = &weightpool;
2455   atwc.extrapool = &extrapool;
2456   atwc.indpool = &indirectpool;
2457   atwc.collate = collate;
2458   atwc.tablewc = &tablewc;
2459
2460   wchead_table_iterate (&collate->wcheads, add_to_tablewc);
2461
2462   memset (&atwc, 0, sizeof (atwc));
2463
2464   collidx_table_finalize (&tablewc);
2465
2466   /* Now add the four tables.  */
2467   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_TABLEWC));
2468   iov[2 + cnt].iov_base = tablewc.result;
2469   iov[2 + cnt].iov_len = tablewc.result_size;
2470   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2471   assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2472   assert (idx[cnt] % __alignof__ (int32_t) == 0);
2473   ++cnt;
2474
2475   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_WEIGHTWC));
2476   iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2477   iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2478   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2479   assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2480   assert (idx[cnt] % __alignof__ (int32_t) == 0);
2481   ++cnt;
2482
2483   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_EXTRAWC));
2484   iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2485   iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2486   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2487   assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2488   assert (idx[cnt] % __alignof__ (int32_t) == 0);
2489   ++cnt;
2490
2491   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_INDIRECTWC));
2492   iov[2 + cnt].iov_len = obstack_object_size (&indirectpool);
2493   iov[2 + cnt].iov_base = obstack_finish (&indirectpool);
2494   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2495   assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2496   assert (idx[cnt] % __alignof__ (int32_t) == 0);
2497   ++cnt;
2498
2499
2500   /* Finally write the table with collation element names out.  It is
2501      a hash table with a simple function which gets the name of the
2502      character as the input.  One character might have many names.  The
2503      value associated with the name is an index into the weight table
2504      where we are then interested in the first-level weight value.
2505
2506      To determine how large the table should be we are counting the
2507      elements have to put in.  Since we are using internal chaining
2508      using a secondary hash function we have to make the table a bit
2509      larger to avoid extremely long search times.  We can achieve
2510      good results with a 40% larger table than there are entries.  */
2511   elem_size = 0;
2512   runp = collate->start;
2513   while (runp != NULL)
2514     {
2515       if (runp->mbs != NULL && runp->weights != NULL && !runp->is_character)
2516         /* Yep, the element really counts.  */
2517         ++elem_size;
2518
2519       runp = runp->next;
2520     }
2521   /* Add 40% and find the next prime number.  */
2522   elem_size = next_prime (elem_size * 1.4);
2523
2524   /* Allocate the table.  Each entry consists of two words: the hash
2525      value and an index in a secondary table which provides the index
2526      into the weight table and the string itself (so that a match can
2527      be determined).  */
2528   elem_table = (uint32_t *) obstack_alloc (&extrapool,
2529                                            elem_size * 2 * sizeof (uint32_t));
2530   memset (elem_table, '\0', elem_size * 2 * sizeof (uint32_t));
2531
2532   /* Now add the elements.  */
2533   runp = collate->start;
2534   while (runp != NULL)
2535     {
2536       if (runp->mbs != NULL && runp->weights != NULL && !runp->is_character)
2537         {
2538           /* Compute the hash value of the name.  */
2539           uint32_t namelen = strlen (runp->name);
2540           uint32_t hash = elem_hash (runp->name, namelen);
2541           size_t idx = hash % elem_size;
2542 #ifndef NDEBUG
2543           size_t start_idx = idx;
2544 #endif
2545
2546           if (elem_table[idx * 2] != 0)
2547             {
2548               /* The spot is already taken.  Try iterating using the value
2549                  from the secondary hashing function.  */
2550               size_t iter = hash % (elem_size - 2) + 1;
2551
2552               do
2553                 {
2554                   idx += iter;
2555                   if (idx >= elem_size)
2556                     idx -= elem_size;
2557                   assert (idx != start_idx);
2558                 }
2559               while (elem_table[idx * 2] != 0);
2560             }
2561           /* This is the spot where we will insert the value.  */
2562           elem_table[idx * 2] = hash;
2563           elem_table[idx * 2 + 1] = obstack_object_size (&extrapool);
2564
2565           /* The string itself including length.  */
2566           obstack_1grow (&extrapool, namelen);
2567           obstack_grow (&extrapool, runp->name, namelen);
2568
2569           /* And the multibyte representation.  */
2570           obstack_1grow (&extrapool, runp->nmbs);
2571           obstack_grow (&extrapool, runp->mbs, runp->nmbs);
2572
2573           /* And align again to 32 bits.  */
2574           if ((1 + namelen + 1 + runp->nmbs) % sizeof (int32_t) != 0)
2575             obstack_grow (&extrapool, "\0\0",
2576                           (sizeof (int32_t)
2577                            - ((1 + namelen + 1 + runp->nmbs)
2578                               % sizeof (int32_t))));
2579
2580           /* Now some 32-bit values: multibyte collation sequence,
2581              wide char string (including length), and wide char
2582              collation sequence.  */
2583           obstack_int32_grow (&extrapool, runp->mbseqorder);
2584
2585           obstack_int32_grow (&extrapool, runp->nwcs);
2586           obstack_grow (&extrapool, runp->wcs,
2587                         runp->nwcs * sizeof (uint32_t));
2588
2589           obstack_int32_grow (&extrapool, runp->wcseqorder);
2590         }
2591
2592       runp = runp->next;
2593     }
2594
2595   /* Prepare to write out this data.  */
2596   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB));
2597   iov[2 + cnt].iov_base = &elem_size;
2598   iov[2 + cnt].iov_len = sizeof (int32_t);
2599   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2600   assert (idx[cnt] % __alignof__ (int32_t) == 0);
2601   ++cnt;
2602
2603   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_TABLEMB));
2604   iov[2 + cnt].iov_base = elem_table;
2605   iov[2 + cnt].iov_len = elem_size * 2 * sizeof (int32_t);
2606   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2607   assert (idx[cnt] % __alignof__ (int32_t) == 0);
2608   ++cnt;
2609
2610   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_EXTRAMB));
2611   iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2612   iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2613   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2614   ++cnt;
2615
2616   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_COLLSEQMB));
2617   iov[2 + cnt].iov_base = collate->mbseqorder;
2618   iov[2 + cnt].iov_len = 256;
2619   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2620   ++cnt;
2621
2622   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_COLLSEQWC));
2623   iov[2 + cnt].iov_base = collate->wcseqorder.result;
2624   iov[2 + cnt].iov_len = collate->wcseqorder.result_size;
2625   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2626   assert (idx[cnt] % __alignof__ (int32_t) == 0);
2627   ++cnt;
2628
2629   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_CODESET));
2630   iov[2 + cnt].iov_base = (void *) charmap->code_set_name;
2631   iov[2 + cnt].iov_len = strlen (iov[2 + cnt].iov_base) + 1;
2632   ++cnt;
2633
2634   assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE));
2635
2636   write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", 2 + cnt, iov);
2637
2638   obstack_free (&weightpool, NULL);
2639   obstack_free (&extrapool, NULL);
2640   obstack_free (&indirectpool, NULL);
2641 }
2642
2643
2644 static enum token_t
2645 skip_to (struct linereader *ldfile, struct locale_collate_t *collate,
2646          const struct charmap_t *charmap, int to_endif)
2647 {
2648   while (1)
2649     {
2650       struct token *now = lr_token (ldfile, charmap, NULL, NULL, 0);
2651       enum token_t nowtok = now->tok;
2652
2653       if (nowtok == tok_eof || nowtok == tok_end)
2654         return nowtok;
2655
2656       if (nowtok == tok_ifdef || nowtok == tok_ifndef)
2657         {
2658           lr_error (ldfile, _("%s: nested conditionals not supported"),
2659                     "LC_COLLATE");
2660           nowtok = skip_to (ldfile, collate, charmap, tok_endif);
2661           if (nowtok == tok_eof || nowtok == tok_end)
2662             return nowtok;
2663         }
2664       else if (nowtok == tok_endif || (!to_endif && nowtok == tok_else))
2665         {
2666           lr_ignore_rest (ldfile, 1);
2667           return nowtok;
2668         }
2669       else if (!to_endif && (nowtok == tok_elifdef || nowtok == tok_elifndef))
2670         {
2671           /* Do not read the rest of the line.  */
2672           return nowtok;
2673         }
2674       else if (nowtok == tok_else)
2675         {
2676           lr_error (ldfile, _("%s: more then one 'else'"), "LC_COLLATE");
2677         }
2678
2679       lr_ignore_rest (ldfile, 0);
2680     }
2681 }
2682
2683
2684 void
2685 collate_read (struct linereader *ldfile, struct localedef_t *result,
2686               const struct charmap_t *charmap, const char *repertoire_name,
2687               int ignore_content)
2688 {
2689   struct repertoire_t *repertoire = NULL;
2690   struct locale_collate_t *collate;
2691   struct token *now;
2692   struct token *arg = NULL;
2693   enum token_t nowtok;
2694   enum token_t was_ellipsis = tok_none;
2695   struct localedef_t *copy_locale = NULL;
2696   /* Parsing state:
2697      0 - start
2698      1 - between `order-start' and `order-end'
2699      2 - after `order-end'
2700      3 - after `reorder-after', waiting for `reorder-end'
2701      4 - after `reorder-end'
2702      5 - after `reorder-sections-after', waiting for `reorder-sections-end'
2703      6 - after `reorder-sections-end'
2704   */
2705   int state = 0;
2706
2707   /* Get the repertoire we have to use.  */
2708   if (repertoire_name != NULL)
2709     repertoire = repertoire_read (repertoire_name);
2710
2711   /* The rest of the line containing `LC_COLLATE' must be free.  */
2712   lr_ignore_rest (ldfile, 1);
2713
2714   while (1)
2715     {
2716       do
2717         {
2718           now = lr_token (ldfile, charmap, result, NULL, verbose);
2719           nowtok = now->tok;
2720         }
2721       while (nowtok == tok_eol);
2722
2723       if (nowtok != tok_define)
2724         break;
2725
2726       if (ignore_content)
2727         lr_ignore_rest (ldfile, 0);
2728       else
2729         {
2730           arg = lr_token (ldfile, charmap, result, NULL, verbose);
2731           if (arg->tok != tok_ident)
2732             SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
2733           else
2734             {
2735               /* Simply add the new symbol.  */
2736               struct name_list *newsym = xmalloc (sizeof (*newsym)
2737                                                   + arg->val.str.lenmb + 1);
2738               memcpy (newsym->str, arg->val.str.startmb, arg->val.str.lenmb);
2739               newsym->str[arg->val.str.lenmb] = '\0';
2740               newsym->next = defined;
2741               defined = newsym;
2742
2743               lr_ignore_rest (ldfile, 1);
2744             }
2745         }
2746     }
2747
2748   if (nowtok == tok_copy)
2749     {
2750       now = lr_token (ldfile, charmap, result, NULL, verbose);
2751       if (now->tok != tok_string)
2752         {
2753           SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
2754
2755         skip_category:
2756           do
2757             now = lr_token (ldfile, charmap, result, NULL, verbose);
2758           while (now->tok != tok_eof && now->tok != tok_end);
2759
2760           if (now->tok != tok_eof
2761               || (now = lr_token (ldfile, charmap, result, NULL, verbose),
2762                   now->tok == tok_eof))
2763             lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");
2764           else if (now->tok != tok_lc_collate)
2765             {
2766               lr_error (ldfile, _("\
2767 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
2768               lr_ignore_rest (ldfile, 0);
2769             }
2770           else
2771             lr_ignore_rest (ldfile, 1);
2772
2773           return;
2774         }
2775
2776       if (! ignore_content)
2777         {
2778           /* Get the locale definition.  */
2779           copy_locale = load_locale (LC_COLLATE, now->val.str.startmb,
2780                                      repertoire_name, charmap, NULL);
2781           if ((copy_locale->avail & COLLATE_LOCALE) == 0)
2782             {
2783               /* Not yet loaded.  So do it now.  */
2784               if (locfile_read (copy_locale, charmap) != 0)
2785                 goto skip_category;
2786             }
2787
2788           if (copy_locale->categories[LC_COLLATE].collate == NULL)
2789             return;
2790         }
2791
2792       lr_ignore_rest (ldfile, 1);
2793
2794       now = lr_token (ldfile, charmap, result, NULL, verbose);
2795       nowtok = now->tok;
2796     }
2797
2798   /* Prepare the data structures.  */
2799   collate_startup (ldfile, result, copy_locale, ignore_content);
2800   collate = result->categories[LC_COLLATE].collate;
2801
2802   while (1)
2803     {
2804       char ucs4buf[10];
2805       char *symstr;
2806       size_t symlen;
2807
2808       /* Of course we don't proceed beyond the end of file.  */
2809       if (nowtok == tok_eof)
2810         break;
2811
2812       /* Ingore empty lines.  */
2813       if (nowtok == tok_eol)
2814         {
2815           now = lr_token (ldfile, charmap, result, NULL, verbose);
2816           nowtok = now->tok;
2817           continue;
2818         }
2819
2820       switch (nowtok)
2821         {
2822         case tok_copy:
2823           /* Allow copying other locales.  */
2824           now = lr_token (ldfile, charmap, result, NULL, verbose);
2825           if (now->tok != tok_string)
2826             goto err_label;
2827
2828           if (! ignore_content)
2829             load_locale (LC_COLLATE, now->val.str.startmb, repertoire_name,
2830                          charmap, result);
2831
2832           lr_ignore_rest (ldfile, 1);
2833           break;
2834
2835         case tok_coll_weight_max:
2836           /* Ignore the rest of the line if we don't need the input of
2837              this line.  */
2838           if (ignore_content)
2839             {
2840               lr_ignore_rest (ldfile, 0);
2841               break;
2842             }
2843
2844           if (state != 0)
2845             goto err_label;
2846
2847           arg = lr_token (ldfile, charmap, result, NULL, verbose);
2848           if (arg->tok != tok_number)
2849             goto err_label;
2850           if (collate->col_weight_max != -1)
2851             lr_error (ldfile, _("%s: duplicate definition of `%s'"),
2852                       "LC_COLLATE", "col_weight_max");
2853           else
2854             collate->col_weight_max = arg->val.num;
2855           lr_ignore_rest (ldfile, 1);
2856           break;
2857
2858         case tok_section_symbol:
2859           /* Ignore the rest of the line if we don't need the input of
2860              this line.  */
2861           if (ignore_content)
2862             {
2863               lr_ignore_rest (ldfile, 0);
2864               break;
2865             }
2866
2867           if (state != 0)
2868             goto err_label;
2869
2870           arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2871           if (arg->tok != tok_bsymbol)
2872             goto err_label;
2873           else if (!ignore_content)
2874             {
2875               /* Check whether this section is already known.  */
2876               struct section_list *known = collate->sections;
2877               while (known != NULL)
2878                 {
2879                   if (strcmp (known->name, arg->val.str.startmb) == 0)
2880                     break;
2881                   known = known->next;
2882                 }
2883
2884               if (known != NULL)
2885                 {
2886                   lr_error (ldfile,
2887                             _("%s: duplicate declaration of section `%s'"),
2888                             "LC_COLLATE", arg->val.str.startmb);
2889                   free (arg->val.str.startmb);
2890                 }
2891               else
2892                 collate->sections = make_seclist_elem (collate,
2893                                                        arg->val.str.startmb,
2894                                                        collate->sections);
2895
2896               lr_ignore_rest (ldfile, known == NULL);
2897             }
2898           else
2899             {
2900               free (arg->val.str.startmb);
2901               lr_ignore_rest (ldfile, 0);
2902             }
2903           break;
2904
2905         case tok_collating_element:
2906           /* Ignore the rest of the line if we don't need the input of
2907              this line.  */
2908           if (ignore_content)
2909             {
2910               lr_ignore_rest (ldfile, 0);
2911               break;
2912             }
2913
2914           if (state != 0 && state != 2)
2915             goto err_label;
2916
2917           arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2918           if (arg->tok != tok_bsymbol)
2919             goto err_label;
2920           else
2921             {
2922               const char *symbol = arg->val.str.startmb;
2923               size_t symbol_len = arg->val.str.lenmb;
2924
2925               /* Next the `from' keyword.  */
2926               arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2927               if (arg->tok != tok_from)
2928                 {
2929                   free ((char *) symbol);
2930                   goto err_label;
2931                 }
2932
2933               ldfile->return_widestr = 1;
2934               ldfile->translate_strings = 1;
2935
2936               /* Finally the string with the replacement.  */
2937               arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2938
2939               ldfile->return_widestr = 0;
2940               ldfile->translate_strings = 0;
2941
2942               if (arg->tok != tok_string)
2943                 goto err_label;
2944
2945               if (!ignore_content && symbol != NULL)
2946                 {
2947                   /* The name is already defined.  */
2948                   if (check_duplicate (ldfile, collate, charmap,
2949                                        repertoire, symbol, symbol_len))
2950                     goto col_elem_free;
2951
2952                   if (arg->val.str.startmb != NULL)
2953                     insert_entry (&collate->elem_table, symbol, symbol_len,
2954                                   new_element (collate,
2955                                                arg->val.str.startmb,
2956                                                arg->val.str.lenmb - 1,
2957                                                arg->val.str.startwc,
2958                                                symbol, symbol_len, 0));
2959                 }
2960               else
2961                 {
2962                 col_elem_free:
2963                   free ((char *) symbol);
2964                   free (arg->val.str.startmb);
2965                   free (arg->val.str.startwc);
2966                 }
2967               lr_ignore_rest (ldfile, 1);
2968             }
2969           break;
2970
2971         case tok_collating_symbol:
2972           /* Ignore the rest of the line if we don't need the input of
2973              this line.  */
2974           if (ignore_content)
2975             {
2976               lr_ignore_rest (ldfile, 0);
2977               break;
2978             }
2979
2980           if (state != 0 && state != 2)
2981             goto err_label;
2982
2983           arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2984           if (arg->tok != tok_bsymbol)
2985             goto err_label;
2986           else
2987             {
2988               char *symbol = arg->val.str.startmb;
2989               size_t symbol_len = arg->val.str.lenmb;
2990               char *endsymbol = NULL;
2991               size_t endsymbol_len = 0;
2992               enum token_t ellipsis = tok_none;
2993
2994               arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2995               if (arg->tok == tok_ellipsis2 || arg->tok == tok_ellipsis4)
2996                 {
2997                   ellipsis = arg->tok;
2998
2999                   arg = lr_token (ldfile, charmap, result, repertoire,
3000                                   verbose);
3001                   if (arg->tok != tok_bsymbol)
3002                     {
3003                       free (symbol);
3004                       goto err_label;
3005                     }
3006
3007                   endsymbol = arg->val.str.startmb;
3008                   endsymbol_len = arg->val.str.lenmb;
3009
3010                   lr_ignore_rest (ldfile, 1);
3011                 }
3012               else if (arg->tok != tok_eol)
3013                 {
3014                   free (symbol);
3015                   goto err_label;
3016                 }
3017
3018               if (!ignore_content)
3019                 {
3020                   if (symbol == NULL
3021                       || (ellipsis != tok_none && endsymbol == NULL))
3022                     {
3023                       lr_error (ldfile, _("\
3024 %s: unknown character in collating symbol name"),
3025                                 "LC_COLLATE");
3026                       goto col_sym_free;
3027                     }
3028                   else if (ellipsis == tok_none)
3029                     {
3030                       /* A single symbol, no ellipsis.  */
3031                       if (check_duplicate (ldfile, collate, charmap,
3032                                            repertoire, symbol, symbol_len))
3033                         /* The name is already defined.  */
3034                         goto col_sym_free;
3035
3036                       insert_entry (&collate->sym_table, symbol, symbol_len,
3037                                     new_symbol (collate, symbol, symbol_len));
3038                     }
3039                   else if (symbol_len != endsymbol_len)
3040                     {
3041                     col_sym_inv_range:
3042                       lr_error (ldfile,
3043                                 _("invalid names for character range"));
3044                       goto col_sym_free;
3045                     }
3046                   else
3047                     {
3048                       /* Oh my, we have to handle an ellipsis.  First, as
3049                          usual, determine the common prefix and then
3050                          convert the rest into a range.  */
3051                       size_t prefixlen;
3052                       unsigned long int from;
3053                       unsigned long int to;
3054                       char *endp;
3055
3056                       for (prefixlen = 0; prefixlen < symbol_len; ++prefixlen)
3057                         if (symbol[prefixlen] != endsymbol[prefixlen])
3058                           break;
3059
3060                       /* Convert the rest into numbers.  */
3061                       symbol[symbol_len] = '\0';
3062                       from = strtoul (&symbol[prefixlen], &endp,
3063                                       ellipsis == tok_ellipsis2 ? 16 : 10);
3064                       if (*endp != '\0')
3065                         goto col_sym_inv_range;
3066
3067                       endsymbol[symbol_len] = '\0';
3068                       to = strtoul (&endsymbol[prefixlen], &endp,
3069                                     ellipsis == tok_ellipsis2 ? 16 : 10);
3070                       if (*endp != '\0')
3071                         goto col_sym_inv_range;
3072
3073                       if (from > to)
3074                         goto col_sym_inv_range;
3075
3076                       /* Now loop over all entries.  */
3077                       while (from <= to)
3078                         {
3079                           char *symbuf;
3080
3081                           symbuf = (char *) obstack_alloc (&collate->mempool,
3082                                                            symbol_len + 1);
3083
3084                           /* Create the name.  */
3085                           sprintf (symbuf,
3086                                    ellipsis == tok_ellipsis2
3087                                    ? "%.*s%.*lX" : "%.*s%.*lu",
3088                                    (int) prefixlen, symbol,
3089                                    (int) (symbol_len - prefixlen), from);
3090
3091                           if (check_duplicate (ldfile, collate, charmap,
3092                                                repertoire, symbuf, symbol_len))
3093                             /* The name is already defined.  */
3094                             goto col_sym_free;
3095
3096                           insert_entry (&collate->sym_table, symbuf,
3097                                         symbol_len,
3098                                         new_symbol (collate, symbuf,
3099                                                     symbol_len));
3100
3101                           /* Increment the counter.  */
3102                           ++from;
3103                         }
3104
3105                       goto col_sym_free;
3106                     }
3107                 }
3108               else
3109                 {
3110                 col_sym_free:
3111                   free (symbol);
3112                   free (endsymbol);
3113                 }
3114             }
3115           break;
3116
3117         case tok_symbol_equivalence:
3118           /* Ignore the rest of the line if we don't need the input of
3119              this line.  */
3120           if (ignore_content)
3121             {
3122               lr_ignore_rest (ldfile, 0);
3123               break;
3124             }
3125
3126           if (state != 0)
3127             goto err_label;
3128
3129           arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3130           if (arg->tok != tok_bsymbol)
3131             goto err_label;
3132           else
3133             {
3134               const char *newname = arg->val.str.startmb;
3135               size_t newname_len = arg->val.str.lenmb;
3136               const char *symname;
3137               size_t symname_len;
3138               void *symval;     /* Actually struct symbol_t*  */
3139
3140               arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3141               if (arg->tok != tok_bsymbol)
3142                 {
3143                   free ((char *) newname);
3144                   goto err_label;
3145                 }
3146
3147               symname = arg->val.str.startmb;
3148               symname_len = arg->val.str.lenmb;
3149
3150               if (newname == NULL)
3151                 {
3152                   lr_error (ldfile, _("\
3153 %s: unknown character in equivalent definition name"),
3154                             "LC_COLLATE");
3155
3156                 sym_equiv_free:
3157                   free ((char *) newname);
3158                   free ((char *) symname);
3159                   break;
3160                 }
3161               if (symname == NULL)
3162                 {
3163                   lr_error (ldfile, _("\
3164 %s: unknown character in equivalent definition value"),
3165                             "LC_COLLATE");
3166                   goto sym_equiv_free;
3167                 }
3168
3169               /* See whether the symbol name is already defined.  */
3170               if (find_entry (&collate->sym_table, symname, symname_len,
3171                               &symval) != 0)
3172                 {
3173                   lr_error (ldfile, _("\
3174 %s: unknown symbol `%s' in equivalent definition"),
3175                             "LC_COLLATE", symname);
3176                   goto sym_equiv_free;
3177                 }
3178
3179               if (insert_entry (&collate->sym_table,
3180                                 newname, newname_len, symval) < 0)
3181                 {
3182                   lr_error (ldfile, _("\
3183 error while adding equivalent collating symbol"));
3184                   goto sym_equiv_free;
3185                 }
3186
3187               free ((char *) symname);
3188             }
3189           lr_ignore_rest (ldfile, 1);
3190           break;
3191
3192         case tok_script:
3193           /* Ignore the rest of the line if we don't need the input of
3194              this line.  */
3195           if (ignore_content)
3196             {
3197               lr_ignore_rest (ldfile, 0);
3198               break;
3199             }
3200
3201           /* We get told about the scripts we know.  */
3202           arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3203           if (arg->tok != tok_bsymbol)
3204             goto err_label;
3205           else
3206             {
3207               struct section_list *runp = collate->known_sections;
3208               char *name;
3209
3210               while (runp != NULL)
3211                 if (strncmp (runp->name, arg->val.str.startmb,
3212                              arg->val.str.lenmb) == 0
3213                     && runp->name[arg->val.str.lenmb] == '\0')
3214                   break;
3215                 else
3216                   runp = runp->def_next;
3217
3218               if (runp != NULL)
3219                 {
3220                   lr_error (ldfile, _("duplicate definition of script `%s'"),
3221                             runp->name);
3222                   lr_ignore_rest (ldfile, 0);
3223                   break;
3224                 }
3225
3226               runp = (struct section_list *) xcalloc (1, sizeof (*runp));
3227               name = (char *) xmalloc (arg->val.str.lenmb + 1);
3228               memcpy (name, arg->val.str.startmb, arg->val.str.lenmb);
3229               name[arg->val.str.lenmb] = '\0';
3230               runp->name = name;
3231
3232               runp->def_next = collate->known_sections;
3233               collate->known_sections = runp;
3234             }
3235           lr_ignore_rest (ldfile, 1);
3236           break;
3237
3238         case tok_order_start:
3239           /* Ignore the rest of the line if we don't need the input of
3240              this line.  */
3241           if (ignore_content)
3242             {
3243               lr_ignore_rest (ldfile, 0);
3244               break;
3245             }
3246
3247           if (state != 0 && state != 1 && state != 2)
3248             goto err_label;
3249           state = 1;
3250
3251           /* The 14652 draft does not specify whether all `order_start' lines
3252              must contain the same number of sort-rules, but 14651 does.  So
3253              we require this here as well.  */
3254           arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3255           if (arg->tok == tok_bsymbol)
3256             {
3257               /* This better should be a section name.  */
3258               struct section_list *sp = collate->known_sections;
3259               while (sp != NULL
3260                      && (sp->name == NULL
3261                          || strncmp (sp->name, arg->val.str.startmb,
3262                                      arg->val.str.lenmb) != 0
3263                          || sp->name[arg->val.str.lenmb] != '\0'))
3264                 sp = sp->def_next;
3265
3266               if (sp == NULL)
3267                 {
3268                   lr_error (ldfile, _("\
3269 %s: unknown section name `%.*s'"),
3270                             "LC_COLLATE", (int) arg->val.str.lenmb,
3271                             arg->val.str.startmb);
3272                   /* We use the error section.  */
3273                   collate->current_section = &collate->error_section;
3274
3275                   if (collate->error_section.first == NULL)
3276                     {
3277                       /* Insert &collate->error_section at the end of
3278                          the collate->sections list.  */
3279                       if (collate->sections == NULL)
3280                         collate->sections = &collate->error_section;
3281                       else
3282                         {
3283                           sp = collate->sections;
3284                           while (sp->next != NULL)
3285                             sp = sp->next;
3286
3287                           sp->next = &collate->error_section;
3288                         }
3289                       collate->error_section.next = NULL;
3290                     }
3291                 }
3292               else
3293                 {
3294                   /* One should not be allowed to open the same
3295                      section twice.  */
3296                   if (sp->first != NULL)
3297                     lr_error (ldfile, _("\
3298 %s: multiple order definitions for section `%s'"),
3299                               "LC_COLLATE", sp->name);
3300                   else
3301                     {
3302                       /* Insert sp in the collate->sections list,
3303                          right after collate->current_section.  */
3304                       if (collate->current_section != NULL)
3305                         {
3306                           sp->next = collate->current_section->next;
3307                           collate->current_section->next = sp;
3308                         }
3309                       else if (collate->sections == NULL)
3310                         /* This is the first section to be defined.  */
3311                         collate->sections = sp;
3312
3313                       collate->current_section = sp;
3314                     }
3315
3316                   /* Next should come the end of the line or a semicolon.  */
3317                   arg = lr_token (ldfile, charmap, result, repertoire,
3318                                   verbose);
3319                   if (arg->tok == tok_eol)
3320                     {
3321                       uint32_t cnt;
3322
3323                       /* This means we have exactly one rule: `forward'.  */
3324                       if (nrules > 1)
3325                         lr_error (ldfile, _("\
3326 %s: invalid number of sorting rules"),
3327                                   "LC_COLLATE");
3328                       else
3329                         nrules = 1;
3330                       sp->rules = obstack_alloc (&collate->mempool,
3331                                                  (sizeof (enum coll_sort_rule)
3332                                                   * nrules));
3333                       for (cnt = 0; cnt < nrules; ++cnt)
3334                         sp->rules[cnt] = sort_forward;
3335
3336                       /* Next line.  */
3337                       break;
3338                     }
3339
3340                   /* Get the next token.  */
3341                   arg = lr_token (ldfile, charmap, result, repertoire,
3342                                   verbose);
3343                 }
3344             }
3345           else
3346             {
3347               /* There is no section symbol.  Therefore we use the unnamed
3348                  section.  */
3349               collate->current_section = &collate->unnamed_section;
3350
3351               if (collate->unnamed_section_defined)
3352                 lr_error (ldfile, _("\
3353 %s: multiple order definitions for unnamed section"),
3354                           "LC_COLLATE");
3355               else
3356                 {
3357                   /* Insert &collate->unnamed_section at the beginning of
3358                      the collate->sections list.  */
3359                   collate->unnamed_section.next = collate->sections;
3360                   collate->sections = &collate->unnamed_section;
3361                   collate->unnamed_section_defined = true;
3362                 }
3363             }
3364
3365           /* Now read the direction names.  */
3366           read_directions (ldfile, arg, charmap, repertoire, result);
3367
3368           /* From now we need the strings untranslated.  */
3369           ldfile->translate_strings = 0;
3370           break;
3371
3372         case tok_order_end:
3373           /* Ignore the rest of the line if we don't need the input of
3374              this line.  */
3375           if (ignore_content)
3376             {
3377               lr_ignore_rest (ldfile, 0);
3378               break;
3379             }
3380
3381           if (state != 1)
3382             goto err_label;
3383
3384           /* Handle ellipsis at end of list.  */
3385           if (was_ellipsis != tok_none)
3386             {
3387               handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3388                                repertoire, result);
3389               was_ellipsis = tok_none;
3390             }
3391
3392           state = 2;
3393           lr_ignore_rest (ldfile, 1);
3394           break;
3395
3396         case tok_reorder_after:
3397           /* Ignore the rest of the line if we don't need the input of
3398              this line.  */
3399           if (ignore_content)
3400             {
3401               lr_ignore_rest (ldfile, 0);
3402               break;
3403             }
3404
3405           if (state == 1)
3406             {
3407               lr_error (ldfile, _("%s: missing `order_end' keyword"),
3408                         "LC_COLLATE");
3409               state = 2;
3410
3411               /* Handle ellipsis at end of list.  */
3412               if (was_ellipsis != tok_none)
3413                 {
3414                   handle_ellipsis (ldfile, arg->val.str.startmb,
3415                                    arg->val.str.lenmb, was_ellipsis, charmap,
3416                                    repertoire, result);
3417                   was_ellipsis = tok_none;
3418                 }
3419             }
3420           else if (state == 0 && copy_locale == NULL)
3421             goto err_label;
3422           else if (state != 0 && state != 2 && state != 3)
3423             goto err_label;
3424           state = 3;
3425
3426           arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3427           if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
3428             {
3429               /* Find this symbol in the sequence table.  */
3430               char ucsbuf[10];
3431               char *startmb;
3432               size_t lenmb;
3433               struct element_t *insp;
3434               int no_error = 1;
3435               void *ptr;
3436
3437               if (arg->tok == tok_bsymbol)
3438                 {
3439                   startmb = arg->val.str.startmb;
3440                   lenmb = arg->val.str.lenmb;
3441                 }
3442               else
3443                 {
3444                   sprintf (ucsbuf, "U%08X", arg->val.ucs4);
3445                   startmb = ucsbuf;
3446                   lenmb = 9;
3447                 }
3448
3449               if (find_entry (&collate->seq_table, startmb, lenmb, &ptr) == 0)
3450                 /* Yes, the symbol exists.  Simply point the cursor
3451                    to it.  */
3452                 collate->cursor = (struct element_t *) ptr;
3453               else
3454                 {
3455                   struct symbol_t *symbp;
3456                   void *ptr;
3457
3458                   if (find_entry (&collate->sym_table, startmb, lenmb,
3459                                   &ptr) == 0)
3460                     {
3461                       symbp = ptr;
3462
3463                       if (symbp->order->last != NULL
3464                           || symbp->order->next != NULL)
3465                         collate->cursor = symbp->order;
3466                       else
3467                         {
3468                           /* This is a collating symbol but its position
3469                              is not yet defined.  */
3470                           lr_error (ldfile, _("\
3471 %s: order for collating symbol %.*s not yet defined"),
3472                                     "LC_COLLATE", (int) lenmb, startmb);
3473                           collate->cursor = NULL;
3474                           no_error = 0;
3475                         }
3476                     }
3477                   else if (find_entry (&collate->elem_table, startmb, lenmb,
3478                                        &ptr) == 0)
3479                     {
3480                       insp = (struct element_t *) ptr;
3481
3482                       if (insp->last != NULL || insp->next != NULL)
3483                         collate->cursor = insp;
3484                       else
3485                         {
3486                           /* This is a collating element but its position
3487                              is not yet defined.  */
3488                           lr_error (ldfile, _("\
3489 %s: order for collating element %.*s not yet defined"),
3490                                     "LC_COLLATE", (int) lenmb, startmb);
3491                           collate->cursor = NULL;
3492                           no_error = 0;
3493                         }
3494                     }
3495                   else
3496                     {
3497                       /* This is bad.  The symbol after which we have to
3498                          insert does not exist.  */
3499                       lr_error (ldfile, _("\
3500 %s: cannot reorder after %.*s: symbol not known"),
3501                                 "LC_COLLATE", (int) lenmb, startmb);
3502                       collate->cursor = NULL;
3503                       no_error = 0;
3504                     }
3505                 }
3506
3507               lr_ignore_rest (ldfile, no_error);
3508             }
3509           else
3510             /* This must not happen.  */
3511             goto err_label;
3512           break;
3513
3514         case tok_reorder_end:
3515           /* Ignore the rest of the line if we don't need the input of
3516              this line.  */
3517           if (ignore_content)
3518             break;
3519
3520           if (state != 3)
3521             goto err_label;
3522           state = 4;
3523           lr_ignore_rest (ldfile, 1);
3524           break;
3525
3526         case tok_reorder_sections_after:
3527           /* Ignore the rest of the line if we don't need the input of
3528              this line.  */
3529           if (ignore_content)
3530             {
3531               lr_ignore_rest (ldfile, 0);
3532               break;
3533             }
3534
3535           if (state == 1)
3536             {
3537               lr_error (ldfile, _("%s: missing `order_end' keyword"),
3538                         "LC_COLLATE");
3539               state = 2;
3540
3541               /* Handle ellipsis at end of list.  */
3542               if (was_ellipsis != tok_none)
3543                 {
3544                   handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3545                                    repertoire, result);
3546                   was_ellipsis = tok_none;
3547                 }
3548             }
3549           else if (state == 3)
3550             {
3551               WITH_CUR_LOCALE (error (0, 0, _("\
3552 %s: missing `reorder-end' keyword"), "LC_COLLATE"));
3553               state = 4;
3554             }
3555           else if (state != 2 && state != 4)
3556             goto err_label;
3557           state = 5;
3558
3559           /* Get the name of the sections we are adding after.  */
3560           arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3561           if (arg->tok == tok_bsymbol)
3562             {
3563               /* Now find a section with this name.  */
3564               struct section_list *runp = collate->sections;
3565
3566               while (runp != NULL)
3567                 {
3568                   if (runp->name != NULL
3569                       && strlen (runp->name) == arg->val.str.lenmb
3570                       && memcmp (runp->name, arg->val.str.startmb,
3571                                  arg->val.str.lenmb) == 0)
3572                     break;
3573
3574                   runp = runp->next;
3575                 }
3576
3577               if (runp != NULL)
3578                 collate->current_section = runp;
3579               else
3580                 {
3581                   /* This is bad.  The section after which we have to
3582                      reorder does not exist.  Therefore we cannot
3583                      process the whole rest of this reorder
3584                      specification.  */
3585                   lr_error (ldfile, _("%s: section `%.*s' not known"),
3586                             "LC_COLLATE", (int) arg->val.str.lenmb,
3587                             arg->val.str.startmb);
3588
3589                   do
3590                     {
3591                       lr_ignore_rest (ldfile, 0);
3592
3593                       now = lr_token (ldfile, charmap, result, NULL, verbose);
3594                     }
3595                   while (now->tok == tok_reorder_sections_after
3596                          || now->tok == tok_reorder_sections_end
3597                          || now->tok == tok_end);
3598
3599                   /* Process the token we just saw.  */
3600                   nowtok = now->tok;
3601                   continue;
3602                 }
3603             }
3604           else
3605             /* This must not happen.  */
3606             goto err_label;
3607           break;
3608
3609         case tok_reorder_sections_end:
3610           /* Ignore the rest of the line if we don't need the input of
3611              this line.  */
3612           if (ignore_content)
3613             break;
3614
3615           if (state != 5)
3616             goto err_label;
3617           state = 6;
3618           lr_ignore_rest (ldfile, 1);
3619           break;
3620
3621         case tok_bsymbol:
3622         case tok_ucs4:
3623           /* Ignore the rest of the line if we don't need the input of
3624              this line.  */
3625           if (ignore_content)
3626             {
3627               lr_ignore_rest (ldfile, 0);
3628               break;
3629             }
3630
3631           if (state != 0 && state != 1 && state != 3 && state != 5)
3632             goto err_label;
3633
3634           if ((state == 0 || state == 5) && nowtok == tok_ucs4)
3635             goto err_label;
3636
3637           if (nowtok == tok_ucs4)
3638             {
3639               snprintf (ucs4buf, sizeof (ucs4buf), "U%08X", now->val.ucs4);
3640               symstr = ucs4buf;
3641               symlen = 9;
3642             }
3643           else if (arg != NULL)
3644             {
3645               symstr = arg->val.str.startmb;
3646               symlen = arg->val.str.lenmb;
3647             }
3648           else
3649             {
3650               lr_error (ldfile, _("%s: bad symbol <%.*s>"), "LC_COLLATE",
3651                         (int) ldfile->token.val.str.lenmb,
3652                         ldfile->token.val.str.startmb);
3653               break;
3654             }
3655
3656           struct element_t *seqp;
3657           if (state == 0)
3658             {
3659               /* We are outside an `order_start' region.  This means
3660                  we must only accept definitions of values for
3661                  collation symbols since these are purely abstract
3662                  values and don't need directions associated.  */
3663               void *ptr;
3664
3665               if (find_entry (&collate->seq_table, symstr, symlen, &ptr) == 0)
3666                 {
3667                   seqp = ptr;
3668
3669                   /* It's already defined.  First check whether this
3670                      is really a collating symbol.  */
3671                   if (seqp->is_character)
3672                     goto err_label;
3673
3674                   goto move_entry;
3675                 }
3676               else
3677                 {
3678                   void *result;
3679
3680                   if (find_entry (&collate->sym_table, symstr, symlen,
3681                                   &result) != 0)
3682                     /* No collating symbol, it's an error.  */
3683                     goto err_label;
3684
3685                   /* Maybe this is the first time we define a symbol
3686                      value and it is before the first actual section.  */
3687                   if (collate->sections == NULL)
3688                     collate->sections = collate->current_section =
3689                       &collate->symbol_section;
3690                 }
3691
3692               if (was_ellipsis != tok_none)
3693                 {
3694                   handle_ellipsis (ldfile, symstr, symlen, was_ellipsis,
3695                                    charmap, repertoire, result);
3696
3697                   /* Remember that we processed the ellipsis.  */
3698                   was_ellipsis = tok_none;
3699
3700                   /* And don't add the value a second time.  */
3701                   break;
3702                 }
3703             }
3704           else if (state == 3)
3705             {
3706               /* It is possible that we already have this collation sequence.
3707                  In this case we move the entry.  */
3708               void *sym;
3709               void *ptr;
3710
3711               /* If the symbol after which we have to insert was not found
3712                  ignore all entries.  */
3713               if (collate->cursor == NULL)
3714                 {
3715                   lr_ignore_rest (ldfile, 0);
3716                   break;
3717                 }
3718
3719               if (find_entry (&collate->seq_table, symstr, symlen, &ptr) == 0)
3720                 {
3721                   seqp = (struct element_t *) ptr;
3722                   goto move_entry;
3723                 }
3724
3725               if (find_entry (&collate->sym_table, symstr, symlen, &sym) == 0
3726                   && (seqp = ((struct symbol_t *) sym)->order) != NULL)
3727                 goto move_entry;
3728
3729               if (find_entry (&collate->elem_table, symstr, symlen, &ptr) == 0
3730                   && (seqp = (struct element_t *) ptr,
3731                       seqp->last != NULL || seqp->next != NULL
3732                       || (collate->start != NULL && seqp == collate->start)))
3733                 {
3734                 move_entry:
3735                   /* Remove the entry from the old position.  */
3736                   if (seqp->last == NULL)
3737                     collate->start = seqp->next;
3738                   else
3739                     seqp->last->next = seqp->next;
3740                   if (seqp->next != NULL)
3741                     seqp->next->last = seqp->last;
3742
3743                   /* We also have to check whether this entry is the
3744                      first or last of a section.  */
3745                   if (seqp->section->first == seqp)
3746                     {
3747                       if (seqp->section->first == seqp->section->last)
3748                         /* This section has no content anymore.  */
3749                         seqp->section->first = seqp->section->last = NULL;
3750                       else
3751                         seqp->section->first = seqp->next;
3752                     }
3753                   else if (seqp->section->last == seqp)
3754                     seqp->section->last = seqp->last;
3755
3756                   /* Now insert it in the new place.  */
3757                   insert_weights (ldfile, seqp, charmap, repertoire, result,
3758                                   tok_none);
3759                   break;
3760                 }
3761
3762               /* Otherwise we just add a new entry.  */
3763             }
3764           else if (state == 5)
3765             {
3766               /* We are reordering sections.  Find the named section.  */
3767               struct section_list *runp = collate->sections;
3768               struct section_list *prevp = NULL;
3769
3770               while (runp != NULL)
3771                 {
3772                   if (runp->name != NULL
3773                       && strlen (runp->name) == symlen
3774                       && memcmp (runp->name, symstr, symlen) == 0)
3775                     break;
3776
3777                   prevp = runp;
3778                   runp = runp->next;
3779                 }
3780
3781               if (runp == NULL)
3782                 {
3783                   lr_error (ldfile, _("%s: section `%.*s' not known"),
3784                             "LC_COLLATE", (int) symlen, symstr);
3785                   lr_ignore_rest (ldfile, 0);
3786                 }
3787               else
3788                 {
3789                   if (runp != collate->current_section)
3790                     {
3791                       /* Remove the named section from the old place and
3792                          insert it in the new one.  */
3793                       prevp->next = runp->next;
3794
3795                       runp->next = collate->current_section->next;
3796                       collate->current_section->next = runp;
3797                       collate->current_section = runp;
3798                     }
3799
3800                   /* Process the rest of the line which might change
3801                      the collation rules.  */
3802                   arg = lr_token (ldfile, charmap, result, repertoire,
3803                                   verbose);
3804                   if (arg->tok != tok_eof && arg->tok != tok_eol)
3805                     read_directions (ldfile, arg, charmap, repertoire,
3806                                      result);
3807                 }
3808               break;
3809             }
3810           else if (was_ellipsis != tok_none)
3811             {
3812               /* Using the information in the `ellipsis_weight'
3813                  element and this and the last value we have to handle
3814                  the ellipsis now.  */
3815               assert (state == 1);
3816
3817               handle_ellipsis (ldfile, symstr, symlen, was_ellipsis, charmap,
3818                                repertoire, result);
3819
3820               /* Remember that we processed the ellipsis.  */
3821               was_ellipsis = tok_none;
3822
3823               /* And don't add the value a second time.  */
3824               break;
3825             }
3826
3827           /* Now insert in the new place.  */
3828           insert_value (ldfile, symstr, symlen, charmap, repertoire, result);
3829           break;
3830
3831         case tok_undefined:
3832           /* Ignore the rest of the line if we don't need the input of
3833              this line.  */
3834           if (ignore_content)
3835             {
3836               lr_ignore_rest (ldfile, 0);
3837               break;
3838             }
3839
3840           if (state != 1)
3841             goto err_label;
3842
3843           if (was_ellipsis != tok_none)
3844             {
3845               lr_error (ldfile,
3846                         _("%s: cannot have `%s' as end of ellipsis range"),
3847                         "LC_COLLATE", "UNDEFINED");
3848
3849               unlink_element (collate);
3850               was_ellipsis = tok_none;
3851             }
3852
3853           /* See whether UNDEFINED already appeared somewhere.  */
3854           if (collate->undefined.next != NULL
3855               || &collate->undefined == collate->cursor)
3856             {
3857               lr_error (ldfile,
3858                         _("%s: order for `%.*s' already defined at %s:%Zu"),
3859                         "LC_COLLATE", 9, "UNDEFINED",
3860                         collate->undefined.file,
3861                         collate->undefined.line);
3862               lr_ignore_rest (ldfile, 0);
3863             }
3864           else
3865             /* Parse the weights.  */
3866              insert_weights (ldfile, &collate->undefined, charmap,
3867                              repertoire, result, tok_none);
3868           break;
3869
3870         case tok_ellipsis2: /* symbolic hexadecimal ellipsis */
3871         case tok_ellipsis3: /* absolute ellipsis */
3872         case tok_ellipsis4: /* symbolic decimal ellipsis */
3873           /* This is the symbolic (decimal or hexadecimal) or absolute
3874              ellipsis.  */
3875           if (was_ellipsis != tok_none)
3876             goto err_label;
3877
3878           if (state != 0 && state != 1 && state != 3)
3879             goto err_label;
3880
3881           was_ellipsis = nowtok;
3882
3883           insert_weights (ldfile, &collate->ellipsis_weight, charmap,
3884                           repertoire, result, nowtok);
3885           break;
3886
3887         case tok_end:
3888         seen_end:
3889           /* Next we assume `LC_COLLATE'.  */
3890           if (!ignore_content)
3891             {
3892               if (state == 0 && copy_locale == NULL)
3893                 /* We must either see a copy statement or have
3894                    ordering values.  */
3895                 lr_error (ldfile,
3896                           _("%s: empty category description not allowed"),
3897                           "LC_COLLATE");
3898               else if (state == 1)
3899                 {
3900                   lr_error (ldfile, _("%s: missing `order_end' keyword"),
3901                             "LC_COLLATE");
3902
3903                   /* Handle ellipsis at end of list.  */
3904                   if (was_ellipsis != tok_none)
3905                     {
3906                       handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3907                                        repertoire, result);
3908                       was_ellipsis = tok_none;
3909                     }
3910                 }
3911               else if (state == 3)
3912                 WITH_CUR_LOCALE (error (0, 0, _("\
3913 %s: missing `reorder-end' keyword"), "LC_COLLATE"));
3914               else if (state == 5)
3915                 WITH_CUR_LOCALE (error (0, 0, _("\
3916 %s: missing `reorder-sections-end' keyword"), "LC_COLLATE"));
3917             }
3918           arg = lr_token (ldfile, charmap, result, NULL, verbose);
3919           if (arg->tok == tok_eof)
3920             break;
3921           if (arg->tok == tok_eol)
3922             lr_error (ldfile, _("%s: incomplete `END' line"), "LC_COLLATE");
3923           else if (arg->tok != tok_lc_collate)
3924             lr_error (ldfile, _("\
3925 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
3926           lr_ignore_rest (ldfile, arg->tok == tok_lc_collate);
3927           return;
3928
3929         case tok_define:
3930           if (ignore_content)
3931             {
3932               lr_ignore_rest (ldfile, 0);
3933               break;
3934             }
3935
3936           arg = lr_token (ldfile, charmap, result, NULL, verbose);
3937           if (arg->tok != tok_ident)
3938             goto err_label;
3939
3940           /* Simply add the new symbol.  */
3941           struct name_list *newsym = xmalloc (sizeof (*newsym)
3942                                               + arg->val.str.lenmb + 1);
3943           memcpy (newsym->str, arg->val.str.startmb, arg->val.str.lenmb);
3944           newsym->str[arg->val.str.lenmb] = '\0';
3945           newsym->next = defined;
3946           defined = newsym;
3947
3948           lr_ignore_rest (ldfile, 1);
3949           break;
3950
3951         case tok_undef:
3952           if (ignore_content)
3953             {
3954               lr_ignore_rest (ldfile, 0);
3955               break;
3956             }
3957
3958           arg = lr_token (ldfile, charmap, result, NULL, verbose);
3959           if (arg->tok != tok_ident)
3960             goto err_label;
3961
3962           /* Remove _all_ occurrences of the symbol from the list.  */
3963           struct name_list *prevdef = NULL;
3964           struct name_list *curdef = defined;
3965           while (curdef != NULL)
3966             if (strncmp (arg->val.str.startmb, curdef->str,
3967                          arg->val.str.lenmb) == 0
3968                 && curdef->str[arg->val.str.lenmb] == '\0')
3969               {
3970                 if (prevdef == NULL)
3971                   defined = curdef->next;
3972                 else
3973                   prevdef->next = curdef->next;
3974
3975                 struct name_list *olddef = curdef;
3976                 curdef = curdef->next;
3977
3978                 free (olddef);
3979               }
3980             else
3981               {
3982                 prevdef = curdef;
3983                 curdef = curdef->next;
3984               }
3985
3986           lr_ignore_rest (ldfile, 1);
3987           break;
3988
3989         case tok_ifdef:
3990         case tok_ifndef:
3991           if (ignore_content)
3992             {
3993               lr_ignore_rest (ldfile, 0);
3994               break;
3995             }
3996
3997         found_ifdef:
3998           arg = lr_token (ldfile, charmap, result, NULL, verbose);
3999           if (arg->tok != tok_ident)
4000             goto err_label;
4001           lr_ignore_rest (ldfile, 1);
4002
4003           if (collate->else_action == else_none)
4004             {
4005               curdef = defined;
4006               while (curdef != NULL)
4007                 if (strncmp (arg->val.str.startmb, curdef->str,
4008                              arg->val.str.lenmb) == 0
4009                     && curdef->str[arg->val.str.lenmb] == '\0')
4010                   break;
4011                 else
4012                   curdef = curdef->next;
4013
4014               if ((nowtok == tok_ifdef && curdef != NULL)
4015                   || (nowtok == tok_ifndef && curdef == NULL))
4016                 {
4017                   /* We have to use the if-branch.  */
4018                   collate->else_action = else_ignore;
4019                 }
4020               else
4021                 {
4022                   /* We have to use the else-branch, if there is one.  */
4023                   nowtok = skip_to (ldfile, collate, charmap, 0);
4024                   if (nowtok == tok_else)
4025                     collate->else_action = else_seen;
4026                   else if (nowtok == tok_elifdef)
4027                     {
4028                       nowtok = tok_ifdef;
4029                       goto found_ifdef;
4030                     }
4031                   else if (nowtok == tok_elifndef)
4032                     {
4033                       nowtok = tok_ifndef;
4034                       goto found_ifdef;
4035                     }
4036                   else if (nowtok == tok_eof)
4037                     goto seen_eof;
4038                   else if (nowtok == tok_end)
4039                     goto seen_end;
4040                 }
4041             }
4042           else
4043             {
4044               /* XXX Should it really become necessary to support nested
4045                  preprocessor handling we will push the state here.  */
4046               lr_error (ldfile, _("%s: nested conditionals not supported"),
4047                         "LC_COLLATE");
4048               nowtok = skip_to (ldfile, collate, charmap, 1);
4049               if (nowtok == tok_eof)
4050                 goto seen_eof;
4051               else if (nowtok == tok_end)
4052                 goto seen_end;
4053             }
4054           break;
4055
4056         case tok_elifdef:
4057         case tok_elifndef:
4058         case tok_else:
4059           if (ignore_content)
4060             {
4061               lr_ignore_rest (ldfile, 0);
4062               break;
4063             }
4064
4065           lr_ignore_rest (ldfile, 1);
4066
4067           if (collate->else_action == else_ignore)
4068             {
4069               /* Ignore everything until the endif.  */
4070               nowtok = skip_to (ldfile, collate, charmap, 1);
4071               if (nowtok == tok_eof)
4072                 goto seen_eof;
4073               else if (nowtok == tok_end)
4074                 goto seen_end;
4075             }
4076           else
4077             {
4078               assert (collate->else_action == else_none);
4079               lr_error (ldfile, _("\
4080 %s: '%s' without matching 'ifdef' or 'ifndef'"), "LC_COLLATE",
4081                         nowtok == tok_else ? "else"
4082                         : nowtok == tok_elifdef ? "elifdef" : "elifndef");
4083             }
4084           break;
4085
4086         case tok_endif:
4087           if (ignore_content)
4088             {
4089               lr_ignore_rest (ldfile, 0);
4090               break;
4091             }
4092
4093           lr_ignore_rest (ldfile, 1);
4094
4095           if (collate->else_action != else_ignore
4096               && collate->else_action != else_seen)
4097             lr_error (ldfile, _("\
4098 %s: 'endif' without matching 'ifdef' or 'ifndef'"), "LC_COLLATE");
4099
4100           /* XXX If we support nested preprocessor directives we pop
4101              the state here.  */
4102           collate->else_action = else_none;
4103           break;
4104
4105         default:
4106         err_label:
4107           SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
4108         }
4109
4110       /* Prepare for the next round.  */
4111       now = lr_token (ldfile, charmap, result, NULL, verbose);
4112       nowtok = now->tok;
4113     }
4114
4115  seen_eof:
4116   /* When we come here we reached the end of the file.  */
4117   lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");
4118 }