bin/localedef/collate.c

   1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11
  12 /*
  13  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
  14  */
  15
  16 /*
  17  * LC_COLLATE database generation routines for localedef.
  18  */
  19
  20 #include <stdio.h>
  21 #include <stdlib.h>
  22 #include <errno.h>
  23 #include <string.h>
  24 #include <sys/types.h>
  25 #include <sys/avl.h>
  26 #include <string.h>
  27 #include <unistd.h>
  28 #include <wchar.h>
  29 #include <widec.h>
  30 #include <limits.h>
  31 #include "localedef.h"
  32 #include "parser.h"
  33 #include "collatefile.h"
  34
  35 /*
  36  * Design notes.
  37  *
  38  * It will be extremely helpful to the reader if they have access to
  39  * the localedef and locale file format specifications available.
  40  * Latest versions of these are available from www.opengroup.org.
  41  *
  42  * The design for the collation code is a bit complex.  The goal is a
  43  * single collation database as described in collate.h (in
  44  * libc/port/locale).  However, there are some other tidbits:
  45  *
  46  * a) The substitution entries are now a directly indexable array.  A
  47  * priority elsewhere in the table is taken as an index into the
  48  * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY)
  49  * set.  (The bit is cleared and the result is the index into the
  50  * table.
  51  *
  52  * b) We eliminate duplicate entries into the substitution table.
  53  * This saves a lot of space.
  54  *
  55  * c) The priorities for each level are "compressed", so that each
  56  * sorting level has consecutively numbered priorities starting at 1.
  57  * (O is reserved for the ignore priority.)  This means sort levels
  58  * which only have a few distinct priorities can represent the
  59  * priority level in fewer bits, which makes the strxfrm output
  60  * smaller.
  61  *
  62  * d) We record the total number of priorities so that strxfrm can
  63  * figure out how many bytes to expand a numeric priority into.
  64  *
  65  * e) For the UNDEFINED pass (the last pass), we record the maximum
  66  * number of bits needed to uniquely prioritize these entries, so that
  67  * the last pass can also use smaller strxfrm output when possible.
  68  *
  69  * f) Priorities with the sign bit set are verboten.  This works out
  70  * because no active character set needs that bit to carry significant
  71  * information once the character is in wide form.
  72  *
  73  * To process the entire data to make the database, we actually run
  74  * multiple passes over the data.
  75  *
  76  * The first pass, which is done at parse time, identifies elements,
  77  * substitutions, and such, and records them in priority order.  As
  78  * some priorities can refer to other priorities, using forward
  79  * references, we use a table of references indicating whether the
  80  * priority's value has been resolved, or whether it is still a
  81  * reference.
  82  *
  83  * The second pass walks over all the items in priority order, noting
  84  * that they are used directly, and not just an indirect reference.
  85  * This is done by creating a "weight" structure for the item.  The
  86  * weights are stashed in an AVL tree sorted by relative "priority".
  87  *
  88  * The third pass walks over all the weight structures, in priority
  89  * order, and assigns a new monotonically increasing (per sort level)
  90  * weight value to them.  These are the values that will actually be
  91  * written to the file.
  92  *
  93  * The fourth pass just writes the data out.
  94  */
  95
  96 /*
  97  * In order to resolve the priorities, we create a table of priorities.
  98  * Entries in the table can be in one of three states.
  99  *
 100  * UNKNOWN is for newly allocated entries, and indicates that nothing
 101  * is known about the priority.  (For example, when new entries are created
 102  * for collating-symbols, this is the value assigned for them until the
 103  * collating symbol's order has been determined.
 104  *
 105  * RESOLVED is used for an entry where the priority indicates the final
 106  * numeric weight.
 107  *
 108  * REFER is used for entries that reference other entries.  Typically
 109  * this is used for forward references.  A collating-symbol can never
 110  * have this value.
 111  *
 112  * The "pass" field is used during final resolution to aid in detection
 113  * of referencing loops.  (For example <A> depends on <B>, but <B> has its
 114  * priority dependent on <A>.)
 115  */
 116 typedef enum {
 117         UNKNOWN,        /* priority is totally unknown */
 118         RESOLVED,       /* priority value fully resolved */
 119         REFER           /* priority is a reference (index) */
 120 } res_t;
 121
 122 typedef struct weight {
 123         int32_t         pri;
 124         int             opt;
 125         avl_node_t      avl;
 126 } weight_t;
 127
 128 typedef struct priority {
 129         res_t           res;
 130         int32_t         pri;
 131         int             pass;
 132         int             lineno;
 133 } collpri_t;
 134
 135 #define NUM_WT  collinfo.directive_count
 136
 137 /*
 138  * These are the abstract collating symbols, which are just a symbolic
 139  * way to reference a priority.
 140  */
 141 struct collsym {
 142         char            *name;
 143         int32_t         ref;
 144         avl_node_t      avl;
 145 };
 146
 147 /*
 148  * These are also abstract collating symbols, but we allow them to have
 149  * different priorities at different levels.
 150  */
 151 typedef struct collundef {
 152         char            *name;
 153         int32_t         ref[COLL_WEIGHTS_MAX];
 154         avl_node_t      avl;
 155 } collundef_t;
 156
 157 /*
 158  * These are called "chains" in libc.  This records the fact that two
 159  * more characters should be treated as a single collating entity when
 160  * they appear together.  For example, in Spanish <C><h> gets collated
 161  * as a character between <C> and <D>.
 162  */
 163 struct collelem {
 164         char            *symbol;
 165         wchar_t         *expand;
 166         int32_t         ref[COLL_WEIGHTS_MAX];
 167         avl_node_t      avl_bysymbol;
 168         avl_node_t      avl_byexpand;
 169 };
 170
 171 /*
 172  * Individual characters have a sequence of weights as well.
 173  */
 174 typedef struct collchar {
 175         wchar_t         wc;
 176         int32_t         ref[COLL_WEIGHTS_MAX];
 177         avl_node_t      avl;
 178 } collchar_t;
 179
 180 /*
 181  * Substitution entries.  The key is itself a priority.  Note that
 182  * when we create one of these, we *automatically* wind up with a
 183  * fully resolved priority for the key, because creation of
 184  * substitutions creates a resolved priority at the same time.
 185  */
 186 typedef struct {
 187         int32_t         key;
 188         int32_t         ref[COLLATE_STR_LEN];
 189         avl_node_t      avl;
 190         avl_node_t      avl_ref;
 191 } subst_t;
 192
 193 static avl_tree_t       collsyms;
 194 static avl_tree_t       collundefs;
 195 static avl_tree_t       elem_by_symbol;
 196 static avl_tree_t       elem_by_expand;
 197 static avl_tree_t       collchars;
 198 static avl_tree_t       substs[COLL_WEIGHTS_MAX];
 199 static avl_tree_t       substs_ref[COLL_WEIGHTS_MAX];
 200 static avl_tree_t       weights[COLL_WEIGHTS_MAX];
 201 static int32_t          nweight[COLL_WEIGHTS_MAX];
 202
 203 /*
 204  * This is state tracking for the ellipsis token.  Note that we start
 205  * the initial values so that the ellipsis logic will think we got a
 206  * magic starting value of NUL.  It starts at minus one because the
 207  * starting point is exclusive -- i.e. the starting point is not
 208  * itself handled by the ellipsis code.
 209  */
 210 static int currorder = EOF;
 211 static int lastorder = EOF;
 212 static collelem_t *currelem;
 213 static collchar_t *currchar;
 214 static collundef_t *currundef;
 215 static wchar_t ellipsis_start = 0;
 216 static int32_t ellipsis_weights[COLL_WEIGHTS_MAX];
 217
 218 /*
 219  * We keep a running tally of weights.
 220  */
 221 static int nextpri = 1;
 222 static int nextsubst[COLL_WEIGHTS_MAX] = { 0 };
 223
 224 /*
 225  * This array collects up the weights for each level.
 226  */
 227 static int32_t order_weights[COLL_WEIGHTS_MAX];
 228 static int curr_weight = 0;
 229 static int32_t subst_weights[COLLATE_STR_LEN];
 230 static int curr_subst = 0;
 231
 232 /*
 233  * Some initial priority values.
 234  */
 235 static int32_t pri_undefined[COLL_WEIGHTS_MAX];
 236 static int32_t pri_ignore;
 237
 238 static collate_info_t collinfo;
 239
 240 static collpri_t        *prilist = NULL;
 241 static int              numpri = 0;
 242 static int              maxpri = 0;
 243
 244 static void start_order(int);
 245
 246 static int32_t
 247 new_pri(void)
 248 {
 249         int i;
 250
 251         if (numpri >= maxpri) {
 252                 maxpri = maxpri ? maxpri * 2 : 1024;
 253                 prilist = reallocarray(prilist, maxpri, sizeof (collpri_t));
 254                 if (prilist == NULL) {
 255                         errf(_("out of memory"));
 256                         return (-1);
 257                 }
 258                 for (i = numpri; i < maxpri; i++) {
 259                         prilist[i].res = UNKNOWN;
 260                         prilist[i].pri = 0;
 261                         prilist[i].pass = 0;
 262                 }
 263         }
 264         return (numpri++);
 265 }
 266
 267 static collpri_t *
 268 get_pri(int32_t ref)
 269 {
 270         if ((ref < 0) || (ref > numpri)) {
 271                 INTERR;
 272                 return (NULL);
 273         }
 274         return (&prilist[ref]);
 275 }
 276
 277 static void
 278 set_pri(int32_t ref, int32_t v, res_t res)
 279 {
 280         collpri_t       *pri;
 281
 282         pri = get_pri(ref);
 283
 284         if ((res == REFER) && ((v < 0) || (v >= numpri))) {
 285                 INTERR;
 286         }
 287
 288         /* Resolve self references */
 289         if ((res == REFER) && (ref == v)) {
 290                 v = nextpri;
 291                 res = RESOLVED;
 292         }
 293
 294         if (pri->res != UNKNOWN) {
 295                 warn(_("repeated item in order list (first on %d)"),
 296                     pri->lineno);
 297                 return;
 298         }
 299         pri->lineno = lineno;
 300         pri->pri = v;
 301         pri->res = res;
 302 }
 303
 304 static int32_t
 305 resolve_pri(int32_t ref)
 306 {
 307         collpri_t       *pri;
 308         static int32_t  pass = 0;
 309
 310         pri = get_pri(ref);
 311         pass++;
 312         while (pri->res == REFER) {
 313                 if (pri->pass == pass) {
 314                         /* report a line with the circular symbol */
 315                         lineno = pri->lineno;
 316                         errf(_("circular reference in order list"));
 317                         return (-1);
 318                 }
 319                 if ((pri->pri < 0) || (pri->pri >= numpri)) {
 320                         INTERR;
 321                         return (-1);
 322                 }
 323                 pri->pass = pass;
 324                 pri = &prilist[pri->pri];
 325         }
 326
 327         if (pri->res == UNKNOWN) {
 328                 return (-1);
 329         }
 330         if (pri->res != RESOLVED)
 331                 INTERR;
 332
 333         return (pri->pri);
 334 }
 335
 336 static int
 337 weight_compare(const void *n1, const void *n2)
 338 {
 339         int32_t k1 = ((const weight_t *)n1)->pri;
 340         int32_t k2 = ((const weight_t *)n2)->pri;
 341
 342         return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
 343 }
 344
 345 static int
 346 collsym_compare(const void *n1, const void *n2)
 347 {
 348         const collsym_t *c1 = n1;
 349         const collsym_t *c2 = n2;
 350         int rv;
 351
 352         rv = strcmp(c1->name, c2->name);
 353         return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
 354 }
 355
 356 static int
 357 collundef_compare(const void *n1, const void *n2)
 358 {
 359         const collundef_t *c1 = n1;
 360         const collundef_t *c2 = n2;
 361         int rv;
 362
 363         rv = strcmp(c1->name, c2->name);
 364         return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
 365 }
 366
 367 static int
 368 element_compare_symbol(const void *n1, const void *n2)
 369 {
 370         const collelem_t *c1 = n1;
 371         const collelem_t *c2 = n2;
 372         int rv;
 373
 374         rv = strcmp(c1->symbol, c2->symbol);
 375         return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
 376 }
 377
 378 static int
 379 element_compare_expand(const void *n1, const void *n2)
 380 {
 381         const collelem_t *c1 = n1;
 382         const collelem_t *c2 = n2;
 383         int rv;
 384
 385         rv = wcscmp(c1->expand, c2->expand);
 386         return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
 387 }
 388
 389 static int
 390 collchar_compare(const void *n1, const void *n2)
 391 {
 392         wchar_t k1 = ((const collchar_t *)n1)->wc;
 393         wchar_t k2 = ((const collchar_t *)n2)->wc;
 394
 395         return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
 396 }
 397
 398 static int
 399 subst_compare(const void *n1, const void *n2)
 400 {
 401         int32_t k1 = ((const subst_t *)n1)->key;
 402         int32_t k2 = ((const subst_t *)n2)->key;
 403
 404         return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
 405 }
 406
 407 static int
 408 subst_compare_ref(const void *n1, const void *n2)
 409 {
 410         int32_t *c1 = ((subst_t *)n1)->ref;
 411         int32_t *c2 = ((subst_t *)n2)->ref;
 412         int rv;
 413
 414         rv = wcscmp((wchar_t *)c1, (wchar_t *)c2);
 415         return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
 416 }
 417
 418 void
 419 init_collate(void)
 420 {
 421         int i;
 422
 423         avl_create(&collsyms, collsym_compare, sizeof (collsym_t),
 424             offsetof(collsym_t, avl));
 425
 426         avl_create(&collundefs, collundef_compare, sizeof (collsym_t),
 427             offsetof(collundef_t, avl));
 428
 429         avl_create(&elem_by_symbol, element_compare_symbol, sizeof (collelem_t),
 430             offsetof(collelem_t, avl_bysymbol));
 431         avl_create(&elem_by_expand, element_compare_expand, sizeof (collelem_t),
 432             offsetof(collelem_t, avl_byexpand));
 433
 434         avl_create(&collchars, collchar_compare, sizeof (collchar_t),
 435             offsetof(collchar_t, avl));
 436
 437         for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
 438                 avl_create(&substs[i], subst_compare, sizeof (subst_t),
 439                     offsetof(subst_t, avl));
 440                 avl_create(&substs_ref[i], subst_compare_ref,
 441                     sizeof (subst_t), offsetof(subst_t, avl_ref));
 442                 avl_create(&weights[i], weight_compare, sizeof (weight_t),
 443                     offsetof(weight_t, avl));
 444                 nweight[i] = 1;
 445         }
 446
 447         (void) memset(&collinfo, 0, sizeof (collinfo));
 448
 449         /* allocate some initial priorities */
 450         pri_ignore = new_pri();
 451
 452         set_pri(pri_ignore, 0, RESOLVED);
 453
 454         for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
 455                 pri_undefined[i] = new_pri();
 456
 457                 /* we will override this later */
 458                 set_pri(pri_undefined[i], COLLATE_MAX_PRIORITY, UNKNOWN);
 459         }
 460 }
 461
 462 void
 463 define_collsym(char *name)
 464 {
 465         collsym_t       *sym;
 466         avl_index_t     where;
 467
 468         if ((sym = calloc(sizeof (*sym), 1)) == NULL) {
 469                 errf(_("out of memory"));
 470                 return;
 471         }
 472         sym->name = name;
 473         sym->ref = new_pri();
 474
 475         if (avl_find(&collsyms, sym, &where) != NULL) {
 476                 /*
 477                  * This should never happen because we are only called
 478                  * for undefined symbols.
 479                  */
 480                 INTERR;
 481                 return;
 482         }
 483         avl_insert(&collsyms, sym, where);
 484 }
 485
 486 collsym_t *
 487 lookup_collsym(char *name)
 488 {
 489         collsym_t       srch;
 490
 491         srch.name = name;
 492         return (avl_find(&collsyms, &srch, NULL));
 493 }
 494
 495 collelem_t *
 496 lookup_collelem(char *symbol)
 497 {
 498         collelem_t      srch;
 499
 500         srch.symbol = symbol;
 501         return (avl_find(&elem_by_symbol, &srch, NULL));
 502 }
 503
 504 static collundef_t *
 505 get_collundef(char *name)
 506 {
 507         collundef_t     srch;
 508         collundef_t     *ud;
 509         avl_index_t     where;
 510         int             i;
 511
 512         srch.name = name;
 513         if ((ud = avl_find(&collundefs, &srch, &where)) == NULL) {
 514                 if (((ud = calloc(sizeof (*ud), 1)) == NULL) ||
 515                     ((ud->name = strdup(name)) == NULL)) {
 516                         errf(_("out of memory"));
 517                         return (NULL);
 518                 }
 519                 for (i = 0; i < NUM_WT; i++) {
 520                         ud->ref[i] = new_pri();
 521                 }
 522                 avl_insert(&collundefs, ud, where);
 523         }
 524         add_charmap_undefined(name);
 525         return (ud);
 526 }
 527
 528 static collchar_t *
 529 get_collchar(wchar_t wc, int create)
 530 {
 531         collchar_t      srch;
 532         collchar_t      *cc;
 533         avl_index_t     where;
 534         int             i;
 535
 536         srch.wc = wc;
 537         cc = avl_find(&collchars, &srch, &where);
 538         if ((cc == NULL) && create) {
 539                 if ((cc = calloc(sizeof (*cc), 1)) == NULL) {
 540                         errf(_("out of memory"));
 541                         return (NULL);
 542                 }
 543                 for (i = 0; i < NUM_WT; i++) {
 544                         cc->ref[i] = new_pri();
 545                 }
 546                 cc->wc = wc;
 547                 avl_insert(&collchars, cc, where);
 548         }
 549         return (cc);
 550 }
 551
 552 void
 553 end_order_collsym(collsym_t *sym)
 554 {
 555         start_order(T_COLLSYM);
 556         /* update the weight */
 557
 558         set_pri(sym->ref, nextpri, RESOLVED);
 559         nextpri++;
 560 }
 561
 562 void
 563 end_order(void)
 564 {
 565         int             i;
 566         int32_t         pri;
 567         int32_t         ref;
 568         collpri_t       *p;
 569
 570         /* advance the priority/weight */
 571         pri = nextpri;
 572
 573         switch (currorder) {
 574         case T_CHAR:
 575                 for (i = 0; i < NUM_WT; i++) {
 576                         if (((ref = order_weights[i]) < 0) ||
 577                             ((p = get_pri(ref)) == NULL) ||
 578                             (p->pri == -1)) {
 579                                 /* unspecified weight is a self reference */
 580                                 set_pri(currchar->ref[i], pri, RESOLVED);
 581                         } else {
 582                                 set_pri(currchar->ref[i], ref, REFER);
 583                         }
 584                         order_weights[i] = -1;
 585                 }
 586
 587                 /* leave a cookie trail in case next symbol is ellipsis */
 588                 ellipsis_start = currchar->wc + 1;
 589                 currchar = NULL;
 590                 break;
 591
 592         case T_ELLIPSIS:
 593                 /* save off the weights were we can find them */
 594                 for (i = 0; i < NUM_WT; i++) {
 595                         ellipsis_weights[i] = order_weights[i];
 596                         order_weights[i] = -1;
 597                 }
 598                 break;
 599
 600         case T_COLLELEM:
 601                 if (currelem == NULL) {
 602                         INTERR;
 603                 } else {
 604                         for (i = 0; i < NUM_WT; i++) {
 605
 606                                 if (((ref = order_weights[i]) < 0) ||
 607                                     ((p = get_pri(ref)) == NULL) ||
 608                                     (p->pri == -1)) {
 609                                         set_pri(currelem->ref[i], pri,
 610                                             RESOLVED);
 611                                 } else {
 612                                         set_pri(currelem->ref[i], ref, REFER);
 613                                 }
 614                                 order_weights[i] = -1;
 615                         }
 616                 }
 617                 break;
 618
 619         case T_UNDEFINED:
 620                 for (i = 0; i < NUM_WT; i++) {
 621                         if (((ref = order_weights[i]) < 0) ||
 622                             ((p = get_pri(ref)) == NULL) ||
 623                             (p->pri == -1)) {
 624                                 set_pri(pri_undefined[i], -1, RESOLVED);
 625                         } else {
 626                                 set_pri(pri_undefined[i], ref, REFER);
 627                         }
 628                         order_weights[i] = -1;
 629                 }
 630                 break;
 631
 632         case T_SYMBOL:
 633                 for (i = 0; i < NUM_WT; i++) {
 634                         if (((ref = order_weights[i]) < 0) ||
 635                             ((p = get_pri(ref)) == NULL) ||
 636                             (p->pri == -1)) {
 637                                 set_pri(currundef->ref[i], pri, RESOLVED);
 638                         } else {
 639                                 set_pri(currundef->ref[i], ref, REFER);
 640                         }
 641                         order_weights[i] = -1;
 642                 }
 643                 break;
 644
 645         default:
 646                 INTERR;
 647         }
 648
 649         nextpri++;
 650 }
 651
 652 static void
 653 start_order(int type)
 654 {
 655         int     i;
 656
 657         lastorder = currorder;
 658         currorder = type;
 659
 660         /* this is used to protect ELLIPSIS processing */
 661         if ((lastorder == T_ELLIPSIS) && (type != T_CHAR)) {
 662                 errf(_("character value expected"));
 663         }
 664
 665         for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
 666                 order_weights[i] = -1;
 667         }
 668         curr_weight = 0;
 669 }
 670
 671 void
 672 start_order_undefined(void)
 673 {
 674         start_order(T_UNDEFINED);
 675 }
 676
 677 void
 678 start_order_symbol(char *name)
 679 {
 680         currundef = get_collundef(name);
 681         start_order(T_SYMBOL);
 682 }
 683
 684 void
 685 start_order_char(wchar_t wc)
 686 {
 687         collchar_t      *cc;
 688         int32_t         ref;
 689
 690         start_order(T_CHAR);
 691
 692         /*
 693          * If we last saw an ellipsis, then we need to close the range.
 694          * Handle that here.  Note that we have to be careful because the
 695          * items *inside* the range are treated exclusiveley to the items
 696          * outside of the range.  The ends of the range can have quite
 697          * different weights than the range members.
 698          */
 699         if (lastorder == T_ELLIPSIS) {
 700                 int             i;
 701
 702                 if (wc < ellipsis_start) {
 703                         errf(_("malformed range!"));
 704                         return;
 705                 }
 706                 while (ellipsis_start < wc) {
 707                         /*
 708                          * pick all of the saved weights for the
 709                          * ellipsis.  note that -1 encodes for the
 710                          * ellipsis itself, which means to take the
 711                          * current relative priority.
 712                          */
 713                         if ((cc = get_collchar(ellipsis_start, 1)) == NULL) {
 714                                 INTERR;
 715                                 return;
 716                         }
 717                         for (i = 0; i < NUM_WT; i++) {
 718                                 collpri_t *p;
 719                                 if (((ref = ellipsis_weights[i]) == -1) ||
 720                                     ((p = get_pri(ref)) == NULL) ||
 721                                     (p->pri == -1)) {
 722                                         set_pri(cc->ref[i], nextpri, RESOLVED);
 723                                 } else {
 724                                         set_pri(cc->ref[i], ref, REFER);
 725                                 }
 726                                 ellipsis_weights[i] = 0;
 727                         }
 728                         ellipsis_start++;
 729                         nextpri++;
 730                 }
 731         }
 732
 733         currchar = get_collchar(wc, 1);
 734 }
 735
 736 void
 737 start_order_collelem(collelem_t *e)
 738 {
 739         start_order(T_COLLELEM);
 740         currelem = e;
 741 }
 742
 743 void
 744 start_order_ellipsis(void)
 745 {
 746         int     i;
 747
 748         start_order(T_ELLIPSIS);
 749
 750         if (lastorder != T_CHAR) {
 751                 errf(_("illegal starting point for range"));
 752                 return;
 753         }
 754
 755         for (i = 0; i < NUM_WT; i++) {
 756                 ellipsis_weights[i] = order_weights[i];
 757         }
 758 }
 759
 760 void
 761 define_collelem(char *name, wchar_t *wcs)
 762 {
 763         collelem_t      *e;
 764         avl_index_t     where1;
 765         avl_index_t     where2;
 766         int             i;
 767
 768         if (wcslen(wcs) >= COLLATE_STR_LEN) {
 769                 errf(_("expanded collation element too long"));
 770                 return;
 771         }
 772
 773         if ((e = calloc(sizeof (*e), 1)) == NULL) {
 774                 errf(_("out of memory"));
 775                 return;
 776         }
 777         e->expand = wcs;
 778         e->symbol = name;
 779
 780         /*
 781          * This is executed before the order statement, so we don't
 782          * know how many priorities we *really* need.  We allocate one
 783          * for each possible weight.  Not a big deal, as collating-elements
 784          * prove to be quite rare.
 785          */
 786         for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
 787                 e->ref[i] = new_pri();
 788         }
 789
 790         /* A character sequence can only reduce to one element. */
 791         if ((avl_find(&elem_by_symbol, e, &where1) != NULL) ||
 792             (avl_find(&elem_by_expand, e, &where2) != NULL)) {
 793                 errf(_("duplicate collating element definition"));
 794                 return;
 795         }
 796         avl_insert(&elem_by_symbol, e, where1);
 797         avl_insert(&elem_by_expand, e, where2);
 798 }
 799
 800 void
 801 add_order_bit(int kw)
 802 {
 803         uint8_t bit = DIRECTIVE_UNDEF;
 804
 805         switch (kw) {
 806         case T_FORWARD:
 807                 bit = DIRECTIVE_FORWARD;
 808                 break;
 809         case T_BACKWARD:
 810                 bit = DIRECTIVE_BACKWARD;
 811                 break;
 812         case T_POSITION:
 813                 bit = DIRECTIVE_POSITION;
 814                 break;
 815         default:
 816                 INTERR;
 817                 break;
 818         }
 819         collinfo.directive[collinfo.directive_count] |= bit;
 820 }
 821
 822 void
 823 add_order_directive(void)
 824 {
 825         if (collinfo.directive_count >= COLL_WEIGHTS_MAX) {
 826                 errf(_("too many directives (max %d)"), COLL_WEIGHTS_MAX);
 827         }
 828         collinfo.directive_count++;
 829 }
 830
 831 static void
 832 add_order_pri(int32_t ref)
 833 {
 834         if (curr_weight >= NUM_WT) {
 835                 errf(_("too many weights (max %d)"), NUM_WT);
 836                 return;
 837         }
 838         order_weights[curr_weight] = ref;
 839         curr_weight++;
 840 }
 841
 842 void
 843 add_order_collsym(collsym_t *s)
 844 {
 845         add_order_pri(s->ref);
 846 }
 847
 848 void
 849 add_order_char(wchar_t wc)
 850 {
 851         collchar_t *cc;
 852
 853         if ((cc = get_collchar(wc, 1)) == NULL) {
 854                 INTERR;
 855                 return;
 856         }
 857
 858         add_order_pri(cc->ref[curr_weight]);
 859 }
 860
 861 void
 862 add_order_collelem(collelem_t *e)
 863 {
 864         add_order_pri(e->ref[curr_weight]);
 865 }
 866
 867 void
 868 add_order_ignore(void)
 869 {
 870         add_order_pri(pri_ignore);
 871 }
 872
 873 void
 874 add_order_symbol(char *sym)
 875 {
 876         collundef_t *c;
 877         if ((c = get_collundef(sym)) == NULL) {
 878                 INTERR;
 879                 return;
 880         }
 881         add_order_pri(c->ref[curr_weight]);
 882 }
 883
 884 void
 885 add_order_ellipsis(void)
 886 {
 887         /* special 0 value indicates self reference */
 888         add_order_pri(0);
 889 }
 890
 891 void
 892 add_order_subst(void)
 893 {
 894         subst_t srch;
 895         subst_t *s;
 896         avl_index_t where;
 897         int i;
 898
 899         (void) memset(&srch, 0, sizeof (srch));
 900         for (i = 0; i < curr_subst; i++) {
 901                 srch.ref[i] = subst_weights[i];
 902                 subst_weights[i] = 0;
 903         }
 904         s = avl_find(&substs_ref[curr_weight], &srch, &where);
 905
 906         if (s == NULL) {
 907                 if ((s = calloc(sizeof (*s), 1)) == NULL) {
 908                         errf(_("out of memory"));
 909                         return;
 910                 }
 911                 s->key = new_pri();
 912
 913                 /*
 914                  * We use a self reference for our key, but we set a
 915                  * high bit to indicate that this is a substitution
 916                  * reference.  This will expedite table lookups later,
 917                  * and prevent table lookups for situations that don't
 918                  * require it.  (In short, its a big win, because we
 919                  * can skip a lot of binary searching.)
 920                  */
 921                 set_pri(s->key,
 922                     (nextsubst[curr_weight] | COLLATE_SUBST_PRIORITY),
 923                     RESOLVED);
 924                 nextsubst[curr_weight] += 1;
 925
 926                 for (i = 0; i < curr_subst; i++) {
 927                         s->ref[i] = srch.ref[i];
 928                 }
 929
 930                 avl_insert(&substs_ref[curr_weight], s, where);
 931
 932                 if (avl_find(&substs[curr_weight], s, &where) != NULL) {
 933                         INTERR;
 934                         return;
 935                 }
 936                 avl_insert(&substs[curr_weight], s, where);
 937         }
 938         curr_subst = 0;
 939
 940
 941         /*
 942          * We are using the current (unique) priority as a search key
 943          * in the substitution table.
 944          */
 945         add_order_pri(s->key);
 946 }
 947
 948 static void
 949 add_subst_pri(int32_t ref)
 950 {
 951         if (curr_subst >= COLLATE_STR_LEN) {
 952                 errf(_("substitution string is too long"));
 953                 return;
 954         }
 955         subst_weights[curr_subst] = ref;
 956         curr_subst++;
 957 }
 958
 959 void
 960 add_subst_char(wchar_t wc)
 961 {
 962         collchar_t *cc;
 963
 964
 965         if (((cc = get_collchar(wc, 1)) == NULL) ||
 966             (cc->wc != wc)) {
 967                 INTERR;
 968                 return;
 969         }
 970         /* we take the weight for the character at that position */
 971         add_subst_pri(cc->ref[curr_weight]);
 972 }
 973
 974 void
 975 add_subst_collelem(collelem_t *e)
 976 {
 977         add_subst_pri(e->ref[curr_weight]);
 978 }
 979
 980 void
 981 add_subst_collsym(collsym_t *s)
 982 {
 983         add_subst_pri(s->ref);
 984 }
 985
 986 void
 987 add_subst_symbol(char *ptr)
 988 {
 989         collundef_t *cu;
 990
 991         if ((cu = get_collundef(ptr)) != NULL) {
 992                 add_subst_pri(cu->ref[curr_weight]);
 993         }
 994 }
 995
 996 void
 997 add_weight(int32_t ref, int pass)
 998 {
 999         weight_t srch;
1000         weight_t *w;
1001         avl_index_t where;
1002
1003         srch.pri = resolve_pri(ref);
1004
1005         /* No translation of ignores */
1006         if (srch.pri == 0)
1007                 return;
1008
1009         /* Substitution priorities are not weights */
1010         if (srch.pri & COLLATE_SUBST_PRIORITY)
1011                 return;
1012
1013         if (avl_find(&weights[pass], &srch, &where) != NULL)
1014                 return;
1015
1016         if ((w = calloc(sizeof (*w), 1)) == NULL) {
1017                 errf(_("out of memory"));
1018                 return;
1019         }
1020         w->pri = srch.pri;
1021         avl_insert(&weights[pass], w, where);
1022 }
1023
1024 void
1025 add_weights(int32_t *refs)
1026 {
1027         int i;
1028         for (i = 0; i < NUM_WT; i++) {
1029                 add_weight(refs[i], i);
1030         }
1031 }
1032
1033 int32_t
1034 get_weight(int32_t ref, int pass)
1035 {
1036         weight_t        srch;
1037         weight_t        *w;
1038         int32_t         pri;
1039
1040         pri = resolve_pri(ref);
1041         if (pri & COLLATE_SUBST_PRIORITY) {
1042                 return (pri);
1043         }
1044         if (pri <= 0) {
1045                 return (pri);
1046         }
1047         srch.pri = pri;
1048         if ((w = avl_find(&weights[pass], &srch, NULL)) == NULL) {
1049                 INTERR;
1050                 return (-1);
1051         }
1052         return (w->opt);
1053 }
1054
1055 void
1056 dump_collate(void)
1057 {
1058         FILE                    *f;
1059         int                     i, j, n;
1060         size_t                  sz;
1061         int32_t                 pri;
1062         collelem_t              *ce;
1063         collchar_t              *cc;
1064         subst_t                 *sb;
1065         char                    vers[COLLATE_STR_LEN];
1066         collate_char_t          chars[UCHAR_MAX + 1];
1067         collate_large_t         *large;
1068         collate_subst_t         *subst[COLL_WEIGHTS_MAX];
1069         collate_chain_t         *chain;
1070
1071         /*
1072          * We have to run throught a preliminary pass to identify all the
1073          * weights that we use for each sorting level.
1074          */
1075         for (i = 0; i < NUM_WT; i++) {
1076                 add_weight(pri_ignore, i);
1077         }
1078         for (i = 0; i < NUM_WT; i++) {
1079                 for (sb = avl_first(&substs[i]); sb;
1080                     sb = AVL_NEXT(&substs[i], sb)) {
1081                         for (j = 0; sb->ref[j]; j++) {
1082                                 add_weight(sb->ref[j], i);
1083                         }
1084                 }
1085         }
1086         for (ce = avl_first(&elem_by_expand);
1087             ce != NULL;
1088             ce = AVL_NEXT(&elem_by_expand, ce)) {
1089                 add_weights(ce->ref);
1090         }
1091         for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
1092                 add_weights(cc->ref);
1093         }
1094
1095         /*
1096          * Now we walk the entire set of weights, removing the gaps
1097          * in the weights.  This gives us optimum usage.  The walk
1098          * occurs in priority.
1099          */
1100         for (i = 0; i < NUM_WT; i++) {
1101                 weight_t *w;
1102                 for (w = avl_first(&weights[i]); w;
1103                     w = AVL_NEXT(&weights[i], w)) {
1104                         w->opt = nweight[i];
1105                         nweight[i] += 1;
1106                 }
1107         }
1108
1109         (void) memset(&chars, 0, sizeof (chars));
1110         (void) memset(vers, 0, COLLATE_STR_LEN);
1111         (void) strlcpy(vers, COLLATE_VERSION, sizeof (vers));
1112
1113         /*
1114          * We need to make sure we arrange for the UNDEFINED field
1115          * to show up.  Also, set the total weight counts.
1116          */
1117         for (i = 0; i < NUM_WT; i++) {
1118                 if (resolve_pri(pri_undefined[i]) == -1) {
1119                         set_pri(pri_undefined[i], -1, RESOLVED);
1120                         /* they collate at the end of everything else */
1121                         collinfo.undef_pri[i] = COLLATE_MAX_PRIORITY;
1122                 }
1123                 collinfo.pri_count[i] = nweight[i];
1124         }
1125
1126         collinfo.pri_count[NUM_WT] = max_wide();
1127         collinfo.undef_pri[NUM_WT] = COLLATE_MAX_PRIORITY;
1128         collinfo.directive[NUM_WT] = DIRECTIVE_UNDEFINED;
1129
1130         /*
1131          * Ordinary character priorities
1132          */
1133         for (i = 0; i <= UCHAR_MAX; i++) {
1134                 if ((cc = get_collchar(i, 0)) != NULL) {
1135                         for (j = 0; j < NUM_WT; j++) {
1136                                 chars[i].pri[j] = get_weight(cc->ref[j], j);
1137                         }
1138                 } else {
1139                         for (j = 0; j < NUM_WT; j++) {
1140                                 chars[i].pri[j] =
1141                                     get_weight(pri_undefined[j], j);
1142                         }
1143                         /*
1144                          * Per POSIX, for undefined characters, we
1145                          * also have to add a last item, which is the
1146                          * character code.
1147                          */
1148                         chars[i].pri[NUM_WT] = i;
1149                 }
1150         }
1151
1152         /*
1153          * Substitution tables
1154          */
1155         for (i = 0; i < NUM_WT; i++) {
1156                 collate_subst_t *st = NULL;
1157                 n = collinfo.subst_count[i] = avl_numnodes(&substs[i]);
1158                 if ((st = calloc(sizeof (collate_subst_t) * n, 1)) == NULL) {
1159                         errf(_("out of memory"));
1160                         return;
1161                 }
1162                 n = 0;
1163                 for (sb = avl_first(&substs[i]); sb;
1164                     sb = AVL_NEXT(&substs[i], sb)) {
1165                         if ((st[n].key = resolve_pri(sb->key)) < 0) {
1166                                 /* by definition these resolve! */
1167                                 INTERR;
1168                         }
1169                         if (st[n].key != (n | COLLATE_SUBST_PRIORITY)) {
1170                                 INTERR;
1171                         }
1172                         for (j = 0; sb->ref[j]; j++) {
1173                                 st[n].pri[j] = get_weight(sb->ref[j], i);
1174                         }
1175                         n++;
1176                 }
1177                 if (n != collinfo.subst_count[i])
1178                         INTERR;
1179                 subst[i] = st;
1180         }
1181
1182
1183         /*
1184          * Chains, i.e. collating elements
1185          */
1186         collinfo.chain_count = avl_numnodes(&elem_by_expand);
1187         chain = calloc(sizeof (collate_chain_t), collinfo.chain_count);
1188         if (chain == NULL) {
1189                 errf(_("out of memory"));
1190                 return;
1191         }
1192         for (n = 0, ce = avl_first(&elem_by_expand);
1193             ce != NULL;
1194             ce = AVL_NEXT(&elem_by_expand, ce), n++) {
1195                 (void) wsncpy(chain[n].str, ce->expand, COLLATE_STR_LEN);
1196                 for (i = 0; i < NUM_WT; i++) {
1197                         chain[n].pri[i] = get_weight(ce->ref[i], i);
1198                 }
1199         }
1200         if (n != collinfo.chain_count)
1201                 INTERR;
1202
1203         /*
1204          * Large (> UCHAR_MAX) character priorities
1205          */
1206         large = calloc(sizeof (collate_large_t) * avl_numnodes(&collchars), 1);
1207         if (large == NULL) {
1208                 errf(_("out of memory"));
1209                 return;
1210         }
1211
1212         i = 0;
1213         for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
1214                 int     undef = 0;
1215                 /* we already gathered those */
1216                 if (cc->wc <= UCHAR_MAX)
1217                         continue;
1218                 for (j = 0; j < NUM_WT; j++) {
1219                         if ((pri = get_weight(cc->ref[j], j)) < 0) {
1220                                 undef = 1;
1221                         }
1222                         if (undef && (pri >= 0)) {
1223                                 /* if undefined, then all priorities are */
1224                                 INTERR;
1225                         } else {
1226                                 large[i].pri.pri[j] = pri;
1227                         }
1228                 }
1229                 if (!undef) {
1230                         large[i].val = cc->wc;
1231                         collinfo.large_count = i++;
1232                 }
1233         }
1234
1235         if ((f = open_category()) == NULL) {
1236                 return;
1237         }
1238
1239         /* Time to write the entire data set out */
1240
1241         if ((wr_category(vers, COLLATE_STR_LEN, f) < 0) ||
1242             (wr_category(&collinfo, sizeof (collinfo), f) < 0) ||
1243             (wr_category(&chars, sizeof (chars), f) < 0)) {
1244                 return;
1245         }
1246
1247         for (i = 0; i < NUM_WT; i++) {
1248                 sz =  sizeof (collate_subst_t) * collinfo.subst_count[i];
1249                 if (wr_category(subst[i], sz, f) < 0) {
1250                         return;
1251                 }
1252         }
1253         sz = sizeof (collate_chain_t) * collinfo.chain_count;
1254         if (wr_category(chain, sz, f) < 0) {
1255                 return;
1256         }
1257         sz = sizeof (collate_large_t) * collinfo.large_count;
1258         if (wr_category(large, sz, f) < 0) {
1259                 return;
1260         }
1261
1262         close_category(f);
1263 }