usr.bin/sort/fields.c

   1 /*      $NetBSD: fields.c,v 1.19 2008/04/28 20:24:15 martin Exp $       */
   2
   3 /*-
   4  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * This code is derived from software contributed to The NetBSD Foundation
   8  * by Ben Harris and Jaromir Dolecek.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 /*-
  33  * Copyright (c) 1993
  34  *      The Regents of the University of California.  All rights reserved.
  35  *
  36  * This code is derived from software contributed to Berkeley by
  37  * Peter McIlroy.
  38  *
  39  * Redistribution and use in source and binary forms, with or without
  40  * modification, are permitted provided that the following conditions
  41  * are met:
  42  * 1. Redistributions of source code must retain the above copyright
  43  *    notice, this list of conditions and the following disclaimer.
  44  * 2. Redistributions in binary form must reproduce the above copyright
  45  *    notice, this list of conditions and the following disclaimer in the
  46  *    documentation and/or other materials provided with the distribution.
  47  * 3. Neither the name of the University nor the names of its contributors
  48  *    may be used to endorse or promote products derived from this software
  49  *    without specific prior written permission.
  50  *
  51  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  52  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  53  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  54  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  55  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  56  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  57  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  58  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  59  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  60  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  61  * SUCH DAMAGE.
  62  */
  63
  64 /* Subroutines to generate sort keys. */
  65
  66 #include "sort.h"
  67
  68 #ifndef lint
  69 __RCSID("$NetBSD: fields.c,v 1.19 2008/04/28 20:24:15 martin Exp $");
  70 __SCCSID("@(#)fields.c  8.1 (Berkeley) 6/6/93");
  71 #endif /* not lint */
  72
  73 #define SKIP_BLANKS(ptr) {                                      \
  74         if (BLANK & d_mask[*(ptr)])                             \
  75                 while (BLANK & d_mask[*(++(ptr))]);             \
  76 }
  77
  78 #define NEXTCOL(pos) {                                          \
  79         if (!SEP_FLAG)                                          \
  80                 while (BLANK & l_d_mask[*(++pos)]);             \
  81         while ((*(pos+1) != '\0') && !((FLD_D | REC_D_F) & l_d_mask[*++pos]));\
  82 }
  83
  84 static u_char *enterfield __P((u_char *, u_char *, struct field *, int));
  85 static u_char *number __P((u_char *, u_char *, u_char *, u_char *, int));
  86
  87 #define DECIMAL '.'
  88 #define OFFSET 128
  89
  90 u_char TENS[10];        /* TENS[0] = REC_D <= 128 ? 130 - '0' : 2 -'0'... */
  91 u_char NEGTENS[10];     /* NEGTENS[0] = REC_D <= 128 ? 126 + '0' : 252 +'0' */
  92 u_char *OFF_TENS, *OFF_NTENS;   /* TENS - '0', NEGTENS - '0' */
  93 u_char fnum[NBINS], rnum[NBINS];
  94
  95 /*
  96  * constructs sort key with leading recheader, followed by the key,
  97  * followed by the original line.
  98  */
  99 length_t
 100 enterkey(keybuf, line, size, fieldtable)
 101         RECHEADER *keybuf;      /* pointer to start of key */
 102         DBT *line;
 103         int size;
 104         struct field fieldtable[];
 105 {
 106         int i;
 107         u_char *l_d_mask;
 108         u_char *lineend, *pos;
 109         u_char *endkey, *keypos;
 110         struct coldesc *clpos;
 111         int col = 1;
 112         struct field *ftpos;
 113         l_d_mask = d_mask;
 114         pos = (u_char *) line->data - 1;
 115         lineend = (u_char *) line->data + line->size-1;
 116                                 /* don't include rec_delimiter */
 117
 118         for (i = 0; i < ncols; i++) {
 119                 clpos = clist + i;
 120                 for (; (col < clpos->num) && (pos < lineend); col++) {
 121                         NEXTCOL(pos);
 122                 }
 123                 if (pos >= lineend)
 124                         break;
 125                 clpos->start = SEP_FLAG ? pos + 1 : pos;
 126                 NEXTCOL(pos);
 127                 clpos->end = pos;
 128                 col++;
 129                 if (pos >= lineend) {
 130                         clpos->end = lineend;
 131                         i++;
 132                         break;
 133                 }
 134         }
 135         for (; i <= ncols; i++)
 136                 clist[i].start = clist[i].end = lineend;
 137         if (clist[0].start < (u_char *) line->data)
 138                 clist[0].start++;
 139
 140         keypos = keybuf->data;
 141         endkey = (u_char *) keybuf + size - line->size;
 142         for (ftpos = fieldtable + 1; ftpos->icol.num; ftpos++)
 143                 if ((keypos = enterfield(keypos, endkey, ftpos,
 144                     fieldtable->flags)) == NULL)
 145                         return (1);
 146
 147         keybuf->offset = keypos - keybuf->data;
 148         keybuf->length = keybuf->offset + line->size;
 149         if (keybuf->length + sizeof(TRECHEADER) > (length_t)size) {
 150                 /* line too long for buffer */
 151                 return (1);
 152         }
 153
 154         /*
 155          * Make [s]radixsort() only sort by relevant part of key if:
 156          * 1. we want to choose unique items by relevant field[s]
 157          * 2. we want stable sort and so the items should be sorted only by
 158          *    the relevant field[s]
 159          */
 160         if (UNIQUE || (stable_sort && keybuf->offset < line->size))
 161                 keypos[-1] = REC_D;
 162
 163         memcpy(keybuf->data + keybuf->offset, line->data, line->size);
 164         return (0);
 165 }
 166
 167 /*
 168  * constructs a field (as defined by -k) within a key
 169  */
 170 static u_char *
 171 enterfield(tablepos, endkey, cur_fld, gflags)
 172         struct field *cur_fld;
 173         u_char *tablepos, *endkey;
 174         int gflags;
 175 {
 176         u_char *start, *end, *lineend, *mask, *lweight;
 177         struct column icol, tcol;
 178         u_int flags;
 179         u_int Rflag;
 180
 181         icol = cur_fld->icol;
 182         tcol = cur_fld->tcol;
 183         flags = cur_fld->flags;
 184         start = icol.p->start;
 185         lineend = clist[ncols].end;
 186         if (flags & BI)
 187                 SKIP_BLANKS(start);
 188         start += icol.indent;
 189         start = min(start, lineend);
 190
 191         if (!tcol.num)
 192                 end = lineend;
 193         else {
 194                 if (tcol.indent) {
 195                         end = tcol.p->start;
 196                         if (flags & BT)
 197                                 SKIP_BLANKS(end);
 198                         end += tcol.indent;
 199                         end = min(end, lineend);
 200                 } else
 201                         end = tcol.p->end;
 202         }
 203
 204         if (flags & N) {
 205                 Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0;
 206                 return number(tablepos, endkey, start, end, Rflag);
 207         }
 208
 209         mask = cur_fld->mask;
 210         lweight = cur_fld->weights;
 211         for (; start < end; start++)
 212                 if (mask[*start]) {
 213                         if (*start <= 1) {
 214                                 if (tablepos+2 >= endkey)
 215                                         return (NULL);
 216                                 *tablepos++ = lweight[1];
 217                                 *tablepos++ = lweight[*start ? 2 : 1];
 218                         } else {
 219                                 if (tablepos+1 >= endkey)
 220                                         return (NULL);
 221                                 *tablepos++ = lweight[*start];
 222                         }
 223                 }
 224         *tablepos++ = lweight[0];
 225         return (tablepos == endkey ? NULL : tablepos);
 226 }
 227
 228 /* Uses the first bin to assign sign, expsign, 0, and the first
 229  * 61 out of the exponent ( (254 - 3 origins - 4 over/underflows)/4 = 61 ).
 230  *   When sorting in forward order:
 231  * use (0-99) -> (130->240) for sorting the mantissa if REC_D <=128;
 232  * else use (0-99)->(2-102).
 233  * If the exponent is >=61, use another byte for each additional 253
 234  * in the exponent. Cutoff is at 567.
 235  * To avoid confusing the exponent and the mantissa, use a field delimiter
 236  * if the exponent is exactly 61, 61+252, etc--this is ok, since it's the
 237  * only time a field delimiter can come in that position.
 238  * Reverse order is done analagously.
 239  */
 240
 241 static u_char *
 242 number(pos, bufend, line, lineend, Rflag)
 243         u_char *line, *pos, *bufend, *lineend;
 244         int Rflag;
 245 {
 246         int or_sign, parity = 0;
 247         int expincr = 1, exponent = -1;
 248         int bite, expsign = 1, sign = 1, zeroskip = 0;
 249         u_char lastvalue='0', *nonzero=NULL, *tline, *C_TENS;
 250         u_char *nweights;
 251
 252         if (Rflag)
 253                 nweights = rnum;
 254         else
 255                 nweights = fnum;
 256         if (pos > bufend - 8)
 257                 return (NULL);
 258         /*
 259          * or_sign sets the sort direction:
 260          *      (-r: +/-)(sign: +/-)(expsign: +/-)
 261          */
 262         or_sign = sign ^ expsign ^ Rflag;
 263         SKIP_BLANKS(line);
 264         if (*line == '-') {     /* set the sign */
 265                 or_sign ^= 1;
 266                 sign = 0;
 267                 line++;
 268         }
 269         /* eat initial zeroes */
 270         for (; *line == '0' && line < lineend; line++)
 271                 zeroskip = 1;
 272         /* calculate exponents < 0 */
 273         if (*line == DECIMAL) {
 274                 exponent = 1;
 275                 while (*++line == '0' && line < lineend)
 276                         exponent++;
 277                 expincr = 0;
 278                 expsign = 0;
 279         }
 280         /* next character better be a digit */
 281         if (*line < '1' || *line > '9' || line >= lineend) {
 282                 /* only exit if we didn't skip any zero number */
 283                 if (!zeroskip) {
 284                         *pos++ = nweights[127];
 285                         return (pos);
 286                 }
 287         }
 288         if (expincr) {
 289                 for (tline = line-1; *++tline >= '0' &&
 290                     *tline <= '9' && tline < lineend;)
 291                         exponent++;
 292         }
 293         if (exponent > 567) {
 294                 *pos++ = nweights[sign ? (expsign ? 254 : 128)
 295                                         : (expsign ? 0 : 126)];
 296                 warnx("exponent out of bounds");
 297                 return (pos);
 298         }
 299         bite = min(exponent, 61);
 300         *pos++ = nweights[(sign) ? (expsign ? 189+bite : 189-bite)
 301                                 : (expsign ? 64-bite : 64+bite)];
 302         if (bite >= 61) {
 303                 do {
 304                         exponent -= bite;
 305                         bite = min(exponent, 254);
 306                         *pos++ = nweights[or_sign ? 254-bite : bite];
 307                 } while (bite == 254);
 308         }
 309         C_TENS = or_sign ? OFF_NTENS : OFF_TENS;
 310         for (; line < lineend; line++) {
 311                 if (*line >= '0' && *line <= '9') {
 312                         if (parity) {
 313                                 *pos++ = C_TENS[lastvalue] + (or_sign ? - *line
 314                                                 : *line);
 315                                 if (pos == bufend)
 316                                         return (NULL);
 317                                 if (*line != '0' || lastvalue != '0')
 318                                         nonzero = pos;
 319                         } else
 320                                 lastvalue = *line;
 321                         parity ^= 1;
 322                 } else if (*line == DECIMAL) {
 323                         if (!expincr)   /* a decimal already occurred once */
 324                                 break;
 325                         expincr = 0;
 326                 } else
 327                         break;
 328         }
 329         if ((parity && lastvalue != '0') || !nonzero) {
 330                 *pos++ = or_sign ? OFF_NTENS[lastvalue] - '0' :
 331                                         OFF_TENS[lastvalue] + '0';
 332         } else
 333                 pos = nonzero;
 334         if (pos > bufend-1)
 335                 return (NULL);
 336         *pos++ = or_sign ? nweights[254] : nweights[0];
 337         return (pos);
 338 }
 339
 340 /* This forces a gap around the record delimiter
 341  * Thus fnum has vaues over (0,254) -> ((0,REC_D-1),(REC_D+1,255));
 342  * rnum over (0,254) -> (255,REC_D+1),(REC_D-1,0))
 343  */
 344 void
 345 num_init()
 346 {
 347         int i;
 348         TENS[0] = REC_D <=128 ? 130 - '0' : 2 - '0';
 349         NEGTENS[0] = REC_D <=128 ? 126 + '0' : 254 + '0';
 350         OFF_TENS = TENS - '0';
 351         OFF_NTENS = NEGTENS - '0';
 352         for (i = 1; i < 10; i++) {
 353                 TENS[i] = TENS[i - 1] + 10;
 354                 NEGTENS[i] = NEGTENS[i - 1] - 10;
 355         }
 356         for (i = 0; i < REC_D; i++) {
 357                 fnum[i] = i;
 358                 rnum[255 - i] = i;
 359         }
 360         for (i = REC_D; i <255; i++) {
 361                 fnum[i] = i + 1;
 362                 rnum[255 - i] = i - 1;
 363         }
 364 }