join(1): use getline() instead of fgetln()
[unleashed.git] / bin / join / join.c
blobac62cf83cd180166d3eeeff992e7651e1cbca379
1 /* $OpenBSD: join.c,v 1.27 2015/10/09 01:37:07 deraadt Exp $ */
3 /*-
4 * Copyright (c) 1991, 1993, 1994
5 * The Regents of the University of California. All rights reserved.
7 * This code is derived from software contributed to Berkeley by
8 * Steve Hayman of the Computer Science Department, Indiana University,
9 * Michiro Hikida and David Goodenough.
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. Neither the name of the University nor the names of its contributors
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
36 #include <err.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <unistd.h>
42 #define MAXIMUM(a, b) (((a) > (b)) ? (a) : (b))
45 * There's a structure per input file which encapsulates the state of the
46 * file. We repeatedly read lines from each file until we've read in all
47 * the consecutive lines from the file with a common join field. Then we
48 * compare the set of lines with an equivalent set from the other file.
50 typedef struct {
51 char *line; /* line */
52 u_long linealloc; /* line allocated count */
53 char **fields; /* line field(s) */
54 u_long fieldcnt; /* line field(s) count */
55 u_long fieldalloc; /* line field(s) allocated count */
56 u_long cfieldc; /* current field count */
57 long fpos; /* fpos of start of field */
58 } LINE;
60 typedef struct {
61 FILE *fp; /* file descriptor */
62 u_long joinf; /* join field (-1, -2, -j) */
63 int unpair; /* output unpairable lines (-a) */
64 u_long number; /* 1 for file 1, 2 for file 2 */
65 LINE *set; /* set of lines with same field */
66 int pushbool; /* if pushback is set */
67 u_long pushback; /* line on the stack */
68 u_long setcnt; /* set count */
69 u_long setalloc; /* set allocated count */
70 u_long setusedc; /* sets used */
71 } INPUT;
72 INPUT input1 = { NULL, 0, 0, 1, NULL, 0, 0, 0, 0, 0 },
73 input2 = { NULL, 0, 0, 2, NULL, 0, 0, 0, 0, 0 };
75 typedef struct {
76 u_long filenum; /* file number */
77 u_long fieldno; /* field number */
78 } OLIST;
79 OLIST *olist; /* output field list */
80 u_long olistcnt; /* output field list count */
81 u_long olistalloc; /* output field allocated count */
83 int joinout = 1; /* show lines with matched join fields (-v) */
84 int needsep; /* need separator character */
85 int spans = 1; /* span multiple delimiters (-t) */
86 char *empty; /* empty field replacement string (-e) */
87 char *tabchar = " \t"; /* delimiter characters (-t) */
89 int cmp(LINE *, u_long, LINE *, u_long);
90 void fieldarg(char *);
91 void joinlines(INPUT *, INPUT *);
92 void obsolete(char **);
93 void outfield(LINE *, u_long, int);
94 void outoneline(INPUT *, LINE *);
95 void outtwoline(INPUT *, LINE *, INPUT *, LINE *);
96 void slurp(INPUT *);
97 void slurpit(INPUT *);
98 void usage(void);
101 main(int argc, char *argv[])
103 INPUT *F1, *F2;
104 int aflag, ch, cval, vflag;
105 char *end;
107 if (pledge("stdio rpath", NULL) == -1)
108 err(1, "pledge");
110 F1 = &input1;
111 F2 = &input2;
113 aflag = vflag = 0;
114 obsolete(argv);
115 while ((ch = getopt(argc, argv, "\01a:e:j:1:2:o:t:v:")) != -1) {
116 switch (ch) {
117 case '\01': /* See comment in obsolete(). */
118 aflag = 1;
119 F1->unpair = F2->unpair = 1;
120 break;
121 case '1':
122 if ((F1->joinf = strtol(optarg, &end, 10)) < 1)
123 errx(1, "-1 option field number less than 1");
124 if (*end)
125 errx(1, "illegal field number -- %s", optarg);
126 --F1->joinf;
127 break;
128 case '2':
129 if ((F2->joinf = strtol(optarg, &end, 10)) < 1)
130 errx(1, "-2 option field number less than 1");
131 if (*end)
132 errx(1, "illegal field number -- %s", optarg);
133 --F2->joinf;
134 break;
135 case 'a':
136 aflag = 1;
137 switch(strtol(optarg, &end, 10)) {
138 case 1:
139 F1->unpair = 1;
140 break;
141 case 2:
142 F2->unpair = 1;
143 break;
144 default:
145 errx(1, "-a option file number not 1 or 2");
146 break;
148 if (*end)
149 errx(1, "illegal file number -- %s", optarg);
150 break;
151 case 'e':
152 empty = optarg;
153 break;
154 case 'j':
155 if ((F1->joinf = F2->joinf = strtol(optarg, &end, 10)) < 1)
156 errx(1, "-j option field number less than 1");
157 if (*end)
158 errx(1, "illegal field number -- %s", optarg);
159 --F1->joinf;
160 --F2->joinf;
161 break;
162 case 'o':
163 fieldarg(optarg);
164 break;
165 case 't':
166 spans = 0;
167 if (strlen(tabchar = optarg) != 1)
168 errx(1, "illegal tab character specification");
169 break;
170 case 'v':
171 vflag = 1;
172 joinout = 0;
173 switch (strtol(optarg, &end, 10)) {
174 case 1:
175 F1->unpair = 1;
176 break;
177 case 2:
178 F2->unpair = 1;
179 break;
180 default:
181 errx(1, "-v option file number not 1 or 2");
182 break;
184 if (*end)
185 errx(1, "illegal file number -- %s", optarg);
186 break;
187 case '?':
188 default:
189 usage();
192 argc -= optind;
193 argv += optind;
195 if (aflag && vflag)
196 errx(1, "the -a and -v options are mutually exclusive");
198 if (argc != 2)
199 usage();
201 /* Open the files; "-" means stdin. */
202 if (!strcmp(*argv, "-"))
203 F1->fp = stdin;
204 else if ((F1->fp = fopen(*argv, "r")) == NULL)
205 err(1, "%s", *argv);
206 ++argv;
207 if (!strcmp(*argv, "-"))
208 F2->fp = stdin;
209 else if ((F2->fp = fopen(*argv, "r")) == NULL)
210 err(1, "%s", *argv);
211 if (F1->fp == stdin && F2->fp == stdin)
212 errx(1, "only one input file may be stdin");
214 if (pledge("stdio", NULL) == -1)
215 err(1, "pledge");
217 F1->setusedc = 0;
218 F2->setusedc = 0;
219 slurp(F1);
220 slurp(F2);
221 F1->set->cfieldc = 0;
222 F2->set->cfieldc = 0;
225 * We try to let the files have the same field value, advancing
226 * whoever falls behind and always advancing the file(s) we output
227 * from.
229 while (F1->setcnt && F2->setcnt) {
230 cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf);
231 if (cval == 0) {
232 /* Oh joy, oh rapture, oh beauty divine! */
233 if (joinout)
234 joinlines(F1, F2);
235 slurp(F1);
236 slurp(F2);
238 else {
239 if (F1->unpair
240 && (cval < 0 || F2->set->cfieldc == F2->setusedc -1)) {
241 joinlines(F1, NULL);
242 slurp(F1);
244 else if (cval < 0)
245 /* File 1 takes the lead... */
246 slurp(F1);
247 if (F2->unpair
248 && (cval > 0 || F1->set->cfieldc == F1->setusedc -1)) {
249 joinlines(F2, NULL);
250 slurp(F2);
252 else if (cval > 0)
253 /* File 2 takes the lead... */
254 slurp(F2);
259 * Now that one of the files is used up, optionally output any
260 * remaining lines from the other file.
262 if (F1->unpair)
263 while (F1->setcnt) {
264 joinlines(F1, NULL);
265 slurp(F1);
267 if (F2->unpair)
268 while (F2->setcnt) {
269 joinlines(F2, NULL);
270 slurp(F2);
273 return 0;
276 /* wrapper around slurpit() to keep track of what field we are on */
277 void slurp(INPUT *F)
279 long fpos;
280 u_long cfieldc;
282 if (F->set == NULL) {
283 fpos = 0;
284 cfieldc = 0;
286 else {
287 fpos = F->set->fpos;
288 cfieldc = F->set->cfieldc;
290 slurpit(F);
291 if (F->set == NULL)
292 return;
293 else if (fpos != F->set->fpos)
294 F->set->cfieldc = cfieldc+1;
297 void
298 slurpit(INPUT *F)
300 LINE *lp, *lastlp, tmp;
301 size_t len, linesize;
302 u_long cnt;
303 char *bp, *fieldp, *line;
304 long fpos;
306 * Read all of the lines from an input file that have the same
307 * join field.
310 F->setcnt = 0;
311 line = NULL;
312 linesize = 0;
313 for (lastlp = NULL; ; ++F->setcnt, lastlp = lp) {
315 * If we're out of space to hold line structures, allocate
316 * more. Initialize the structure so that we know that this
317 * is new space.
319 if (F->setcnt == F->setalloc) {
320 LINE *p;
321 u_long newsize = F->setalloc + 50;
322 cnt = F->setalloc;
323 if ((p = reallocarray(F->set, newsize, sizeof(LINE)))
324 == NULL)
325 err(1, NULL);
326 F->set = p;
327 F->setalloc = newsize;
328 memset(F->set + cnt, 0, 50 * sizeof(LINE));
329 /* re-set lastlp in case it moved */
330 if (lastlp != NULL)
331 lastlp = &F->set[F->setcnt - 1];
334 * Get any pushed back line, else get the next line. Allocate
335 * space as necessary. If taking the line from the stack swap
336 * the two structures so that we don't lose space allocated to
337 * either structure. This could be avoided by doing another
338 * level of indirection, but it's probably okay as is.
340 lp = &F->set[F->setcnt];
341 if (F->pushbool) {
342 tmp = F->set[F->setcnt];
343 F->set[F->setcnt] = F->set[F->pushback];
344 F->set[F->pushback] = tmp;
345 F->pushbool = 0;
346 continue;
348 if ((len = getline(&line, &linesize, F->fp)) == -1) {
349 free(line);
350 return;
353 * we depend on knowing on what field we are, one safe way is
354 * the file position.
356 fpos = ftell(F->fp) - len;
357 if (lp->linealloc <= len + 1) {
358 char *p;
359 u_long newsize = lp->linealloc +
360 MAXIMUM(100, len + 1 - lp->linealloc);
361 if ((p = realloc(lp->line, newsize)) == NULL)
362 err(1, NULL);
363 lp->line = p;
364 lp->linealloc = newsize;
366 F->setusedc++;
367 memmove(lp->line, line, len);
368 lp->fpos = fpos;
369 /* Replace trailing newline, if it exists. */
370 if (lp->line[len - 1] == '\n')
371 lp->line[len - 1] = '\0';
373 /* Split the line into fields, allocate space as necessary. */
374 lp->fieldcnt = 0;
375 bp = lp->line;
376 while ((fieldp = strsep(&bp, tabchar)) != NULL) {
377 if (spans && *fieldp == '\0')
378 continue;
379 if (lp->fieldcnt == lp->fieldalloc) {
380 char **p;
381 u_long newsize = lp->fieldalloc + 50;
382 if ((p = reallocarray(lp->fields, newsize,
383 sizeof(char *))) == NULL)
384 err(1, NULL);
385 lp->fields = p;
386 lp->fieldalloc = newsize;
388 lp->fields[lp->fieldcnt++] = fieldp;
391 /* See if the join field value has changed. */
392 if (lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf)) {
393 F->pushbool = 1;
394 F->pushback = F->setcnt;
395 break;
398 free(line);
402 cmp(LINE *lp1, u_long fieldno1, LINE *lp2, u_long fieldno2)
404 if (lp1->fieldcnt <= fieldno1)
405 return (-1);
406 else if (lp2->fieldcnt <= fieldno2)
407 return (1);
408 return (strcmp(lp1->fields[fieldno1], lp2->fields[fieldno2]));
411 void
412 joinlines(INPUT *F1, INPUT *F2)
414 u_long cnt1, cnt2;
417 * Output the results of a join comparison. The output may be from
418 * either file 1 or file 2 (in which case the first argument is the
419 * file from which to output) or from both.
421 if (F2 == NULL) {
422 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
423 outoneline(F1, &F1->set[cnt1]);
424 return;
426 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
427 for (cnt2 = 0; cnt2 < F2->setcnt; ++cnt2)
428 outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]);
431 void
432 outoneline(INPUT *F, LINE *lp)
434 u_long cnt;
437 * Output a single line from one of the files, according to the
438 * join rules. This happens when we are writing unmatched single
439 * lines. Output empty fields in the right places.
441 if (olist)
442 for (cnt = 0; cnt < olistcnt; ++cnt) {
443 if (olist[cnt].filenum == F->number)
444 outfield(lp, olist[cnt].fieldno, 0);
445 else if (olist[cnt].filenum == 0)
446 outfield(lp, F->joinf, 0);
447 else
448 outfield(lp, 0, 1);
450 else {
452 * Output the join field, then the remaining fields from F
454 outfield(lp, F->joinf, 0);
455 for (cnt = 0; cnt < lp->fieldcnt; ++cnt)
456 if (F->joinf != cnt)
457 outfield(lp, cnt, 0);
460 putchar('\n');
461 if (ferror(stdout))
462 err(1, "stdout");
463 needsep = 0;
466 void
467 outtwoline(INPUT *F1, LINE *lp1, INPUT *F2, LINE *lp2)
469 u_long cnt;
471 /* Output a pair of lines according to the join list (if any). */
472 if (olist) {
473 for (cnt = 0; cnt < olistcnt; ++cnt)
474 if (olist[cnt].filenum == 0) {
475 if (lp1->fieldcnt >= F1->joinf)
476 outfield(lp1, F1->joinf, 0);
477 else
478 outfield(lp2, F2->joinf, 0);
479 } else if (olist[cnt].filenum == 1)
480 outfield(lp1, olist[cnt].fieldno, 0);
481 else /* if (olist[cnt].filenum == 2) */
482 outfield(lp2, olist[cnt].fieldno, 0);
483 } else {
485 * Output the join field, then the remaining fields from F1
486 * and F2.
488 outfield(lp1, F1->joinf, 0);
489 for (cnt = 0; cnt < lp1->fieldcnt; ++cnt)
490 if (F1->joinf != cnt)
491 outfield(lp1, cnt, 0);
492 for (cnt = 0; cnt < lp2->fieldcnt; ++cnt)
493 if (F2->joinf != cnt)
494 outfield(lp2, cnt, 0);
496 putchar('\n');
497 if (ferror(stdout))
498 err(1, "stdout");
499 needsep = 0;
502 void
503 outfield(LINE *lp, u_long fieldno, int out_empty)
505 if (needsep++)
506 putchar((int)*tabchar);
507 if (!ferror(stdout)) {
508 if (lp->fieldcnt <= fieldno || out_empty) {
509 if (empty != NULL)
510 fputs(empty, stdout);
511 } else {
512 if (*lp->fields[fieldno] == '\0')
513 return;
514 fputs(lp->fields[fieldno], stdout);
517 if (ferror(stdout))
518 err(1, "stdout");
522 * Convert an output list argument "2.1, 1.3, 2.4" into an array of output
523 * fields.
525 void
526 fieldarg(char *option)
528 u_long fieldno, filenum;
529 char *end, *token;
531 while ((token = strsep(&option, ", \t")) != NULL) {
532 if (*token == '\0')
533 continue;
534 if (token[0] == '0')
535 filenum = fieldno = 0;
536 else if ((token[0] == '1' || token[0] == '2') &&
537 token[1] == '.') {
538 filenum = token[0] - '0';
539 fieldno = strtol(token + 2, &end, 10);
540 if (*end)
541 errx(1, "malformed -o option field");
542 if (fieldno == 0)
543 errx(1, "field numbers are 1 based");
544 --fieldno;
545 } else
546 errx(1, "malformed -o option field");
547 if (olistcnt == olistalloc) {
548 OLIST *p;
549 u_long newsize = olistalloc + 50;
550 if ((p = reallocarray(olist, newsize, sizeof(OLIST)))
551 == NULL)
552 err(1, NULL);
553 olist = p;
554 olistalloc = newsize;
556 olist[olistcnt].filenum = filenum;
557 olist[olistcnt].fieldno = fieldno;
558 ++olistcnt;
562 void
563 obsolete(char **argv)
565 size_t len;
566 char **p, *ap, *t;
568 while ((ap = *++argv) != NULL) {
569 /* Return if "--". */
570 if (ap[0] == '-' && ap[1] == '-')
571 return;
572 /* skip if not an option */
573 if (ap[0] != '-')
574 continue;
575 switch (ap[1]) {
576 case 'a':
578 * The original join allowed "-a", which meant the
579 * same as -a1 plus -a2. POSIX 1003.2, Draft 11.2
580 * only specifies this as "-a 1" and "a -2", so we
581 * have to use another option flag, one that is
582 * unlikely to ever be used or accidentally entered
583 * on the command line. (Well, we could reallocate
584 * the argv array, but that hardly seems worthwhile.)
586 if (ap[2] == '\0' && (argv[1] == NULL ||
587 (strcmp(argv[1], "1") != 0 &&
588 strcmp(argv[1], "2") != 0))) {
589 ap[1] = '\01';
590 warnx("-a option used without an argument; "
591 "reverting to historical behavior");
593 break;
594 case 'j':
596 * The original join allowed "-j[12] arg" and "-j arg".
597 * Convert the former to "-[12] arg". Don't convert
598 * the latter since getopt(3) can handle it.
600 switch(ap[2]) {
601 case '1':
602 case '2':
603 if (ap[3] != '\0')
604 goto jbad;
605 ap[1] = ap[2];
606 ap[2] = '\0';
607 break;
608 case '\0':
609 break;
610 default:
611 jbad: warnx("unknown option -- %s", ap + 1);
612 usage();
614 break;
615 case 'o':
617 * The original join allowed "-o arg arg".
618 * Convert to "-o arg -o arg".
620 if (ap[2] != '\0' || argv[1] == NULL)
621 break;
622 for (p = argv + 2; *p != NULL; ++p) {
623 if (p[0][0] == '0' || ((p[0][0] != '1' &&
624 p[0][0] != '2') || p[0][1] != '.'))
625 break;
626 len = strlen(*p);
627 if (len - 2 != strspn(*p + 2, "0123456789"))
628 break;
629 if ((t = malloc(len + 3)) == NULL)
630 err(1, NULL);
631 t[0] = '-';
632 t[1] = 'o';
633 memmove(t + 2, *p, len + 1);
634 *p = t;
636 argv = p - 1;
637 break;
642 void
643 usage(void)
645 int len;
646 extern char *__progname;
648 len = strlen(__progname) + sizeof("usage: ");
649 (void)fprintf(stderr, "usage: %s [-1 field] [-2 field] "
650 "[-a file_number | -v file_number] [-e string]\n"
651 "%*s[-o list] [-t char] file1 file2\n",
652 __progname, len, "");
653 exit(1);