1 /***********************************************************************
3 * This software is part of the ast package *
4 * Copyright (c) 1992-2010 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Common Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
9 * A copy of the License is available at *
10 * http://www.opensource.org/licenses/cpl1.0.txt *
11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
13 * Information and Software Systems Research *
17 * Glenn Fowler <gsf@research.att.com> *
18 * David Korn <dgk@research.att.com> *
20 ***********************************************************************/
24 * AT&T Bell Laboratories
26 * cut fields or columns from fields from a file
29 static const char usage
[] =
30 "[-?\n@(#)$Id: cut (AT&T Research) 2009-12-04 $\n]"
32 "[+NAME?cut - cut out selected columns or fields of each line of a file]"
33 "[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields "
34 "from one or more files, contatenating them on standard output.]"
35 "[+?The option argument \alist\a is a comma-separated or blank-separated "
36 "list of positive numbers and ranges. Ranges can be of three "
37 "forms. The first is two positive integers separated by a hyphen "
38 "(\alow\a\b-\b\ahigh\a), which represents all fields from \alow\a to "
39 "\ahigh\a. The second is a positive number preceded by a hyphen "
40 "(\b-\b\ahigh\a), which represents all fields from field \b1\b to "
41 "\ahigh\a. The last is a positive number followed by a hyphen "
42 "(\alow\a\b-\b), which represents all fields from \alow\a to the "
43 "last field, inclusive. Elements in the \alist\a can be repeated, "
44 "can overlap, and can appear in any order. The order of the "
45 "output is that of the input.]"
46 "[+?One and only one of \b-b\b, \b-c\b, or \b-f\b must be specified.]"
47 "[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b "
48 "cuts from standard input. The start of the file is defined "
49 "as the current offset.]"
50 "[b:bytes]:[list?\bcut\b based on a list of byte counts.]"
51 "[c:characters]:[list?\bcut\b based on a list of character counts.]"
52 "[d:delimiter]:[delim?The field character for the \b-f\b option is set "
53 "to \adelim\a. The default is the \btab\b character.]"
54 "[f:fields]:[list?\bcut\b based on fields separated by the delimiter "
55 "character specified with the \b-d\b optiion.]"
56 "[n!:split?Split multibyte characters selected by the \b-b\b option.]"
57 "[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length "
58 "records of length \areclen\a when used with the \b-b\b or \b-c\b "
60 "[s:suppress|only-delimited?Suppress lines with no delimiter characters, "
61 "when used with the \b-f\b option. By default, lines with no "
62 "delimiters will be passsed in untouched.]"
63 "[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for "
64 "the \b-f\b option is set to \aldelim\a. The default is the "
65 "\bnewline\b character.]"
66 "[N!:newline?Output new-lines at end of each record when used "
67 "with the \b-b\b or \b-c\b option.]"
72 "[+0?All files processed successfully.]"
73 "[+>0?One or more files failed to open or could not be read.]"
75 "[+SEE ALSO?\bpaste\b(1), \bgrep\b(1)]"
81 typedef struct Delim_s
99 unsigned char space
[UCHAR_MAX
+1];
100 int list
[2]; /* NOTE: must be last member */
110 #define C_NONEWLINE 32
116 #define mb2wc(w,p,n) (*ast.mb_towc)(&w,(char*)p,n)
119 * compare the first of an array of integers
123 mycomp(register const void* a
, register const void* b
)
125 if (*((int*)a
) < *((int*)b
))
127 if (*((int*)a
) > *((int*)b
))
133 cutinit(int mode
, char* str
, Delim_t
* wdelim
, Delim_t
* ldelim
, size_t reclen
)
138 register int range
= 0;
139 register char* cp
= str
;
142 if (!(cut
= (Cut_t
*)stakalloc(sizeof(Cut_t
) + strlen(cp
) * sizeof(int))))
143 error(ERROR_exit(1), "out of space");
144 if (cut
->mb
= mbwide())
146 memset(cut
->space
, 0, sizeof(cut
->space
) / 2);
147 memset(cut
->space
+ sizeof(cut
->space
) / 2, SP_WIDE
, sizeof(cut
->space
) / 2);
150 memset(cut
->space
, 0, sizeof(cut
->space
));
151 cut
->wdelim
= *wdelim
;
152 if (wdelim
->len
== 1)
153 cut
->space
[wdelim
->chr
] = SP_WORD
;
154 cut
->ldelim
= *ldelim
;
155 cut
->eob
= (ldelim
->len
== 1) ? ldelim
->chr
: 0;
156 cut
->space
[cut
->eob
] = SP_LINE
;
157 cut
->cflag
= (mode
&C_CHARS
) && cut
->mb
;
158 cut
->nosplit
= (mode
&(C_BYTES
|C_NOSPLIT
)) == (C_BYTES
|C_NOSPLIT
) && cut
->mb
;
159 cut
->sflag
= (mode
&C_SUPRESS
) != 0;
160 cut
->nlflag
= (mode
&C_NONEWLINE
) != 0;
161 cut
->reclen
= reclen
;
168 while(*cp
==' ' || *cp
=='\t')
176 if((n
= (n
? (n
-range
) : (HUGE
-1))) < 0)
177 error(ERROR_exit(1),"invalid range for c/f option");
190 n
= 1 + (lp
-cut
->list
)/2;
191 qsort(lp
=cut
->list
,n
,2*sizeof(*lp
),mycomp
);
192 /* eliminate overlapping regions */
193 for(n
=0,range
= -2,dp
=lp
; *lp
!=HUGE
; lp
+=2)
202 if((c
= lp
[0]+lp
[1]-range
)>0)
210 range
= *dp
++ = lp
[0];
216 range
+= (*dp
++ = lp
[1]);
221 /* convert ranges into gaps */
222 for(n
=0; *lp
!=HUGE
; lp
+=2)
235 error(ERROR_exit(1),"bad list for c/f option");
242 error(ERROR_exit(1),"bad list for c/f option");
250 * cut each line of file <fdin> and put results to <fdout> using list <list>
254 cutcols(Cut_t
* cut
, Sfio_t
* fdin
, Sfio_t
* fdout
)
258 register int ncol
= 0;
259 register const int* lp
= cut
->list
;
261 register int skip
; /* non-zero for don't copy */
267 if (len
= cut
->reclen
)
268 bp
= sfreserve(fdin
, len
, -1);
270 bp
= sfgetr(fdin
, '\n', 0);
271 if (!bp
&& !(bp
= sfgetr(fdin
, 0, SF_LASTR
)))
275 if (!(ncol
= skip
= *(lp
= cut
->list
)))
282 register const char* s
= bp
;
283 register int w
= len
< ncol
? len
: ncol
;
290 else if ((z
= mblen(s
, w
)) <= 0)
295 bp
= (char*)(s
= xx
);
309 ncol
= !w
&& ncol
>= len
;
313 register const char* s
= bp
;
314 register int w
= len
;
317 while (w
> 0 && ncol
> 0)
320 if (!(*s
& 0x80) || (z
= mblen(s
, w
)) <= 0)
327 ncol
= !w
&& (ncol
|| !skip
);
331 if ((c
= ncol
) > len
)
333 else if (c
== len
&& !skip
)
339 if (sfwrite(fdout
, (char*)bp
, c
) < 0)
349 } while (ncol
!= HUGE
);
350 if (!cut
->nlflag
&& (skip
|| must
|| cut
->reclen
))
352 if (cut
->ldelim
.len
> 1)
353 sfwrite(fdout
, cut
->ldelim
.str
, cut
->ldelim
.len
);
355 sfputc(fdout
, cut
->ldelim
.chr
);
361 * cut each line of file <fdin> and put results to <fdout> using list <list>
362 * stream <fdin> must be line buffered
366 cutfields(Cut_t
* cut
, Sfio_t
* fdin
, Sfio_t
* fdout
)
368 register unsigned char *sp
= cut
->space
;
369 register unsigned char *cp
;
370 register unsigned char *wp
;
371 register int c
, nfields
;
372 register const int *lp
= cut
->list
;
373 register unsigned char *copy
;
374 register int nodelim
, empty
, inword
=0;
375 register unsigned char *ep
;
376 unsigned char *bp
, *first
;
382 /* process each buffer */
383 while ((bp
= (unsigned char*)sfreserve(fdin
, SF_UNBOUND
, -1)) && (c
= sfvalue(fdin
)) > 0)
387 if((lastchar
= cp
[c
]) != cut
->eob
)
389 /* process each line in the buffer */
397 if (nfields
= *(lp
= cut
->list
))
407 /* skip over non-delimiter characters */
411 switch (c
= sp
[*(unsigned char*)cp
++])
417 while ((c
= mb2wc(w
, cp
, ep
- cp
)) <= 0)
419 /* mb char possibly spanning buffer boundary -- fun stuff */
420 if ((ep
- cp
) < mbmax())
426 if (lastchar
!= cut
->eob
)
429 if ((c
= mb2wc(w
, cp
, ep
- cp
)) > 0)
435 if ((c
= cp
- copy
) > 0 && sfwrite(fdout
, (char*)copy
, c
) < 0)
438 for (i
= 0; i
<= (ep
- cp
); i
++)
440 if (!(bp
= (unsigned char*)sfreserve(fdin
, SF_UNBOUND
, -1)) || (c
= sfvalue(fdin
)) <= 0)
444 if ((lastchar
= cp
[c
]) != cut
->eob
)
450 if ((c
= mb2wc(w
, (char*)mb
, j
)) <= 0)
455 first
= bp
= cp
+= c
- i
;
459 if (w
== cut
->ldelim
.chr
)
460 lastchar
= cut
->ldelim
.chr
;
461 else if (w
!= cut
->wdelim
.chr
)
464 if (sfwrite(fdout
, (char*)mb
, c
) < 0)
479 if (c
== cut
->wdelim
.chr
)
484 if (c
== cut
->ldelim
.chr
)
498 while (!(c
= sp
[*cp
++]));
501 /* check for end-of-line */
506 if (lastchar
== cut
->ldelim
.chr
)
508 /* restore cut->last character */
509 if (lastchar
!= cut
->eob
)
522 if ((c
= wp
- copy
) > 0 && sfwrite(fdout
, (char*)copy
, c
) < 0)
527 /* set to delimiter unless the first field */
528 copy
= empty
? cp
: wp
;
540 sfseek(fdtmp
,(Sfoff_t
)0,SEEK_SET
);
541 sfmove(fdtmp
,fdout
,offset
,-1);
550 sfseek(fdtmp
,offset
=0,SEEK_SET
);
552 if (copy
&& (c
=cp
-copy
)>0 && (!nodelim
|| !cut
->sflag
) && sfwrite(fdout
,(char*)copy
,c
)< 0)
555 /* see whether to save in tmp file */
556 if(inword
&& nodelim
&& !cut
->sflag
&& (c
=cp
-first
)>0)
558 /* copy line to tmpfile in case no fields */
560 fdtmp
= sftmp(BLOCK
);
561 sfwrite(fdtmp
,(char*)first
,c
);
571 b_cut(int argc
, char** argv
, void* context
)
573 register char* cp
= 0;
583 cmdinit(argc
, argv
, context
, ERROR_CATALOG
, 0);
586 wdelim
.len
= ldelim
.len
= 1;
589 switch (n
= optget(argv
, usage
))
597 error(2, "f option already specified");
607 ldelim
.str
= opt_info
.arg
;
611 ldelim
.chr
= mbchar(s
);
612 if ((n
= s
- opt_info
.arg
) > 1)
618 ldelim
.chr
= *(unsigned char*)opt_info
.arg
;
622 wdelim
.str
= opt_info
.arg
;
626 wdelim
.chr
= mbchar(s
);
627 if ((n
= s
- opt_info
.arg
) > 1)
633 wdelim
.chr
= *(unsigned char*)opt_info
.arg
;
637 if(mode
&(C_CHARS
|C_BYTES
))
639 error(2, "c option already specified");
654 reclen
= opt_info
.num
;
660 error(2, "%s", opt_info
.arg
);
663 error(ERROR_usage(2), "%s", opt_info
.arg
);
668 argv
+= opt_info
.index
;
669 if (error_info
.errors
)
670 error(ERROR_usage(2), "%s",optusage(NiL
));
673 error(2, "b, c or f option must be specified");
674 error(ERROR_usage(2), "%s", optusage(NiL
));
677 error(3, "non-empty b, c or f option must be specified");
678 if((mode
& (C_FIELDS
|C_SUPRESS
)) == C_SUPRESS
)
679 error(3, "s option requires f option");
680 cut
= cutinit(mode
, cp
, &wdelim
, &ldelim
, reclen
);
685 if(!cp
|| streq(cp
,"-"))
687 else if(!(fp
= sfopen(NiL
,cp
,"r")))
689 error(ERROR_system(0),"%s: cannot open",cp
);
693 cutfields(cut
,fp
,sfstdout
);
695 cutcols(cut
,fp
,sfstdout
);
698 } while(cp
= *argv
++);
699 if (sfsync(sfstdout
))
700 error(ERROR_system(0), "write error");
701 return error_info
.errors
!= 0;