1 /* wc - print the number of lines, words, and bytes in files
2 Copyright (C) 1985, 1991, 1995-2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 /* Written by Paul Rubin, phr@ocf.berkeley.edu
18 and David MacKenzie, djm@gnu.ai.mit.edu. */
25 #include <sys/types.h>
30 #include "argv-iter.h"
37 #include "readtokens0.h"
38 #include "safe-read.h"
41 #if !defined iswspace && !HAVE_ISWSPACE
42 # define iswspace(wc) \
43 ((wc) == to_uchar (wc) && isspace (to_uchar (wc)))
46 /* The official name of this program (e.g., no `g' prefix). */
47 #define PROGRAM_NAME "wc"
50 proper_name ("Paul Rubin"), \
51 proper_name ("David MacKenzie")
53 /* Size of atomic reads. */
54 #define BUFFER_SIZE (16 * 1024)
56 /* Cumulative number of lines, words, chars and bytes in all files so far.
57 max_line_length is the maximum over all files processed so far. */
58 static uintmax_t total_lines
;
59 static uintmax_t total_words
;
60 static uintmax_t total_chars
;
61 static uintmax_t total_bytes
;
62 static uintmax_t max_line_length
;
64 /* Which counts to print. */
65 static bool print_lines
, print_words
, print_chars
, print_bytes
;
66 static bool print_linelength
;
68 /* The print width of each count. */
69 static int number_width
;
71 /* True if we have ever read the standard input. */
72 static bool have_read_stdin
;
74 /* The result of calling fstat or stat on a file descriptor or file. */
77 /* If positive, fstat or stat has not been called yet. Otherwise,
78 this is the value returned from fstat or stat. */
81 /* If FAILED is zero, this is the file's status. */
85 /* For long options that have no equivalent short option, use a
86 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
89 FILES0_FROM_OPTION
= CHAR_MAX
+ 1
92 static struct option
const longopts
[] =
94 {"bytes", no_argument
, NULL
, 'c'},
95 {"chars", no_argument
, NULL
, 'm'},
96 {"lines", no_argument
, NULL
, 'l'},
97 {"words", no_argument
, NULL
, 'w'},
98 {"files0-from", required_argument
, NULL
, FILES0_FROM_OPTION
},
99 {"max-line-length", no_argument
, NULL
, 'L'},
100 {GETOPT_HELP_OPTION_DECL
},
101 {GETOPT_VERSION_OPTION_DECL
},
108 if (status
!= EXIT_SUCCESS
)
109 fprintf (stderr
, _("Try `%s --help' for more information.\n"),
114 Usage: %s [OPTION]... [FILE]...\n\
115 or: %s [OPTION]... --files0-from=F\n\
117 program_name
, program_name
);
119 Print newline, word, and byte counts for each FILE, and a total line if\n\
120 more than one FILE is specified. With no FILE, or when FILE is -,\n\
121 read standard input. A word is a non-zero-length sequence of characters\n\
122 delimited by white space.\n\
123 The options below may be used to select which counts are printed, always in\n\
124 the following order: newline, word, character, byte, maximum line length.\n\
125 -c, --bytes print the byte counts\n\
126 -m, --chars print the character counts\n\
127 -l, --lines print the newline counts\n\
130 --files0-from=F read input from the files specified by\n\
131 NUL-terminated names in file F;\n\
132 If F is - then read names from standard input\n\
133 -L, --max-line-length print the length of the longest line\n\
134 -w, --words print the word counts\n\
136 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
137 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
138 emit_ancillary_info ();
143 /* FILE is the name of the file (or NULL for standard input)
144 associated with the specified counters. */
146 write_counts (uintmax_t lines
,
150 uintmax_t linelength
,
153 static char const format_sp_int
[] = " %*s";
154 char const *format_int
= format_sp_int
+ 1;
155 char buf
[INT_BUFSIZE_BOUND (uintmax_t)];
159 printf (format_int
, number_width
, umaxtostr (lines
, buf
));
160 format_int
= format_sp_int
;
164 printf (format_int
, number_width
, umaxtostr (words
, buf
));
165 format_int
= format_sp_int
;
169 printf (format_int
, number_width
, umaxtostr (chars
, buf
));
170 format_int
= format_sp_int
;
174 printf (format_int
, number_width
, umaxtostr (bytes
, buf
));
175 format_int
= format_sp_int
;
177 if (print_linelength
)
179 printf (format_int
, number_width
, umaxtostr (linelength
, buf
));
182 printf (" %s", file
);
186 /* Count words. FILE_X is the name of the file (or NULL for standard
187 input) that is open on descriptor FD. *FSTATUS is its status.
188 Return true if successful. */
190 wc (int fd
, char const *file_x
, struct fstatus
*fstatus
)
193 char buf
[BUFFER_SIZE
+ 1];
195 uintmax_t lines
, words
, chars
, bytes
, linelength
;
196 bool count_bytes
, count_chars
, count_complicated
;
197 char const *file
= file_x
? file_x
: _("standard input");
199 lines
= words
= chars
= bytes
= linelength
= 0;
201 /* If in the current locale, chars are equivalent to bytes, we prefer
202 counting bytes, because that's easier. */
206 count_bytes
= print_bytes
;
207 count_chars
= print_chars
;
212 count_bytes
= print_bytes
|| print_chars
;
215 count_complicated
= print_words
|| print_linelength
;
217 /* Advise the kernel of our access pattern only if we will read(). */
218 if (!count_bytes
|| count_chars
|| print_lines
|| count_complicated
)
219 fdadvise (fd
, 0, 0, FADVISE_SEQUENTIAL
);
221 /* When counting only bytes, save some line- and word-counting
222 overhead. If FD is a `regular' Unix file, using lseek is enough
223 to get its `size' in bytes. Otherwise, read blocks of BUFFER_SIZE
224 bytes at a time until EOF. Note that the `size' (number of bytes)
225 that wc reports is smaller than stats.st_size when the file is not
226 positioned at its beginning. That's why the lseek calls below are
227 necessary. For example the command
228 `(dd ibs=99k skip=1 count=0; ./wc -c) < /etc/group'
229 should make wc report `0' bytes. */
231 if (count_bytes
&& !count_chars
&& !print_lines
&& !count_complicated
)
233 off_t current_pos
, end_pos
;
235 if (0 < fstatus
->failed
)
236 fstatus
->failed
= fstat (fd
, &fstatus
->st
);
238 if (! fstatus
->failed
&& S_ISREG (fstatus
->st
.st_mode
)
239 && (current_pos
= lseek (fd
, 0, SEEK_CUR
)) != -1
240 && (end_pos
= lseek (fd
, 0, SEEK_END
)) != -1)
242 /* Be careful here. The current position may actually be
243 beyond the end of the file. As in the example above. */
244 bytes
= end_pos
< current_pos
? 0 : end_pos
- current_pos
;
248 fdadvise (fd
, 0, 0, FADVISE_SEQUENTIAL
);
249 while ((bytes_read
= safe_read (fd
, buf
, BUFFER_SIZE
)) > 0)
251 if (bytes_read
== SAFE_READ_ERROR
)
253 error (0, errno
, "%s", file
);
261 else if (!count_chars
&& !count_complicated
)
263 /* Use a separate loop when counting only lines or lines and bytes --
264 but not chars or words. */
265 while ((bytes_read
= safe_read (fd
, buf
, BUFFER_SIZE
)) > 0)
269 if (bytes_read
== SAFE_READ_ERROR
)
271 error (0, errno
, "%s", file
);
276 while ((p
= memchr (p
, '\n', (buf
+ bytes_read
) - p
)))
285 # define SUPPORT_OLD_MBRTOWC 1
286 else if (MB_CUR_MAX
> 1)
288 bool in_word
= false;
289 uintmax_t linepos
= 0;
290 mbstate_t state
= { 0, };
291 bool in_shift
= false;
292 # if SUPPORT_OLD_MBRTOWC
293 /* Back-up the state before each multibyte character conversion and
294 move the last incomplete character of the buffer to the front
295 of the buffer. This is needed because we don't know whether
296 the `mbrtowc' function updates the state when it returns -2, -
297 this is the ISO C 99 and glibc-2.2 behaviour - or not - amended
298 ANSI C, glibc-2.1 and Solaris 5.7 behaviour. We don't have an
299 autoconf test for this, yet. */
300 size_t prev
= 0; /* number of bytes carried over from previous round */
302 const size_t prev
= 0;
305 while ((bytes_read
= safe_read (fd
, buf
+ prev
, BUFFER_SIZE
- prev
)) > 0)
308 # if SUPPORT_OLD_MBRTOWC
309 mbstate_t backup_state
;
311 if (bytes_read
== SAFE_READ_ERROR
)
313 error (0, errno
, "%s", file
);
326 if (!in_shift
&& is_basic (*p
))
328 /* Handle most ASCII characters quickly, without calling
336 # if SUPPORT_OLD_MBRTOWC
337 backup_state
= state
;
339 n
= mbrtowc (&wide_char
, p
, bytes_read
, &state
);
340 if (n
== (size_t) -2)
342 # if SUPPORT_OLD_MBRTOWC
343 state
= backup_state
;
347 if (n
== (size_t) -1)
349 /* Remember that we read a byte, but don't complain
350 about the error. Because of the decoding error,
351 this is a considered to be byte but not a
352 character (that is, chars is not incremented). */
357 if (mbsinit (&state
))
375 if (linepos
> linelength
)
376 linelength
= linepos
;
378 goto mb_word_separator
;
380 linepos
+= 8 - (linepos
% 8);
381 goto mb_word_separator
;
391 if (iswprint (wide_char
))
393 int width
= wcwidth (wide_char
);
396 if (iswspace (wide_char
))
397 goto mb_word_separator
;
403 while (bytes_read
> 0);
405 # if SUPPORT_OLD_MBRTOWC
408 if (bytes_read
== BUFFER_SIZE
)
410 /* Encountered a very long redundant shift sequence. */
414 memmove (buf
, p
, bytes_read
);
419 if (linepos
> linelength
)
420 linelength
= linepos
;
426 bool in_word
= false;
427 uintmax_t linepos
= 0;
429 while ((bytes_read
= safe_read (fd
, buf
, BUFFER_SIZE
)) > 0)
432 if (bytes_read
== SAFE_READ_ERROR
)
434 error (0, errno
, "%s", file
);
449 if (linepos
> linelength
)
450 linelength
= linepos
;
454 linepos
+= 8 - (linepos
% 8);
465 if (isprint (to_uchar (p
[-1])))
468 if (isspace (to_uchar (p
[-1])))
475 while (--bytes_read
);
477 if (linepos
> linelength
)
478 linelength
= linepos
;
482 if (count_chars
< print_chars
)
485 write_counts (lines
, words
, chars
, bytes
, linelength
, file_x
);
486 total_lines
+= lines
;
487 total_words
+= words
;
488 total_chars
+= chars
;
489 total_bytes
+= bytes
;
490 if (linelength
> max_line_length
)
491 max_line_length
= linelength
;
497 wc_file (char const *file
, struct fstatus
*fstatus
)
499 if (! file
|| STREQ (file
, "-"))
501 have_read_stdin
= true;
502 if (O_BINARY
&& ! isatty (STDIN_FILENO
))
503 xfreopen (NULL
, "rb", stdin
);
504 return wc (STDIN_FILENO
, file
, fstatus
);
508 int fd
= open (file
, O_RDONLY
| O_BINARY
);
511 error (0, errno
, "%s", file
);
516 bool ok
= wc (fd
, file
, fstatus
);
519 error (0, errno
, "%s", file
);
527 /* Return the file status for the NFILES files addressed by FILE.
528 Optimize the case where only one number is printed, for just one
529 file; in that case we can use a print width of 1, so we don't need
530 to stat the file. Handle the case of (nfiles == 0) in the same way;
531 that happens when we don't know how long the list of file names will be. */
533 static struct fstatus
*
534 get_input_fstatus (int nfiles
, char *const *file
)
536 struct fstatus
*fstatus
= xnmalloc (nfiles
? nfiles
: 1, sizeof *fstatus
);
540 && ((print_lines
+ print_words
+ print_chars
541 + print_bytes
+ print_linelength
)
543 fstatus
[0].failed
= 1;
548 for (i
= 0; i
< nfiles
; i
++)
549 fstatus
[i
].failed
= (! file
[i
] || STREQ (file
[i
], "-")
550 ? fstat (STDIN_FILENO
, &fstatus
[i
].st
)
551 : stat (file
[i
], &fstatus
[i
].st
));
557 /* Return a print width suitable for the NFILES files whose status is
558 recorded in FSTATUS. Optimize the same special case that
559 get_input_fstatus optimizes. */
561 static int _GL_ATTRIBUTE_PURE
562 compute_number_width (int nfiles
, struct fstatus
const *fstatus
)
566 if (0 < nfiles
&& fstatus
[0].failed
<= 0)
568 int minimum_width
= 1;
569 uintmax_t regular_total
= 0;
572 for (i
= 0; i
< nfiles
; i
++)
573 if (! fstatus
[i
].failed
)
575 if (S_ISREG (fstatus
[i
].st
.st_mode
))
576 regular_total
+= fstatus
[i
].st
.st_size
;
581 for (; 10 <= regular_total
; regular_total
/= 10)
583 if (width
< minimum_width
)
584 width
= minimum_width
;
592 main (int argc
, char **argv
)
598 char *files_from
= NULL
;
599 struct fstatus
*fstatus
;
602 initialize_main (&argc
, &argv
);
603 set_program_name (argv
[0]);
604 setlocale (LC_ALL
, "");
605 bindtextdomain (PACKAGE
, LOCALEDIR
);
606 textdomain (PACKAGE
);
608 atexit (close_stdout
);
610 /* Line buffer stdout to ensure lines are written atomically and immediately
611 so that processes running in parallel do not intersperse their output. */
612 setvbuf (stdout
, NULL
, _IOLBF
, 0);
614 print_lines
= print_words
= print_chars
= print_bytes
= false;
615 print_linelength
= false;
616 total_lines
= total_words
= total_chars
= total_bytes
= max_line_length
= 0;
618 while ((optc
= getopt_long (argc
, argv
, "clLmw", longopts
, NULL
)) != -1)
638 print_linelength
= true;
641 case FILES0_FROM_OPTION
:
645 case_GETOPT_HELP_CHAR
;
647 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
650 usage (EXIT_FAILURE
);
653 if (! (print_lines
|| print_words
|| print_chars
|| print_bytes
654 || print_linelength
))
655 print_lines
= print_words
= print_bytes
= true;
657 bool read_tokens
= false;
658 struct argv_iterator
*ai
;
663 /* When using --files0-from=F, you may not specify any files
664 on the command-line. */
667 error (0, 0, _("extra operand %s"), quote (argv
[optind
]));
668 fprintf (stderr
, "%s\n",
669 _("file operands cannot be combined with --files0-from"));
670 usage (EXIT_FAILURE
);
673 if (STREQ (files_from
, "-"))
677 stream
= fopen (files_from
, "r");
679 error (EXIT_FAILURE
, errno
, _("cannot open %s for reading"),
683 /* Read the file list into RAM if we can detect its size and that
684 size is reasonable. Otherwise, we'll read a name at a time. */
686 if (fstat (fileno (stream
), &st
) == 0
687 && S_ISREG (st
.st_mode
)
688 && st
.st_size
<= MIN (10 * 1024 * 1024, physmem_available () / 2))
691 readtokens0_init (&tok
);
692 if (! readtokens0 (stream
, &tok
) || fclose (stream
) != 0)
693 error (EXIT_FAILURE
, 0, _("cannot read file names from %s"),
697 ai
= argv_iter_init_argv (files
);
703 ai
= argv_iter_init_stream (stream
);
708 static char *stdin_only
[] = { NULL
};
709 files
= (optind
< argc
? argv
+ optind
: stdin_only
);
710 nfiles
= (optind
< argc
? argc
- optind
: 1);
711 ai
= argv_iter_init_argv (files
);
717 fstatus
= get_input_fstatus (nfiles
, files
);
718 number_width
= compute_number_width (nfiles
, fstatus
);
722 for (i
= 0; /* */; i
++)
724 bool skip_file
= false;
725 enum argv_iter_err ai_err
;
726 char *file_name
= argv_iter (ai
, &ai_err
);
734 error (0, errno
, _("%s: read error"),
735 quotearg_colon (files_from
));
741 assert (!"unexpected error code from argv_iter");
744 if (files_from
&& STREQ (files_from
, "-") && STREQ (file_name
, "-"))
746 /* Give a better diagnostic in an unusual case:
747 printf - | wc --files0-from=- */
748 error (0, 0, _("when reading file names from stdin, "
749 "no file name of %s allowed"),
756 /* Diagnose a zero-length file name. When it's one
757 among many, knowing the record number may help.
758 FIXME: currently print the record number only with
759 --files0-from=FILE. Maybe do it for argv, too? */
760 if (files_from
== NULL
)
761 error (0, 0, "%s", _("invalid zero-length file name"));
764 /* Using the standard `filename:line-number:' prefix here is
765 not totally appropriate, since NUL is the separator, not NL,
766 but it might be better than nothing. */
767 unsigned long int file_number
= argv_iter_n_args (ai
);
768 error (0, 0, "%s:%lu: %s", quotearg_colon (files_from
),
769 file_number
, _("invalid zero-length file name"));
777 ok
&= wc_file (file_name
, &fstatus
[nfiles
? i
: 0]);
781 /* No arguments on the command line is fine. That means read from stdin.
782 However, no arguments on the --files0-from input stream is an error
783 means don't read anything. */
784 if (ok
&& !files_from
&& argv_iter_n_args (ai
) == 0)
785 ok
&= wc_file (NULL
, &fstatus
[0]);
788 readtokens0_free (&tok
);
790 if (1 < argv_iter_n_args (ai
))
791 write_counts (total_lines
, total_words
, total_chars
, total_bytes
,
792 max_line_length
, _("total"));
798 if (have_read_stdin
&& close (STDIN_FILENO
) != 0)
799 error (EXIT_FAILURE
, errno
, "-");
801 exit (ok
? EXIT_SUCCESS
: EXIT_FAILURE
);