all: update gnulib submodule to latest
[coreutils.git] / src / wc.c
blob299a9665f68f49995f9abd69c7f6eb6abe9a3637
1 /* wc - print the number of lines, words, and bytes in files
2 Copyright (C) 1985-2017 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 /* Written by Paul Rubin, phr@ocf.berkeley.edu
18 and David MacKenzie, djm@gnu.ai.mit.edu. */
20 #include <config.h>
22 #include <stdio.h>
23 #include <assert.h>
24 #include <getopt.h>
25 #include <sys/types.h>
26 #include <wchar.h>
27 #include <wctype.h>
29 #include "system.h"
30 #include "argv-iter.h"
31 #include "die.h"
32 #include "error.h"
33 #include "fadvise.h"
34 #include "mbchar.h"
35 #include "physmem.h"
36 #include "readtokens0.h"
37 #include "safe-read.h"
38 #include "stat-size.h"
39 #include "xbinary-io.h"
41 #if !defined iswspace && !HAVE_ISWSPACE
42 # define iswspace(wc) \
43 ((wc) == to_uchar (wc) && isspace (to_uchar (wc)))
44 #endif
46 /* The official name of this program (e.g., no 'g' prefix). */
47 #define PROGRAM_NAME "wc"
49 #define AUTHORS \
50 proper_name ("Paul Rubin"), \
51 proper_name ("David MacKenzie")
53 /* Size of atomic reads. */
54 #define BUFFER_SIZE (16 * 1024)
56 /* Cumulative number of lines, words, chars and bytes in all files so far.
57 max_line_length is the maximum over all files processed so far. */
58 static uintmax_t total_lines;
59 static uintmax_t total_words;
60 static uintmax_t total_chars;
61 static uintmax_t total_bytes;
62 static uintmax_t max_line_length;
64 /* Which counts to print. */
65 static bool print_lines, print_words, print_chars, print_bytes;
66 static bool print_linelength;
68 /* The print width of each count. */
69 static int number_width;
71 /* True if we have ever read the standard input. */
72 static bool have_read_stdin;
74 /* Used to determine if file size can be determined without reading. */
75 static size_t page_size;
77 /* The result of calling fstat or stat on a file descriptor or file. */
78 struct fstatus
80 /* If positive, fstat or stat has not been called yet. Otherwise,
81 this is the value returned from fstat or stat. */
82 int failed;
84 /* If FAILED is zero, this is the file's status. */
85 struct stat st;
88 /* For long options that have no equivalent short option, use a
89 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
90 enum
92 FILES0_FROM_OPTION = CHAR_MAX + 1
95 static struct option const longopts[] =
97 {"bytes", no_argument, NULL, 'c'},
98 {"chars", no_argument, NULL, 'm'},
99 {"lines", no_argument, NULL, 'l'},
100 {"words", no_argument, NULL, 'w'},
101 {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
102 {"max-line-length", no_argument, NULL, 'L'},
103 {GETOPT_HELP_OPTION_DECL},
104 {GETOPT_VERSION_OPTION_DECL},
105 {NULL, 0, NULL, 0}
108 void
109 usage (int status)
111 if (status != EXIT_SUCCESS)
112 emit_try_help ();
113 else
115 printf (_("\
116 Usage: %s [OPTION]... [FILE]...\n\
117 or: %s [OPTION]... --files0-from=F\n\
119 program_name, program_name);
120 fputs (_("\
121 Print newline, word, and byte counts for each FILE, and a total line if\n\
122 more than one FILE is specified. A word is a non-zero-length sequence of\n\
123 characters delimited by white space.\n\
124 "), stdout);
126 emit_stdin_note ();
128 fputs (_("\
130 The options below may be used to select which counts are printed, always in\n\
131 the following order: newline, word, character, byte, maximum line length.\n\
132 -c, --bytes print the byte counts\n\
133 -m, --chars print the character counts\n\
134 -l, --lines print the newline counts\n\
135 "), stdout);
136 fputs (_("\
137 --files0-from=F read input from the files specified by\n\
138 NUL-terminated names in file F;\n\
139 If F is - then read names from standard input\n\
140 -L, --max-line-length print the maximum display width\n\
141 -w, --words print the word counts\n\
142 "), stdout);
143 fputs (HELP_OPTION_DESCRIPTION, stdout);
144 fputs (VERSION_OPTION_DESCRIPTION, stdout);
145 emit_ancillary_info (PROGRAM_NAME);
147 exit (status);
150 /* FILE is the name of the file (or NULL for standard input)
151 associated with the specified counters. */
152 static void
153 write_counts (uintmax_t lines,
154 uintmax_t words,
155 uintmax_t chars,
156 uintmax_t bytes,
157 uintmax_t linelength,
158 const char *file)
160 static char const format_sp_int[] = " %*s";
161 char const *format_int = format_sp_int + 1;
162 char buf[INT_BUFSIZE_BOUND (uintmax_t)];
164 if (print_lines)
166 printf (format_int, number_width, umaxtostr (lines, buf));
167 format_int = format_sp_int;
169 if (print_words)
171 printf (format_int, number_width, umaxtostr (words, buf));
172 format_int = format_sp_int;
174 if (print_chars)
176 printf (format_int, number_width, umaxtostr (chars, buf));
177 format_int = format_sp_int;
179 if (print_bytes)
181 printf (format_int, number_width, umaxtostr (bytes, buf));
182 format_int = format_sp_int;
184 if (print_linelength)
186 printf (format_int, number_width, umaxtostr (linelength, buf));
188 if (file)
189 printf (" %s", strchr (file, '\n') ? quotef (file) : file);
190 putchar ('\n');
193 /* Count words. FILE_X is the name of the file (or NULL for standard
194 input) that is open on descriptor FD. *FSTATUS is its status.
195 CURRENT_POS is the current file offset if known, negative if unknown.
196 Return true if successful. */
197 static bool
198 wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
200 bool ok = true;
201 char buf[BUFFER_SIZE + 1];
202 size_t bytes_read;
203 uintmax_t lines, words, chars, bytes, linelength;
204 bool count_bytes, count_chars, count_complicated;
205 char const *file = file_x ? file_x : _("standard input");
207 lines = words = chars = bytes = linelength = 0;
209 /* If in the current locale, chars are equivalent to bytes, we prefer
210 counting bytes, because that's easier. */
211 #if MB_LEN_MAX > 1
212 if (MB_CUR_MAX > 1)
214 count_bytes = print_bytes;
215 count_chars = print_chars;
217 else
218 #endif
220 count_bytes = print_bytes || print_chars;
221 count_chars = false;
223 count_complicated = print_words || print_linelength;
225 /* Advise the kernel of our access pattern only if we will read(). */
226 if (!count_bytes || count_chars || print_lines || count_complicated)
227 fdadvise (fd, 0, 0, FADVISE_SEQUENTIAL);
229 /* When counting only bytes, save some line- and word-counting
230 overhead. If FD is a 'regular' Unix file, using lseek is enough
231 to get its 'size' in bytes. Otherwise, read blocks of BUFFER_SIZE
232 bytes at a time until EOF. Note that the 'size' (number of bytes)
233 that wc reports is smaller than stats.st_size when the file is not
234 positioned at its beginning. That's why the lseek calls below are
235 necessary. For example the command
236 '(dd ibs=99k skip=1 count=0; ./wc -c) < /etc/group'
237 should make wc report '0' bytes. */
239 if (count_bytes && !count_chars && !print_lines && !count_complicated)
241 bool skip_read = false;
243 if (0 < fstatus->failed)
244 fstatus->failed = fstat (fd, &fstatus->st);
246 /* For sized files, seek to one st_blksize before EOF rather than to EOF.
247 This works better for files in proc-like file systems where
248 the size is only approximate. */
249 if (! fstatus->failed && usable_st_size (&fstatus->st)
250 && 0 <= fstatus->st.st_size)
252 size_t end_pos = fstatus->st.st_size;
253 if (current_pos < 0)
254 current_pos = lseek (fd, 0, SEEK_CUR);
256 if (end_pos % page_size)
258 /* We only need special handling of /proc and /sys files etc.
259 when they're a multiple of PAGE_SIZE. In the common case
260 for files with st_size not a multiple of PAGE_SIZE,
261 it's more efficient and accurate to use st_size.
263 Be careful here. The current position may actually be
264 beyond the end of the file. As in the example above. */
266 bytes = end_pos < current_pos ? 0 : end_pos - current_pos;
267 skip_read = true;
269 else
271 off_t hi_pos = end_pos - end_pos % (ST_BLKSIZE (fstatus->st) + 1);
272 if (0 <= current_pos && current_pos < hi_pos
273 && 0 <= lseek (fd, hi_pos, SEEK_CUR))
274 bytes = hi_pos - current_pos;
278 if (! skip_read)
280 fdadvise (fd, 0, 0, FADVISE_SEQUENTIAL);
281 while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
283 if (bytes_read == SAFE_READ_ERROR)
285 error (0, errno, "%s", quotef (file));
286 ok = false;
287 break;
289 bytes += bytes_read;
293 else if (!count_chars && !count_complicated)
295 /* Use a separate loop when counting only lines or lines and bytes --
296 but not chars or words. */
297 bool long_lines = false;
298 while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
300 if (bytes_read == SAFE_READ_ERROR)
302 error (0, errno, "%s", quotef (file));
303 ok = false;
304 break;
307 bytes += bytes_read;
309 char *p = buf;
310 char *end = p + bytes_read;
311 uintmax_t plines = lines;
313 if (! long_lines)
315 /* Avoid function call overhead for shorter lines. */
316 while (p != end)
317 lines += *p++ == '\n';
319 else
321 /* memchr is more efficient with longer lines. */
322 while ((p = memchr (p, '\n', end - p)))
324 ++p;
325 ++lines;
329 /* If the average line length in the block is >= 15, then use
330 memchr for the next block, where system specific optimizations
331 may outweigh function call overhead.
332 FIXME: This line length was determined in 2015, on both
333 x86_64 and ppc64, but it's worth re-evaluating in future with
334 newer compilers, CPUs, or memchr() implementations etc. */
335 if (lines - plines <= bytes_read / 15)
336 long_lines = true;
337 else
338 long_lines = false;
341 #if MB_LEN_MAX > 1
342 # define SUPPORT_OLD_MBRTOWC 1
343 else if (MB_CUR_MAX > 1)
345 bool in_word = false;
346 uintmax_t linepos = 0;
347 mbstate_t state = { 0, };
348 bool in_shift = false;
349 # if SUPPORT_OLD_MBRTOWC
350 /* Back-up the state before each multibyte character conversion and
351 move the last incomplete character of the buffer to the front
352 of the buffer. This is needed because we don't know whether
353 the 'mbrtowc' function updates the state when it returns -2, --
354 this is the ISO C 99 and glibc-2.2 behaviour - or not - amended
355 ANSI C, glibc-2.1 and Solaris 5.7 behaviour. We don't have an
356 autoconf test for this, yet. */
357 size_t prev = 0; /* number of bytes carried over from previous round */
358 # else
359 const size_t prev = 0;
360 # endif
362 while ((bytes_read = safe_read (fd, buf + prev, BUFFER_SIZE - prev)) > 0)
364 const char *p;
365 # if SUPPORT_OLD_MBRTOWC
366 mbstate_t backup_state;
367 # endif
368 if (bytes_read == SAFE_READ_ERROR)
370 error (0, errno, "%s", quotef (file));
371 ok = false;
372 break;
375 bytes += bytes_read;
376 p = buf;
377 bytes_read += prev;
380 wchar_t wide_char;
381 size_t n;
383 if (!in_shift && is_basic (*p))
385 /* Handle most ASCII characters quickly, without calling
386 mbrtowc(). */
387 n = 1;
388 wide_char = *p;
390 else
392 in_shift = true;
393 # if SUPPORT_OLD_MBRTOWC
394 backup_state = state;
395 # endif
396 n = mbrtowc (&wide_char, p, bytes_read, &state);
397 if (n == (size_t) -2)
399 # if SUPPORT_OLD_MBRTOWC
400 state = backup_state;
401 # endif
402 break;
404 if (n == (size_t) -1)
406 /* Remember that we read a byte, but don't complain
407 about the error. Because of the decoding error,
408 this is a considered to be byte but not a
409 character (that is, chars is not incremented). */
410 p++;
411 bytes_read--;
412 continue;
414 if (mbsinit (&state))
415 in_shift = false;
416 if (n == 0)
418 wide_char = 0;
419 n = 1;
422 p += n;
423 bytes_read -= n;
424 chars++;
425 switch (wide_char)
427 case '\n':
428 lines++;
429 FALLTHROUGH;
430 case '\r':
431 case '\f':
432 if (linepos > linelength)
433 linelength = linepos;
434 linepos = 0;
435 goto mb_word_separator;
436 case '\t':
437 linepos += 8 - (linepos % 8);
438 goto mb_word_separator;
439 case ' ':
440 linepos++;
441 FALLTHROUGH;
442 case '\v':
443 mb_word_separator:
444 words += in_word;
445 in_word = false;
446 break;
447 default:
448 if (iswprint (wide_char))
450 int width = wcwidth (wide_char);
451 if (width > 0)
452 linepos += width;
453 if (iswspace (wide_char))
454 goto mb_word_separator;
455 in_word = true;
457 break;
460 while (bytes_read > 0);
462 # if SUPPORT_OLD_MBRTOWC
463 if (bytes_read > 0)
465 if (bytes_read == BUFFER_SIZE)
467 /* Encountered a very long redundant shift sequence. */
468 p++;
469 bytes_read--;
471 memmove (buf, p, bytes_read);
473 prev = bytes_read;
474 # endif
476 if (linepos > linelength)
477 linelength = linepos;
478 words += in_word;
480 #endif
481 else
483 bool in_word = false;
484 uintmax_t linepos = 0;
486 while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
488 const char *p = buf;
489 if (bytes_read == SAFE_READ_ERROR)
491 error (0, errno, "%s", quotef (file));
492 ok = false;
493 break;
496 bytes += bytes_read;
499 switch (*p++)
501 case '\n':
502 lines++;
503 FALLTHROUGH;
504 case '\r':
505 case '\f':
506 if (linepos > linelength)
507 linelength = linepos;
508 linepos = 0;
509 goto word_separator;
510 case '\t':
511 linepos += 8 - (linepos % 8);
512 goto word_separator;
513 case ' ':
514 linepos++;
515 FALLTHROUGH;
516 case '\v':
517 word_separator:
518 words += in_word;
519 in_word = false;
520 break;
521 default:
522 if (isprint (to_uchar (p[-1])))
524 linepos++;
525 if (isspace (to_uchar (p[-1])))
526 goto word_separator;
527 in_word = true;
529 break;
532 while (--bytes_read);
534 if (linepos > linelength)
535 linelength = linepos;
536 words += in_word;
539 if (count_chars < print_chars)
540 chars = bytes;
542 write_counts (lines, words, chars, bytes, linelength, file_x);
543 total_lines += lines;
544 total_words += words;
545 total_chars += chars;
546 total_bytes += bytes;
547 if (linelength > max_line_length)
548 max_line_length = linelength;
550 return ok;
553 static bool
554 wc_file (char const *file, struct fstatus *fstatus)
556 if (! file || STREQ (file, "-"))
558 have_read_stdin = true;
559 xset_binary_mode (STDIN_FILENO, O_BINARY);
560 return wc (STDIN_FILENO, file, fstatus, -1);
562 else
564 int fd = open (file, O_RDONLY | O_BINARY);
565 if (fd == -1)
567 error (0, errno, "%s", quotef (file));
568 return false;
570 else
572 bool ok = wc (fd, file, fstatus, 0);
573 if (close (fd) != 0)
575 error (0, errno, "%s", quotef (file));
576 return false;
578 return ok;
583 /* Return the file status for the NFILES files addressed by FILE.
584 Optimize the case where only one number is printed, for just one
585 file; in that case we can use a print width of 1, so we don't need
586 to stat the file. Handle the case of (nfiles == 0) in the same way;
587 that happens when we don't know how long the list of file names will be. */
589 static struct fstatus *
590 get_input_fstatus (size_t nfiles, char *const *file)
592 struct fstatus *fstatus = xnmalloc (nfiles ? nfiles : 1, sizeof *fstatus);
594 if (nfiles == 0
595 || (nfiles == 1
596 && ((print_lines + print_words + print_chars
597 + print_bytes + print_linelength)
598 == 1)))
599 fstatus[0].failed = 1;
600 else
602 for (size_t i = 0; i < nfiles; i++)
603 fstatus[i].failed = (! file[i] || STREQ (file[i], "-")
604 ? fstat (STDIN_FILENO, &fstatus[i].st)
605 : stat (file[i], &fstatus[i].st));
608 return fstatus;
611 /* Return a print width suitable for the NFILES files whose status is
612 recorded in FSTATUS. Optimize the same special case that
613 get_input_fstatus optimizes. */
615 static int _GL_ATTRIBUTE_PURE
616 compute_number_width (size_t nfiles, struct fstatus const *fstatus)
618 int width = 1;
620 if (0 < nfiles && fstatus[0].failed <= 0)
622 int minimum_width = 1;
623 uintmax_t regular_total = 0;
625 for (size_t i = 0; i < nfiles; i++)
626 if (! fstatus[i].failed)
628 if (S_ISREG (fstatus[i].st.st_mode))
629 regular_total += fstatus[i].st.st_size;
630 else
631 minimum_width = 7;
634 for (; 10 <= regular_total; regular_total /= 10)
635 width++;
636 if (width < minimum_width)
637 width = minimum_width;
640 return width;
645 main (int argc, char **argv)
647 bool ok;
648 int optc;
649 size_t nfiles;
650 char **files;
651 char *files_from = NULL;
652 struct fstatus *fstatus;
653 struct Tokens tok;
655 initialize_main (&argc, &argv);
656 set_program_name (argv[0]);
657 setlocale (LC_ALL, "");
658 bindtextdomain (PACKAGE, LOCALEDIR);
659 textdomain (PACKAGE);
661 atexit (close_stdout);
663 page_size = getpagesize ();
664 /* Line buffer stdout to ensure lines are written atomically and immediately
665 so that processes running in parallel do not intersperse their output. */
666 setvbuf (stdout, NULL, _IOLBF, 0);
668 print_lines = print_words = print_chars = print_bytes = false;
669 print_linelength = false;
670 total_lines = total_words = total_chars = total_bytes = max_line_length = 0;
672 while ((optc = getopt_long (argc, argv, "clLmw", longopts, NULL)) != -1)
673 switch (optc)
675 case 'c':
676 print_bytes = true;
677 break;
679 case 'm':
680 print_chars = true;
681 break;
683 case 'l':
684 print_lines = true;
685 break;
687 case 'w':
688 print_words = true;
689 break;
691 case 'L':
692 print_linelength = true;
693 break;
695 case FILES0_FROM_OPTION:
696 files_from = optarg;
697 break;
699 case_GETOPT_HELP_CHAR;
701 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
703 default:
704 usage (EXIT_FAILURE);
707 if (! (print_lines || print_words || print_chars || print_bytes
708 || print_linelength))
709 print_lines = print_words = print_bytes = true;
711 bool read_tokens = false;
712 struct argv_iterator *ai;
713 if (files_from)
715 FILE *stream;
717 /* When using --files0-from=F, you may not specify any files
718 on the command-line. */
719 if (optind < argc)
721 error (0, 0, _("extra operand %s"), quoteaf (argv[optind]));
722 fprintf (stderr, "%s\n",
723 _("file operands cannot be combined with --files0-from"));
724 usage (EXIT_FAILURE);
727 if (STREQ (files_from, "-"))
728 stream = stdin;
729 else
731 stream = fopen (files_from, "r");
732 if (stream == NULL)
733 die (EXIT_FAILURE, errno, _("cannot open %s for reading"),
734 quoteaf (files_from));
737 /* Read the file list into RAM if we can detect its size and that
738 size is reasonable. Otherwise, we'll read a name at a time. */
739 struct stat st;
740 if (fstat (fileno (stream), &st) == 0
741 && S_ISREG (st.st_mode)
742 && st.st_size <= MIN (10 * 1024 * 1024, physmem_available () / 2))
744 read_tokens = true;
745 readtokens0_init (&tok);
746 if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
747 die (EXIT_FAILURE, 0, _("cannot read file names from %s"),
748 quoteaf (files_from));
749 files = tok.tok;
750 nfiles = tok.n_tok;
751 ai = argv_iter_init_argv (files);
753 else
755 files = NULL;
756 nfiles = 0;
757 ai = argv_iter_init_stream (stream);
760 else
762 static char *stdin_only[] = { NULL };
763 files = (optind < argc ? argv + optind : stdin_only);
764 nfiles = (optind < argc ? argc - optind : 1);
765 ai = argv_iter_init_argv (files);
768 if (!ai)
769 xalloc_die ();
771 fstatus = get_input_fstatus (nfiles, files);
772 number_width = compute_number_width (nfiles, fstatus);
774 ok = true;
775 for (int i = 0; /* */; i++)
777 bool skip_file = false;
778 enum argv_iter_err ai_err;
779 char *file_name = argv_iter (ai, &ai_err);
780 if (!file_name)
782 switch (ai_err)
784 case AI_ERR_EOF:
785 goto argv_iter_done;
786 case AI_ERR_READ:
787 error (0, errno, _("%s: read error"),
788 quotef (files_from));
789 ok = false;
790 goto argv_iter_done;
791 case AI_ERR_MEM:
792 xalloc_die ();
793 default:
794 assert (!"unexpected error code from argv_iter");
797 if (files_from && STREQ (files_from, "-") && STREQ (file_name, "-"))
799 /* Give a better diagnostic in an unusual case:
800 printf - | wc --files0-from=- */
801 error (0, 0, _("when reading file names from stdin, "
802 "no file name of %s allowed"),
803 quoteaf (file_name));
804 skip_file = true;
807 if (!file_name[0])
809 /* Diagnose a zero-length file name. When it's one
810 among many, knowing the record number may help.
811 FIXME: currently print the record number only with
812 --files0-from=FILE. Maybe do it for argv, too? */
813 if (files_from == NULL)
814 error (0, 0, "%s", _("invalid zero-length file name"));
815 else
817 /* Using the standard 'filename:line-number:' prefix here is
818 not totally appropriate, since NUL is the separator, not NL,
819 but it might be better than nothing. */
820 unsigned long int file_number = argv_iter_n_args (ai);
821 error (0, 0, "%s:%lu: %s", quotef (files_from),
822 file_number, _("invalid zero-length file name"));
824 skip_file = true;
827 if (skip_file)
828 ok = false;
829 else
830 ok &= wc_file (file_name, &fstatus[nfiles ? i : 0]);
832 if (! nfiles)
833 fstatus[0].failed = 1;
835 argv_iter_done:
837 /* No arguments on the command line is fine. That means read from stdin.
838 However, no arguments on the --files0-from input stream is an error
839 means don't read anything. */
840 if (ok && !files_from && argv_iter_n_args (ai) == 0)
841 ok &= wc_file (NULL, &fstatus[0]);
843 if (read_tokens)
844 readtokens0_free (&tok);
846 if (1 < argv_iter_n_args (ai))
847 write_counts (total_lines, total_words, total_chars, total_bytes,
848 max_line_length, _("total"));
850 argv_iter_free (ai);
852 free (fstatus);
854 if (have_read_stdin && close (STDIN_FILENO) != 0)
855 die (EXIT_FAILURE, errno, "-");
857 return ok ? EXIT_SUCCESS : EXIT_FAILURE;