maint: revert "build: update gnulib submodule to latest"
[coreutils/ericb.git] / src / wc.c
blob43b46a3b9ca0bae721b8f8089ec1690e36ead92a
1 /* wc - print the number of lines, words, and bytes in files
2 Copyright (C) 1985, 1991, 1995-2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 /* Written by Paul Rubin, phr@ocf.berkeley.edu
18 and David MacKenzie, djm@gnu.ai.mit.edu. */
20 #include <config.h>
22 #include <stdio.h>
23 #include <assert.h>
24 #include <getopt.h>
25 #include <sys/types.h>
26 #include <wchar.h>
27 #include <wctype.h>
29 #include "system.h"
30 #include "argv-iter.h"
31 #include "error.h"
32 #include "fadvise.h"
33 #include "mbchar.h"
34 #include "physmem.h"
35 #include "quote.h"
36 #include "quotearg.h"
37 #include "readtokens0.h"
38 #include "safe-read.h"
39 #include "xfreopen.h"
41 #if !defined iswspace && !HAVE_ISWSPACE
42 # define iswspace(wc) \
43 ((wc) == to_uchar (wc) && isspace (to_uchar (wc)))
44 #endif
46 /* The official name of this program (e.g., no `g' prefix). */
47 #define PROGRAM_NAME "wc"
49 #define AUTHORS \
50 proper_name ("Paul Rubin"), \
51 proper_name ("David MacKenzie")
53 /* Size of atomic reads. */
54 #define BUFFER_SIZE (16 * 1024)
56 /* Cumulative number of lines, words, chars and bytes in all files so far.
57 max_line_length is the maximum over all files processed so far. */
58 static uintmax_t total_lines;
59 static uintmax_t total_words;
60 static uintmax_t total_chars;
61 static uintmax_t total_bytes;
62 static uintmax_t max_line_length;
64 /* Which counts to print. */
65 static bool print_lines, print_words, print_chars, print_bytes;
66 static bool print_linelength;
68 /* The print width of each count. */
69 static int number_width;
71 /* True if we have ever read the standard input. */
72 static bool have_read_stdin;
74 /* The result of calling fstat or stat on a file descriptor or file. */
75 struct fstatus
77 /* If positive, fstat or stat has not been called yet. Otherwise,
78 this is the value returned from fstat or stat. */
79 int failed;
81 /* If FAILED is zero, this is the file's status. */
82 struct stat st;
85 /* For long options that have no equivalent short option, use a
86 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
87 enum
89 FILES0_FROM_OPTION = CHAR_MAX + 1
92 static struct option const longopts[] =
94 {"bytes", no_argument, NULL, 'c'},
95 {"chars", no_argument, NULL, 'm'},
96 {"lines", no_argument, NULL, 'l'},
97 {"words", no_argument, NULL, 'w'},
98 {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
99 {"max-line-length", no_argument, NULL, 'L'},
100 {GETOPT_HELP_OPTION_DECL},
101 {GETOPT_VERSION_OPTION_DECL},
102 {NULL, 0, NULL, 0}
105 void
106 usage (int status)
108 if (status != EXIT_SUCCESS)
109 fprintf (stderr, _("Try `%s --help' for more information.\n"),
110 program_name);
111 else
113 printf (_("\
114 Usage: %s [OPTION]... [FILE]...\n\
115 or: %s [OPTION]... --files0-from=F\n\
117 program_name, program_name);
118 fputs (_("\
119 Print newline, word, and byte counts for each FILE, and a total line if\n\
120 more than one FILE is specified. With no FILE, or when FILE is -,\n\
121 read standard input. A word is a non-zero-length sequence of characters\n\
122 delimited by white space.\n\
123 The options below may be used to select which counts are printed, always in\n\
124 the following order: newline, word, character, byte, maximum line length.\n\
125 -c, --bytes print the byte counts\n\
126 -m, --chars print the character counts\n\
127 -l, --lines print the newline counts\n\
128 "), stdout);
129 fputs (_("\
130 --files0-from=F read input from the files specified by\n\
131 NUL-terminated names in file F;\n\
132 If F is - then read names from standard input\n\
133 -L, --max-line-length print the length of the longest line\n\
134 -w, --words print the word counts\n\
135 "), stdout);
136 fputs (HELP_OPTION_DESCRIPTION, stdout);
137 fputs (VERSION_OPTION_DESCRIPTION, stdout);
138 emit_ancillary_info ();
140 exit (status);
143 /* FILE is the name of the file (or NULL for standard input)
144 associated with the specified counters. */
145 static void
146 write_counts (uintmax_t lines,
147 uintmax_t words,
148 uintmax_t chars,
149 uintmax_t bytes,
150 uintmax_t linelength,
151 const char *file)
153 static char const format_sp_int[] = " %*s";
154 char const *format_int = format_sp_int + 1;
155 char buf[INT_BUFSIZE_BOUND (uintmax_t)];
157 if (print_lines)
159 printf (format_int, number_width, umaxtostr (lines, buf));
160 format_int = format_sp_int;
162 if (print_words)
164 printf (format_int, number_width, umaxtostr (words, buf));
165 format_int = format_sp_int;
167 if (print_chars)
169 printf (format_int, number_width, umaxtostr (chars, buf));
170 format_int = format_sp_int;
172 if (print_bytes)
174 printf (format_int, number_width, umaxtostr (bytes, buf));
175 format_int = format_sp_int;
177 if (print_linelength)
179 printf (format_int, number_width, umaxtostr (linelength, buf));
181 if (file)
182 printf (" %s", file);
183 putchar ('\n');
186 /* Count words. FILE_X is the name of the file (or NULL for standard
187 input) that is open on descriptor FD. *FSTATUS is its status.
188 Return true if successful. */
189 static bool
190 wc (int fd, char const *file_x, struct fstatus *fstatus)
192 bool ok = true;
193 char buf[BUFFER_SIZE + 1];
194 size_t bytes_read;
195 uintmax_t lines, words, chars, bytes, linelength;
196 bool count_bytes, count_chars, count_complicated;
197 char const *file = file_x ? file_x : _("standard input");
199 lines = words = chars = bytes = linelength = 0;
201 /* If in the current locale, chars are equivalent to bytes, we prefer
202 counting bytes, because that's easier. */
203 #if MB_LEN_MAX > 1
204 if (MB_CUR_MAX > 1)
206 count_bytes = print_bytes;
207 count_chars = print_chars;
209 else
210 #endif
212 count_bytes = print_bytes || print_chars;
213 count_chars = false;
215 count_complicated = print_words || print_linelength;
217 /* Advise the kernel of our access pattern only if we will read(). */
218 if (!count_bytes || count_chars || print_lines || count_complicated)
219 fdadvise (fd, 0, 0, FADVISE_SEQUENTIAL);
221 /* When counting only bytes, save some line- and word-counting
222 overhead. If FD is a `regular' Unix file, using lseek is enough
223 to get its `size' in bytes. Otherwise, read blocks of BUFFER_SIZE
224 bytes at a time until EOF. Note that the `size' (number of bytes)
225 that wc reports is smaller than stats.st_size when the file is not
226 positioned at its beginning. That's why the lseek calls below are
227 necessary. For example the command
228 `(dd ibs=99k skip=1 count=0; ./wc -c) < /etc/group'
229 should make wc report `0' bytes. */
231 if (count_bytes && !count_chars && !print_lines && !count_complicated)
233 off_t current_pos, end_pos;
235 if (0 < fstatus->failed)
236 fstatus->failed = fstat (fd, &fstatus->st);
238 if (! fstatus->failed && S_ISREG (fstatus->st.st_mode)
239 && (current_pos = lseek (fd, 0, SEEK_CUR)) != -1
240 && (end_pos = lseek (fd, 0, SEEK_END)) != -1)
242 /* Be careful here. The current position may actually be
243 beyond the end of the file. As in the example above. */
244 bytes = end_pos < current_pos ? 0 : end_pos - current_pos;
246 else
248 fdadvise (fd, 0, 0, FADVISE_SEQUENTIAL);
249 while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
251 if (bytes_read == SAFE_READ_ERROR)
253 error (0, errno, "%s", file);
254 ok = false;
255 break;
257 bytes += bytes_read;
261 else if (!count_chars && !count_complicated)
263 /* Use a separate loop when counting only lines or lines and bytes --
264 but not chars or words. */
265 while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
267 char *p = buf;
269 if (bytes_read == SAFE_READ_ERROR)
271 error (0, errno, "%s", file);
272 ok = false;
273 break;
276 while ((p = memchr (p, '\n', (buf + bytes_read) - p)))
278 ++p;
279 ++lines;
281 bytes += bytes_read;
284 #if MB_LEN_MAX > 1
285 # define SUPPORT_OLD_MBRTOWC 1
286 else if (MB_CUR_MAX > 1)
288 bool in_word = false;
289 uintmax_t linepos = 0;
290 mbstate_t state = { 0, };
291 bool in_shift = false;
292 # if SUPPORT_OLD_MBRTOWC
293 /* Back-up the state before each multibyte character conversion and
294 move the last incomplete character of the buffer to the front
295 of the buffer. This is needed because we don't know whether
296 the `mbrtowc' function updates the state when it returns -2, -
297 this is the ISO C 99 and glibc-2.2 behaviour - or not - amended
298 ANSI C, glibc-2.1 and Solaris 5.7 behaviour. We don't have an
299 autoconf test for this, yet. */
300 size_t prev = 0; /* number of bytes carried over from previous round */
301 # else
302 const size_t prev = 0;
303 # endif
305 while ((bytes_read = safe_read (fd, buf + prev, BUFFER_SIZE - prev)) > 0)
307 const char *p;
308 # if SUPPORT_OLD_MBRTOWC
309 mbstate_t backup_state;
310 # endif
311 if (bytes_read == SAFE_READ_ERROR)
313 error (0, errno, "%s", file);
314 ok = false;
315 break;
318 bytes += bytes_read;
319 p = buf;
320 bytes_read += prev;
323 wchar_t wide_char;
324 size_t n;
326 if (!in_shift && is_basic (*p))
328 /* Handle most ASCII characters quickly, without calling
329 mbrtowc(). */
330 n = 1;
331 wide_char = *p;
333 else
335 in_shift = true;
336 # if SUPPORT_OLD_MBRTOWC
337 backup_state = state;
338 # endif
339 n = mbrtowc (&wide_char, p, bytes_read, &state);
340 if (n == (size_t) -2)
342 # if SUPPORT_OLD_MBRTOWC
343 state = backup_state;
344 # endif
345 break;
347 if (n == (size_t) -1)
349 /* Remember that we read a byte, but don't complain
350 about the error. Because of the decoding error,
351 this is a considered to be byte but not a
352 character (that is, chars is not incremented). */
353 p++;
354 bytes_read--;
355 continue;
357 if (mbsinit (&state))
358 in_shift = false;
359 if (n == 0)
361 wide_char = 0;
362 n = 1;
365 p += n;
366 bytes_read -= n;
367 chars++;
368 switch (wide_char)
370 case '\n':
371 lines++;
372 /* Fall through. */
373 case '\r':
374 case '\f':
375 if (linepos > linelength)
376 linelength = linepos;
377 linepos = 0;
378 goto mb_word_separator;
379 case '\t':
380 linepos += 8 - (linepos % 8);
381 goto mb_word_separator;
382 case ' ':
383 linepos++;
384 /* Fall through. */
385 case '\v':
386 mb_word_separator:
387 words += in_word;
388 in_word = false;
389 break;
390 default:
391 if (iswprint (wide_char))
393 int width = wcwidth (wide_char);
394 if (width > 0)
395 linepos += width;
396 if (iswspace (wide_char))
397 goto mb_word_separator;
398 in_word = true;
400 break;
403 while (bytes_read > 0);
405 # if SUPPORT_OLD_MBRTOWC
406 if (bytes_read > 0)
408 if (bytes_read == BUFFER_SIZE)
410 /* Encountered a very long redundant shift sequence. */
411 p++;
412 bytes_read--;
414 memmove (buf, p, bytes_read);
416 prev = bytes_read;
417 # endif
419 if (linepos > linelength)
420 linelength = linepos;
421 words += in_word;
423 #endif
424 else
426 bool in_word = false;
427 uintmax_t linepos = 0;
429 while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
431 const char *p = buf;
432 if (bytes_read == SAFE_READ_ERROR)
434 error (0, errno, "%s", file);
435 ok = false;
436 break;
439 bytes += bytes_read;
442 switch (*p++)
444 case '\n':
445 lines++;
446 /* Fall through. */
447 case '\r':
448 case '\f':
449 if (linepos > linelength)
450 linelength = linepos;
451 linepos = 0;
452 goto word_separator;
453 case '\t':
454 linepos += 8 - (linepos % 8);
455 goto word_separator;
456 case ' ':
457 linepos++;
458 /* Fall through. */
459 case '\v':
460 word_separator:
461 words += in_word;
462 in_word = false;
463 break;
464 default:
465 if (isprint (to_uchar (p[-1])))
467 linepos++;
468 if (isspace (to_uchar (p[-1])))
469 goto word_separator;
470 in_word = true;
472 break;
475 while (--bytes_read);
477 if (linepos > linelength)
478 linelength = linepos;
479 words += in_word;
482 if (count_chars < print_chars)
483 chars = bytes;
485 write_counts (lines, words, chars, bytes, linelength, file_x);
486 total_lines += lines;
487 total_words += words;
488 total_chars += chars;
489 total_bytes += bytes;
490 if (linelength > max_line_length)
491 max_line_length = linelength;
493 return ok;
496 static bool
497 wc_file (char const *file, struct fstatus *fstatus)
499 if (! file || STREQ (file, "-"))
501 have_read_stdin = true;
502 if (O_BINARY && ! isatty (STDIN_FILENO))
503 xfreopen (NULL, "rb", stdin);
504 return wc (STDIN_FILENO, file, fstatus);
506 else
508 int fd = open (file, O_RDONLY | O_BINARY);
509 if (fd == -1)
511 error (0, errno, "%s", file);
512 return false;
514 else
516 bool ok = wc (fd, file, fstatus);
517 if (close (fd) != 0)
519 error (0, errno, "%s", file);
520 return false;
522 return ok;
527 /* Return the file status for the NFILES files addressed by FILE.
528 Optimize the case where only one number is printed, for just one
529 file; in that case we can use a print width of 1, so we don't need
530 to stat the file. Handle the case of (nfiles == 0) in the same way;
531 that happens when we don't know how long the list of file names will be. */
533 static struct fstatus *
534 get_input_fstatus (int nfiles, char *const *file)
536 struct fstatus *fstatus = xnmalloc (nfiles ? nfiles : 1, sizeof *fstatus);
538 if (nfiles == 0
539 || (nfiles == 1
540 && ((print_lines + print_words + print_chars
541 + print_bytes + print_linelength)
542 == 1)))
543 fstatus[0].failed = 1;
544 else
546 int i;
548 for (i = 0; i < nfiles; i++)
549 fstatus[i].failed = (! file[i] || STREQ (file[i], "-")
550 ? fstat (STDIN_FILENO, &fstatus[i].st)
551 : stat (file[i], &fstatus[i].st));
554 return fstatus;
557 /* Return a print width suitable for the NFILES files whose status is
558 recorded in FSTATUS. Optimize the same special case that
559 get_input_fstatus optimizes. */
561 static int _GL_ATTRIBUTE_PURE
562 compute_number_width (int nfiles, struct fstatus const *fstatus)
564 int width = 1;
566 if (0 < nfiles && fstatus[0].failed <= 0)
568 int minimum_width = 1;
569 uintmax_t regular_total = 0;
570 int i;
572 for (i = 0; i < nfiles; i++)
573 if (! fstatus[i].failed)
575 if (S_ISREG (fstatus[i].st.st_mode))
576 regular_total += fstatus[i].st.st_size;
577 else
578 minimum_width = 7;
581 for (; 10 <= regular_total; regular_total /= 10)
582 width++;
583 if (width < minimum_width)
584 width = minimum_width;
587 return width;
592 main (int argc, char **argv)
594 bool ok;
595 int optc;
596 int nfiles;
597 char **files;
598 char *files_from = NULL;
599 struct fstatus *fstatus;
600 struct Tokens tok;
602 initialize_main (&argc, &argv);
603 set_program_name (argv[0]);
604 setlocale (LC_ALL, "");
605 bindtextdomain (PACKAGE, LOCALEDIR);
606 textdomain (PACKAGE);
608 atexit (close_stdout);
610 /* Line buffer stdout to ensure lines are written atomically and immediately
611 so that processes running in parallel do not intersperse their output. */
612 setvbuf (stdout, NULL, _IOLBF, 0);
614 print_lines = print_words = print_chars = print_bytes = false;
615 print_linelength = false;
616 total_lines = total_words = total_chars = total_bytes = max_line_length = 0;
618 while ((optc = getopt_long (argc, argv, "clLmw", longopts, NULL)) != -1)
619 switch (optc)
621 case 'c':
622 print_bytes = true;
623 break;
625 case 'm':
626 print_chars = true;
627 break;
629 case 'l':
630 print_lines = true;
631 break;
633 case 'w':
634 print_words = true;
635 break;
637 case 'L':
638 print_linelength = true;
639 break;
641 case FILES0_FROM_OPTION:
642 files_from = optarg;
643 break;
645 case_GETOPT_HELP_CHAR;
647 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
649 default:
650 usage (EXIT_FAILURE);
653 if (! (print_lines || print_words || print_chars || print_bytes
654 || print_linelength))
655 print_lines = print_words = print_bytes = true;
657 bool read_tokens = false;
658 struct argv_iterator *ai;
659 if (files_from)
661 FILE *stream;
663 /* When using --files0-from=F, you may not specify any files
664 on the command-line. */
665 if (optind < argc)
667 error (0, 0, _("extra operand %s"), quote (argv[optind]));
668 fprintf (stderr, "%s\n",
669 _("file operands cannot be combined with --files0-from"));
670 usage (EXIT_FAILURE);
673 if (STREQ (files_from, "-"))
674 stream = stdin;
675 else
677 stream = fopen (files_from, "r");
678 if (stream == NULL)
679 error (EXIT_FAILURE, errno, _("cannot open %s for reading"),
680 quote (files_from));
683 /* Read the file list into RAM if we can detect its size and that
684 size is reasonable. Otherwise, we'll read a name at a time. */
685 struct stat st;
686 if (fstat (fileno (stream), &st) == 0
687 && S_ISREG (st.st_mode)
688 && st.st_size <= MIN (10 * 1024 * 1024, physmem_available () / 2))
690 read_tokens = true;
691 readtokens0_init (&tok);
692 if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
693 error (EXIT_FAILURE, 0, _("cannot read file names from %s"),
694 quote (files_from));
695 files = tok.tok;
696 nfiles = tok.n_tok;
697 ai = argv_iter_init_argv (files);
699 else
701 files = NULL;
702 nfiles = 0;
703 ai = argv_iter_init_stream (stream);
706 else
708 static char *stdin_only[] = { NULL };
709 files = (optind < argc ? argv + optind : stdin_only);
710 nfiles = (optind < argc ? argc - optind : 1);
711 ai = argv_iter_init_argv (files);
714 if (!ai)
715 xalloc_die ();
717 fstatus = get_input_fstatus (nfiles, files);
718 number_width = compute_number_width (nfiles, fstatus);
720 int i;
721 ok = true;
722 for (i = 0; /* */; i++)
724 bool skip_file = false;
725 enum argv_iter_err ai_err;
726 char *file_name = argv_iter (ai, &ai_err);
727 if (!file_name)
729 switch (ai_err)
731 case AI_ERR_EOF:
732 goto argv_iter_done;
733 case AI_ERR_READ:
734 error (0, errno, _("%s: read error"),
735 quotearg_colon (files_from));
736 ok = false;
737 goto argv_iter_done;
738 case AI_ERR_MEM:
739 xalloc_die ();
740 default:
741 assert (!"unexpected error code from argv_iter");
744 if (files_from && STREQ (files_from, "-") && STREQ (file_name, "-"))
746 /* Give a better diagnostic in an unusual case:
747 printf - | wc --files0-from=- */
748 error (0, 0, _("when reading file names from stdin, "
749 "no file name of %s allowed"),
750 quote (file_name));
751 skip_file = true;
754 if (!file_name[0])
756 /* Diagnose a zero-length file name. When it's one
757 among many, knowing the record number may help.
758 FIXME: currently print the record number only with
759 --files0-from=FILE. Maybe do it for argv, too? */
760 if (files_from == NULL)
761 error (0, 0, "%s", _("invalid zero-length file name"));
762 else
764 /* Using the standard `filename:line-number:' prefix here is
765 not totally appropriate, since NUL is the separator, not NL,
766 but it might be better than nothing. */
767 unsigned long int file_number = argv_iter_n_args (ai);
768 error (0, 0, "%s:%lu: %s", quotearg_colon (files_from),
769 file_number, _("invalid zero-length file name"));
771 skip_file = true;
774 if (skip_file)
775 ok = false;
776 else
777 ok &= wc_file (file_name, &fstatus[nfiles ? i : 0]);
779 argv_iter_done:
781 /* No arguments on the command line is fine. That means read from stdin.
782 However, no arguments on the --files0-from input stream is an error
783 means don't read anything. */
784 if (ok && !files_from && argv_iter_n_args (ai) == 0)
785 ok &= wc_file (NULL, &fstatus[0]);
787 if (read_tokens)
788 readtokens0_free (&tok);
790 if (1 < argv_iter_n_args (ai))
791 write_counts (total_lines, total_words, total_chars, total_bytes,
792 max_line_length, _("total"));
794 argv_iter_free (ai);
796 free (fstatus);
798 if (have_read_stdin && close (STDIN_FILENO) != 0)
799 error (EXIT_FAILURE, errno, "-");
801 exit (ok ? EXIT_SUCCESS : EXIT_FAILURE);