build: complete the rename of get_date
[coreutils/ericb.git] / src / wc.c
bloba1922baf9fe12818f52ed8be31bc200886ea2302
1 /* wc - print the number of lines, words, and bytes in files
2 Copyright (C) 1985, 1991, 1995-2010 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 /* Written by Paul Rubin, phr@ocf.berkeley.edu
18 and David MacKenzie, djm@gnu.ai.mit.edu. */
20 #include <config.h>
22 #include <stdio.h>
23 #include <assert.h>
24 #include <getopt.h>
25 #include <sys/types.h>
26 #include <wchar.h>
27 #include <wctype.h>
29 #include "system.h"
30 #include "argv-iter.h"
31 #include "error.h"
32 #include "fadvise.h"
33 #include "mbchar.h"
34 #include "physmem.h"
35 #include "quote.h"
36 #include "quotearg.h"
37 #include "readtokens0.h"
38 #include "safe-read.h"
39 #include "xfreopen.h"
41 #if !defined iswspace && !HAVE_ISWSPACE
42 # define iswspace(wc) \
43 ((wc) == to_uchar (wc) && isspace (to_uchar (wc)))
44 #endif
46 /* The official name of this program (e.g., no `g' prefix). */
47 #define PROGRAM_NAME "wc"
49 #define AUTHORS \
50 proper_name ("Paul Rubin"), \
51 proper_name ("David MacKenzie")
53 /* Size of atomic reads. */
54 #define BUFFER_SIZE (16 * 1024)
56 /* Cumulative number of lines, words, chars and bytes in all files so far.
57 max_line_length is the maximum over all files processed so far. */
58 static uintmax_t total_lines;
59 static uintmax_t total_words;
60 static uintmax_t total_chars;
61 static uintmax_t total_bytes;
62 static uintmax_t max_line_length;
64 /* Which counts to print. */
65 static bool print_lines, print_words, print_chars, print_bytes;
66 static bool print_linelength;
68 /* The print width of each count. */
69 static int number_width;
71 /* True if we have ever read the standard input. */
72 static bool have_read_stdin;
74 /* The result of calling fstat or stat on a file descriptor or file. */
75 struct fstatus
77 /* If positive, fstat or stat has not been called yet. Otherwise,
78 this is the value returned from fstat or stat. */
79 int failed;
81 /* If FAILED is zero, this is the file's status. */
82 struct stat st;
85 /* For long options that have no equivalent short option, use a
86 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
87 enum
89 FILES0_FROM_OPTION = CHAR_MAX + 1
92 static struct option const longopts[] =
94 {"bytes", no_argument, NULL, 'c'},
95 {"chars", no_argument, NULL, 'm'},
96 {"lines", no_argument, NULL, 'l'},
97 {"words", no_argument, NULL, 'w'},
98 {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
99 {"max-line-length", no_argument, NULL, 'L'},
100 {GETOPT_HELP_OPTION_DECL},
101 {GETOPT_VERSION_OPTION_DECL},
102 {NULL, 0, NULL, 0}
105 void
106 usage (int status)
108 if (status != EXIT_SUCCESS)
109 fprintf (stderr, _("Try `%s --help' for more information.\n"),
110 program_name);
111 else
113 printf (_("\
114 Usage: %s [OPTION]... [FILE]...\n\
115 or: %s [OPTION]... --files0-from=F\n\
117 program_name, program_name);
118 fputs (_("\
119 Print newline, word, and byte counts for each FILE, and a total line if\n\
120 more than one FILE is specified. With no FILE, or when FILE is -,\n\
121 read standard input. A word is a non-zero-length sequence of characters\n\
122 delimited by white space.\n\
123 -c, --bytes print the byte counts\n\
124 -m, --chars print the character counts\n\
125 -l, --lines print the newline counts\n\
126 "), stdout);
127 fputs (_("\
128 --files0-from=F read input from the files specified by\n\
129 NUL-terminated names in file F;\n\
130 If F is - then read names from standard input\n\
131 -L, --max-line-length print the length of the longest line\n\
132 -w, --words print the word counts\n\
133 "), stdout);
134 fputs (HELP_OPTION_DESCRIPTION, stdout);
135 fputs (VERSION_OPTION_DESCRIPTION, stdout);
136 emit_ancillary_info ();
138 exit (status);
141 /* FILE is the name of the file (or NULL for standard input)
142 associated with the specified counters. */
143 static void
144 write_counts (uintmax_t lines,
145 uintmax_t words,
146 uintmax_t chars,
147 uintmax_t bytes,
148 uintmax_t linelength,
149 const char *file)
151 static char const format_sp_int[] = " %*s";
152 char const *format_int = format_sp_int + 1;
153 char buf[INT_BUFSIZE_BOUND (uintmax_t)];
155 if (print_lines)
157 printf (format_int, number_width, umaxtostr (lines, buf));
158 format_int = format_sp_int;
160 if (print_words)
162 printf (format_int, number_width, umaxtostr (words, buf));
163 format_int = format_sp_int;
165 if (print_chars)
167 printf (format_int, number_width, umaxtostr (chars, buf));
168 format_int = format_sp_int;
170 if (print_bytes)
172 printf (format_int, number_width, umaxtostr (bytes, buf));
173 format_int = format_sp_int;
175 if (print_linelength)
177 printf (format_int, number_width, umaxtostr (linelength, buf));
179 if (file)
180 printf (" %s", file);
181 putchar ('\n');
184 /* Count words. FILE_X is the name of the file (or NULL for standard
185 input) that is open on descriptor FD. *FSTATUS is its status.
186 Return true if successful. */
187 static bool
188 wc (int fd, char const *file_x, struct fstatus *fstatus)
190 bool ok = true;
191 char buf[BUFFER_SIZE + 1];
192 size_t bytes_read;
193 uintmax_t lines, words, chars, bytes, linelength;
194 bool count_bytes, count_chars, count_complicated;
195 char const *file = file_x ? file_x : _("standard input");
197 lines = words = chars = bytes = linelength = 0;
199 /* If in the current locale, chars are equivalent to bytes, we prefer
200 counting bytes, because that's easier. */
201 #if MB_LEN_MAX > 1
202 if (MB_CUR_MAX > 1)
204 count_bytes = print_bytes;
205 count_chars = print_chars;
207 else
208 #endif
210 count_bytes = print_bytes || print_chars;
211 count_chars = false;
213 count_complicated = print_words || print_linelength;
215 /* Advise the kernel of our access pattern only if we will read(). */
216 if (!count_bytes || count_chars || print_lines || count_complicated)
217 fdadvise (fd, 0, 0, FADVISE_SEQUENTIAL);
219 /* When counting only bytes, save some line- and word-counting
220 overhead. If FD is a `regular' Unix file, using lseek is enough
221 to get its `size' in bytes. Otherwise, read blocks of BUFFER_SIZE
222 bytes at a time until EOF. Note that the `size' (number of bytes)
223 that wc reports is smaller than stats.st_size when the file is not
224 positioned at its beginning. That's why the lseek calls below are
225 necessary. For example the command
226 `(dd ibs=99k skip=1 count=0; ./wc -c) < /etc/group'
227 should make wc report `0' bytes. */
229 if (count_bytes && !count_chars && !print_lines && !count_complicated)
231 off_t current_pos, end_pos;
233 if (0 < fstatus->failed)
234 fstatus->failed = fstat (fd, &fstatus->st);
236 if (! fstatus->failed && S_ISREG (fstatus->st.st_mode)
237 && (current_pos = lseek (fd, (off_t) 0, SEEK_CUR)) != -1
238 && (end_pos = lseek (fd, (off_t) 0, SEEK_END)) != -1)
240 /* Be careful here. The current position may actually be
241 beyond the end of the file. As in the example above. */
242 bytes = end_pos < current_pos ? 0 : end_pos - current_pos;
244 else
246 fdadvise (fd, 0, 0, FADVISE_SEQUENTIAL);
247 while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
249 if (bytes_read == SAFE_READ_ERROR)
251 error (0, errno, "%s", file);
252 ok = false;
253 break;
255 bytes += bytes_read;
259 else if (!count_chars && !count_complicated)
261 /* Use a separate loop when counting only lines or lines and bytes --
262 but not chars or words. */
263 while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
265 char *p = buf;
267 if (bytes_read == SAFE_READ_ERROR)
269 error (0, errno, "%s", file);
270 ok = false;
271 break;
274 while ((p = memchr (p, '\n', (buf + bytes_read) - p)))
276 ++p;
277 ++lines;
279 bytes += bytes_read;
282 #if MB_LEN_MAX > 1
283 # define SUPPORT_OLD_MBRTOWC 1
284 else if (MB_CUR_MAX > 1)
286 bool in_word = false;
287 uintmax_t linepos = 0;
288 DECLARE_ZEROED_AGGREGATE (mbstate_t, state);
289 bool in_shift = false;
290 # if SUPPORT_OLD_MBRTOWC
291 /* Back-up the state before each multibyte character conversion and
292 move the last incomplete character of the buffer to the front
293 of the buffer. This is needed because we don't know whether
294 the `mbrtowc' function updates the state when it returns -2, -
295 this is the ISO C 99 and glibc-2.2 behaviour - or not - amended
296 ANSI C, glibc-2.1 and Solaris 5.7 behaviour. We don't have an
297 autoconf test for this, yet. */
298 size_t prev = 0; /* number of bytes carried over from previous round */
299 # else
300 const size_t prev = 0;
301 # endif
303 while ((bytes_read = safe_read (fd, buf + prev, BUFFER_SIZE - prev)) > 0)
305 const char *p;
306 # if SUPPORT_OLD_MBRTOWC
307 mbstate_t backup_state;
308 # endif
309 if (bytes_read == SAFE_READ_ERROR)
311 error (0, errno, "%s", file);
312 ok = false;
313 break;
316 bytes += bytes_read;
317 p = buf;
318 bytes_read += prev;
321 wchar_t wide_char;
322 size_t n;
324 if (!in_shift && is_basic (*p))
326 /* Handle most ASCII characters quickly, without calling
327 mbrtowc(). */
328 n = 1;
329 wide_char = *p;
331 else
333 in_shift = true;
334 # if SUPPORT_OLD_MBRTOWC
335 backup_state = state;
336 # endif
337 n = mbrtowc (&wide_char, p, bytes_read, &state);
338 if (n == (size_t) -2)
340 # if SUPPORT_OLD_MBRTOWC
341 state = backup_state;
342 # endif
343 break;
345 if (n == (size_t) -1)
347 /* Remember that we read a byte, but don't complain
348 about the error. Because of the decoding error,
349 this is a considered to be byte but not a
350 character (that is, chars is not incremented). */
351 p++;
352 bytes_read--;
353 continue;
355 if (mbsinit (&state))
356 in_shift = false;
357 if (n == 0)
359 wide_char = 0;
360 n = 1;
363 p += n;
364 bytes_read -= n;
365 chars++;
366 switch (wide_char)
368 case '\n':
369 lines++;
370 /* Fall through. */
371 case '\r':
372 case '\f':
373 if (linepos > linelength)
374 linelength = linepos;
375 linepos = 0;
376 goto mb_word_separator;
377 case '\t':
378 linepos += 8 - (linepos % 8);
379 goto mb_word_separator;
380 case ' ':
381 linepos++;
382 /* Fall through. */
383 case '\v':
384 mb_word_separator:
385 words += in_word;
386 in_word = false;
387 break;
388 default:
389 if (iswprint (wide_char))
391 int width = wcwidth (wide_char);
392 if (width > 0)
393 linepos += width;
394 if (iswspace (wide_char))
395 goto mb_word_separator;
396 in_word = true;
398 break;
401 while (bytes_read > 0);
403 # if SUPPORT_OLD_MBRTOWC
404 if (bytes_read > 0)
406 if (bytes_read == BUFFER_SIZE)
408 /* Encountered a very long redundant shift sequence. */
409 p++;
410 bytes_read--;
412 memmove (buf, p, bytes_read);
414 prev = bytes_read;
415 # endif
417 if (linepos > linelength)
418 linelength = linepos;
419 words += in_word;
421 #endif
422 else
424 bool in_word = false;
425 uintmax_t linepos = 0;
427 while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
429 const char *p = buf;
430 if (bytes_read == SAFE_READ_ERROR)
432 error (0, errno, "%s", file);
433 ok = false;
434 break;
437 bytes += bytes_read;
440 switch (*p++)
442 case '\n':
443 lines++;
444 /* Fall through. */
445 case '\r':
446 case '\f':
447 if (linepos > linelength)
448 linelength = linepos;
449 linepos = 0;
450 goto word_separator;
451 case '\t':
452 linepos += 8 - (linepos % 8);
453 goto word_separator;
454 case ' ':
455 linepos++;
456 /* Fall through. */
457 case '\v':
458 word_separator:
459 words += in_word;
460 in_word = false;
461 break;
462 default:
463 if (isprint (to_uchar (p[-1])))
465 linepos++;
466 if (isspace (to_uchar (p[-1])))
467 goto word_separator;
468 in_word = true;
470 break;
473 while (--bytes_read);
475 if (linepos > linelength)
476 linelength = linepos;
477 words += in_word;
480 if (count_chars < print_chars)
481 chars = bytes;
483 write_counts (lines, words, chars, bytes, linelength, file_x);
484 total_lines += lines;
485 total_words += words;
486 total_chars += chars;
487 total_bytes += bytes;
488 if (linelength > max_line_length)
489 max_line_length = linelength;
491 return ok;
494 static bool
495 wc_file (char const *file, struct fstatus *fstatus)
497 if (! file || STREQ (file, "-"))
499 have_read_stdin = true;
500 if (O_BINARY && ! isatty (STDIN_FILENO))
501 xfreopen (NULL, "rb", stdin);
502 return wc (STDIN_FILENO, file, fstatus);
504 else
506 int fd = open (file, O_RDONLY | O_BINARY);
507 if (fd == -1)
509 error (0, errno, "%s", file);
510 return false;
512 else
514 bool ok = wc (fd, file, fstatus);
515 if (close (fd) != 0)
517 error (0, errno, "%s", file);
518 return false;
520 return ok;
525 /* Return the file status for the NFILES files addressed by FILE.
526 Optimize the case where only one number is printed, for just one
527 file; in that case we can use a print width of 1, so we don't need
528 to stat the file. Handle the case of (nfiles == 0) in the same way;
529 that happens when we don't know how long the list of file names will be. */
531 static struct fstatus *
532 get_input_fstatus (int nfiles, char *const *file)
534 struct fstatus *fstatus = xnmalloc (nfiles ? nfiles : 1, sizeof *fstatus);
536 if (nfiles == 0
537 || (nfiles == 1
538 && ((print_lines + print_words + print_chars
539 + print_bytes + print_linelength)
540 == 1)))
541 fstatus[0].failed = 1;
542 else
544 int i;
546 for (i = 0; i < nfiles; i++)
547 fstatus[i].failed = (! file[i] || STREQ (file[i], "-")
548 ? fstat (STDIN_FILENO, &fstatus[i].st)
549 : stat (file[i], &fstatus[i].st));
552 return fstatus;
555 /* Return a print width suitable for the NFILES files whose status is
556 recorded in FSTATUS. Optimize the same special case that
557 get_input_fstatus optimizes. */
559 static int
560 compute_number_width (int nfiles, struct fstatus const *fstatus)
562 int width = 1;
564 if (0 < nfiles && fstatus[0].failed <= 0)
566 int minimum_width = 1;
567 uintmax_t regular_total = 0;
568 int i;
570 for (i = 0; i < nfiles; i++)
571 if (! fstatus[i].failed)
573 if (S_ISREG (fstatus[i].st.st_mode))
574 regular_total += fstatus[i].st.st_size;
575 else
576 minimum_width = 7;
579 for (; 10 <= regular_total; regular_total /= 10)
580 width++;
581 if (width < minimum_width)
582 width = minimum_width;
585 return width;
590 main (int argc, char **argv)
592 bool ok;
593 int optc;
594 int nfiles;
595 char **files;
596 char *files_from = NULL;
597 struct fstatus *fstatus;
598 struct Tokens tok;
600 initialize_main (&argc, &argv);
601 set_program_name (argv[0]);
602 setlocale (LC_ALL, "");
603 bindtextdomain (PACKAGE, LOCALEDIR);
604 textdomain (PACKAGE);
606 atexit (close_stdout);
608 /* Line buffer stdout to ensure lines are written atomically and immediately
609 so that processes running in parallel do not intersperse their output. */
610 setvbuf (stdout, NULL, _IOLBF, 0);
612 print_lines = print_words = print_chars = print_bytes = false;
613 print_linelength = false;
614 total_lines = total_words = total_chars = total_bytes = max_line_length = 0;
616 while ((optc = getopt_long (argc, argv, "clLmw", longopts, NULL)) != -1)
617 switch (optc)
619 case 'c':
620 print_bytes = true;
621 break;
623 case 'm':
624 print_chars = true;
625 break;
627 case 'l':
628 print_lines = true;
629 break;
631 case 'w':
632 print_words = true;
633 break;
635 case 'L':
636 print_linelength = true;
637 break;
639 case FILES0_FROM_OPTION:
640 files_from = optarg;
641 break;
643 case_GETOPT_HELP_CHAR;
645 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
647 default:
648 usage (EXIT_FAILURE);
651 if (! (print_lines || print_words || print_chars || print_bytes
652 || print_linelength))
653 print_lines = print_words = print_bytes = true;
655 bool read_tokens = false;
656 struct argv_iterator *ai;
657 if (files_from)
659 FILE *stream;
661 /* When using --files0-from=F, you may not specify any files
662 on the command-line. */
663 if (optind < argc)
665 error (0, 0, _("extra operand %s"), quote (argv[optind]));
666 fprintf (stderr, "%s\n",
667 _("file operands cannot be combined with --files0-from"));
668 usage (EXIT_FAILURE);
671 if (STREQ (files_from, "-"))
672 stream = stdin;
673 else
675 stream = fopen (files_from, "r");
676 if (stream == NULL)
677 error (EXIT_FAILURE, errno, _("cannot open %s for reading"),
678 quote (files_from));
681 /* Read the file list into RAM if we can detect its size and that
682 size is reasonable. Otherwise, we'll read a name at a time. */
683 struct stat st;
684 if (fstat (fileno (stream), &st) == 0
685 && S_ISREG (st.st_mode)
686 && st.st_size <= MIN (10 * 1024 * 1024, physmem_available () / 2))
688 read_tokens = true;
689 readtokens0_init (&tok);
690 if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
691 error (EXIT_FAILURE, 0, _("cannot read file names from %s"),
692 quote (files_from));
693 files = tok.tok;
694 nfiles = tok.n_tok;
695 ai = argv_iter_init_argv (files);
697 else
699 files = NULL;
700 nfiles = 0;
701 ai = argv_iter_init_stream (stream);
704 else
706 static char *stdin_only[] = { NULL };
707 files = (optind < argc ? argv + optind : stdin_only);
708 nfiles = (optind < argc ? argc - optind : 1);
709 ai = argv_iter_init_argv (files);
712 fstatus = get_input_fstatus (nfiles, files);
713 number_width = compute_number_width (nfiles, fstatus);
715 int i;
716 ok = true;
717 for (i = 0; /* */; i++)
719 bool skip_file = false;
720 enum argv_iter_err ai_err;
721 char *file_name = argv_iter (ai, &ai_err);
722 if (ai_err == AI_ERR_EOF)
723 break;
724 if (!file_name)
726 switch (ai_err)
728 case AI_ERR_READ:
729 error (0, errno, _("%s: read error"), quote (files_from));
730 skip_file = true;
731 continue;
732 case AI_ERR_MEM:
733 xalloc_die ();
734 default:
735 assert (!"unexpected error code from argv_iter");
738 if (files_from && STREQ (files_from, "-") && STREQ (file_name, "-"))
740 /* Give a better diagnostic in an unusual case:
741 printf - | wc --files0-from=- */
742 error (0, 0, _("when reading file names from stdin, "
743 "no file name of %s allowed"),
744 quote (file_name));
745 skip_file = true;
748 if (!file_name[0])
750 /* Diagnose a zero-length file name. When it's one
751 among many, knowing the record number may help.
752 FIXME: currently print the record number only with
753 --files0-from=FILE. Maybe do it for argv, too? */
754 if (files_from == NULL)
755 error (0, 0, "%s", _("invalid zero-length file name"));
756 else
758 /* Using the standard `filename:line-number:' prefix here is
759 not totally appropriate, since NUL is the separator, not NL,
760 but it might be better than nothing. */
761 unsigned long int file_number = argv_iter_n_args (ai);
762 error (0, 0, "%s:%lu: %s", quotearg_colon (files_from),
763 file_number, _("invalid zero-length file name"));
765 skip_file = true;
768 if (skip_file)
769 ok = false;
770 else
771 ok &= wc_file (file_name, &fstatus[nfiles ? i : 0]);
774 /* No arguments on the command line is fine. That means read from stdin.
775 However, no arguments on the --files0-from input stream is an error
776 means don't read anything. */
777 if (ok && !files_from && argv_iter_n_args (ai) == 0)
778 ok &= wc_file (NULL, &fstatus[0]);
780 if (read_tokens)
781 readtokens0_free (&tok);
783 if (1 < argv_iter_n_args (ai))
784 write_counts (total_lines, total_words, total_chars, total_bytes,
785 max_line_length, _("total"));
787 argv_iter_free (ai);
789 free (fstatus);
791 if (have_read_stdin && close (STDIN_FILENO) != 0)
792 error (EXIT_FAILURE, errno, "-");
794 exit (ok ? EXIT_SUCCESS : EXIT_FAILURE);