ls: --color now highlights files with capabilities, too
[coreutils.git] / src / wc.c
blob0bb1929f488b6ba18e957d7201648c39197dad92
1 /* wc - print the number of lines, words, and bytes in files
2 Copyright (C) 85, 91, 1995-2008 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 /* Written by Paul Rubin, phr@ocf.berkeley.edu
18 and David MacKenzie, djm@gnu.ai.mit.edu. */
20 #include <config.h>
22 #include <stdio.h>
23 #include <getopt.h>
24 #include <sys/types.h>
25 #include <wchar.h>
26 #include <wctype.h>
28 #include "system.h"
29 #include "error.h"
30 #include "mbchar.h"
31 #include "quote.h"
32 #include "quotearg.h"
33 #include "readtokens0.h"
34 #include "safe-read.h"
36 #if !defined iswspace && !HAVE_ISWSPACE
37 # define iswspace(wc) \
38 ((wc) == to_uchar (wc) && isspace (to_uchar (wc)))
39 #endif
41 /* The official name of this program (e.g., no `g' prefix). */
42 #define PROGRAM_NAME "wc"
44 #define AUTHORS \
45 proper_name ("Paul Rubin"), \
46 proper_name ("David MacKenzie")
48 /* Size of atomic reads. */
49 #define BUFFER_SIZE (16 * 1024)
51 /* Cumulative number of lines, words, chars and bytes in all files so far.
52 max_line_length is the maximum over all files processed so far. */
53 static uintmax_t total_lines;
54 static uintmax_t total_words;
55 static uintmax_t total_chars;
56 static uintmax_t total_bytes;
57 static uintmax_t max_line_length;
59 /* Which counts to print. */
60 static bool print_lines, print_words, print_chars, print_bytes;
61 static bool print_linelength;
63 /* The print width of each count. */
64 static int number_width;
66 /* True if we have ever read the standard input. */
67 static bool have_read_stdin;
69 /* The result of calling fstat or stat on a file descriptor or file. */
70 struct fstatus
72 /* If positive, fstat or stat has not been called yet. Otherwise,
73 this is the value returned from fstat or stat. */
74 int failed;
76 /* If FAILED is zero, this is the file's status. */
77 struct stat st;
80 /* For long options that have no equivalent short option, use a
81 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
82 enum
84 FILES0_FROM_OPTION = CHAR_MAX + 1
87 static struct option const longopts[] =
89 {"bytes", no_argument, NULL, 'c'},
90 {"chars", no_argument, NULL, 'm'},
91 {"lines", no_argument, NULL, 'l'},
92 {"words", no_argument, NULL, 'w'},
93 {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
94 {"max-line-length", no_argument, NULL, 'L'},
95 {GETOPT_HELP_OPTION_DECL},
96 {GETOPT_VERSION_OPTION_DECL},
97 {NULL, 0, NULL, 0}
100 void
101 usage (int status)
103 if (status != EXIT_SUCCESS)
104 fprintf (stderr, _("Try `%s --help' for more information.\n"),
105 program_name);
106 else
108 printf (_("\
109 Usage: %s [OPTION]... [FILE]...\n\
110 or: %s [OPTION]... --files0-from=F\n\
112 program_name, program_name);
113 fputs (_("\
114 Print newline, word, and byte counts for each FILE, and a total line if\n\
115 more than one FILE is specified. With no FILE, or when FILE is -,\n\
116 read standard input.\n\
117 -c, --bytes print the byte counts\n\
118 -m, --chars print the character counts\n\
119 -l, --lines print the newline counts\n\
120 "), stdout);
121 fputs (_("\
122 --files0-from=F read input from the files specified by\n\
123 NUL-terminated names in file F\n\
124 -L, --max-line-length print the length of the longest line\n\
125 -w, --words print the word counts\n\
126 "), stdout);
127 fputs (HELP_OPTION_DESCRIPTION, stdout);
128 fputs (VERSION_OPTION_DESCRIPTION, stdout);
129 emit_bug_reporting_address ();
131 exit (status);
134 /* FILE is the name of the file (or NULL for standard input)
135 associated with the specified counters. */
136 static void
137 write_counts (uintmax_t lines,
138 uintmax_t words,
139 uintmax_t chars,
140 uintmax_t bytes,
141 uintmax_t linelength,
142 const char *file)
144 static char const format_sp_int[] = " %*s";
145 char const *format_int = format_sp_int + 1;
146 char buf[INT_BUFSIZE_BOUND (uintmax_t)];
148 if (print_lines)
150 printf (format_int, number_width, umaxtostr (lines, buf));
151 format_int = format_sp_int;
153 if (print_words)
155 printf (format_int, number_width, umaxtostr (words, buf));
156 format_int = format_sp_int;
158 if (print_chars)
160 printf (format_int, number_width, umaxtostr (chars, buf));
161 format_int = format_sp_int;
163 if (print_bytes)
165 printf (format_int, number_width, umaxtostr (bytes, buf));
166 format_int = format_sp_int;
168 if (print_linelength)
170 printf (format_int, number_width, umaxtostr (linelength, buf));
172 if (file)
173 printf (" %s", file);
174 putchar ('\n');
177 /* Count words. FILE_X is the name of the file (or NULL for standard
178 input) that is open on descriptor FD. *FSTATUS is its status.
179 Return true if successful. */
180 static bool
181 wc (int fd, char const *file_x, struct fstatus *fstatus)
183 bool ok = true;
184 char buf[BUFFER_SIZE + 1];
185 size_t bytes_read;
186 uintmax_t lines, words, chars, bytes, linelength;
187 bool count_bytes, count_chars, count_complicated;
188 char const *file = file_x ? file_x : _("standard input");
190 lines = words = chars = bytes = linelength = 0;
192 /* If in the current locale, chars are equivalent to bytes, we prefer
193 counting bytes, because that's easier. */
194 #if HAVE_MBRTOWC && (MB_LEN_MAX > 1)
195 if (MB_CUR_MAX > 1)
197 count_bytes = print_bytes;
198 count_chars = print_chars;
200 else
201 #endif
203 count_bytes = print_bytes | print_chars;
204 count_chars = false;
206 count_complicated = print_words | print_linelength;
208 /* When counting only bytes, save some line- and word-counting
209 overhead. If FD is a `regular' Unix file, using lseek is enough
210 to get its `size' in bytes. Otherwise, read blocks of BUFFER_SIZE
211 bytes at a time until EOF. Note that the `size' (number of bytes)
212 that wc reports is smaller than stats.st_size when the file is not
213 positioned at its beginning. That's why the lseek calls below are
214 necessary. For example the command
215 `(dd ibs=99k skip=1 count=0; ./wc -c) < /etc/group'
216 should make wc report `0' bytes. */
218 if (count_bytes & !count_chars & !print_lines & !count_complicated)
220 off_t current_pos, end_pos;
222 if (0 < fstatus->failed)
223 fstatus->failed = fstat (fd, &fstatus->st);
225 if (! fstatus->failed && S_ISREG (fstatus->st.st_mode)
226 && (current_pos = lseek (fd, (off_t) 0, SEEK_CUR)) != -1
227 && (end_pos = lseek (fd, (off_t) 0, SEEK_END)) != -1)
229 /* Be careful here. The current position may actually be
230 beyond the end of the file. As in the example above. */
231 bytes = end_pos < current_pos ? 0 : end_pos - current_pos;
233 else
235 while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
237 if (bytes_read == SAFE_READ_ERROR)
239 error (0, errno, "%s", file);
240 ok = false;
241 break;
243 bytes += bytes_read;
247 else if (!count_chars & !count_complicated)
249 /* Use a separate loop when counting only lines or lines and bytes --
250 but not chars or words. */
251 while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
253 char *p = buf;
255 if (bytes_read == SAFE_READ_ERROR)
257 error (0, errno, "%s", file);
258 ok = false;
259 break;
262 while ((p = memchr (p, '\n', (buf + bytes_read) - p)))
264 ++p;
265 ++lines;
267 bytes += bytes_read;
270 #if HAVE_MBRTOWC && (MB_LEN_MAX > 1)
271 # define SUPPORT_OLD_MBRTOWC 1
272 else if (MB_CUR_MAX > 1)
274 bool in_word = false;
275 uintmax_t linepos = 0;
276 mbstate_t state = { 0, };
277 bool in_shift = false;
278 # if SUPPORT_OLD_MBRTOWC
279 /* Back-up the state before each multibyte character conversion and
280 move the last incomplete character of the buffer to the front
281 of the buffer. This is needed because we don't know whether
282 the `mbrtowc' function updates the state when it returns -2, -
283 this is the ISO C 99 and glibc-2.2 behaviour - or not - amended
284 ANSI C, glibc-2.1 and Solaris 5.7 behaviour. We don't have an
285 autoconf test for this, yet. */
286 size_t prev = 0; /* number of bytes carried over from previous round */
287 # else
288 const size_t prev = 0;
289 # endif
291 while ((bytes_read = safe_read (fd, buf + prev, BUFFER_SIZE - prev)) > 0)
293 const char *p;
294 # if SUPPORT_OLD_MBRTOWC
295 mbstate_t backup_state;
296 # endif
297 if (bytes_read == SAFE_READ_ERROR)
299 error (0, errno, "%s", file);
300 ok = false;
301 break;
304 bytes += bytes_read;
305 p = buf;
306 bytes_read += prev;
309 wchar_t wide_char;
310 size_t n;
312 if (!in_shift && is_basic (*p))
314 /* Handle most ASCII characters quickly, without calling
315 mbrtowc(). */
316 n = 1;
317 wide_char = *p;
319 else
321 in_shift = true;
322 # if SUPPORT_OLD_MBRTOWC
323 backup_state = state;
324 # endif
325 n = mbrtowc (&wide_char, p, bytes_read, &state);
326 if (n == (size_t) -2)
328 # if SUPPORT_OLD_MBRTOWC
329 state = backup_state;
330 # endif
331 break;
333 if (n == (size_t) -1)
335 /* Remember that we read a byte, but don't complain
336 about the error. Because of the decoding error,
337 this is a considered to be byte but not a
338 character (that is, chars is not incremented). */
339 p++;
340 bytes_read--;
341 continue;
343 if (mbsinit (&state))
344 in_shift = false;
345 if (n == 0)
347 wide_char = 0;
348 n = 1;
351 p += n;
352 bytes_read -= n;
353 chars++;
354 switch (wide_char)
356 case '\n':
357 lines++;
358 /* Fall through. */
359 case '\r':
360 case '\f':
361 if (linepos > linelength)
362 linelength = linepos;
363 linepos = 0;
364 goto mb_word_separator;
365 case '\t':
366 linepos += 8 - (linepos % 8);
367 goto mb_word_separator;
368 case ' ':
369 linepos++;
370 /* Fall through. */
371 case '\v':
372 mb_word_separator:
373 words += in_word;
374 in_word = false;
375 break;
376 default:
377 if (iswprint (wide_char))
379 int width = wcwidth (wide_char);
380 if (width > 0)
381 linepos += width;
382 if (iswspace (wide_char))
383 goto mb_word_separator;
384 in_word = true;
386 break;
389 while (bytes_read > 0);
391 # if SUPPORT_OLD_MBRTOWC
392 if (bytes_read > 0)
394 if (bytes_read == BUFFER_SIZE)
396 /* Encountered a very long redundant shift sequence. */
397 p++;
398 bytes_read--;
400 memmove (buf, p, bytes_read);
402 prev = bytes_read;
403 # endif
405 if (linepos > linelength)
406 linelength = linepos;
407 words += in_word;
409 #endif
410 else
412 bool in_word = false;
413 uintmax_t linepos = 0;
415 while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
417 const char *p = buf;
418 if (bytes_read == SAFE_READ_ERROR)
420 error (0, errno, "%s", file);
421 ok = false;
422 break;
425 bytes += bytes_read;
428 switch (*p++)
430 case '\n':
431 lines++;
432 /* Fall through. */
433 case '\r':
434 case '\f':
435 if (linepos > linelength)
436 linelength = linepos;
437 linepos = 0;
438 goto word_separator;
439 case '\t':
440 linepos += 8 - (linepos % 8);
441 goto word_separator;
442 case ' ':
443 linepos++;
444 /* Fall through. */
445 case '\v':
446 word_separator:
447 words += in_word;
448 in_word = false;
449 break;
450 default:
451 if (isprint (to_uchar (p[-1])))
453 linepos++;
454 if (isspace (to_uchar (p[-1])))
455 goto word_separator;
456 in_word = true;
458 break;
461 while (--bytes_read);
463 if (linepos > linelength)
464 linelength = linepos;
465 words += in_word;
468 if (count_chars < print_chars)
469 chars = bytes;
471 write_counts (lines, words, chars, bytes, linelength, file_x);
472 total_lines += lines;
473 total_words += words;
474 total_chars += chars;
475 total_bytes += bytes;
476 if (linelength > max_line_length)
477 max_line_length = linelength;
479 return ok;
482 static bool
483 wc_file (char const *file, struct fstatus *fstatus)
485 if (! file || STREQ (file, "-"))
487 have_read_stdin = true;
488 if (O_BINARY && ! isatty (STDIN_FILENO))
489 freopen (NULL, "rb", stdin);
490 return wc (STDIN_FILENO, file, fstatus);
492 else
494 int fd = open (file, O_RDONLY | O_BINARY);
495 if (fd == -1)
497 error (0, errno, "%s", file);
498 return false;
500 else
502 bool ok = wc (fd, file, fstatus);
503 if (close (fd) != 0)
505 error (0, errno, "%s", file);
506 return false;
508 return ok;
513 /* Return the file status for the NFILES files addressed by FILE.
514 Optimize the case where only one number is printed, for just one
515 file; in that case we can use a print width of 1, so we don't need
516 to stat the file. */
518 static struct fstatus *
519 get_input_fstatus (int nfiles, char * const *file)
521 struct fstatus *fstatus = xnmalloc (nfiles, sizeof *fstatus);
523 if (nfiles == 1
524 && ((print_lines + print_words + print_chars
525 + print_bytes + print_linelength)
526 == 1))
527 fstatus[0].failed = 1;
528 else
530 int i;
532 for (i = 0; i < nfiles; i++)
533 fstatus[i].failed = (! file[i] || STREQ (file[i], "-")
534 ? fstat (STDIN_FILENO, &fstatus[i].st)
535 : stat (file[i], &fstatus[i].st));
538 return fstatus;
541 /* Return a print width suitable for the NFILES files whose status is
542 recorded in FSTATUS. Optimize the same special case that
543 get_input_fstatus optimizes. */
545 static int
546 compute_number_width (int nfiles, struct fstatus const *fstatus)
548 int width = 1;
550 if (0 < nfiles && fstatus[0].failed <= 0)
552 int minimum_width = 1;
553 uintmax_t regular_total = 0;
554 int i;
556 for (i = 0; i < nfiles; i++)
557 if (! fstatus[i].failed)
559 if (S_ISREG (fstatus[i].st.st_mode))
560 regular_total += fstatus[i].st.st_size;
561 else
562 minimum_width = 7;
565 for (; 10 <= regular_total; regular_total /= 10)
566 width++;
567 if (width < minimum_width)
568 width = minimum_width;
571 return width;
576 main (int argc, char **argv)
578 int i;
579 bool ok;
580 int optc;
581 int nfiles;
582 char **files;
583 char *files_from = NULL;
584 struct fstatus *fstatus;
585 struct Tokens tok;
587 initialize_main (&argc, &argv);
588 set_program_name (argv[0]);
589 setlocale (LC_ALL, "");
590 bindtextdomain (PACKAGE, LOCALEDIR);
591 textdomain (PACKAGE);
593 atexit (close_stdout);
595 print_lines = print_words = print_chars = print_bytes = false;
596 print_linelength = false;
597 total_lines = total_words = total_chars = total_bytes = max_line_length = 0;
599 while ((optc = getopt_long (argc, argv, "clLmw", longopts, NULL)) != -1)
600 switch (optc)
602 case 'c':
603 print_bytes = true;
604 break;
606 case 'm':
607 print_chars = true;
608 break;
610 case 'l':
611 print_lines = true;
612 break;
614 case 'w':
615 print_words = true;
616 break;
618 case 'L':
619 print_linelength = true;
620 break;
622 case FILES0_FROM_OPTION:
623 files_from = optarg;
624 break;
626 case_GETOPT_HELP_CHAR;
628 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
630 default:
631 usage (EXIT_FAILURE);
634 if (! (print_lines | print_words | print_chars | print_bytes
635 | print_linelength))
636 print_lines = print_words = print_bytes = true;
638 if (files_from)
640 FILE *stream;
642 /* When using --files0-from=F, you may not specify any files
643 on the command-line. */
644 if (optind < argc)
646 error (0, 0, _("extra operand %s"), quote (argv[optind]));
647 fprintf (stderr, "%s\n",
648 _("file operands cannot be combined with --files0-from"));
649 usage (EXIT_FAILURE);
652 if (STREQ (files_from, "-"))
653 stream = stdin;
654 else
656 stream = fopen (files_from, "r");
657 if (stream == NULL)
658 error (EXIT_FAILURE, errno, _("cannot open %s for reading"),
659 quote (files_from));
662 readtokens0_init (&tok);
664 if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
665 error (EXIT_FAILURE, 0, _("cannot read file names from %s"),
666 quote (files_from));
668 files = tok.tok;
669 nfiles = tok.n_tok;
671 else
673 static char *stdin_only[2];
674 files = (optind < argc ? argv + optind : stdin_only);
675 nfiles = (optind < argc ? argc - optind : 1);
676 stdin_only[0] = NULL;
679 fstatus = get_input_fstatus (nfiles, files);
680 number_width = compute_number_width (nfiles, fstatus);
682 ok = true;
683 for (i = 0; i < nfiles; i++)
685 if (files[i])
687 if (files_from && STREQ (files_from, "-") && STREQ (files[i], "-"))
689 ok = false;
690 /* Give a better diagnostic in an unusual case:
691 printf - | wc --files0-from=- */
692 error (0, 0, _("when reading file names from stdin, "
693 "no file name of %s allowed"),
694 quote ("-"));
695 continue;
698 /* Diagnose a zero-length file name. When it's one
699 among many, knowing the record number may help. */
700 if (files[i][0] == '\0')
702 ok = false;
703 if (files_from)
705 /* Using the standard `filename:line-number:' prefix here is
706 not totally appropriate, since NUL is the separator, not NL,
707 but it might be better than nothing. */
708 unsigned long int file_number = i + 1;
709 error (0, 0, "%s:%lu: %s", quotearg_colon (files_from),
710 file_number, _("invalid zero-length file name"));
712 else
713 error (0, 0, "%s", _("invalid zero-length file name"));
714 continue;
718 ok &= wc_file (files[i], &fstatus[i]);
721 if (1 < nfiles)
722 write_counts (total_lines, total_words, total_chars, total_bytes,
723 max_line_length, _("total"));
725 free (fstatus);
727 if (have_read_stdin && close (STDIN_FILENO) != 0)
728 error (EXIT_FAILURE, errno, "-");
730 exit (ok ? EXIT_SUCCESS : EXIT_FAILURE);