1 /* wc - print the number of lines, words, and bytes in files
2 Copyright (C) 1985-2017 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 /* Written by Paul Rubin, phr@ocf.berkeley.edu
18 and David MacKenzie, djm@gnu.ai.mit.edu. */
25 #include <sys/types.h>
30 #include "argv-iter.h"
36 #include "readtokens0.h"
37 #include "safe-read.h"
38 #include "stat-size.h"
39 #include "xbinary-io.h"
41 #if !defined iswspace && !HAVE_ISWSPACE
42 # define iswspace(wc) \
43 ((wc) == to_uchar (wc) && isspace (to_uchar (wc)))
46 /* The official name of this program (e.g., no 'g' prefix). */
47 #define PROGRAM_NAME "wc"
50 proper_name ("Paul Rubin"), \
51 proper_name ("David MacKenzie")
53 /* Size of atomic reads. */
54 #define BUFFER_SIZE (16 * 1024)
56 /* Cumulative number of lines, words, chars and bytes in all files so far.
57 max_line_length is the maximum over all files processed so far. */
58 static uintmax_t total_lines
;
59 static uintmax_t total_words
;
60 static uintmax_t total_chars
;
61 static uintmax_t total_bytes
;
62 static uintmax_t max_line_length
;
64 /* Which counts to print. */
65 static bool print_lines
, print_words
, print_chars
, print_bytes
;
66 static bool print_linelength
;
68 /* The print width of each count. */
69 static int number_width
;
71 /* True if we have ever read the standard input. */
72 static bool have_read_stdin
;
74 /* Used to determine if file size can be determined without reading. */
75 static size_t page_size
;
77 /* The result of calling fstat or stat on a file descriptor or file. */
80 /* If positive, fstat or stat has not been called yet. Otherwise,
81 this is the value returned from fstat or stat. */
84 /* If FAILED is zero, this is the file's status. */
88 /* For long options that have no equivalent short option, use a
89 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
92 FILES0_FROM_OPTION
= CHAR_MAX
+ 1
95 static struct option
const longopts
[] =
97 {"bytes", no_argument
, NULL
, 'c'},
98 {"chars", no_argument
, NULL
, 'm'},
99 {"lines", no_argument
, NULL
, 'l'},
100 {"words", no_argument
, NULL
, 'w'},
101 {"files0-from", required_argument
, NULL
, FILES0_FROM_OPTION
},
102 {"max-line-length", no_argument
, NULL
, 'L'},
103 {GETOPT_HELP_OPTION_DECL
},
104 {GETOPT_VERSION_OPTION_DECL
},
111 if (status
!= EXIT_SUCCESS
)
116 Usage: %s [OPTION]... [FILE]...\n\
117 or: %s [OPTION]... --files0-from=F\n\
119 program_name
, program_name
);
121 Print newline, word, and byte counts for each FILE, and a total line if\n\
122 more than one FILE is specified. A word is a non-zero-length sequence of\n\
123 characters delimited by white space.\n\
130 The options below may be used to select which counts are printed, always in\n\
131 the following order: newline, word, character, byte, maximum line length.\n\
132 -c, --bytes print the byte counts\n\
133 -m, --chars print the character counts\n\
134 -l, --lines print the newline counts\n\
137 --files0-from=F read input from the files specified by\n\
138 NUL-terminated names in file F;\n\
139 If F is - then read names from standard input\n\
140 -L, --max-line-length print the maximum display width\n\
141 -w, --words print the word counts\n\
143 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
144 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
145 emit_ancillary_info (PROGRAM_NAME
);
150 /* FILE is the name of the file (or NULL for standard input)
151 associated with the specified counters. */
153 write_counts (uintmax_t lines
,
157 uintmax_t linelength
,
160 static char const format_sp_int
[] = " %*s";
161 char const *format_int
= format_sp_int
+ 1;
162 char buf
[INT_BUFSIZE_BOUND (uintmax_t)];
166 printf (format_int
, number_width
, umaxtostr (lines
, buf
));
167 format_int
= format_sp_int
;
171 printf (format_int
, number_width
, umaxtostr (words
, buf
));
172 format_int
= format_sp_int
;
176 printf (format_int
, number_width
, umaxtostr (chars
, buf
));
177 format_int
= format_sp_int
;
181 printf (format_int
, number_width
, umaxtostr (bytes
, buf
));
182 format_int
= format_sp_int
;
184 if (print_linelength
)
186 printf (format_int
, number_width
, umaxtostr (linelength
, buf
));
189 printf (" %s", strchr (file
, '\n') ? quotef (file
) : file
);
193 /* Count words. FILE_X is the name of the file (or NULL for standard
194 input) that is open on descriptor FD. *FSTATUS is its status.
195 CURRENT_POS is the current file offset if known, negative if unknown.
196 Return true if successful. */
198 wc (int fd
, char const *file_x
, struct fstatus
*fstatus
, off_t current_pos
)
201 char buf
[BUFFER_SIZE
+ 1];
203 uintmax_t lines
, words
, chars
, bytes
, linelength
;
204 bool count_bytes
, count_chars
, count_complicated
;
205 char const *file
= file_x
? file_x
: _("standard input");
207 lines
= words
= chars
= bytes
= linelength
= 0;
209 /* If in the current locale, chars are equivalent to bytes, we prefer
210 counting bytes, because that's easier. */
214 count_bytes
= print_bytes
;
215 count_chars
= print_chars
;
220 count_bytes
= print_bytes
|| print_chars
;
223 count_complicated
= print_words
|| print_linelength
;
225 /* Advise the kernel of our access pattern only if we will read(). */
226 if (!count_bytes
|| count_chars
|| print_lines
|| count_complicated
)
227 fdadvise (fd
, 0, 0, FADVISE_SEQUENTIAL
);
229 /* When counting only bytes, save some line- and word-counting
230 overhead. If FD is a 'regular' Unix file, using lseek is enough
231 to get its 'size' in bytes. Otherwise, read blocks of BUFFER_SIZE
232 bytes at a time until EOF. Note that the 'size' (number of bytes)
233 that wc reports is smaller than stats.st_size when the file is not
234 positioned at its beginning. That's why the lseek calls below are
235 necessary. For example the command
236 '(dd ibs=99k skip=1 count=0; ./wc -c) < /etc/group'
237 should make wc report '0' bytes. */
239 if (count_bytes
&& !count_chars
&& !print_lines
&& !count_complicated
)
241 bool skip_read
= false;
243 if (0 < fstatus
->failed
)
244 fstatus
->failed
= fstat (fd
, &fstatus
->st
);
246 /* For sized files, seek to one st_blksize before EOF rather than to EOF.
247 This works better for files in proc-like file systems where
248 the size is only approximate. */
249 if (! fstatus
->failed
&& usable_st_size (&fstatus
->st
)
250 && 0 <= fstatus
->st
.st_size
)
252 size_t end_pos
= fstatus
->st
.st_size
;
254 current_pos
= lseek (fd
, 0, SEEK_CUR
);
256 if (end_pos
% page_size
)
258 /* We only need special handling of /proc and /sys files etc.
259 when they're a multiple of PAGE_SIZE. In the common case
260 for files with st_size not a multiple of PAGE_SIZE,
261 it's more efficient and accurate to use st_size.
263 Be careful here. The current position may actually be
264 beyond the end of the file. As in the example above. */
266 bytes
= end_pos
< current_pos
? 0 : end_pos
- current_pos
;
271 off_t hi_pos
= end_pos
- end_pos
% (ST_BLKSIZE (fstatus
->st
) + 1);
272 if (0 <= current_pos
&& current_pos
< hi_pos
273 && 0 <= lseek (fd
, hi_pos
, SEEK_CUR
))
274 bytes
= hi_pos
- current_pos
;
280 fdadvise (fd
, 0, 0, FADVISE_SEQUENTIAL
);
281 while ((bytes_read
= safe_read (fd
, buf
, BUFFER_SIZE
)) > 0)
283 if (bytes_read
== SAFE_READ_ERROR
)
285 error (0, errno
, "%s", quotef (file
));
293 else if (!count_chars
&& !count_complicated
)
295 /* Use a separate loop when counting only lines or lines and bytes --
296 but not chars or words. */
297 bool long_lines
= false;
298 while ((bytes_read
= safe_read (fd
, buf
, BUFFER_SIZE
)) > 0)
300 if (bytes_read
== SAFE_READ_ERROR
)
302 error (0, errno
, "%s", quotef (file
));
310 char *end
= p
+ bytes_read
;
311 uintmax_t plines
= lines
;
315 /* Avoid function call overhead for shorter lines. */
317 lines
+= *p
++ == '\n';
321 /* memchr is more efficient with longer lines. */
322 while ((p
= memchr (p
, '\n', end
- p
)))
329 /* If the average line length in the block is >= 15, then use
330 memchr for the next block, where system specific optimizations
331 may outweigh function call overhead.
332 FIXME: This line length was determined in 2015, on both
333 x86_64 and ppc64, but it's worth re-evaluating in future with
334 newer compilers, CPUs, or memchr() implementations etc. */
335 if (lines
- plines
<= bytes_read
/ 15)
342 # define SUPPORT_OLD_MBRTOWC 1
343 else if (MB_CUR_MAX
> 1)
345 bool in_word
= false;
346 uintmax_t linepos
= 0;
347 mbstate_t state
= { 0, };
348 bool in_shift
= false;
349 # if SUPPORT_OLD_MBRTOWC
350 /* Back-up the state before each multibyte character conversion and
351 move the last incomplete character of the buffer to the front
352 of the buffer. This is needed because we don't know whether
353 the 'mbrtowc' function updates the state when it returns -2, --
354 this is the ISO C 99 and glibc-2.2 behaviour - or not - amended
355 ANSI C, glibc-2.1 and Solaris 5.7 behaviour. We don't have an
356 autoconf test for this, yet. */
357 size_t prev
= 0; /* number of bytes carried over from previous round */
359 const size_t prev
= 0;
362 while ((bytes_read
= safe_read (fd
, buf
+ prev
, BUFFER_SIZE
- prev
)) > 0)
365 # if SUPPORT_OLD_MBRTOWC
366 mbstate_t backup_state
;
368 if (bytes_read
== SAFE_READ_ERROR
)
370 error (0, errno
, "%s", quotef (file
));
383 if (!in_shift
&& is_basic (*p
))
385 /* Handle most ASCII characters quickly, without calling
393 # if SUPPORT_OLD_MBRTOWC
394 backup_state
= state
;
396 n
= mbrtowc (&wide_char
, p
, bytes_read
, &state
);
397 if (n
== (size_t) -2)
399 # if SUPPORT_OLD_MBRTOWC
400 state
= backup_state
;
404 if (n
== (size_t) -1)
406 /* Remember that we read a byte, but don't complain
407 about the error. Because of the decoding error,
408 this is a considered to be byte but not a
409 character (that is, chars is not incremented). */
414 if (mbsinit (&state
))
432 if (linepos
> linelength
)
433 linelength
= linepos
;
435 goto mb_word_separator
;
437 linepos
+= 8 - (linepos
% 8);
438 goto mb_word_separator
;
448 if (iswprint (wide_char
))
450 int width
= wcwidth (wide_char
);
453 if (iswspace (wide_char
))
454 goto mb_word_separator
;
460 while (bytes_read
> 0);
462 # if SUPPORT_OLD_MBRTOWC
465 if (bytes_read
== BUFFER_SIZE
)
467 /* Encountered a very long redundant shift sequence. */
471 memmove (buf
, p
, bytes_read
);
476 if (linepos
> linelength
)
477 linelength
= linepos
;
483 bool in_word
= false;
484 uintmax_t linepos
= 0;
486 while ((bytes_read
= safe_read (fd
, buf
, BUFFER_SIZE
)) > 0)
489 if (bytes_read
== SAFE_READ_ERROR
)
491 error (0, errno
, "%s", quotef (file
));
506 if (linepos
> linelength
)
507 linelength
= linepos
;
511 linepos
+= 8 - (linepos
% 8);
522 if (isprint (to_uchar (p
[-1])))
525 if (isspace (to_uchar (p
[-1])))
532 while (--bytes_read
);
534 if (linepos
> linelength
)
535 linelength
= linepos
;
539 if (count_chars
< print_chars
)
542 write_counts (lines
, words
, chars
, bytes
, linelength
, file_x
);
543 total_lines
+= lines
;
544 total_words
+= words
;
545 total_chars
+= chars
;
546 total_bytes
+= bytes
;
547 if (linelength
> max_line_length
)
548 max_line_length
= linelength
;
554 wc_file (char const *file
, struct fstatus
*fstatus
)
556 if (! file
|| STREQ (file
, "-"))
558 have_read_stdin
= true;
559 xset_binary_mode (STDIN_FILENO
, O_BINARY
);
560 return wc (STDIN_FILENO
, file
, fstatus
, -1);
564 int fd
= open (file
, O_RDONLY
| O_BINARY
);
567 error (0, errno
, "%s", quotef (file
));
572 bool ok
= wc (fd
, file
, fstatus
, 0);
575 error (0, errno
, "%s", quotef (file
));
583 /* Return the file status for the NFILES files addressed by FILE.
584 Optimize the case where only one number is printed, for just one
585 file; in that case we can use a print width of 1, so we don't need
586 to stat the file. Handle the case of (nfiles == 0) in the same way;
587 that happens when we don't know how long the list of file names will be. */
589 static struct fstatus
*
590 get_input_fstatus (size_t nfiles
, char *const *file
)
592 struct fstatus
*fstatus
= xnmalloc (nfiles
? nfiles
: 1, sizeof *fstatus
);
596 && ((print_lines
+ print_words
+ print_chars
597 + print_bytes
+ print_linelength
)
599 fstatus
[0].failed
= 1;
602 for (size_t i
= 0; i
< nfiles
; i
++)
603 fstatus
[i
].failed
= (! file
[i
] || STREQ (file
[i
], "-")
604 ? fstat (STDIN_FILENO
, &fstatus
[i
].st
)
605 : stat (file
[i
], &fstatus
[i
].st
));
611 /* Return a print width suitable for the NFILES files whose status is
612 recorded in FSTATUS. Optimize the same special case that
613 get_input_fstatus optimizes. */
615 static int _GL_ATTRIBUTE_PURE
616 compute_number_width (size_t nfiles
, struct fstatus
const *fstatus
)
620 if (0 < nfiles
&& fstatus
[0].failed
<= 0)
622 int minimum_width
= 1;
623 uintmax_t regular_total
= 0;
625 for (size_t i
= 0; i
< nfiles
; i
++)
626 if (! fstatus
[i
].failed
)
628 if (S_ISREG (fstatus
[i
].st
.st_mode
))
629 regular_total
+= fstatus
[i
].st
.st_size
;
634 for (; 10 <= regular_total
; regular_total
/= 10)
636 if (width
< minimum_width
)
637 width
= minimum_width
;
645 main (int argc
, char **argv
)
651 char *files_from
= NULL
;
652 struct fstatus
*fstatus
;
655 initialize_main (&argc
, &argv
);
656 set_program_name (argv
[0]);
657 setlocale (LC_ALL
, "");
658 bindtextdomain (PACKAGE
, LOCALEDIR
);
659 textdomain (PACKAGE
);
661 atexit (close_stdout
);
663 page_size
= getpagesize ();
664 /* Line buffer stdout to ensure lines are written atomically and immediately
665 so that processes running in parallel do not intersperse their output. */
666 setvbuf (stdout
, NULL
, _IOLBF
, 0);
668 print_lines
= print_words
= print_chars
= print_bytes
= false;
669 print_linelength
= false;
670 total_lines
= total_words
= total_chars
= total_bytes
= max_line_length
= 0;
672 while ((optc
= getopt_long (argc
, argv
, "clLmw", longopts
, NULL
)) != -1)
692 print_linelength
= true;
695 case FILES0_FROM_OPTION
:
699 case_GETOPT_HELP_CHAR
;
701 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
704 usage (EXIT_FAILURE
);
707 if (! (print_lines
|| print_words
|| print_chars
|| print_bytes
708 || print_linelength
))
709 print_lines
= print_words
= print_bytes
= true;
711 bool read_tokens
= false;
712 struct argv_iterator
*ai
;
717 /* When using --files0-from=F, you may not specify any files
718 on the command-line. */
721 error (0, 0, _("extra operand %s"), quoteaf (argv
[optind
]));
722 fprintf (stderr
, "%s\n",
723 _("file operands cannot be combined with --files0-from"));
724 usage (EXIT_FAILURE
);
727 if (STREQ (files_from
, "-"))
731 stream
= fopen (files_from
, "r");
733 die (EXIT_FAILURE
, errno
, _("cannot open %s for reading"),
734 quoteaf (files_from
));
737 /* Read the file list into RAM if we can detect its size and that
738 size is reasonable. Otherwise, we'll read a name at a time. */
740 if (fstat (fileno (stream
), &st
) == 0
741 && S_ISREG (st
.st_mode
)
742 && st
.st_size
<= MIN (10 * 1024 * 1024, physmem_available () / 2))
745 readtokens0_init (&tok
);
746 if (! readtokens0 (stream
, &tok
) || fclose (stream
) != 0)
747 die (EXIT_FAILURE
, 0, _("cannot read file names from %s"),
748 quoteaf (files_from
));
751 ai
= argv_iter_init_argv (files
);
757 ai
= argv_iter_init_stream (stream
);
762 static char *stdin_only
[] = { NULL
};
763 files
= (optind
< argc
? argv
+ optind
: stdin_only
);
764 nfiles
= (optind
< argc
? argc
- optind
: 1);
765 ai
= argv_iter_init_argv (files
);
771 fstatus
= get_input_fstatus (nfiles
, files
);
772 number_width
= compute_number_width (nfiles
, fstatus
);
775 for (int i
= 0; /* */; i
++)
777 bool skip_file
= false;
778 enum argv_iter_err ai_err
;
779 char *file_name
= argv_iter (ai
, &ai_err
);
787 error (0, errno
, _("%s: read error"),
788 quotef (files_from
));
794 assert (!"unexpected error code from argv_iter");
797 if (files_from
&& STREQ (files_from
, "-") && STREQ (file_name
, "-"))
799 /* Give a better diagnostic in an unusual case:
800 printf - | wc --files0-from=- */
801 error (0, 0, _("when reading file names from stdin, "
802 "no file name of %s allowed"),
803 quoteaf (file_name
));
809 /* Diagnose a zero-length file name. When it's one
810 among many, knowing the record number may help.
811 FIXME: currently print the record number only with
812 --files0-from=FILE. Maybe do it for argv, too? */
813 if (files_from
== NULL
)
814 error (0, 0, "%s", _("invalid zero-length file name"));
817 /* Using the standard 'filename:line-number:' prefix here is
818 not totally appropriate, since NUL is the separator, not NL,
819 but it might be better than nothing. */
820 unsigned long int file_number
= argv_iter_n_args (ai
);
821 error (0, 0, "%s:%lu: %s", quotef (files_from
),
822 file_number
, _("invalid zero-length file name"));
830 ok
&= wc_file (file_name
, &fstatus
[nfiles
? i
: 0]);
833 fstatus
[0].failed
= 1;
837 /* No arguments on the command line is fine. That means read from stdin.
838 However, no arguments on the --files0-from input stream is an error
839 means don't read anything. */
840 if (ok
&& !files_from
&& argv_iter_n_args (ai
) == 0)
841 ok
&= wc_file (NULL
, &fstatus
[0]);
844 readtokens0_free (&tok
);
846 if (1 < argv_iter_n_args (ai
))
847 write_counts (total_lines
, total_words
, total_chars
, total_bytes
,
848 max_line_length
, _("total"));
854 if (have_read_stdin
&& close (STDIN_FILENO
) != 0)
855 die (EXIT_FAILURE
, errno
, "-");
857 return ok
? EXIT_SUCCESS
: EXIT_FAILURE
;