1 /* wc - print the number of lines, words, and bytes in files
2 Copyright (C) 1985-2022 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Written by Paul Rubin, phr@ocf.berkeley.edu
18 and David MacKenzie, djm@gnu.ai.mit.edu. */
25 #include <sys/types.h>
30 #include "argv-iter.h"
36 #include "readtokens0.h"
37 #include "safe-read.h"
38 #include "stat-size.h"
39 #include "xbinary-io.h"
40 #ifdef USE_AVX2_WC_LINECOUNT
44 #if !defined iswspace && !HAVE_ISWSPACE
45 # define iswspace(wc) \
46 ((wc) == to_uchar (wc) && isspace (to_uchar (wc)))
49 /* The official name of this program (e.g., no 'g' prefix). */
50 #define PROGRAM_NAME "wc"
53 proper_name ("Paul Rubin"), \
54 proper_name ("David MacKenzie")
56 /* Size of atomic reads. */
57 #define BUFFER_SIZE (16 * 1024)
60 wc_lines (char const *file
, int fd
, uintmax_t *lines_out
,
61 uintmax_t *bytes_out
);
62 #ifdef USE_AVX2_WC_LINECOUNT
65 wc_lines_avx2 (char const *file
, int fd
, uintmax_t *lines_out
,
66 uintmax_t *bytes_out
);
69 (*wc_lines_p
) (char const *file
, int fd
, uintmax_t *lines_out
,
70 uintmax_t *bytes_out
) = wc_lines
;
74 /* Cumulative number of lines, words, chars and bytes in all files so far.
75 max_line_length is the maximum over all files processed so far. */
76 static uintmax_t total_lines
;
77 static uintmax_t total_words
;
78 static uintmax_t total_chars
;
79 static uintmax_t total_bytes
;
80 static uintmax_t max_line_length
;
82 /* Which counts to print. */
83 static bool print_lines
, print_words
, print_chars
, print_bytes
;
84 static bool print_linelength
;
86 /* The print width of each count. */
87 static int number_width
;
89 /* True if we have ever read the standard input. */
90 static bool have_read_stdin
;
92 /* Used to determine if file size can be determined without reading. */
93 static size_t page_size
;
95 /* Enable to _not_ treat non breaking space as a word separator. */
96 static bool posixly_correct
;
98 /* The result of calling fstat or stat on a file descriptor or file. */
101 /* If positive, fstat or stat has not been called yet. Otherwise,
102 this is the value returned from fstat or stat. */
105 /* If FAILED is zero, this is the file's status. */
109 /* For long options that have no equivalent short option, use a
110 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
113 DEBUG_PROGRAM_OPTION
= CHAR_MAX
+ 1,
117 static struct option
const longopts
[] =
119 {"bytes", no_argument
, NULL
, 'c'},
120 {"chars", no_argument
, NULL
, 'm'},
121 {"lines", no_argument
, NULL
, 'l'},
122 {"words", no_argument
, NULL
, 'w'},
123 {"debug", no_argument
, NULL
, DEBUG_PROGRAM_OPTION
},
124 {"files0-from", required_argument
, NULL
, FILES0_FROM_OPTION
},
125 {"max-line-length", no_argument
, NULL
, 'L'},
126 {GETOPT_HELP_OPTION_DECL
},
127 {GETOPT_VERSION_OPTION_DECL
},
131 #ifdef USE_AVX2_WC_LINECOUNT
133 avx2_supported (void)
135 unsigned int eax
= 0;
136 unsigned int ebx
= 0;
137 unsigned int ecx
= 0;
138 unsigned int edx
= 0;
139 bool getcpuid_ok
= false;
140 bool avx_enabled
= false;
142 if (__get_cpuid (1, &eax
, &ebx
, &ecx
, &edx
))
145 if (ecx
& bit_OSXSAVE
)
146 avx_enabled
= true; /* Support is not disabled. */
152 eax
= ebx
= ecx
= edx
= 0;
153 if (! __get_cpuid_count (7, 0, &eax
, &ebx
, &ecx
, &edx
))
157 if (! (ebx
& bit_AVX2
))
158 avx_enabled
= false; /* Hardware doesn't support it. */
166 error (0, 0, "%s", _("failed to get cpuid"));
169 else if (! avx_enabled
)
172 error (0, 0, "%s", _("avx2 support not detected"));
178 error (0, 0, "%s", _("using avx2 hardware support"));
187 if (status
!= EXIT_SUCCESS
)
192 Usage: %s [OPTION]... [FILE]...\n\
193 or: %s [OPTION]... --files0-from=F\n\
195 program_name
, program_name
);
197 Print newline, word, and byte counts for each FILE, and a total line if\n\
198 more than one FILE is specified. A word is a non-zero-length sequence of\n\
199 printable characters delimited by white space.\n\
206 The options below may be used to select which counts are printed, always in\n\
207 the following order: newline, word, character, byte, maximum line length.\n\
208 -c, --bytes print the byte counts\n\
209 -m, --chars print the character counts\n\
210 -l, --lines print the newline counts\n\
213 --files0-from=F read input from the files specified by\n\
214 NUL-terminated names in file F;\n\
215 If F is - then read names from standard input\n\
216 -L, --max-line-length print the maximum display width\n\
217 -w, --words print the word counts\n\
219 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
220 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
221 emit_ancillary_info (PROGRAM_NAME
);
226 /* Return non zero if a non breaking space. */
229 iswnbspace (wint_t wc
)
231 return ! posixly_correct
232 && (wc
== 0x00A0 || wc
== 0x2007
233 || wc
== 0x202F || wc
== 0x2060);
239 return iswnbspace (btowc (c
));
242 /* FILE is the name of the file (or NULL for standard input)
243 associated with the specified counters. */
245 write_counts (uintmax_t lines
,
249 uintmax_t linelength
,
252 static char const format_sp_int
[] = " %*s";
253 char const *format_int
= format_sp_int
+ 1;
254 char buf
[INT_BUFSIZE_BOUND (uintmax_t)];
258 printf (format_int
, number_width
, umaxtostr (lines
, buf
));
259 format_int
= format_sp_int
;
263 printf (format_int
, number_width
, umaxtostr (words
, buf
));
264 format_int
= format_sp_int
;
268 printf (format_int
, number_width
, umaxtostr (chars
, buf
));
269 format_int
= format_sp_int
;
273 printf (format_int
, number_width
, umaxtostr (bytes
, buf
));
274 format_int
= format_sp_int
;
276 if (print_linelength
)
278 printf (format_int
, number_width
, umaxtostr (linelength
, buf
));
281 printf (" %s", strchr (file
, '\n') ? quotef (file
) : file
);
286 wc_lines (char const *file
, int fd
, uintmax_t *lines_out
, uintmax_t *bytes_out
)
289 uintmax_t lines
, bytes
;
290 char buf
[BUFFER_SIZE
+ 1];
291 bool long_lines
= false;
293 if (!lines_out
|| !bytes_out
)
300 while ((bytes_read
= safe_read (fd
, buf
, BUFFER_SIZE
)) > 0)
303 if (bytes_read
== SAFE_READ_ERROR
)
305 error (0, errno
, "%s", quotef (file
));
312 char *end
= buf
+ bytes_read
;
313 uintmax_t plines
= lines
;
317 /* Avoid function call overhead for shorter lines. */
319 lines
+= *p
++ == '\n';
323 /* rawmemchr is more efficient with longer lines. */
325 while ((p
= rawmemchr (p
, '\n')) < end
)
332 /* If the average line length in the block is >= 15, then use
333 memchr for the next block, where system specific optimizations
334 may outweigh function call overhead.
335 FIXME: This line length was determined in 2015, on both
336 x86_64 and ppc64, but it's worth re-evaluating in future with
337 newer compilers, CPUs, or memchr() implementations etc. */
338 if (lines
- plines
<= bytes_read
/ 15)
350 /* Count words. FILE_X is the name of the file (or NULL for standard
351 input) that is open on descriptor FD. *FSTATUS is its status.
352 CURRENT_POS is the current file offset if known, negative if unknown.
353 Return true if successful. */
355 wc (int fd
, char const *file_x
, struct fstatus
*fstatus
, off_t current_pos
)
358 char buf
[BUFFER_SIZE
+ 1];
360 uintmax_t lines
, words
, chars
, bytes
, linelength
;
361 bool count_bytes
, count_chars
, count_complicated
;
362 char const *file
= file_x
? file_x
: _("standard input");
364 lines
= words
= chars
= bytes
= linelength
= 0;
366 /* If in the current locale, chars are equivalent to bytes, we prefer
367 counting bytes, because that's easier. */
371 count_bytes
= print_bytes
;
372 count_chars
= print_chars
;
377 count_bytes
= print_bytes
|| print_chars
;
380 count_complicated
= print_words
|| print_linelength
;
382 /* Advise the kernel of our access pattern only if we will read(). */
383 if (!count_bytes
|| count_chars
|| print_lines
|| count_complicated
)
384 fdadvise (fd
, 0, 0, FADVISE_SEQUENTIAL
);
386 /* When counting only bytes, save some line- and word-counting
387 overhead. If FD is a 'regular' Unix file, using lseek is enough
388 to get its 'size' in bytes. Otherwise, read blocks of BUFFER_SIZE
389 bytes at a time until EOF. Note that the 'size' (number of bytes)
390 that wc reports is smaller than stats.st_size when the file is not
391 positioned at its beginning. That's why the lseek calls below are
392 necessary. For example the command
393 '(dd ibs=99k skip=1 count=0; ./wc -c) < /etc/group'
394 should make wc report '0' bytes. */
396 if (count_bytes
&& !count_chars
&& !print_lines
&& !count_complicated
)
398 bool skip_read
= false;
400 if (0 < fstatus
->failed
)
401 fstatus
->failed
= fstat (fd
, &fstatus
->st
);
403 /* For sized files, seek to one st_blksize before EOF rather than to EOF.
404 This works better for files in proc-like file systems where
405 the size is only approximate. */
406 if (! fstatus
->failed
&& usable_st_size (&fstatus
->st
)
407 && 0 <= fstatus
->st
.st_size
)
409 size_t end_pos
= fstatus
->st
.st_size
;
411 current_pos
= lseek (fd
, 0, SEEK_CUR
);
413 if (end_pos
% page_size
)
415 /* We only need special handling of /proc and /sys files etc.
416 when they're a multiple of PAGE_SIZE. In the common case
417 for files with st_size not a multiple of PAGE_SIZE,
418 it's more efficient and accurate to use st_size.
420 Be careful here. The current position may actually be
421 beyond the end of the file. As in the example above. */
423 bytes
= end_pos
< current_pos
? 0 : end_pos
- current_pos
;
428 off_t hi_pos
= end_pos
- end_pos
% (ST_BLKSIZE (fstatus
->st
) + 1);
429 if (0 <= current_pos
&& current_pos
< hi_pos
430 && 0 <= lseek (fd
, hi_pos
, SEEK_CUR
))
431 bytes
= hi_pos
- current_pos
;
437 fdadvise (fd
, 0, 0, FADVISE_SEQUENTIAL
);
438 while ((bytes_read
= safe_read (fd
, buf
, BUFFER_SIZE
)) > 0)
440 if (bytes_read
== SAFE_READ_ERROR
)
442 error (0, errno
, "%s", quotef (file
));
450 else if (!count_chars
&& !count_complicated
)
452 #ifdef USE_AVX2_WC_LINECOUNT
453 if (avx2_supported ())
454 wc_lines_p
= wc_lines_avx2
;
457 /* Use a separate loop when counting only lines or lines and bytes --
458 but not chars or words. */
459 ok
= wc_lines_p (file
, fd
, &lines
, &bytes
);
462 # define SUPPORT_OLD_MBRTOWC 1
463 else if (MB_CUR_MAX
> 1)
465 bool in_word
= false;
466 uintmax_t linepos
= 0;
467 mbstate_t state
= { 0, };
468 bool in_shift
= false;
469 # if SUPPORT_OLD_MBRTOWC
470 /* Back-up the state before each multibyte character conversion and
471 move the last incomplete character of the buffer to the front
472 of the buffer. This is needed because we don't know whether
473 the 'mbrtowc' function updates the state when it returns -2, --
474 this is the ISO C 99 and glibc-2.2 behaviour - or not - amended
475 ANSI C, glibc-2.1 and Solaris 5.7 behaviour. We don't have an
476 autoconf test for this, yet. */
477 size_t prev
= 0; /* number of bytes carried over from previous round */
479 const size_t prev
= 0;
482 while ((bytes_read
= safe_read (fd
, buf
+ prev
, BUFFER_SIZE
- prev
)) > 0)
485 # if SUPPORT_OLD_MBRTOWC
486 mbstate_t backup_state
;
488 if (bytes_read
== SAFE_READ_ERROR
)
490 error (0, errno
, "%s", quotef (file
));
504 if (!in_shift
&& is_basic (*p
))
506 /* Handle most ASCII characters quickly, without calling
515 # if SUPPORT_OLD_MBRTOWC
516 backup_state
= state
;
518 n
= mbrtowc (&wide_char
, p
, bytes_read
, &state
);
519 if (n
== (size_t) -2)
521 # if SUPPORT_OLD_MBRTOWC
522 state
= backup_state
;
526 if (n
== (size_t) -1)
528 /* Remember that we read a byte, but don't complain
529 about the error. Because of the decoding error,
530 this is a considered to be byte but not a
531 character (that is, chars is not incremented). */
536 if (mbsinit (&state
))
552 if (linepos
> linelength
)
553 linelength
= linepos
;
555 goto mb_word_separator
;
557 linepos
+= 8 - (linepos
% 8);
558 goto mb_word_separator
;
568 if (wide
&& iswprint (wide_char
))
570 /* wcwidth can be expensive on OSX for example,
571 so avoid if uneeded. */
572 if (print_linelength
)
574 int width
= wcwidth (wide_char
);
578 if (iswspace (wide_char
) || iswnbspace (wide_char
))
579 goto mb_word_separator
;
582 else if (!wide
&& isprint (to_uchar (*p
)))
585 if (isspace (to_uchar (*p
)))
586 goto mb_word_separator
;
596 while (bytes_read
> 0);
598 # if SUPPORT_OLD_MBRTOWC
601 if (bytes_read
== BUFFER_SIZE
)
603 /* Encountered a very long redundant shift sequence. */
607 memmove (buf
, p
, bytes_read
);
612 if (linepos
> linelength
)
613 linelength
= linepos
;
619 bool in_word
= false;
620 uintmax_t linepos
= 0;
622 while ((bytes_read
= safe_read (fd
, buf
, BUFFER_SIZE
)) > 0)
625 if (bytes_read
== SAFE_READ_ERROR
)
627 error (0, errno
, "%s", quotef (file
));
642 if (linepos
> linelength
)
643 linelength
= linepos
;
647 linepos
+= 8 - (linepos
% 8);
658 if (isprint (to_uchar (p
[-1])))
661 if (isspace (to_uchar (p
[-1]))
662 || isnbspace (to_uchar (p
[-1])))
669 while (--bytes_read
);
671 if (linepos
> linelength
)
672 linelength
= linepos
;
676 if (count_chars
< print_chars
)
679 write_counts (lines
, words
, chars
, bytes
, linelength
, file_x
);
680 total_lines
+= lines
;
681 total_words
+= words
;
682 total_chars
+= chars
;
683 total_bytes
+= bytes
;
684 if (linelength
> max_line_length
)
685 max_line_length
= linelength
;
691 wc_file (char const *file
, struct fstatus
*fstatus
)
693 if (! file
|| STREQ (file
, "-"))
695 have_read_stdin
= true;
696 xset_binary_mode (STDIN_FILENO
, O_BINARY
);
697 return wc (STDIN_FILENO
, file
, fstatus
, -1);
701 int fd
= open (file
, O_RDONLY
| O_BINARY
);
704 error (0, errno
, "%s", quotef (file
));
709 bool ok
= wc (fd
, file
, fstatus
, 0);
712 error (0, errno
, "%s", quotef (file
));
720 /* Return the file status for the NFILES files addressed by FILE.
721 Optimize the case where only one number is printed, for just one
722 file; in that case we can use a print width of 1, so we don't need
723 to stat the file. Handle the case of (nfiles == 0) in the same way;
724 that happens when we don't know how long the list of file names will be. */
726 static struct fstatus
*
727 get_input_fstatus (size_t nfiles
, char *const *file
)
729 struct fstatus
*fstatus
= xnmalloc (nfiles
? nfiles
: 1, sizeof *fstatus
);
733 && ((print_lines
+ print_words
+ print_chars
734 + print_bytes
+ print_linelength
)
736 fstatus
[0].failed
= 1;
739 for (size_t i
= 0; i
< nfiles
; i
++)
740 fstatus
[i
].failed
= (! file
[i
] || STREQ (file
[i
], "-")
741 ? fstat (STDIN_FILENO
, &fstatus
[i
].st
)
742 : stat (file
[i
], &fstatus
[i
].st
));
748 /* Return a print width suitable for the NFILES files whose status is
749 recorded in FSTATUS. Optimize the same special case that
750 get_input_fstatus optimizes. */
754 compute_number_width (size_t nfiles
, struct fstatus
const *fstatus
)
758 if (0 < nfiles
&& fstatus
[0].failed
<= 0)
760 int minimum_width
= 1;
761 uintmax_t regular_total
= 0;
763 for (size_t i
= 0; i
< nfiles
; i
++)
764 if (! fstatus
[i
].failed
)
766 if (S_ISREG (fstatus
[i
].st
.st_mode
))
767 regular_total
+= fstatus
[i
].st
.st_size
;
772 for (; 10 <= regular_total
; regular_total
/= 10)
774 if (width
< minimum_width
)
775 width
= minimum_width
;
783 main (int argc
, char **argv
)
789 char *files_from
= NULL
;
790 struct fstatus
*fstatus
;
793 initialize_main (&argc
, &argv
);
794 set_program_name (argv
[0]);
795 setlocale (LC_ALL
, "");
796 bindtextdomain (PACKAGE
, LOCALEDIR
);
797 textdomain (PACKAGE
);
799 atexit (close_stdout
);
801 page_size
= getpagesize ();
802 /* Line buffer stdout to ensure lines are written atomically and immediately
803 so that processes running in parallel do not intersperse their output. */
804 setvbuf (stdout
, NULL
, _IOLBF
, 0);
806 posixly_correct
= (getenv ("POSIXLY_CORRECT") != NULL
);
808 print_lines
= print_words
= print_chars
= print_bytes
= false;
809 print_linelength
= false;
810 total_lines
= total_words
= total_chars
= total_bytes
= max_line_length
= 0;
812 while ((optc
= getopt_long (argc
, argv
, "clLmw", longopts
, NULL
)) != -1)
832 print_linelength
= true;
835 case DEBUG_PROGRAM_OPTION
:
839 case FILES0_FROM_OPTION
:
843 case_GETOPT_HELP_CHAR
;
845 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
848 usage (EXIT_FAILURE
);
851 if (! (print_lines
|| print_words
|| print_chars
|| print_bytes
852 || print_linelength
))
853 print_lines
= print_words
= print_bytes
= true;
855 bool read_tokens
= false;
856 struct argv_iterator
*ai
;
861 /* When using --files0-from=F, you may not specify any files
862 on the command-line. */
865 error (0, 0, _("extra operand %s"), quoteaf (argv
[optind
]));
866 fprintf (stderr
, "%s\n",
867 _("file operands cannot be combined with --files0-from"));
868 usage (EXIT_FAILURE
);
871 if (STREQ (files_from
, "-"))
875 stream
= fopen (files_from
, "r");
877 die (EXIT_FAILURE
, errno
, _("cannot open %s for reading"),
878 quoteaf (files_from
));
881 /* Read the file list into RAM if we can detect its size and that
882 size is reasonable. Otherwise, we'll read a name at a time. */
884 if (fstat (fileno (stream
), &st
) == 0
885 && S_ISREG (st
.st_mode
)
886 && st
.st_size
<= MIN (10 * 1024 * 1024, physmem_available () / 2))
889 readtokens0_init (&tok
);
890 if (! readtokens0 (stream
, &tok
) || fclose (stream
) != 0)
891 die (EXIT_FAILURE
, 0, _("cannot read file names from %s"),
892 quoteaf (files_from
));
895 ai
= argv_iter_init_argv (files
);
901 ai
= argv_iter_init_stream (stream
);
906 static char *stdin_only
[] = { NULL
};
907 files
= (optind
< argc
? argv
+ optind
: stdin_only
);
908 nfiles
= (optind
< argc
? argc
- optind
: 1);
909 ai
= argv_iter_init_argv (files
);
915 fstatus
= get_input_fstatus (nfiles
, files
);
916 number_width
= compute_number_width (nfiles
, fstatus
);
919 for (int i
= 0; /* */; i
++)
921 bool skip_file
= false;
922 enum argv_iter_err ai_err
;
923 char *file_name
= argv_iter (ai
, &ai_err
);
931 error (0, errno
, _("%s: read error"),
932 quotef (files_from
));
938 assert (!"unexpected error code from argv_iter");
941 if (files_from
&& STREQ (files_from
, "-") && STREQ (file_name
, "-"))
943 /* Give a better diagnostic in an unusual case:
944 printf - | wc --files0-from=- */
945 error (0, 0, _("when reading file names from stdin, "
946 "no file name of %s allowed"),
947 quoteaf (file_name
));
953 /* Diagnose a zero-length file name. When it's one
954 among many, knowing the record number may help.
955 FIXME: currently print the record number only with
956 --files0-from=FILE. Maybe do it for argv, too? */
957 if (files_from
== NULL
)
958 error (0, 0, "%s", _("invalid zero-length file name"));
961 /* Using the standard 'filename:line-number:' prefix here is
962 not totally appropriate, since NUL is the separator, not NL,
963 but it might be better than nothing. */
964 unsigned long int file_number
= argv_iter_n_args (ai
);
965 error (0, 0, "%s:%lu: %s", quotef (files_from
),
966 file_number
, _("invalid zero-length file name"));
974 ok
&= wc_file (file_name
, &fstatus
[nfiles
? i
: 0]);
977 fstatus
[0].failed
= 1;
981 /* No arguments on the command line is fine. That means read from stdin.
982 However, no arguments on the --files0-from input stream is an error
983 means don't read anything. */
984 if (ok
&& !files_from
&& argv_iter_n_args (ai
) == 0)
985 ok
&= wc_file (NULL
, &fstatus
[0]);
988 readtokens0_free (&tok
);
990 if (1 < argv_iter_n_args (ai
))
991 write_counts (total_lines
, total_words
, total_chars
, total_bytes
,
992 max_line_length
, _("total"));
998 if (have_read_stdin
&& close (STDIN_FILENO
) != 0)
999 die (EXIT_FAILURE
, errno
, "-");
1001 return ok
? EXIT_SUCCESS
: EXIT_FAILURE
;