1 /* wc - print the number of lines, words, and bytes in files
2 Copyright (C) 1985-2023 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Written by Paul Rubin, phr@ocf.berkeley.edu
18 and David MacKenzie, djm@gnu.ai.mit.edu. */
25 #include <sys/types.h>
31 #include "argv-iter.h"
37 #include "readtokens0.h"
38 #include "safe-read.h"
39 #include "stat-size.h"
40 #include "xbinary-io.h"
41 #ifdef USE_AVX2_WC_LINECOUNT
45 #if !defined iswspace && !HAVE_ISWSPACE
46 # define iswspace(wc) \
47 ((wc) == to_uchar (wc) && isspace (to_uchar (wc)))
50 /* The official name of this program (e.g., no 'g' prefix). */
51 #define PROGRAM_NAME "wc"
54 proper_name ("Paul Rubin"), \
55 proper_name ("David MacKenzie")
57 /* Size of atomic reads. */
58 #define BUFFER_SIZE (16 * 1024)
61 wc_lines (char const *file
, int fd
, uintmax_t *lines_out
,
62 uintmax_t *bytes_out
);
63 #ifdef USE_AVX2_WC_LINECOUNT
66 wc_lines_avx2 (char const *file
, int fd
, uintmax_t *lines_out
,
67 uintmax_t *bytes_out
);
70 (*wc_lines_p
) (char const *file
, int fd
, uintmax_t *lines_out
,
71 uintmax_t *bytes_out
) = wc_lines
;
75 /* Cumulative number of lines, words, chars and bytes in all files so far.
76 max_line_length is the maximum over all files processed so far. */
77 static uintmax_t total_lines
;
78 static uintmax_t total_words
;
79 static uintmax_t total_chars
;
80 static uintmax_t total_bytes
;
81 static uintmax_t max_line_length
;
83 /* Which counts to print. */
84 static bool print_lines
, print_words
, print_chars
, print_bytes
;
85 static bool print_linelength
;
87 /* The print width of each count. */
88 static int number_width
;
90 /* True if we have ever read the standard input. */
91 static bool have_read_stdin
;
93 /* Used to determine if file size can be determined without reading. */
94 static size_t page_size
;
96 /* Enable to _not_ treat non breaking space as a word separator. */
97 static bool posixly_correct
;
99 /* The result of calling fstat or stat on a file descriptor or file. */
102 /* If positive, fstat or stat has not been called yet. Otherwise,
103 this is the value returned from fstat or stat. */
106 /* If FAILED is zero, this is the file's status. */
110 /* For long options that have no equivalent short option, use a
111 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
114 DEBUG_PROGRAM_OPTION
= CHAR_MAX
+ 1,
119 static struct option
const longopts
[] =
121 {"bytes", no_argument
, NULL
, 'c'},
122 {"chars", no_argument
, NULL
, 'm'},
123 {"lines", no_argument
, NULL
, 'l'},
124 {"words", no_argument
, NULL
, 'w'},
125 {"debug", no_argument
, NULL
, DEBUG_PROGRAM_OPTION
},
126 {"files0-from", required_argument
, NULL
, FILES0_FROM_OPTION
},
127 {"max-line-length", no_argument
, NULL
, 'L'},
128 {"total", required_argument
, NULL
, TOTAL_OPTION
},
129 {GETOPT_HELP_OPTION_DECL
},
130 {GETOPT_VERSION_OPTION_DECL
},
136 total_auto
, /* 0: default or --total=auto */
137 total_always
, /* 1: --total=always */
138 total_only
, /* 2: --total=only */
139 total_never
/* 3: --total=never */
141 static char const *const total_args
[] =
143 "auto", "always", "only", "never", NULL
145 static enum total_type
const total_types
[] =
147 total_auto
, total_always
, total_only
, total_never
149 ARGMATCH_VERIFY (total_args
, total_types
);
150 static enum total_type total_mode
= total_auto
;
152 #ifdef USE_AVX2_WC_LINECOUNT
154 avx2_supported (void)
156 unsigned int eax
= 0;
157 unsigned int ebx
= 0;
158 unsigned int ecx
= 0;
159 unsigned int edx
= 0;
160 bool getcpuid_ok
= false;
161 bool avx_enabled
= false;
163 if (__get_cpuid (1, &eax
, &ebx
, &ecx
, &edx
))
166 if (ecx
& bit_OSXSAVE
)
167 avx_enabled
= true; /* Support is not disabled. */
173 eax
= ebx
= ecx
= edx
= 0;
174 if (! __get_cpuid_count (7, 0, &eax
, &ebx
, &ecx
, &edx
))
178 if (! (ebx
& bit_AVX2
))
179 avx_enabled
= false; /* Hardware doesn't support it. */
187 error (0, 0, "%s", _("failed to get cpuid"));
190 else if (! avx_enabled
)
193 error (0, 0, "%s", _("avx2 support not detected"));
199 error (0, 0, "%s", _("using avx2 hardware support"));
208 if (status
!= EXIT_SUCCESS
)
213 Usage: %s [OPTION]... [FILE]...\n\
214 or: %s [OPTION]... --files0-from=F\n\
216 program_name
, program_name
);
218 Print newline, word, and byte counts for each FILE, and a total line if\n\
219 more than one FILE is specified. A word is a non-zero-length sequence of\n\
220 printable characters delimited by white space.\n\
227 The options below may be used to select which counts are printed, always in\n\
228 the following order: newline, word, character, byte, maximum line length.\n\
229 -c, --bytes print the byte counts\n\
230 -m, --chars print the character counts\n\
231 -l, --lines print the newline counts\n\
234 --files0-from=F read input from the files specified by\n\
235 NUL-terminated names in file F;\n\
236 If F is - then read names from standard input\n\
237 -L, --max-line-length print the maximum display width\n\
238 -w, --words print the word counts\n\
241 --total=WHEN when to print a line with total counts;\n\
242 WHEN can be: auto, always, only, never\n\
244 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
245 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
246 emit_ancillary_info (PROGRAM_NAME
);
251 /* Return non zero if a non breaking space. */
254 iswnbspace (wint_t wc
)
256 return ! posixly_correct
257 && (wc
== 0x00A0 || wc
== 0x2007
258 || wc
== 0x202F || wc
== 0x2060);
264 return iswnbspace (btowc (c
));
267 /* FILE is the name of the file (or NULL for standard input)
268 associated with the specified counters. */
270 write_counts (uintmax_t lines
,
274 uintmax_t linelength
,
277 static char const format_sp_int
[] = " %*s";
278 char const *format_int
= format_sp_int
+ 1;
279 char buf
[INT_BUFSIZE_BOUND (uintmax_t)];
283 printf (format_int
, number_width
, umaxtostr (lines
, buf
));
284 format_int
= format_sp_int
;
288 printf (format_int
, number_width
, umaxtostr (words
, buf
));
289 format_int
= format_sp_int
;
293 printf (format_int
, number_width
, umaxtostr (chars
, buf
));
294 format_int
= format_sp_int
;
298 printf (format_int
, number_width
, umaxtostr (bytes
, buf
));
299 format_int
= format_sp_int
;
301 if (print_linelength
)
303 printf (format_int
, number_width
, umaxtostr (linelength
, buf
));
306 printf (" %s", strchr (file
, '\n') ? quotef (file
) : file
);
311 wc_lines (char const *file
, int fd
, uintmax_t *lines_out
, uintmax_t *bytes_out
)
314 uintmax_t lines
, bytes
;
315 char buf
[BUFFER_SIZE
+ 1];
316 bool long_lines
= false;
318 if (!lines_out
|| !bytes_out
)
325 while ((bytes_read
= safe_read (fd
, buf
, BUFFER_SIZE
)) > 0)
328 if (bytes_read
== SAFE_READ_ERROR
)
330 error (0, errno
, "%s", quotef (file
));
337 char *end
= buf
+ bytes_read
;
338 uintmax_t plines
= lines
;
342 /* Avoid function call overhead for shorter lines. */
344 lines
+= *p
++ == '\n';
348 /* rawmemchr is more efficient with longer lines. */
350 while ((p
= rawmemchr (p
, '\n')) < end
)
357 /* If the average line length in the block is >= 15, then use
358 memchr for the next block, where system specific optimizations
359 may outweigh function call overhead.
360 FIXME: This line length was determined in 2015, on both
361 x86_64 and ppc64, but it's worth re-evaluating in future with
362 newer compilers, CPUs, or memchr() implementations etc. */
363 if (lines
- plines
<= bytes_read
/ 15)
375 /* Count words. FILE_X is the name of the file (or NULL for standard
376 input) that is open on descriptor FD. *FSTATUS is its status.
377 CURRENT_POS is the current file offset if known, negative if unknown.
378 Return true if successful. */
380 wc (int fd
, char const *file_x
, struct fstatus
*fstatus
, off_t current_pos
)
383 char buf
[BUFFER_SIZE
+ 1];
385 uintmax_t lines
, words
, chars
, bytes
, linelength
;
386 bool count_bytes
, count_chars
, count_complicated
;
387 char const *file
= file_x
? file_x
: _("standard input");
389 lines
= words
= chars
= bytes
= linelength
= 0;
391 /* If in the current locale, chars are equivalent to bytes, we prefer
392 counting bytes, because that's easier. */
396 count_bytes
= print_bytes
;
397 count_chars
= print_chars
;
402 count_bytes
= print_bytes
|| print_chars
;
405 count_complicated
= print_words
|| print_linelength
;
407 /* Advise the kernel of our access pattern only if we will read(). */
408 if (!count_bytes
|| count_chars
|| print_lines
|| count_complicated
)
409 fdadvise (fd
, 0, 0, FADVISE_SEQUENTIAL
);
411 /* When counting only bytes, save some line- and word-counting
412 overhead. If FD is a 'regular' Unix file, using lseek is enough
413 to get its 'size' in bytes. Otherwise, read blocks of BUFFER_SIZE
414 bytes at a time until EOF. Note that the 'size' (number of bytes)
415 that wc reports is smaller than stats.st_size when the file is not
416 positioned at its beginning. That's why the lseek calls below are
417 necessary. For example the command
418 '(dd ibs=99k skip=1 count=0; ./wc -c) < /etc/group'
419 should make wc report '0' bytes. */
421 if (count_bytes
&& !count_chars
&& !print_lines
&& !count_complicated
)
423 bool skip_read
= false;
425 if (0 < fstatus
->failed
)
426 fstatus
->failed
= fstat (fd
, &fstatus
->st
);
428 /* For sized files, seek to one st_blksize before EOF rather than to EOF.
429 This works better for files in proc-like file systems where
430 the size is only approximate. */
431 if (! fstatus
->failed
&& usable_st_size (&fstatus
->st
)
432 && 0 <= fstatus
->st
.st_size
)
434 off_t end_pos
= fstatus
->st
.st_size
;
436 current_pos
= lseek (fd
, 0, SEEK_CUR
);
438 if (end_pos
% page_size
)
440 /* We only need special handling of /proc and /sys files etc.
441 when they're a multiple of PAGE_SIZE. In the common case
442 for files with st_size not a multiple of PAGE_SIZE,
443 it's more efficient and accurate to use st_size.
445 Be careful here. The current position may actually be
446 beyond the end of the file. As in the example above. */
448 bytes
= end_pos
< current_pos
? 0 : end_pos
- current_pos
;
453 off_t hi_pos
= end_pos
- end_pos
% (ST_BLKSIZE (fstatus
->st
) + 1);
454 if (0 <= current_pos
&& current_pos
< hi_pos
455 && 0 <= lseek (fd
, hi_pos
, SEEK_CUR
))
456 bytes
= hi_pos
- current_pos
;
462 fdadvise (fd
, 0, 0, FADVISE_SEQUENTIAL
);
463 while ((bytes_read
= safe_read (fd
, buf
, BUFFER_SIZE
)) > 0)
465 if (bytes_read
== SAFE_READ_ERROR
)
467 error (0, errno
, "%s", quotef (file
));
475 else if (!count_chars
&& !count_complicated
)
477 #ifdef USE_AVX2_WC_LINECOUNT
478 if (avx2_supported ())
479 wc_lines_p
= wc_lines_avx2
;
482 /* Use a separate loop when counting only lines or lines and bytes --
483 but not chars or words. */
484 ok
= wc_lines_p (file
, fd
, &lines
, &bytes
);
487 # define SUPPORT_OLD_MBRTOWC 1
488 else if (MB_CUR_MAX
> 1)
490 bool in_word
= false;
491 uintmax_t linepos
= 0;
492 mbstate_t state
= { 0, };
493 bool in_shift
= false;
494 # if SUPPORT_OLD_MBRTOWC
495 /* Back-up the state before each multibyte character conversion and
496 move the last incomplete character of the buffer to the front
497 of the buffer. This is needed because we don't know whether
498 the 'mbrtowc' function updates the state when it returns -2, --
499 this is the ISO C 99 and glibc-2.2 behaviour - or not - amended
500 ANSI C, glibc-2.1 and Solaris 5.7 behaviour. We don't have an
501 autoconf test for this, yet. */
502 size_t prev
= 0; /* number of bytes carried over from previous round */
504 const size_t prev
= 0;
507 while ((bytes_read
= safe_read (fd
, buf
+ prev
, BUFFER_SIZE
- prev
)) > 0)
510 # if SUPPORT_OLD_MBRTOWC
511 mbstate_t backup_state
;
513 if (bytes_read
== SAFE_READ_ERROR
)
515 error (0, errno
, "%s", quotef (file
));
529 if (!in_shift
&& is_basic (*p
))
531 /* Handle most ASCII characters quickly, without calling
540 # if SUPPORT_OLD_MBRTOWC
541 backup_state
= state
;
543 n
= mbrtowc (&wide_char
, p
, bytes_read
, &state
);
544 if (n
== (size_t) -2)
546 # if SUPPORT_OLD_MBRTOWC
547 state
= backup_state
;
551 if (n
== (size_t) -1)
553 /* Remember that we read a byte, but don't complain
554 about the error. Because of the decoding error,
555 this is a considered to be byte but not a
556 character (that is, chars is not incremented). */
561 if (mbsinit (&state
))
577 if (linepos
> linelength
)
578 linelength
= linepos
;
580 goto mb_word_separator
;
582 linepos
+= 8 - (linepos
% 8);
583 goto mb_word_separator
;
593 if (wide
&& iswprint (wide_char
))
595 /* wcwidth can be expensive on OSX for example,
596 so avoid if uneeded. */
597 if (print_linelength
)
599 int width
= wcwidth (wide_char
);
603 if (iswspace (wide_char
) || iswnbspace (wide_char
))
604 goto mb_word_separator
;
607 else if (!wide
&& isprint (to_uchar (*p
)))
610 if (isspace (to_uchar (*p
)))
611 goto mb_word_separator
;
621 while (bytes_read
> 0);
623 # if SUPPORT_OLD_MBRTOWC
626 if (bytes_read
== BUFFER_SIZE
)
628 /* Encountered a very long redundant shift sequence. */
632 memmove (buf
, p
, bytes_read
);
637 if (linepos
> linelength
)
638 linelength
= linepos
;
644 bool in_word
= false;
645 uintmax_t linepos
= 0;
647 while ((bytes_read
= safe_read (fd
, buf
, BUFFER_SIZE
)) > 0)
650 if (bytes_read
== SAFE_READ_ERROR
)
652 error (0, errno
, "%s", quotef (file
));
667 if (linepos
> linelength
)
668 linelength
= linepos
;
672 linepos
+= 8 - (linepos
% 8);
683 if (isprint (to_uchar (p
[-1])))
686 if (isspace (to_uchar (p
[-1]))
687 || isnbspace (to_uchar (p
[-1])))
694 while (--bytes_read
);
696 if (linepos
> linelength
)
697 linelength
= linepos
;
701 if (count_chars
< print_chars
)
704 if (total_mode
!= total_only
)
705 write_counts (lines
, words
, chars
, bytes
, linelength
, file_x
);
706 total_lines
+= lines
;
707 total_words
+= words
;
708 total_chars
+= chars
;
709 total_bytes
+= bytes
;
710 if (linelength
> max_line_length
)
711 max_line_length
= linelength
;
717 wc_file (char const *file
, struct fstatus
*fstatus
)
719 if (! file
|| STREQ (file
, "-"))
721 have_read_stdin
= true;
722 xset_binary_mode (STDIN_FILENO
, O_BINARY
);
723 return wc (STDIN_FILENO
, file
, fstatus
, -1);
727 int fd
= open (file
, O_RDONLY
| O_BINARY
);
730 error (0, errno
, "%s", quotef (file
));
735 bool ok
= wc (fd
, file
, fstatus
, 0);
738 error (0, errno
, "%s", quotef (file
));
746 /* Return the file status for the NFILES files addressed by FILE.
747 Optimize the case where only one number is printed, for just one
748 file; in that case we can use a print width of 1, so we don't need
749 to stat the file. Handle the case of (nfiles == 0) in the same way;
750 that happens when we don't know how long the list of file names will be. */
752 static struct fstatus
*
753 get_input_fstatus (size_t nfiles
, char *const *file
)
755 struct fstatus
*fstatus
= xnmalloc (nfiles
? nfiles
: 1, sizeof *fstatus
);
759 && ((print_lines
+ print_words
+ print_chars
760 + print_bytes
+ print_linelength
)
762 fstatus
[0].failed
= 1;
765 for (size_t i
= 0; i
< nfiles
; i
++)
766 fstatus
[i
].failed
= (! file
[i
] || STREQ (file
[i
], "-")
767 ? fstat (STDIN_FILENO
, &fstatus
[i
].st
)
768 : stat (file
[i
], &fstatus
[i
].st
));
774 /* Return a print width suitable for the NFILES files whose status is
775 recorded in FSTATUS. Optimize the same special case that
776 get_input_fstatus optimizes. */
780 compute_number_width (size_t nfiles
, struct fstatus
const *fstatus
)
784 if (0 < nfiles
&& fstatus
[0].failed
<= 0)
786 int minimum_width
= 1;
787 uintmax_t regular_total
= 0;
789 for (size_t i
= 0; i
< nfiles
; i
++)
790 if (! fstatus
[i
].failed
)
792 if (S_ISREG (fstatus
[i
].st
.st_mode
))
793 regular_total
+= fstatus
[i
].st
.st_size
;
798 for (; 10 <= regular_total
; regular_total
/= 10)
800 if (width
< minimum_width
)
801 width
= minimum_width
;
809 main (int argc
, char **argv
)
815 char *files_from
= NULL
;
816 struct fstatus
*fstatus
;
819 initialize_main (&argc
, &argv
);
820 set_program_name (argv
[0]);
821 setlocale (LC_ALL
, "");
822 bindtextdomain (PACKAGE
, LOCALEDIR
);
823 textdomain (PACKAGE
);
825 atexit (close_stdout
);
827 page_size
= getpagesize ();
828 /* Line buffer stdout to ensure lines are written atomically and immediately
829 so that processes running in parallel do not intersperse their output. */
830 setvbuf (stdout
, NULL
, _IOLBF
, 0);
832 posixly_correct
= (getenv ("POSIXLY_CORRECT") != NULL
);
834 print_lines
= print_words
= print_chars
= print_bytes
= false;
835 print_linelength
= false;
836 total_lines
= total_words
= total_chars
= total_bytes
= max_line_length
= 0;
838 while ((optc
= getopt_long (argc
, argv
, "clLmw", longopts
, NULL
)) != -1)
858 print_linelength
= true;
861 case DEBUG_PROGRAM_OPTION
:
865 case FILES0_FROM_OPTION
:
870 total_mode
= XARGMATCH ("--total", optarg
, total_args
, total_types
);
873 case_GETOPT_HELP_CHAR
;
875 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
878 usage (EXIT_FAILURE
);
881 if (! (print_lines
|| print_words
|| print_chars
|| print_bytes
882 || print_linelength
))
883 print_lines
= print_words
= print_bytes
= true;
885 bool read_tokens
= false;
886 struct argv_iterator
*ai
;
891 /* When using --files0-from=F, you may not specify any files
892 on the command-line. */
895 error (0, 0, _("extra operand %s"), quoteaf (argv
[optind
]));
896 fprintf (stderr
, "%s\n",
897 _("file operands cannot be combined with --files0-from"));
898 usage (EXIT_FAILURE
);
901 if (STREQ (files_from
, "-"))
905 stream
= fopen (files_from
, "r");
907 die (EXIT_FAILURE
, errno
, _("cannot open %s for reading"),
908 quoteaf (files_from
));
911 /* Read the file list into RAM if we can detect its size and that
912 size is reasonable. Otherwise, we'll read a name at a time. */
914 if (fstat (fileno (stream
), &st
) == 0
915 && S_ISREG (st
.st_mode
)
916 && st
.st_size
<= MIN (10 * 1024 * 1024, physmem_available () / 2))
919 readtokens0_init (&tok
);
920 if (! readtokens0 (stream
, &tok
) || fclose (stream
) != 0)
921 die (EXIT_FAILURE
, 0, _("cannot read file names from %s"),
922 quoteaf (files_from
));
925 ai
= argv_iter_init_argv (files
);
931 ai
= argv_iter_init_stream (stream
);
936 static char *stdin_only
[] = { NULL
};
937 files
= (optind
< argc
? argv
+ optind
: stdin_only
);
938 nfiles
= (optind
< argc
? argc
- optind
: 1);
939 ai
= argv_iter_init_argv (files
);
945 fstatus
= get_input_fstatus (nfiles
, files
);
946 if (total_mode
== total_only
)
947 number_width
= 1; /* No extra padding, since no alignment requirement. */
949 number_width
= compute_number_width (nfiles
, fstatus
);
952 for (int i
= 0; /* */; i
++)
954 bool skip_file
= false;
955 enum argv_iter_err ai_err
;
956 char *file_name
= argv_iter (ai
, &ai_err
);
964 error (0, errno
, _("%s: read error"),
965 quotef (files_from
));
971 assert (!"unexpected error code from argv_iter");
974 if (files_from
&& STREQ (files_from
, "-") && STREQ (file_name
, "-"))
976 /* Give a better diagnostic in an unusual case:
977 printf - | wc --files0-from=- */
978 error (0, 0, _("when reading file names from stdin, "
979 "no file name of %s allowed"),
980 quoteaf (file_name
));
986 /* Diagnose a zero-length file name. When it's one
987 among many, knowing the record number may help.
988 FIXME: currently print the record number only with
989 --files0-from=FILE. Maybe do it for argv, too? */
990 if (files_from
== NULL
)
991 error (0, 0, "%s", _("invalid zero-length file name"));
994 /* Using the standard 'filename:line-number:' prefix here is
995 not totally appropriate, since NUL is the separator, not NL,
996 but it might be better than nothing. */
997 unsigned long int file_number
= argv_iter_n_args (ai
);
998 error (0, 0, "%s:%lu: %s", quotef (files_from
),
999 file_number
, _("invalid zero-length file name"));
1007 ok
&= wc_file (file_name
, &fstatus
[nfiles
? i
: 0]);
1010 fstatus
[0].failed
= 1;
1014 /* No arguments on the command line is fine. That means read from stdin.
1015 However, no arguments on the --files0-from input stream is an error
1016 means don't read anything. */
1017 if (ok
&& !files_from
&& argv_iter_n_args (ai
) == 0)
1018 ok
&= wc_file (NULL
, &fstatus
[0]);
1021 readtokens0_free (&tok
);
1023 if (total_mode
!= total_never
1024 && (total_mode
!= total_auto
|| 1 < argv_iter_n_args (ai
)))
1025 write_counts (total_lines
, total_words
, total_chars
, total_bytes
,
1027 total_mode
!= total_only
? _("total") : NULL
);
1029 argv_iter_free (ai
);
1033 if (have_read_stdin
&& close (STDIN_FILENO
) != 0)
1034 die (EXIT_FAILURE
, errno
, "-");
1036 return ok
? EXIT_SUCCESS
: EXIT_FAILURE
;