3 # csv-check: Check validity of CSV file and report
4 # (m)'10 [10 Mar 2010] Copyright H.M.Brand 2007-2013
6 # This code requires the defined-or feature and PerlIO
12 use Encode qw( decode );
14 our $VERSION = "1.5"; # 2010-03-10
18 my $err = shift and select STDERR;
20 usage: csv-check [-s <sep>] [-q <quot>] [-e <esc>] [-u] [--pp] [file.csv]
21 -s <sep> use <sep> as seperator char. Auto-detect, default = ','
22 The string "tab" is allowed.
23 -e <esc> use <sep> as seperator char. Auto-detect, default = ','
24 The string "undef" is allowed.
25 -q <quot> use <quot> as quotation char. Default = '"'
26 The string "undef" will disable quotation.
27 -u check if all fields are valid unicode
29 --pp use Text::CSV_PP instead (cross-check)
34 use Getopt::Long qw(:config bundling nopermute passthrough);
35 my $sep; # Set after reading first line in a flurry attempt to auto-detect
41 "help|?" => sub { usage (0); },
51 my $csvmod = "Text::CSV_XS";
54 $csvmod = "Text::CSV_PP";
61 my $fn = defined $ARGV[0] ? $ARGV[0] : "-";
62 my $data = do { local $/; <> } or die "No data to analyze\n";
64 my ($bin, $rows, $eol, %cols) = (0, 0, undef);
65 unless ($sep) { # No sep char passed, try to auto-detect;
66 $sep = $data =~ m/["\d],["\d,]/ ? "," :
67 $data =~ m/["\d];["\d;]/ ? ";" :
68 $data =~ m/["\d]\t["\d]/ ? "\t" :
69 # If neither, then for unquoted strings
70 $data =~ m/\w,[\w,]/ ? "," :
71 $data =~ m/\w;[\w;]/ ? ";" :
72 $data =~ m/\w\t[\w]/ ? "\t" : ",";
73 $data =~ m/([\r\n]+)\Z/ and $eol = DDisplay "$1";
76 my $csv = $csvmod->new ({
77 sep_char => $sep eq "tab" ? "\t" : $sep,
78 quote_char => $quo eq "undef" ? undef : $quo,
79 escape_char => $esc eq "undef" ? undef : $esc,
87 (my $file = defined $ARGV ? $ARGV : "") =~ s{(\S)$}{$1 };
88 (my $prog = $0) =~ s{.*/}{};
89 print "Checked $file with $prog $VERSION using $csvmod @{[$csvmod->VERSION]}\n";
90 my @diag = $csv->error_diag;
91 if ($diag[0] == 2012 && $csv->eof) {
92 my @coll = sort { $a <=> $b } keys %cols;
94 my $cols = @coll == 1 ? $coll[0] : "(@coll)";
95 defined $eol or $eol = $csv->eol || "--unknown--";
96 print "OK: rows: $rows, columns: $cols\n";
97 print " sep = <$sep>, quo = <$quo>, bin = <$bin>, eol = <$eol>\n";
99 print "WARN: multiple column lengths:\n";
100 printf " %6d line%s with %4d field%s\n",
101 $cols{$_}, $cols{$_} == 1 ? " " : "s",
102 $_, $_ == 1 ? "" : "s"
109 print "$ARGV record $diag[3] at line $./$diag[2] - $diag[0] - $diag[1]\n";
110 my $ep = $diag[2] - 1; # diag[2] is 1-based
111 my $err = $csv->error_input . " ";
112 substr $err, $ep + 1, 0, "*";
113 substr $err, $ep, 0, "*";
114 ($err = substr $err, $ep - 5, 12) =~ s/ +$//;
118 print "$ARGV line $. - $diag[1]\n";
127 grep { $_ & 0x0002 } $csv->meta_info and $bin = 1;
130 foreach my $x (0 .. $#r) {
131 local $SIG{__WARN__} = sub {
132 (my $msg = shift) =~ s{ at /\S+Encode.pm.*}{};
133 printf STDERR "Field %3d:%3d - '%s'\t- %s",
134 $rows, $x, DPeek ($r[$x]), $msg;
136 my $oct = decode ("utf-8", $r[$x], Encode::FB_WARN);
141 open my $fh, "<", \$data or die "$fn: $!\n";
142 while (my $row = $csv->getline ($fh)) {