release of 0.09
[DataExtract-FixedWidth.git] / lib / DataExtract / FixedWidth.pm
blob9e4b7dc29c9f9a6013032178d969527a539bc48f
1 package DataExtract::FixedWidth;
2 use Moose;
3 use Carp;
5 our $VERSION = '0.09';
7 sub BUILD {
8 my $self = shift;
10 confess 'You must either send either a "header_row" or data for "heuristic"'
11 unless $self->has_header_row || $self->has_heuristic
13 confess 'You must send a "header_row" if you send "cols"'
14 if $self->has_cols && !$self->has_header_row && !$self->has_heuristic
19 has 'unpack_string' => (
20 isa => 'Str'
21 , is => 'rw'
22 , lazy_build => 1
25 has 'cols' => (
26 isa => 'ArrayRef'
27 , is => 'rw'
28 , auto_deref => 1
29 , lazy_build => 1
32 has 'colchar_map' => (
33 isa => 'HashRef'
34 , is => 'rw'
35 , lazy_build => 1
38 has 'header_row' => (
39 isa => 'Maybe[Str]'
40 , is => 'rw'
41 , predicate => 'has_header_row'
44 has 'first_col_zero' => (
45 isa => 'Bool'
46 , is => 'ro'
47 , default => 1
50 has 'fix_overlay' => (
51 isa => 'Bool'
52 , is => 'ro'
53 , default => 0
56 has 'trim_whitespace' => (
57 isa => 'Bool'
58 , is => 'ro'
59 , default => 1
62 has 'sorted_colstart' => (
63 isa => 'ArrayRef'
64 , is => 'ro'
65 , lazy_build => 1
66 , auto_deref => 1
69 has 'null_as_undef' => (
70 isa => 'Bool'
71 , is => 'ro'
72 , default => 0
75 has 'heuristic' => (
76 isa => 'ArrayRef'
77 , is => 'rw'
78 , predicate => 'has_heuristic'
79 , auto_deref => 1
80 , trigger => \&_heuristic_trigger
83 has 'skip_header_data' => (
84 isa => 'Bool'
85 , is => 'rw'
86 , default => 1
89 has 'verbose' => ( isa => 'Bool', 'is' => 'ro', default => 0 );
91 sub _heuristic_trigger {
92 my ( $self, $data ) = @_;
94 chomp @$data;
96 my $maxLength = 0;
97 for ( @$data ) {
98 $maxLength = length if length > $maxLength
101 $self->header_row( $data->[0] )
102 unless $self->has_header_row
106 my @unpack;
107 my $mask = ' ' x $maxLength;
108 $mask |= $_ for @$data;
110 ## The (?=\S) fixes a bug that creates null columns in the event any
111 ## one column has trailing whitespace (because you'll have '\S\s '
112 ## this was a bug revealed in the dataset NullFirstRow.txt
114 ## the ^\s+ makes it so that right alligned tables
115 ## spaces on the left of the first non-whitespace character in
116 ## the first col work
117 push @unpack, length($1)
118 while $mask =~ m/((?:^\s+)?\S+\s+(?=\S))/g
121 $self->unpack_string( $self->_helper_unpack( \@unpack ) );
126 sub _build_cols {
127 my $self = shift;
129 my @cols;
131 ## If we have the unpack string and the header_row parse it all out on our own
132 ## Here we have two conditionals because the unpack_string comes into existance in
133 ## build_unpack_string and not the heuristic_trigger
134 if (
135 ( $self->has_header_row && $self->has_unpack_string )
136 || ( $self->has_header_row && $self->has_heuristic )
138 my $skd = $self->skip_header_data;
139 $self->skip_header_data( 0 );
141 @cols = @{ $self->parse( $self->header_row ) };
143 $self->skip_header_data( $skd );
146 ## We only the header_row
147 elsif ( $self->header_row ) {
148 @cols = split ' ', $self->header_row;
151 else {
152 croak 'Need some method to calculate cols';
155 \@cols;
159 sub _build_colchar_map {
160 my $self = shift;
161 my $ccm = {};
163 ## If we can generate from heurisitic data and don't have a header_row
164 if (
165 $self->has_header_row
166 && !defined $self->header_row
167 && $self->has_heuristic
168 && $self->has_cols
170 my @cols = $self->cols;
171 foreach my $idx ( 0 .. $#cols ) {
172 $ccm->{$idx} = $cols[$idx];
176 ## Generate from header_row
177 else {
178 croak 'Can not render the map of columns to start-chars without the header_row'
179 unless defined $self->has_header_row
182 foreach my $col ( $self->cols ) {
184 my $pos = 0;
185 $pos = index( $self->header_row, $col, $pos );
187 croak "Failed to find a column '$col' in the header row"
188 unless defined $pos
191 unless ( exists $ccm->{ $pos } ) {
192 $ccm->{ $pos } = $col;
195 ## We have two like-named columns
196 else {
198 ## possible inf loop here
199 until ( not exists $ccm->{$pos} ) {
200 $pos = index( $self->header_row, $col, $pos+1 );
202 croak "Failed to find another column '$col' in the header row"
203 unless defined $pos
208 $ccm->{ $pos } = $col;
216 $ccm;
220 sub _build_unpack_string {
221 my $self = shift;
223 my @unpack;
224 my @startcols = $self->sorted_colstart;
225 $startcols[0] = 0 if $self->first_col_zero;
226 foreach my $idx ( 0 .. $#startcols ) {
228 if ( exists $startcols[$idx+1] ) {
229 push @unpack, ( $startcols[$idx+1] - $startcols[$idx] );
234 $self->_helper_unpack( \@unpack );
238 ## Takes ArrayRef of startcols and returns the unpack string.
239 sub _helper_unpack {
240 my ( $self, $startcols ) = @_;
242 my $format;
243 if ( @$startcols ) {
244 $format = 'a' . join 'a', @$startcols;
246 $format .= 'A*';
248 $format;
252 sub parse {
253 my ( $self, $data ) = @_;
255 return undef if !defined $data;
257 chomp $data;
259 ## skip_header_data
260 if (
261 $self->skip_header_data
262 && ( defined $self->header_row && $data eq $self->header_row )
264 warn "Skipping duplicate header row\n" if $self->verbose;
265 return undef
268 #printf "\nData:|%s|\tHeader:|%s|", $data, $self->header_row;
270 my @cols = unpack ( $self->unpack_string, $data );
272 ## If we bleed over a bit we can fix that.
273 if ( $self->fix_overlay ) {
274 foreach my $idx ( 0 .. $#cols ) {
275 if (
276 $cols[$idx] =~ m/\S+$/
277 && exists $cols[$idx+1]
278 && $cols[$idx+1] =~ s/^(\S+)//
280 $cols[$idx] .= $1;
285 ## Get rid of whitespaces
286 if ( $self->trim_whitespace ) {
287 for ( @cols ) { s/^\s+//; s/\s+$//; }
290 ## Swithc nulls to undef
291 if ( $self->null_as_undef ) {
292 croak 'This ->null_as_undef option mandates ->trim_whitespace be true'
293 unless $self->trim_whitespace
295 for ( @cols ) { undef $_ unless length($_) }
298 \@cols;
302 sub parse_hash {
303 my ( $self, $data ) = @_;
305 my $row = $self->parse( $data );
307 my $colstarts = $self->sorted_colstart;
309 my $results;
310 foreach my $idx ( 0 .. $#$row ) {
311 my $col = $self->colchar_map->{ $colstarts->[$idx] };
312 $results->{ $col } = $row->[$idx];
315 $results;
319 sub _build_sorted_colstart {
320 my $self = shift;
322 my @startcols = map { $_->[0] }
323 sort { $a->[1] <=> $b->[1] }
324 map { [$_, sprintf( "%10d", $_ ) ] }
325 keys %{ $self->colchar_map }
328 \@startcols;
332 no Moose;
333 __PACKAGE__->meta->make_immutable;
337 __END__
339 =head1 NAME
341 DataExtract::FixedWidth - The one stop shop for parsing static column width text tables!
343 =head1 SYNOPSIS
345 ## We assume the columns have no spaces in the header.
346 my $de = DataExtract::FixedWidth->new({ header_row => $header_row });
348 ## We explicitly tell what column names to pick out of the header.
349 my $de = DataExtract::FixedWidth->new({
350 header_row => $header_row
351 cols => [qw/COL1NAME COL2NAME COL3NAME/, 'COL WITH SPACE IN NAME']
354 ## We supply data to heuristic and assume
355 ## * first row is the header (to avoid this assumption
356 ## set the header_row to undef. )
357 ## * heurisitic's unpack_string is correct
358 ## * unpack_string applied to header_row will tell us the columns
359 my $de = DataExtract::FixedWidth->new({ heuristic => \@datarows });
361 ## We supply data to heuristic, say we have no header, and the set columns
362 ## just like the above except ->parse_hash will be be indexed by the
363 ## provided columns and no row is designated as the header.
364 my $de = DataExtract::FixedWidth->new({
365 heuristic => \@datarows
366 , header_row => undef
367 , columns => [qw/ foo bar baz/]
370 ## We supply data to heuristic, and we explicitly add the header_row
371 ## with this method it doesn't have to occur in the data.
372 ## The unpack string rendered will be applied to the first row to get
373 ## the columns
374 my $de = DataExtract::FixedWidth->new({
375 heuristic => \@datarows
376 , header_row => $header_row
379 ## We explicitly add the header_row, with this method it doesn't have
380 ## to occur in the data. The unpack string rendered will be applied
381 ## to the provided header_row to get the columns
382 my $de = DataExtract::FixedWidth->new({
383 unpack_string => $template
384 , header_row => $header_row
387 $de->parse( $data_row );
389 $de->parse_hash( $data_row );
391 =head1 DESCRIPTION
393 This module parses any type of fixed width table -- these types of tables are often outputed by ghostscript, printf() displays with string padding (i.e. %-20s %20s etc), and most screen capture mechanisms. This module is using Moose all methods can be specified in the constructor.
395 In the below example, this module can discern the column names from the header. Or, you can supply them explicitly in the constructor; or, you can supply the rows in an ArrayRef to heuristic and pray for the best luck. This module is pretty abstracted and will deduce what it doesn't know in a decent fashion if all of the information is not provided.
397 SAMPLE FILE
398 HEADER: 'COL1NAME COL2NAME COL3NAMEEEEE'
399 DATA1: 'FOOBARBAZ THIS IS TEXT ANHER COL '
400 DATA2: 'FOOBAR FOOBAR IS TEXT ANOTHER COL '
402 After you have constructed, you can C<-E<gt>parse> which will return an ArrayRef
403 $de->parse('FOOBARBAZ THIS IS TEXT ANOTHER COL');
405 Or, you can use C<-E<gt>parse_hash()> which returns a HashRef of the data indexed by the column headers. They can be determined in many ways with the data you provide.
407 =head2 Constructor
409 The class constructor, C<-E<gt>new>, has numerious forms. Some options it has are:
411 =over 12
413 =item heuristics => \@lines
415 This will deduce the unpack format string from data. If you opt to use this method, and need parse_hash, the first row of the heurisitic is assumed to be the header_row. The unpack_string that results for the heuristic is applied to the header_row to determine the columns.
417 =item cols => \@cols
419 This will permit you to explicitly list the columns in the header row. This is especially handy if you have spaces in the column header. This option will make the C<header_row> mandatory.
421 =item header_row => $string
423 If a C<cols> option is not provided the assumption is that there are no spaces in the column header. The module can take care of the rest. The only way this column can be avoided is if we deduce the header from heuristics, or if you explicitly supply the unpack string and only use C<-E<gt>parse($line)>. If you are not going to supply a header, and you do not want to waste the first line on a header assumption, set the C<header_row =E<gt> undef> in the constructor.
425 =item verbose => 1|0
427 Right now, it simply display's warnings when it does something that might at first seem awkward. Like returning undef when it encouters a duplicate copy of a header row.
429 =back
431 =head2 Methods
433 B<An astrisk, (*) in the option means that is the default.>
435 =over 12
437 =item ->parse( $data_line )
439 Parses the data and returns an ArrayRef
441 =item ->parse_hash( $data_line )
443 Parses the data and returns a HashRef, indexed by the I<cols> (headers)
445 =item ->first_col_zero(1*|0)
447 This option forces the unpack string to make the first column assume the characters to the left of the header column. So, in the below example the first column also includes the first char of the row, even though the word stock begins at the second character.
449 CHAR NUMBERS: |1|2|3|4|5|6|7|8|9|10
450 HEADER ROW : | |S|T|O|C|K| |V|I|N
452 =item ->trim_whitespace(*1|0)
454 Trim the whitespace for the elements that C<-E<gt>parse($line)> outputs.
456 =item ->fix_overlay(1|0*)
458 Fixes columns that bleed into other columns, move over all non-whitespace characters preceding the first whitespace of the next column. This does not work with heurisitic because the unpack string makes the assumption the data is not mangeled.
460 So if ColumnA as is 'foob' and ColumnB is 'ar Hello world'
462 * ColumnA becomes 'foobar', and ColumnB becomes 'Hello world'
464 =item ->null_as_undef(1|0*)
466 Simply undef all elements that return C<length(element) = 0>, requires C<-E<gt>trim_whitespace>.
468 =item ->skip_header_data(1*|0)
470 Skips duplicate copies of the header_row if found in the data.
472 =item ->colchar_map
474 Returns a HashRef that displays the results of each column header and relative character position the column starts at. In the case of heuristic this is a simple ordinal number. In the case of non-heuristic provided data it is currently a cardinal character position.
476 =item ->unpack_string
478 Returns the C<CORE::unpack()> template string that will be used internally by C<-E<gt>parse($line)>
480 =back
482 =head1 AVAILABILITY
484 CPAN.org
486 Git repo at L<http://repo.or.cz/w/DataExtract-FixedWidth.git>
488 =head1 COPYRIGHT & LICENSE
490 Copyright 2008 Evan, all rights reserved.
492 This program is free software; you can redistribute it and/or modify it
493 under the same terms as Perl itself.
496 =head1 AUTHOR
498 Evan Carroll <me at evancarroll.com>
499 System Lord of the Internets
501 =head1 BUGS
503 Please report any bugs or feature requests to C<bug-dataexract-fixedwidth at rt.cpan.org>, or through
504 the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=DataExtract-FixedWidth>. I will be notified, and then you'll
505 automatically be notified of progress on your bug as I make changes.
507 =cut