Fixed a long lasting bug which caused empty rows to be added in the event of trailing...
[DataExtract-FixedWidth.git] / lib / DataExtract / FixedWidth.pm
blobc729340490e4d9048cff682c6b8ee36a9c62e30c
1 package DataExtract::FixedWidth;
2 use Moose;
3 use Carp;
5 our $VERSION = '0.07';
7 sub BUILD {
8 my $self = shift;
10 confess 'You must either send either a "header_row" or data for "heuristic"'
11 unless $self->has_header_row || $self->has_heuristic
13 confess 'You must send a "header_row" if you send "cols"'
14 if $self->has_cols && !$self->has_header_row && !$self->has_heuristic
19 has 'unpack_string' => (
20 isa => 'Str'
21 , is => 'rw'
22 , lazy_build => 1
25 has 'cols' => (
26 isa => 'ArrayRef'
27 , is => 'rw'
28 , auto_deref => 1
29 , lazy_build => 1
32 has 'colchar_map' => (
33 isa => 'HashRef'
34 , is => 'rw'
35 , lazy_build => 1
38 has 'header_row' => (
39 isa => 'Maybe[Str]'
40 , is => 'rw'
41 , predicate => 'has_header_row'
44 has 'first_col_zero' => (
45 isa => 'Bool'
46 , is => 'ro'
47 , default => 1
50 has 'fix_overlay' => (
51 isa => 'Bool'
52 , is => 'ro'
53 , default => 0
56 has 'trim_whitespace' => (
57 isa => 'Bool'
58 , is => 'ro'
59 , default => 1
62 has 'sorted_colstart' => (
63 isa => 'ArrayRef'
64 , is => 'ro'
65 , lazy_build => 1
66 , auto_deref => 1
69 has 'null_as_undef' => (
70 isa => 'Bool'
71 , is => 'ro'
72 , default => 0
75 has 'heuristic' => (
76 isa => 'ArrayRef'
77 , is => 'rw'
78 , predicate => 'has_heuristic'
79 , auto_deref => 1
80 , trigger => \&_heuristic_trigger
83 has 'skip_header_data' => (
84 isa => 'Bool'
85 , is => 'rw'
86 , default => 1
89 has 'verbose' => ( isa => 'Bool', 'is' => 'ro', default => 0 );
91 sub _heuristic_trigger {
92 my ( $self, $data ) = @_;
94 chomp @$data;
96 my $maxLength = 0;
97 for ( @$data ) {
98 $maxLength = length if length > $maxLength
101 $self->header_row( $data->[0] )
102 unless $self->has_header_row
106 my @unpack;
107 my $mask = ' ' x $maxLength;
108 $mask |= $_ for @$data;
110 ## The (?=\S) fixes a bug that creates null columns in the event any
111 ## one column has trailing whitespace (because you'll have '\S\s '
112 ## this was a bug revealed in the dataset NullFirstRow.txt
113 push @unpack, length($1)
114 while $mask =~ m/(\S+\s+(?=\S)|$)/g
117 ## Remove last row, (to be replaced with A*)
118 pop @unpack;
120 $self->unpack_string( $self->_helper_unpack( \@unpack ) );
125 sub _build_cols {
126 my $self = shift;
128 my @cols;
130 ## If we have the unpack string and the header_row parse it all out on our own
131 ## Here we have two conditionals because the unpack_string comes into existance in
132 ## build_unpack_string and not the heuristic_trigger
133 if (
134 ( $self->has_header_row && $self->has_unpack_string )
135 || ( $self->has_header_row && $self->has_heuristic )
137 my $skd = $self->skip_header_data;
138 $self->skip_header_data( 0 );
140 @cols = @{ $self->parse( $self->header_row ) };
142 $self->skip_header_data( $skd );
145 ## We only the header_row
146 elsif ( $self->header_row ) {
147 @cols = split ' ', $self->header_row;
150 else {
151 croak 'Need some method to calculate cols';
154 \@cols;
158 sub _build_colchar_map {
159 my $self = shift;
160 my $ccm = {};
162 ## If we can generate from heurisitic data and don't have a header_row
163 if (
164 $self->has_header_row
165 && !defined $self->header_row
166 && $self->has_heuristic
167 && $self->has_cols
169 my @cols = $self->cols;
170 foreach my $idx ( 0 .. $#cols ) {
171 $ccm->{$idx} = $cols[$idx];
175 ## Generate from header_row
176 else {
177 croak 'Can not render the map of columns to start-chars without the header_row'
178 unless defined $self->has_header_row
181 foreach my $col ( $self->cols ) {
183 my $pos = 0;
184 $pos = index( $self->header_row, $col, $pos );
186 croak "Failed to find a column '$col' in the header row"
187 unless defined $pos
190 unless ( exists $ccm->{ $pos } ) {
191 $ccm->{ $pos } = $col;
194 ## We have two like-named columns
195 else {
197 ## possible inf loop here
198 until ( not exists $ccm->{$pos} ) {
199 $pos = index( $self->header_row, $col, $pos+1 );
201 croak "Failed to find another column '$col' in the header row"
202 unless defined $pos
207 $ccm->{ $pos } = $col;
215 $ccm;
219 sub _build_unpack_string {
220 my $self = shift;
222 my @unpack;
223 my @startcols = $self->sorted_colstart;
224 $startcols[0] = 0 if $self->first_col_zero;
225 foreach my $idx ( 0 .. $#startcols ) {
227 if ( exists $startcols[$idx+1] ) {
228 push @unpack, ( $startcols[$idx+1] - $startcols[$idx] );
233 $self->_helper_unpack( \@unpack );
237 ## Takes ArrayRef of startcols and returns the unpack string.
238 sub _helper_unpack {
239 my ( $self, $startcols ) = @_;
241 my $format;
242 if ( @$startcols ) {
243 $format = 'a' . join 'a', @$startcols;
245 $format .= 'A*';
247 $format;
251 sub parse {
252 my ( $self, $data ) = @_;
254 return undef if !defined $data;
256 chomp $data;
258 ## skip_header_data
259 if (
260 $self->skip_header_data
261 && ( defined $self->header_row && $data eq $self->header_row )
263 warn "Skipping duplicate header row\n" if $self->verbose;
264 return undef
267 #printf "\nData:|%s|\tHeader:|%s|", $data, $self->header_row;
269 my @cols = unpack ( $self->unpack_string, $data );
271 ## If we bleed over a bit we can fix that.
272 if ( $self->fix_overlay ) {
273 foreach my $idx ( 0 .. $#cols ) {
274 if (
275 $cols[$idx] =~ m/\S+$/
276 && exists $cols[$idx+1]
277 && $cols[$idx+1] =~ s/^(\S+)//
279 $cols[$idx] .= $1;
284 ## Get rid of whitespaces
285 if ( $self->trim_whitespace ) {
286 for ( @cols ) { s/^\s+//; s/\s+$//; }
289 ## Swithc nulls to undef
290 if ( $self->null_as_undef ) {
291 croak 'This ->null_as_undef option mandates ->trim_whitespace be true'
292 unless $self->trim_whitespace
294 for ( @cols ) { undef $_ unless length($_) }
297 \@cols;
301 sub parse_hash {
302 my ( $self, $data ) = @_;
304 my $row = $self->parse( $data );
306 my $colstarts = $self->sorted_colstart;
308 my $results;
309 foreach my $idx ( 0 .. $#$row ) {
310 my $col = $self->colchar_map->{ $colstarts->[$idx] };
311 $results->{ $col } = $row->[$idx];
314 $results;
318 sub _build_sorted_colstart {
319 my $self = shift;
321 my @startcols = map { $_->[0] }
322 sort { $a->[1] <=> $b->[1] }
323 map { [$_, sprintf( "%10d", $_ ) ] }
324 keys %{ $self->colchar_map }
327 \@startcols;
331 no Moose;
332 __PACKAGE__->meta->make_immutable;
336 __END__
338 =head1 NAME
340 DataExtract::FixedWidth - The one stop shop for parsing static column width text tables!
342 =head1 SYNOPSIS
344 ## We assume the columns have no spaces in the header.
345 my $de = DataExtract::FixedWidth->new({ header_row => $header_row });
347 ## We explicitly tell what column names to pick out of the header.
348 my $de = DataExtract::FixedWidth->new({
349 header_row => $header_row
350 cols => [qw/COL1NAME COL2NAME COL3NAME/, 'COL WITH SPACE IN NAME']
353 ## We supply data to heuristic and assume
354 ## * first row is the header (to avoid this assumption
355 ## set the header_row to undef. )
356 ## * heurisitic's unpack_string is correct
357 ## * unpack_string applied to header_row will tell us the columns
358 my $de = DataExtract::FixedWidth->new({ heuristic => \@datarows });
360 ## We supply data to heuristic, say we have no header, and the set columns
361 ## just like the above except ->parse_hash will be be indexed by the
362 ## provided columns and no row is designated as the header.
363 my $de = DataExtract::FixedWidth->new({
364 heuristic => \@datarows
365 , header_row => undef
366 , columns => [qw/ foo bar baz/]
369 ## We supply data to heuristic, and we explicitly add the header_row
370 ## with this method it doesn't have to occur in the data.
371 ## The unpack string rendered will be applied to the first row to get
372 ## the columns
373 my $de = DataExtract::FixedWidth->new({
374 heuristic => \@datarows
375 , header_row => $header_row
378 ## We explicitly add the header_row, with this method it doesn't have
379 ## to occur in the data. The unpack string rendered will be applied
380 ## to the provided header_row to get the columns
381 my $de = DataExtract::FixedWidth->new({
382 unpack_string => $template
383 , header_row => $header_row
386 $de->parse( $data_row );
388 $de->parse_hash( $data_row );
390 =head1 DESCRIPTION
392 This module parses any type of fixed width table -- these types of tables are often outputed by ghostscript, printf() displays with string padding (i.e. %-20s %20s etc), and most screen capture mechanisms. This module is using Moose all methods can be specified in the constructor.
394 In the below example, this module can discern the column names from the header. Or, you can supply them explicitly in the constructor; or, you can supply the rows in an ArrayRef to heuristic and pray for the best luck. This module is pretty abstracted and will deduce what it doesn't know in a decent fashion if all of the information is not provided.
396 SAMPLE FILE
397 HEADER: 'COL1NAME COL2NAME COL3NAMEEEEE'
398 DATA1: 'FOOBARBAZ THIS IS TEXT ANHER COL '
399 DATA2: 'FOOBAR FOOBAR IS TEXT ANOTHER COL '
401 After you have constructed, you can C<-E<gt>parse> which will return an ArrayRef
402 $de->parse('FOOBARBAZ THIS IS TEXT ANOTHER COL');
404 Or, you can use C<-E<gt>parse_hash()> which returns a HashRef of the data indexed by the column headers. They can be determined in many ways with the data you provide.
406 =head2 Constructor
408 The class constructor, C<-E<gt>new>, has numerious forms. Some options it has are:
410 =over 12
412 =item heuristics => \@lines
414 This will deduce the unpack format string from data. If you opt to use this method, and need parse_hash, the first row of the heurisitic is assumed to be the header_row. The unpack_string that results for the heuristic is applied to the header_row to determine the columns.
416 =item cols => \@cols
418 This will permit you to explicitly list the columns in the header row. This is especially handy if you have spaces in the column header. This option will make the C<header_row> mandatory.
420 =item header_row => $string
422 If a C<cols> option is not provided the assumption is that there are no spaces in the column header. The module can take care of the rest. The only way this column can be avoided is if we deduce the header from heuristics, or if you explicitly supply the unpack string and only use C<-E<gt>parse($line)>. If you are not going to supply a header, and you do not want to waste the first line on a header assumption, set the C<header_row =E<gt> undef> in the constructor.
424 =item verbose => 1|0
426 Right now, it simply display's warnings when it does something that might at first seem awkward. Like returning undef when it encouters a duplicate copy of a header row.
428 =back
430 =head2 Methods
432 B<An astrisk, (*) in the option means that is the default.>
434 =over 12
436 =item ->parse( $data_line )
438 Parses the data and returns an ArrayRef
440 =item ->parse_hash( $data_line )
442 Parses the data and returns a HashRef, indexed by the I<cols> (headers)
444 =item ->first_col_zero(1*|0)
446 This option forces the unpack string to make the first column assume the characters to the left of the header column. So, in the below example the first column also includes the first char of the row, even though the word stock begins at the second character.
448 CHAR NUMBERS: |1|2|3|4|5|6|7|8|9|10
449 HEADER ROW : | |S|T|O|C|K| |V|I|N
451 =item ->trim_whitespace(*1|0)
453 Trim the whitespace for the elements that C<-E<gt>parse($line)> outputs.
455 =item ->fix_overlay(1|0*)
457 Fixes columns that bleed into other columns, move over all non-whitespace characters preceding the first whitespace of the next column. This does not work with heurisitic because the unpack string makes the assumption the data is not mangeled.
459 So if ColumnA as is 'foob' and ColumnB is 'ar Hello world'
461 * ColumnA becomes 'foobar', and ColumnB becomes 'Hello world'
463 =item ->null_as_undef(1|0*)
465 Simply undef all elements that return C<length(element) = 0>, requires C<-E<gt>trim_whitespace>.
467 =item ->skip_header_data(1*|0)
469 Skips duplicate copies of the header_row if found in the data.
471 =item ->colchar_map
473 Returns a HashRef that displays the results of each column header and relative character position the column starts at. In the case of heuristic this is a simple ordinal number. In the case of non-heuristic provided data it is currently a cardinal character position.
475 =item ->unpack_string
477 Returns the C<CORE::unpack()> template string that will be used internally by C<-E<gt>parse($line)>
479 =back
481 =head1 AVAILABILITY
483 CPAN.org
485 =head1 COPYRIGHT & LICENSE
487 Copyright 2008 Evan, all rights reserved.
489 This program is free software; you can redistribute it and/or modify it
490 under the same terms as Perl itself.
493 =head1 AUTHOR
495 Evan Carroll <me at evancarroll.com>
496 System Lord of the Internets
498 =head1 BUGS
500 Please report any bugs or feature requests to C<bug-dataexract-fixedwidth at rt.cpan.org>, or through
501 the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=DataExtract-FixedWidth>. I will be notified, and then you'll
502 automatically be notified of progress on your bug as I make changes.
504 =cut