pod changes
[DataExtract-FixedWidth.git] / lib / DataExtract / FixedWidth.pm
blobebd54b99bc684264d76f8f8f7871b014c73285ee
1 package DataExtract::FixedWidth;
2 use Moose;
3 use Carp;
5 our $VERSION = '0.05';
7 sub BUILD {
8 my $self = shift;
10 confess 'You must either send either a "header_row" or data for "heuristic"'
11 unless $self->has_header_row || $self->has_heuristic
13 confess 'You must send a "header_row" if you send "cols"'
14 if $self->has_cols && !$self->has_header_row
19 has 'unpack_string' => (
20 isa => 'Str'
21 , is => 'rw'
22 , lazy_build => 1
25 has 'cols' => (
26 isa => 'ArrayRef'
27 , is => 'rw'
28 , auto_deref => 1
29 , lazy_build => 1
32 has 'colchar_map' => (
33 isa => 'HashRef'
34 , is => 'rw'
35 , lazy_build => 1
38 has 'header_row' => (
39 isa => 'Maybe[Str]'
40 , is => 'rw'
41 , lazy_build => 1
44 has 'first_col_zero' => (
45 isa => 'Bool'
46 , is => 'ro'
47 , default => 1
50 has 'fix_overlay' => (
51 isa => 'Bool'
52 , is => 'ro'
53 , default => 0
56 has 'trim_whitespace' => (
57 isa => 'Bool'
58 , is => 'ro'
59 , default => 1
62 has 'sorted_colstart' => (
63 isa => 'ArrayRef'
64 , is => 'ro'
65 , lazy_build => 1
66 , auto_deref => 1
69 has 'null_as_undef' => (
70 isa => 'Bool'
71 , is => 'ro'
72 , default => 0
75 has 'heuristic' => (
76 isa => 'ArrayRef'
77 , is => 'rw'
78 , predicate => 'has_heuristic'
79 , auto_deref => 1
80 , trigger => sub { chomp @{$_[1]} }
83 has 'skip_header_data' => (
84 isa => 'Bool'
85 , is => 'rw'
86 , default => 1
89 sub _build_header_row {
90 my $self = shift;
92 $self->has_heuristic
93 ? return ${$self->heuristic}[0]
94 : undef
99 sub _build_cols {
100 my $self = shift;
102 my @cols;
104 ## If we have the unpack string and the header_row parse it all out on our own
105 if (
106 ( $self->header_row && $self->has_unpack_string )
107 || ( $self->header_row && $self->has_heuristic )
109 my $skd = $self->skip_header_data;
110 $self->skip_header_data( 0 );
112 @cols = @{ $self->parse( $self->header_row ) };
114 $self->skip_header_data( $skd );
117 ## We only the header_row
118 elsif ( $self->header_row ) {
119 @cols = split ' ', $self->header_row;
122 else {
123 croak 'Need some method to calculate cols';
126 \@cols;
130 sub _build_colchar_map {
131 my $self = shift;
133 croak 'Can not render the map of columns to start-chars without the header_row'
134 unless defined $self->header_row
137 my $ccm = {};
138 foreach my $col ( $self->cols ) {
140 my $pos = 0;
141 $pos = index( $self->header_row, $col, $pos );
143 croak "Failed to find a column '$col' in the header row"
144 unless defined $pos
147 unless ( exists $ccm->{ $pos } ) {
148 $ccm->{ $pos } = $col;
151 ## We have two like-named columns
152 else {
154 until ( not exists $ccm->{$pos} ) {
155 $pos = index( $self->header_row, $col, $pos+1 );
157 croak "Failed to find another column '$col' in the header row"
158 unless defined $pos
163 $ccm->{ $pos } = $col;
169 $ccm;
173 sub _build_unpack_string {
174 my $self = shift;
176 my @unpack;
177 if ( $self->has_heuristic ) {
178 my @lines = $self->heuristic;
180 my $mask = ' ' x length $lines[ 0 ];
182 $mask |= $_ for @lines;
184 push @unpack, length($1)
185 while $mask =~ m/(\S+\s+|$)/g
188 ## Remove last row, (to be replaced with A*)
189 pop @unpack;
192 else {
193 my @startcols = $self->sorted_colstart;
194 $startcols[0] = 0 if $self->first_col_zero;
195 foreach my $idx ( 0 .. $#startcols ) {
197 if ( exists $startcols[$idx+1] ) {
198 push @unpack, ( $startcols[$idx+1] - $startcols[$idx] );
204 my $unpack;
205 if ( @unpack ) {
206 $unpack = 'a' . join 'a', @unpack;
208 $unpack .= 'A*';
210 $unpack;
214 sub parse {
215 my ( $self, $data ) = @_;
217 return undef if !defined $data;
219 chomp $data;
221 ## skip_header_data
222 return undef
223 if $self->skip_header_data
224 && ( defined $self->header_row && $data eq $self->header_row )
227 #printf "\nData:|%s|\tHeader:|%s|", $data, $self->header_row;
229 my @cols = unpack ( $self->unpack_string, $data );
231 ## If we bleed over a bit we can fix that.
232 if ( $self->fix_overlay ) {
233 foreach my $idx ( 0 .. $#cols ) {
234 if (
235 $cols[$idx] =~ m/\S+$/
236 && exists $cols[$idx+1]
237 && $cols[$idx+1] =~ s/^(\S+)//
239 $cols[$idx] .= $1;
244 ## Get rid of whitespaces
245 if ( $self->trim_whitespace ) {
246 for ( @cols ) { s/^\s+//; s/\s+$//; }
249 ## Swithc nulls to undef
250 if ( $self->null_as_undef ) {
251 croak 'This ->null_as_undef option mandates ->trim_whitespace be true'
252 unless $self->trim_whitespace
254 for ( @cols ) { undef $_ unless length($_) }
257 \@cols;
261 sub parse_hash {
262 my ( $self, $data ) = @_;
264 my $row = $self->parse( $data );
266 my $colstarts = $self->sorted_colstart;
268 my $results;
269 foreach my $idx ( 0 .. $#$row ) {
270 my $col = $self->colchar_map->{ $colstarts->[$idx] };
271 $results->{ $col } = $row->[$idx];
274 $results;
278 sub _build_sorted_colstart {
279 my $self = shift;
281 my @startcols = map { $_->[0] }
282 sort { $a->[1] <=> $b->[1] }
283 map { [$_, sprintf( "%10d", $_ ) ] }
284 keys %{ $self->colchar_map }
287 \@startcols;
293 __END__
295 =head1 NAME
297 DataExtract::FixedWidth - The one stop shop for parsing static column width text tables!
299 =head1 SYNOPSIS
301 ## We assume the columns have no spaces in the header.
302 my $de = DataExtract::FixedWidth->new({ header_row => $header_row });
304 ## We explicitly tell what column names to pick out of the header.
305 my $de = DataExtract::FixedWidth->new({
306 header_row => $header_row
307 cols => [qw/COL1NAME COL2NAME COL3NAME/, 'COL WITH SPACE IN NAME']
310 ## We supply data to heuristically determine header. Here we assume the first
311 ## row is the header (if we need the first row to avoid this possible assumption set
312 ## the header_row to undef. And the result of the heurisitic applied to the first row
313 ## is the columns
314 my $de = DataExtract::FixedWidth->new({ heuristic => \@datarows });
316 $de->parse( $data_row );
318 $de->parse_hash( $data_row );
320 =head1 DESCRIPTION
322 This module parses any type of fixed width table -- these types of tables are often outputed by ghostscript, printf() displays with string padding (i.e. %-20s %20s etc), and most screen capture mechanisms. This module is using Moose all methods can be specified in the constructor.
325 In the below example, this module can discern the column names from the header. Or, you can supply them explicitly in the constructor; or, you can supply the rows in an ArrayRef to heuristic and pray for the best luck.
327 SAMPLE FILE
328 HEADER: 'COL1NAME COL2NAME COL3NAMEEEEE'
329 DATA1: 'FOOBARBAZ THIS IS TEXT ANHER COL '
330 DATA2: 'FOOBAR FOOBAR IS TEXT ANOTHER COL '
332 After you have constructed, you can C<-E<gt>parse> which will return an ArrayRef
333 $de->parse('FOOBARBAZ THIS IS TEXT ANOTHER COL');
335 Or, you can use C<-E<gt>parse_hash()> which returns a HashRef of the data indexed by the column header
337 =head2 Constructor
339 The class constructor -- C<-E<gt>new> -- provides numerious features. Some options it has are:
341 =over 12
343 =item heuristics => \@lines
345 This will deduce the unpack format string from data. If you opt to use this method, and need parse_hash, the first row of the heurisitic is assumed to be the header_row. The unpack_string that results for the heuristic is applied to the header_row to determine the columns.
347 =item cols => \@cols
349 This will permit you to explicitly list the columns in the header row. This is especially handy if you have spaces in the column header. This option will make the C<header_row> mandatory.
351 =item header_row => $string
353 If a C<cols> option is not provided the assumption is that there are no spaces in the column header. The module can take care of the rest. The only way this column can be avoided is if we deduce the header from heuristics, or if you explicitly supply the unpack string and only use C<-E<gt>parse($line)>. If you are not going to supply a header, and you do not want to waste the first line on a header assumption, set the C<header_row =E<gt> undef> in the constructor.
355 =back
357 =head2 Methods
359 B<An astrisk, (*) in the option means that is the default.>
361 =over 12
363 =item ->parse( $data_line )
365 Parses the data and returns an ArrayRef
367 =item ->parse_hash( $data_line )
369 Parses the data and returns a HashRef, indexed by the I<cols> (headers)
371 =item ->first_col_zero(1*|0)
373 This option forces the unpack string to make the first column assume the characters to the left of the header column. So, in the below example the first column also includes the first char of the row, even though the word stock begins at the second character.
375 CHAR NUMBERS: |1|2|3|4|5|6|7|8|9|10
376 HEADER ROW : | |S|T|O|C|K| |V|I|N
378 =item ->trim_whitespace(*1|0)
380 Trim the whitespace for the elements that ->parse() outputs
382 =item ->fix_overlay(1|0*)
384 Fixes columns that bleed into other columns, move over all non-whitespace characters preceding the first whitespace of the next column.
386 So if ColumnA as is 'foob' and ColumnB is 'ar Hello world'
388 * ColumnA becomes 'foobar', and ColumnB becomes 'Hello world'
390 =item ->null_as_undef(1|0*)
392 Simply undef all elements that return C<length(element) = 0>, requires C<-E<gt>trim_whitespace>
394 =item ->skip_header_data(1*|0)
396 Skips duplicate copies of the header_row if found in the data
398 =item ->colchar_map
400 Returns a hash ref that sisplays the results of each column header and the character position the column starts at.
402 =item ->unpack_string
404 Returns the CORE::unpack() template string that will be used internally by ->parse()
406 =back
408 =head1 AVAILABILITY
410 CPAN.org
412 =head1 COPYRIGHT & LICENSE
414 Copyright 2008 Evan, all rights reserved.
416 This program is free software; you can redistribute it and/or modify it
417 under the same terms as Perl itself.
420 =head1 AUTHOR
422 Evan Carroll <me at evancarroll.com>
423 System Lord of the Internets
425 =head1 BUGS
427 Please report any bugs or feature requests to C<bug-dataexract-fixedwidth at rt.cpan.org>, or through
428 the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=DataExtract-FixedWidth>. I will be notified, and then you'll
429 automatically be notified of progress on your bug as I make changes.
431 =cut