pod and new test new version 06
[DataExtract-FixedWidth.git] / lib / DataExtract / FixedWidth.pm
blob76a2da53f44fcac89372813438dea06efcc25703
1 package DataExtract::FixedWidth;
2 use Moose;
3 use Carp;
5 our $VERSION = '0.06';
7 sub BUILD {
8 my $self = shift;
10 confess 'You must either send either a "header_row" or data for "heuristic"'
11 unless $self->has_header_row || $self->has_heuristic
13 confess 'You must send a "header_row" if you send "cols"'
14 if $self->has_cols && !$self->has_header_row && !$self->has_heuristic
19 has 'unpack_string' => (
20 isa => 'Str'
21 , is => 'rw'
22 , lazy_build => 1
25 has 'cols' => (
26 isa => 'ArrayRef'
27 , is => 'rw'
28 , auto_deref => 1
29 , lazy_build => 1
32 has 'colchar_map' => (
33 isa => 'HashRef'
34 , is => 'rw'
35 , lazy_build => 1
38 has 'header_row' => (
39 isa => 'Maybe[Str]'
40 , is => 'rw'
41 , predicate => 'has_header_row'
44 has 'first_col_zero' => (
45 isa => 'Bool'
46 , is => 'ro'
47 , default => 1
50 has 'fix_overlay' => (
51 isa => 'Bool'
52 , is => 'ro'
53 , default => 0
56 has 'trim_whitespace' => (
57 isa => 'Bool'
58 , is => 'ro'
59 , default => 1
62 has 'sorted_colstart' => (
63 isa => 'ArrayRef'
64 , is => 'ro'
65 , lazy_build => 1
66 , auto_deref => 1
69 has 'null_as_undef' => (
70 isa => 'Bool'
71 , is => 'ro'
72 , default => 0
75 has 'heuristic' => (
76 isa => 'ArrayRef'
77 , is => 'rw'
78 , predicate => 'has_heuristic'
79 , auto_deref => 1
80 , trigger => \&_heuristic_trigger
83 has 'skip_header_data' => (
84 isa => 'Bool'
85 , is => 'rw'
86 , default => 1
89 sub _heuristic_trigger {
90 my ( $self, $data ) = @_;
92 chomp @$data;
94 $self->header_row( $data->[0] )
95 unless $self->has_header_row
99 my @unpack;
100 my $mask = ' ' x length $data->[0];
101 $mask |= $_ for @$data;
103 push @unpack, length($1)
104 while $mask =~ m/(\S+\s+|$)/g
107 ## Remove last row, (to be replaced with A*)
108 pop @unpack;
110 $self->unpack_string( $self->_helper_unpack( \@unpack ) );
115 sub _build_cols {
116 my $self = shift;
118 my @cols;
120 ## If we have the unpack string and the header_row parse it all out on our own
121 ## Here we have two conditionals because the unpack_string comes into existance in
122 ## build_unpack_string and not the heuristic_trigger
123 if (
124 ( $self->has_header_row && $self->has_unpack_string )
125 || ( $self->has_header_row && $self->has_heuristic )
127 my $skd = $self->skip_header_data;
128 $self->skip_header_data( 0 );
130 @cols = @{ $self->parse( $self->header_row ) };
132 $self->skip_header_data( $skd );
135 ## We only the header_row
136 elsif ( $self->header_row ) {
137 @cols = split ' ', $self->header_row;
140 else {
141 croak 'Need some method to calculate cols';
144 \@cols;
148 sub _build_colchar_map {
149 my $self = shift;
150 my $ccm = {};
152 ## If we can generate from heurisitic data and don't have a header_row
153 if (
154 $self->has_header_row
155 && !defined $self->header_row
156 && $self->has_heuristic
157 && $self->has_cols
159 my @cols = $self->cols;
160 foreach my $idx ( 0 .. $#cols ) {
161 $ccm->{$idx} = $cols[$idx];
165 ## Generate from header_row
166 else {
167 croak 'Can not render the map of columns to start-chars without the header_row'
168 unless defined $self->has_header_row
171 foreach my $col ( $self->cols ) {
173 my $pos = 0;
174 $pos = index( $self->header_row, $col, $pos );
176 croak "Failed to find a column '$col' in the header row"
177 unless defined $pos
180 unless ( exists $ccm->{ $pos } ) {
181 $ccm->{ $pos } = $col;
184 ## We have two like-named columns
185 else {
187 ## possible inf loop here
188 until ( not exists $ccm->{$pos} ) {
189 $pos = index( $self->header_row, $col, $pos+1 );
191 croak "Failed to find another column '$col' in the header row"
192 unless defined $pos
197 $ccm->{ $pos } = $col;
205 $ccm;
209 sub _build_unpack_string {
210 my $self = shift;
212 my @unpack;
213 my @startcols = $self->sorted_colstart;
214 $startcols[0] = 0 if $self->first_col_zero;
215 foreach my $idx ( 0 .. $#startcols ) {
217 if ( exists $startcols[$idx+1] ) {
218 push @unpack, ( $startcols[$idx+1] - $startcols[$idx] );
223 $self->_helper_unpack( \@unpack );
227 ## Takes ArrayRef of startcols and returns the unpack string.
228 sub _helper_unpack {
229 my ( $self, $startcols ) = @_;
231 my $format;
232 if ( @$startcols ) {
233 $format = 'a' . join 'a', @$startcols;
235 $format .= 'A*';
237 $format;
241 sub parse {
242 my ( $self, $data ) = @_;
244 return undef if !defined $data;
246 chomp $data;
248 ## skip_header_data
249 return undef
250 if $self->skip_header_data
251 && ( defined $self->header_row && $data eq $self->header_row )
254 #printf "\nData:|%s|\tHeader:|%s|", $data, $self->header_row;
256 my @cols = unpack ( $self->unpack_string, $data );
258 ## If we bleed over a bit we can fix that.
259 if ( $self->fix_overlay ) {
260 foreach my $idx ( 0 .. $#cols ) {
261 if (
262 $cols[$idx] =~ m/\S+$/
263 && exists $cols[$idx+1]
264 && $cols[$idx+1] =~ s/^(\S+)//
266 $cols[$idx] .= $1;
271 ## Get rid of whitespaces
272 if ( $self->trim_whitespace ) {
273 for ( @cols ) { s/^\s+//; s/\s+$//; }
276 ## Swithc nulls to undef
277 if ( $self->null_as_undef ) {
278 croak 'This ->null_as_undef option mandates ->trim_whitespace be true'
279 unless $self->trim_whitespace
281 for ( @cols ) { undef $_ unless length($_) }
284 \@cols;
288 sub parse_hash {
289 my ( $self, $data ) = @_;
291 my $row = $self->parse( $data );
293 my $colstarts = $self->sorted_colstart;
295 my $results;
296 foreach my $idx ( 0 .. $#$row ) {
297 my $col = $self->colchar_map->{ $colstarts->[$idx] };
298 $results->{ $col } = $row->[$idx];
301 $results;
305 sub _build_sorted_colstart {
306 my $self = shift;
308 my @startcols = map { $_->[0] }
309 sort { $a->[1] <=> $b->[1] }
310 map { [$_, sprintf( "%10d", $_ ) ] }
311 keys %{ $self->colchar_map }
314 \@startcols;
318 no Moose;
322 __END__
324 =head1 NAME
326 DataExtract::FixedWidth - The one stop shop for parsing static column width text tables!
328 =head1 SYNOPSIS
330 ## We assume the columns have no spaces in the header.
331 my $de = DataExtract::FixedWidth->new({ header_row => $header_row });
333 ## We explicitly tell what column names to pick out of the header.
334 my $de = DataExtract::FixedWidth->new({
335 header_row => $header_row
336 cols => [qw/COL1NAME COL2NAME COL3NAME/, 'COL WITH SPACE IN NAME']
339 ## We supply data to heuristic and assume
340 ## * first row is the header (to avoid this assumption
341 ## set the header_row to undef. )
342 ## * heurisitic's unpack_string is correct
343 ## * unpack_string applied to header_row will tell us the columns
344 my $de = DataExtract::FixedWidth->new({ heuristic => \@datarows });
346 ## We supply data to heuristic, say we have no header, and the set columns
347 ## just like the above except ->parse_hash will be be indexed by the
348 ## provided columns and no row is designated as the header.
349 my $de = DataExtract::FixedWidth->new({
350 heuristic => \@datarows
351 , header_row => undef
352 , columns => [qw/ foo bar baz/]
355 ## We supply data to heuristic, and we explicitly add the header_row
356 ## with this method it doesn't have to occur in the data.
357 ## The unpack string rendered will be applied to the first row to get
358 ## the columns
359 my $de = DataExtract::FixedWidth->new({
360 heuristic => \@datarows
361 , header_row => $header_row
364 ## We explicitly add the header_row, with this method it doesn't have
365 ## to occur in the data. The unpack string rendered will be applied
366 ## to the provided header_row to get the columns
367 my $de = DataExtract::FixedWidth->new({
368 unpack_string => $template
369 , header_row => $header_row
372 $de->parse( $data_row );
374 $de->parse_hash( $data_row );
376 =head1 DESCRIPTION
378 This module parses any type of fixed width table -- these types of tables are often outputed by ghostscript, printf() displays with string padding (i.e. %-20s %20s etc), and most screen capture mechanisms. This module is using Moose all methods can be specified in the constructor.
380 In the below example, this module can discern the column names from the header. Or, you can supply them explicitly in the constructor; or, you can supply the rows in an ArrayRef to heuristic and pray for the best luck. This module is pretty abstracted and will deduce what it doesn't know in a decent fashion if all of the information is not provided.
382 SAMPLE FILE
383 HEADER: 'COL1NAME COL2NAME COL3NAMEEEEE'
384 DATA1: 'FOOBARBAZ THIS IS TEXT ANHER COL '
385 DATA2: 'FOOBAR FOOBAR IS TEXT ANOTHER COL '
387 After you have constructed, you can C<-E<gt>parse> which will return an ArrayRef
388 $de->parse('FOOBARBAZ THIS IS TEXT ANOTHER COL');
390 Or, you can use C<-E<gt>parse_hash()> which returns a HashRef of the data indexed by the column headers. They can be determined in many ways with the data you provide.
392 =head2 Constructor
394 The class constructor, C<-E<gt>new>, has numerious forms. Some options it has are:
396 =over 12
398 =item heuristics => \@lines
400 This will deduce the unpack format string from data. If you opt to use this method, and need parse_hash, the first row of the heurisitic is assumed to be the header_row. The unpack_string that results for the heuristic is applied to the header_row to determine the columns.
402 =item cols => \@cols
404 This will permit you to explicitly list the columns in the header row. This is especially handy if you have spaces in the column header. This option will make the C<header_row> mandatory.
406 =item header_row => $string
408 If a C<cols> option is not provided the assumption is that there are no spaces in the column header. The module can take care of the rest. The only way this column can be avoided is if we deduce the header from heuristics, or if you explicitly supply the unpack string and only use C<-E<gt>parse($line)>. If you are not going to supply a header, and you do not want to waste the first line on a header assumption, set the C<header_row =E<gt> undef> in the constructor.
410 =back
412 =head2 Methods
414 B<An astrisk, (*) in the option means that is the default.>
416 =over 12
418 =item ->parse( $data_line )
420 Parses the data and returns an ArrayRef
422 =item ->parse_hash( $data_line )
424 Parses the data and returns a HashRef, indexed by the I<cols> (headers)
426 =item ->first_col_zero(1*|0)
428 This option forces the unpack string to make the first column assume the characters to the left of the header column. So, in the below example the first column also includes the first char of the row, even though the word stock begins at the second character.
430 CHAR NUMBERS: |1|2|3|4|5|6|7|8|9|10
431 HEADER ROW : | |S|T|O|C|K| |V|I|N
433 =item ->trim_whitespace(*1|0)
435 Trim the whitespace for the elements that C<-E<gt>parse($line)> outputs.
437 =item ->fix_overlay(1|0*)
439 Fixes columns that bleed into other columns, move over all non-whitespace characters preceding the first whitespace of the next column. This does not work with heurisitic because the unpack string makes the assumption the data is not mangeled.
441 So if ColumnA as is 'foob' and ColumnB is 'ar Hello world'
443 * ColumnA becomes 'foobar', and ColumnB becomes 'Hello world'
445 =item ->null_as_undef(1|0*)
447 Simply undef all elements that return C<length(element) = 0>, requires C<-E<gt>trim_whitespace>.
449 =item ->skip_header_data(1*|0)
451 Skips duplicate copies of the header_row if found in the data.
453 =item ->colchar_map
455 Returns a HashRef that displays the results of each column header and relative character position the column starts at. In the case of heuristic this is a simple ordinal number. In the case of non-heuristic provided data it is currently a cardinal character position.
457 =item ->unpack_string
459 Returns the C<CORE::unpack()> template string that will be used internally by C<-E<gt>parse($line)>
461 =back
463 =head1 AVAILABILITY
465 CPAN.org
467 =head1 COPYRIGHT & LICENSE
469 Copyright 2008 Evan, all rights reserved.
471 This program is free software; you can redistribute it and/or modify it
472 under the same terms as Perl itself.
475 =head1 AUTHOR
477 Evan Carroll <me at evancarroll.com>
478 System Lord of the Internets
480 =head1 BUGS
482 Please report any bugs or feature requests to C<bug-dataexract-fixedwidth at rt.cpan.org>, or through
483 the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=DataExtract-FixedWidth>. I will be notified, and then you'll
484 automatically be notified of progress on your bug as I make changes.
486 =cut