lib/DataExtract/FixedWidth.pm

   1 package DataExtract::FixedWidth;
   2 use Moose;
   3 use Carp;
   4
   5 our $VERSION = '0.07';
   6
   7 sub BUILD {
   8         my $self = shift;
   9
  10         confess 'You must either send either a "header_row" or data for "heuristic"'
  11                 unless $self->has_header_row || $self->has_heuristic
  12         ;
  13         confess 'You must send a "header_row" if you send "cols"'
  14                 if $self->has_cols && !$self->has_header_row && !$self->has_heuristic
  15         ;
  16
  17 }
  18
  19 has 'unpack_string' => (
  20         isa          => 'Str'
  21         , is         => 'rw'
  22         , lazy_build => 1
  23 );
  24
  25 has 'cols' => (
  26         isa            => 'ArrayRef'
  27         , is           => 'rw'
  28         , auto_deref   => 1
  29         , lazy_build   => 1
  30 );
  31
  32 has 'colchar_map' => (
  33         isa          => 'HashRef'
  34         , is         => 'rw'
  35         , lazy_build => 1
  36 );
  37
  38 has 'header_row' => (
  39         isa          => 'Maybe[Str]'
  40         , is         => 'rw'
  41         , predicate  => 'has_header_row'
  42 );
  43
  44 has 'first_col_zero' => (
  45         isa       => 'Bool'
  46         , is      => 'ro'
  47         , default => 1
  48 );
  49
  50 has 'fix_overlay' => (
  51         isa       => 'Bool'
  52         , is      => 'ro'
  53         , default => 0
  54 );
  55
  56 has 'trim_whitespace' => (
  57         isa       => 'Bool'
  58         , is      => 'ro'
  59         , default => 1
  60 );
  61
  62 has 'sorted_colstart' => (
  63         isa          => 'ArrayRef'
  64         , is         => 'ro'
  65         , lazy_build => 1
  66         , auto_deref => 1
  67 );
  68
  69 has 'null_as_undef' => (
  70         isa       => 'Bool'
  71         , is      => 'ro'
  72         , default => 0
  73 );
  74
  75 has 'heuristic' => (
  76         isa          => 'ArrayRef'
  77         , is         => 'rw'
  78         , predicate  => 'has_heuristic'
  79         , auto_deref => 1
  80         , trigger    => \&_heuristic_trigger
  81 );
  82
  83 has 'skip_header_data' => (
  84         isa       => 'Bool'
  85         , is      => 'rw'
  86         , default => 1
  87 );
  88
  89 has 'verbose' => ( isa => 'Bool', 'is' => 'ro', default => 0 );
  90
  91 sub _heuristic_trigger {
  92         my ( $self, $data ) = @_;
  93
  94         chomp @$data;
  95
  96         my $maxLength = 0;
  97         for ( @$data ) {
  98                 $maxLength = length if length > $maxLength
  99         }
 100
 101         $self->header_row( $data->[0] )
 102                 unless $self->has_header_row
 103         ;
 104
 105         {
 106                 my @unpack;
 107                 my $mask = ' ' x $maxLength;
 108                 $mask |= $_ for @$data;
 109
 110                 ## The (?=\S) fixes a bug that creates null columns in the event any
 111                 ## one column has trailing whitespace (because you'll have '\S\s  '
 112                 ## this was a bug revealed in the dataset NullFirstRow.txt
 113                 push @unpack, length($1)
 114                         while $mask =~ m/(\S+\s+(?=\S)|$)/g
 115                 ;
 116
 117                 ## Remove last row, (to be replaced with A*)
 118                 pop @unpack;
 119
 120                 $self->unpack_string( $self->_helper_unpack( \@unpack ) );
 121         }
 122
 123 }
 124
 125 sub _build_cols {
 126         my $self = shift;
 127
 128         my @cols;
 129
 130         ## If we have the unpack string and the header_row parse it all out on our own
 131         ## Here we have two conditionals because the unpack_string comes into existance in
 132         ## build_unpack_string and not the heuristic_trigger
 133         if (
 134                 ( $self->has_header_row && $self->has_unpack_string )
 135                 || ( $self->has_header_row && $self->has_heuristic )
 136         ) {
 137                 my $skd = $self->skip_header_data;
 138                 $self->skip_header_data( 0 );
 139
 140                 @cols = @{ $self->parse( $self->header_row ) };
 141
 142                 $self->skip_header_data( $skd );
 143         }
 144
 145         ## We only the header_row
 146         elsif ( $self->header_row ) {
 147                 @cols = split ' ', $self->header_row;
 148         }
 149
 150         else {
 151                 croak 'Need some method to calculate cols';
 152         }
 153
 154         \@cols;
 155
 156 }
 157
 158 sub _build_colchar_map {
 159         my $self = shift;
 160         my $ccm = {};
 161
 162         ## If we can generate from heurisitic data and don't have a header_row
 163         if (
 164                 $self->has_header_row
 165                 && !defined $self->header_row
 166                 && $self->has_heuristic
 167                 && $self->has_cols
 168         ) {
 169                 my @cols = $self->cols;
 170                 foreach my $idx ( 0 .. $#cols ) {
 171                         $ccm->{$idx} = $cols[$idx];
 172                 }
 173         }
 174
 175         ## Generate from header_row
 176         else {
 177                 croak 'Can not render the map of columns to start-chars without the header_row'
 178                         unless defined $self->has_header_row
 179                 ;
 180
 181                 foreach my $col ( $self->cols ) {
 182
 183                         my $pos = 0;
 184                         $pos = index( $self->header_row, $col, $pos );
 185
 186                         croak "Failed to find a column '$col' in the header row"
 187                                 unless defined $pos
 188                         ;
 189
 190                         unless ( exists $ccm->{ $pos } ) {
 191                                 $ccm->{ $pos } = $col;
 192                         }
 193
 194                         ## We have two like-named columns
 195                         else {
 196
 197                                 ## possible inf loop here
 198                                 until ( not exists $ccm->{$pos} ) {
 199                                         $pos = index( $self->header_row, $col, $pos+1 );
 200
 201                                         croak "Failed to find another column '$col' in the header row"
 202                                                 unless defined $pos
 203                                         ;
 204
 205                                 }
 206
 207                                 $ccm->{ $pos } = $col;
 208
 209                         }
 210
 211                 }
 212
 213         }
 214
 215         $ccm;
 216
 217 }
 218
 219 sub _build_unpack_string {
 220         my $self = shift;
 221
 222         my @unpack;
 223         my @startcols = $self->sorted_colstart;
 224         $startcols[0] = 0 if $self->first_col_zero;
 225         foreach my $idx ( 0 .. $#startcols ) {
 226
 227                 if ( exists $startcols[$idx+1] ) {
 228                         push @unpack, ( $startcols[$idx+1] - $startcols[$idx] );
 229                 }
 230
 231         }
 232
 233         $self->_helper_unpack( \@unpack );
 234
 235 }
 236
 237 ## Takes ArrayRef of startcols and returns the unpack string.
 238 sub _helper_unpack {
 239         my ( $self, $startcols ) = @_;
 240
 241         my $format;
 242         if ( @$startcols ) {
 243                 $format = 'a' . join 'a', @$startcols;
 244         }
 245         $format .= 'A*';
 246
 247         $format;
 248
 249 }
 250
 251 sub parse {
 252         my ( $self, $data ) = @_;
 253
 254         return undef if !defined $data;
 255
 256         chomp $data;
 257
 258         ## skip_header_data
 259         if (
 260                 $self->skip_header_data
 261                 && ( defined $self->header_row && $data eq $self->header_row )
 262         ) {
 263                 warn "Skipping duplicate header row\n" if $self->verbose;
 264                 return undef
 265         }
 266
 267         #printf "\nData:|%s|\tHeader:|%s|", $data, $self->header_row;
 268
 269         my @cols = unpack ( $self->unpack_string, $data );
 270
 271         ## If we bleed over a bit we can fix that.
 272         if ( $self->fix_overlay ) {
 273                 foreach my $idx ( 0 .. $#cols ) {
 274                         if (
 275                                 $cols[$idx] =~ m/\S+$/
 276                                 && exists $cols[$idx+1]
 277                                 && $cols[$idx+1] =~ s/^(\S+)//
 278                         ) {
 279                                         $cols[$idx] .= $1;
 280                         }
 281                 }
 282         }
 283
 284         ## Get rid of whitespaces
 285         if ( $self->trim_whitespace ) {
 286                 for ( @cols ) { s/^\s+//; s/\s+$//; }
 287         }
 288
 289         ## Swithc nulls to undef
 290         if ( $self->null_as_undef ) {
 291                 croak 'This ->null_as_undef option mandates ->trim_whitespace be true'
 292                         unless $self->trim_whitespace
 293                 ;
 294                 for ( @cols ) { undef $_ unless length($_) }
 295         }
 296
 297         \@cols;
 298
 299 }
 300
 301 sub parse_hash {
 302         my ( $self, $data ) = @_;
 303
 304         my $row = $self->parse( $data );
 305
 306         my $colstarts = $self->sorted_colstart;
 307
 308         my $results;
 309         foreach my $idx ( 0 .. $#$row ) {
 310                 my $col = $self->colchar_map->{ $colstarts->[$idx] };
 311                 $results->{ $col } = $row->[$idx];
 312         }
 313
 314         $results;
 315
 316 }
 317
 318 sub _build_sorted_colstart {
 319         my $self = shift;
 320
 321         my @startcols = map { $_->[0] }
 322                 sort { $a->[1] <=> $b->[1] }
 323                 map { [$_, sprintf( "%10d", $_ ) ] }
 324                 keys %{ $self->colchar_map }
 325         ;
 326
 327         \@startcols;
 328
 329 }
 330
 331 no Moose;
 332 __PACKAGE__->meta->make_immutable;
 333
 334 1;
 335
 336 __END__
 337
 338 =head1 NAME
 339
 340 DataExtract::FixedWidth - The one stop shop for parsing static column width text tables!
 341
 342 =head1 SYNOPSIS
 343
 344         ## We assume the columns have no spaces in the header.
 345         my $de = DataExtract::FixedWidth->new({ header_row => $header_row });
 346
 347         ## We explicitly tell what column names to pick out of the header.
 348         my $de = DataExtract::FixedWidth->new({
 349                 header_row => $header_row
 350                 cols       => [qw/COL1NAME COL2NAME COL3NAME/, 'COL WITH SPACE IN NAME']
 351         });
 352
 353         ## We supply data to heuristic and assume
 354         ## * first row is the header (to avoid this assumption
 355         ##   set the header_row to undef. )
 356         ## * heurisitic's unpack_string is correct
 357         ## * unpack_string applied to header_row will tell us the columns
 358         my $de = DataExtract::FixedWidth->new({ heuristic => \@datarows });
 359
 360         ## We supply data to heuristic, say we have no header, and the set columns
 361         ## just like the above except ->parse_hash will be be indexed by the
 362         ## provided columns and no row is designated as the header.
 363         my $de = DataExtract::FixedWidth->new({
 364                 heuristic    => \@datarows
 365                 , header_row => undef
 366                 , columns    => [qw/ foo bar baz/]
 367         });
 368
 369         ## We supply data to heuristic, and we explicitly add the header_row
 370         ## with this method it doesn't have to occur in the data.
 371         ## The unpack string rendered will be applied to the first row to get
 372         ## the columns
 373         my $de = DataExtract::FixedWidth->new({
 374                 heuristic    => \@datarows
 375                 , header_row => $header_row
 376         });
 377
 378         ## We explicitly add the header_row, with this method it doesn't have
 379         ## to occur in the data. The unpack string rendered will be applied
 380         ## to the provided header_row to get the columns
 381         my $de = DataExtract::FixedWidth->new({
 382                 unpack_string => $template
 383                 , header_row  => $header_row
 384         });
 385
 386         $de->parse( $data_row );
 387
 388         $de->parse_hash( $data_row );
 389
 390 =head1 DESCRIPTION
 391
 392 This module parses any type of fixed width table -- these types of tables are often outputed by ghostscript, printf() displays with string padding (i.e. %-20s %20s etc), and most screen capture mechanisms. This module is using Moose all methods can be specified in the constructor.
 393
 394 In the below example, this module can discern the column names from the header. Or, you can supply them explicitly in the constructor; or, you can supply the rows in an ArrayRef to heuristic and pray for the best luck. This module is pretty abstracted and will deduce what it doesn't know in a decent fashion if all of the information is not provided.
 395
 396         SAMPLE FILE
 397         HEADER:  'COL1NAME       COL2NAME       COL3NAMEEEEE'
 398         DATA1:   'FOOBARBAZ      THIS IS TEXT   ANHER COL   '
 399         DATA2:   'FOOBAR FOOBAR  IS TEXT        ANOTHER COL '
 400
 401 After you have constructed, you can C<-E<gt>parse> which will return an ArrayRef
 402         $de->parse('FOOBARBAZ THIS IS TEXT    ANOTHER COL');
 403
 404 Or, you can use C<-E<gt>parse_hash()> which returns a HashRef of the data indexed by the column headers. They can be determined in many ways with the data you provide.
 405
 406 =head2 Constructor
 407
 408 The class constructor, C<-E<gt>new>, has numerious forms. Some options it has are:
 409
 410 =over 12
 411
 412 =item heuristics => \@lines
 413
 414 This will deduce the unpack format string from data. If you opt to use this method, and need parse_hash, the first row of the heurisitic is assumed to be the header_row. The unpack_string that results for the heuristic is applied to the header_row to determine the columns.
 415
 416 =item cols => \@cols
 417
 418 This will permit you to explicitly list the columns in the header row. This is especially handy if you have spaces in the column header. This option will make the C<header_row> mandatory.
 419
 420 =item header_row => $string
 421
 422 If a C<cols> option is not provided the assumption is that there are no spaces in the column header. The module can take care of the rest. The only way this column can be avoided is if we deduce the header from heuristics, or if you explicitly supply the unpack string and only use C<-E<gt>parse($line)>. If you are not going to supply a header, and you do not want to waste the first line on a header assumption, set the C<header_row =E<gt> undef> in the constructor.
 423
 424 =item verbose => 1|0
 425
 426 Right now, it simply display's warnings when it does something that might at first seem awkward. Like returning undef when it encouters a duplicate copy of a header row.
 427
 428 =back
 429
 430 =head2 Methods
 431
 432 B<An astrisk, (*) in the option means that is the default.>
 433
 434 =over 12
 435
 436 =item ->parse( $data_line )
 437
 438 Parses the data and returns an ArrayRef
 439
 440 =item ->parse_hash( $data_line )
 441
 442 Parses the data and returns a HashRef, indexed by the I<cols> (headers)
 443
 444 =item ->first_col_zero(1*|0)
 445
 446 This option forces the unpack string to make the first column assume the characters to the left of the header column. So, in the below example the first column also includes the first char of the row, even though the word stock begins at the second character.
 447
 448         CHAR NUMBERS: |1|2|3|4|5|6|7|8|9|10
 449         HEADER ROW  : | |S|T|O|C|K| |V|I|N
 450
 451 =item ->trim_whitespace(*1|0)
 452
 453 Trim the whitespace for the elements that C<-E<gt>parse($line)> outputs.
 454
 455 =item ->fix_overlay(1|0*)
 456
 457 Fixes columns that bleed into other columns, move over all non-whitespace characters preceding the first whitespace of the next column. This does not work with heurisitic because the unpack string makes the assumption the data is not mangeled.
 458
 459 So if ColumnA as is 'foob' and ColumnB is 'ar Hello world'
 460
 461 * ColumnA becomes 'foobar', and ColumnB becomes 'Hello world'
 462
 463 =item ->null_as_undef(1|0*)
 464
 465 Simply undef all elements that return C<length(element) = 0>, requires C<-E<gt>trim_whitespace>.
 466
 467 =item ->skip_header_data(1*|0)
 468
 469 Skips duplicate copies of the header_row if found in the data.
 470
 471 =item ->colchar_map
 472
 473 Returns a HashRef that displays the results of each column header and relative character position the column starts at. In the case of heuristic this is a simple ordinal number. In the case of non-heuristic provided data it is currently a cardinal character position.
 474
 475 =item ->unpack_string
 476
 477 Returns the C<CORE::unpack()> template string that will be used internally by C<-E<gt>parse($line)>
 478
 479 =back
 480
 481 =head1 AVAILABILITY
 482
 483 CPAN.org
 484
 485 =head1 COPYRIGHT & LICENSE
 486
 487 Copyright 2008 Evan, all rights reserved.
 488
 489 This program is free software; you can redistribute it and/or modify it
 490 under the same terms as Perl itself.
 491
 492
 493 =head1 AUTHOR
 494
 495         Evan Carroll <me at evancarroll.com>
 496         System Lord of the Internets
 497
 498 =head1 BUGS
 499
 500 Please report any bugs or feature requests to C<bug-dataexract-fixedwidth at rt.cpan.org>, or through
 501 the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=DataExtract-FixedWidth>.  I will be notified, and then you'll
 502 automatically be notified of progress on your bug as I make changes.
 503
 504 =cut