lib/DataExtract/FixedWidth.pm

   1 package DataExtract::FixedWidth;
   2 use Moose;
   3 use Carp;
   4
   5 our $VERSION = '0.08';
   6
   7 sub BUILD {
   8         my $self = shift;
   9
  10         confess 'You must either send either a "header_row" or data for "heuristic"'
  11                 unless $self->has_header_row || $self->has_heuristic
  12         ;
  13         confess 'You must send a "header_row" if you send "cols"'
  14                 if $self->has_cols && !$self->has_header_row && !$self->has_heuristic
  15         ;
  16
  17 }
  18
  19 has 'unpack_string' => (
  20         isa          => 'Str'
  21         , is         => 'rw'
  22         , lazy_build => 1
  23 );
  24
  25 has 'cols' => (
  26         isa            => 'ArrayRef'
  27         , is           => 'rw'
  28         , auto_deref   => 1
  29         , lazy_build   => 1
  30 );
  31
  32 has 'colchar_map' => (
  33         isa          => 'HashRef'
  34         , is         => 'rw'
  35         , lazy_build => 1
  36 );
  37
  38 has 'header_row' => (
  39         isa          => 'Maybe[Str]'
  40         , is         => 'rw'
  41         , predicate  => 'has_header_row'
  42 );
  43
  44 has 'first_col_zero' => (
  45         isa       => 'Bool'
  46         , is      => 'ro'
  47         , default => 1
  48 );
  49
  50 has 'fix_overlay' => (
  51         isa       => 'Bool'
  52         , is      => 'ro'
  53         , default => 0
  54 );
  55
  56 has 'trim_whitespace' => (
  57         isa       => 'Bool'
  58         , is      => 'ro'
  59         , default => 1
  60 );
  61
  62 has 'sorted_colstart' => (
  63         isa          => 'ArrayRef'
  64         , is         => 'ro'
  65         , lazy_build => 1
  66         , auto_deref => 1
  67 );
  68
  69 has 'null_as_undef' => (
  70         isa       => 'Bool'
  71         , is      => 'ro'
  72         , default => 0
  73 );
  74
  75 has 'heuristic' => (
  76         isa          => 'ArrayRef'
  77         , is         => 'rw'
  78         , predicate  => 'has_heuristic'
  79         , auto_deref => 1
  80         , trigger    => \&_heuristic_trigger
  81 );
  82
  83 has 'skip_header_data' => (
  84         isa       => 'Bool'
  85         , is      => 'rw'
  86         , default => 1
  87 );
  88
  89 has 'verbose' => ( isa => 'Bool', 'is' => 'ro', default => 0 );
  90
  91 sub _heuristic_trigger {
  92         my ( $self, $data ) = @_;
  93
  94         chomp @$data;
  95
  96         my $maxLength = 0;
  97         for ( @$data ) {
  98                 $maxLength = length if length > $maxLength
  99         }
 100
 101         $self->header_row( $data->[0] )
 102                 unless $self->has_header_row
 103         ;
 104
 105         {
 106                 my @unpack;
 107                 my $mask = ' ' x $maxLength;
 108                 $mask |= $_ for @$data;
 109
 110                 ## The (?=\S) fixes a bug that creates null columns in the event any
 111                 ## one column has trailing whitespace (because you'll have '\S\s  '
 112                 ## this was a bug revealed in the dataset NullFirstRow.txt
 113                 push @unpack, length($1)
 114                         while $mask =~ m/(\S+\s+(?=\S))/g
 115                 ;
 116
 117                 $self->unpack_string( $self->_helper_unpack( \@unpack ) );
 118         }
 119
 120 }
 121
 122 sub _build_cols {
 123         my $self = shift;
 124
 125         my @cols;
 126
 127         ## If we have the unpack string and the header_row parse it all out on our own
 128         ## Here we have two conditionals because the unpack_string comes into existance in
 129         ## build_unpack_string and not the heuristic_trigger
 130         if (
 131                 ( $self->has_header_row && $self->has_unpack_string )
 132                 || ( $self->has_header_row && $self->has_heuristic )
 133         ) {
 134                 my $skd = $self->skip_header_data;
 135                 $self->skip_header_data( 0 );
 136
 137                 @cols = @{ $self->parse( $self->header_row ) };
 138
 139                 $self->skip_header_data( $skd );
 140         }
 141
 142         ## We only the header_row
 143         elsif ( $self->header_row ) {
 144                 @cols = split ' ', $self->header_row;
 145         }
 146
 147         else {
 148                 croak 'Need some method to calculate cols';
 149         }
 150
 151         \@cols;
 152
 153 }
 154
 155 sub _build_colchar_map {
 156         my $self = shift;
 157         my $ccm = {};
 158
 159         ## If we can generate from heurisitic data and don't have a header_row
 160         if (
 161                 $self->has_header_row
 162                 && !defined $self->header_row
 163                 && $self->has_heuristic
 164                 && $self->has_cols
 165         ) {
 166                 my @cols = $self->cols;
 167                 foreach my $idx ( 0 .. $#cols ) {
 168                         $ccm->{$idx} = $cols[$idx];
 169                 }
 170         }
 171
 172         ## Generate from header_row
 173         else {
 174                 croak 'Can not render the map of columns to start-chars without the header_row'
 175                         unless defined $self->has_header_row
 176                 ;
 177
 178                 foreach my $col ( $self->cols ) {
 179
 180                         my $pos = 0;
 181                         $pos = index( $self->header_row, $col, $pos );
 182
 183                         croak "Failed to find a column '$col' in the header row"
 184                                 unless defined $pos
 185                         ;
 186
 187                         unless ( exists $ccm->{ $pos } ) {
 188                                 $ccm->{ $pos } = $col;
 189                         }
 190
 191                         ## We have two like-named columns
 192                         else {
 193
 194                                 ## possible inf loop here
 195                                 until ( not exists $ccm->{$pos} ) {
 196                                         $pos = index( $self->header_row, $col, $pos+1 );
 197
 198                                         croak "Failed to find another column '$col' in the header row"
 199                                                 unless defined $pos
 200                                         ;
 201
 202                                 }
 203
 204                                 $ccm->{ $pos } = $col;
 205
 206                         }
 207
 208                 }
 209
 210         }
 211
 212         $ccm;
 213
 214 }
 215
 216 sub _build_unpack_string {
 217         my $self = shift;
 218
 219         my @unpack;
 220         my @startcols = $self->sorted_colstart;
 221         $startcols[0] = 0 if $self->first_col_zero;
 222         foreach my $idx ( 0 .. $#startcols ) {
 223
 224                 if ( exists $startcols[$idx+1] ) {
 225                         push @unpack, ( $startcols[$idx+1] - $startcols[$idx] );
 226                 }
 227
 228         }
 229
 230         $self->_helper_unpack( \@unpack );
 231
 232 }
 233
 234 ## Takes ArrayRef of startcols and returns the unpack string.
 235 sub _helper_unpack {
 236         my ( $self, $startcols ) = @_;
 237
 238         my $format;
 239         if ( @$startcols ) {
 240                 $format = 'a' . join 'a', @$startcols;
 241         }
 242         $format .= 'A*';
 243
 244         $format;
 245
 246 }
 247
 248 sub parse {
 249         my ( $self, $data ) = @_;
 250
 251         return undef if !defined $data;
 252
 253         chomp $data;
 254
 255         ## skip_header_data
 256         if (
 257                 $self->skip_header_data
 258                 && ( defined $self->header_row && $data eq $self->header_row )
 259         ) {
 260                 warn "Skipping duplicate header row\n" if $self->verbose;
 261                 return undef
 262         }
 263
 264         #printf "\nData:|%s|\tHeader:|%s|", $data, $self->header_row;
 265
 266         my @cols = unpack ( $self->unpack_string, $data );
 267
 268         ## If we bleed over a bit we can fix that.
 269         if ( $self->fix_overlay ) {
 270                 foreach my $idx ( 0 .. $#cols ) {
 271                         if (
 272                                 $cols[$idx] =~ m/\S+$/
 273                                 && exists $cols[$idx+1]
 274                                 && $cols[$idx+1] =~ s/^(\S+)//
 275                         ) {
 276                                         $cols[$idx] .= $1;
 277                         }
 278                 }
 279         }
 280
 281         ## Get rid of whitespaces
 282         if ( $self->trim_whitespace ) {
 283                 for ( @cols ) { s/^\s+//; s/\s+$//; }
 284         }
 285
 286         ## Swithc nulls to undef
 287         if ( $self->null_as_undef ) {
 288                 croak 'This ->null_as_undef option mandates ->trim_whitespace be true'
 289                         unless $self->trim_whitespace
 290                 ;
 291                 for ( @cols ) { undef $_ unless length($_) }
 292         }
 293
 294         \@cols;
 295
 296 }
 297
 298 sub parse_hash {
 299         my ( $self, $data ) = @_;
 300
 301         my $row = $self->parse( $data );
 302
 303         my $colstarts = $self->sorted_colstart;
 304
 305         my $results;
 306         foreach my $idx ( 0 .. $#$row ) {
 307                 my $col = $self->colchar_map->{ $colstarts->[$idx] };
 308                 $results->{ $col } = $row->[$idx];
 309         }
 310
 311         $results;
 312
 313 }
 314
 315 sub _build_sorted_colstart {
 316         my $self = shift;
 317
 318         my @startcols = map { $_->[0] }
 319                 sort { $a->[1] <=> $b->[1] }
 320                 map { [$_, sprintf( "%10d", $_ ) ] }
 321                 keys %{ $self->colchar_map }
 322         ;
 323
 324         \@startcols;
 325
 326 }
 327
 328 no Moose;
 329 __PACKAGE__->meta->make_immutable;
 330
 331 1;
 332
 333 __END__
 334
 335 =head1 NAME
 336
 337 DataExtract::FixedWidth - The one stop shop for parsing static column width text tables!
 338
 339 =head1 SYNOPSIS
 340
 341         ## We assume the columns have no spaces in the header.
 342         my $de = DataExtract::FixedWidth->new({ header_row => $header_row });
 343
 344         ## We explicitly tell what column names to pick out of the header.
 345         my $de = DataExtract::FixedWidth->new({
 346                 header_row => $header_row
 347                 cols       => [qw/COL1NAME COL2NAME COL3NAME/, 'COL WITH SPACE IN NAME']
 348         });
 349
 350         ## We supply data to heuristic and assume
 351         ## * first row is the header (to avoid this assumption
 352         ##   set the header_row to undef. )
 353         ## * heurisitic's unpack_string is correct
 354         ## * unpack_string applied to header_row will tell us the columns
 355         my $de = DataExtract::FixedWidth->new({ heuristic => \@datarows });
 356
 357         ## We supply data to heuristic, say we have no header, and the set columns
 358         ## just like the above except ->parse_hash will be be indexed by the
 359         ## provided columns and no row is designated as the header.
 360         my $de = DataExtract::FixedWidth->new({
 361                 heuristic    => \@datarows
 362                 , header_row => undef
 363                 , columns    => [qw/ foo bar baz/]
 364         });
 365
 366         ## We supply data to heuristic, and we explicitly add the header_row
 367         ## with this method it doesn't have to occur in the data.
 368         ## The unpack string rendered will be applied to the first row to get
 369         ## the columns
 370         my $de = DataExtract::FixedWidth->new({
 371                 heuristic    => \@datarows
 372                 , header_row => $header_row
 373         });
 374
 375         ## We explicitly add the header_row, with this method it doesn't have
 376         ## to occur in the data. The unpack string rendered will be applied
 377         ## to the provided header_row to get the columns
 378         my $de = DataExtract::FixedWidth->new({
 379                 unpack_string => $template
 380                 , header_row  => $header_row
 381         });
 382
 383         $de->parse( $data_row );
 384
 385         $de->parse_hash( $data_row );
 386
 387 =head1 DESCRIPTION
 388
 389 This module parses any type of fixed width table -- these types of tables are often outputed by ghostscript, printf() displays with string padding (i.e. %-20s %20s etc), and most screen capture mechanisms. This module is using Moose all methods can be specified in the constructor.
 390
 391 In the below example, this module can discern the column names from the header. Or, you can supply them explicitly in the constructor; or, you can supply the rows in an ArrayRef to heuristic and pray for the best luck. This module is pretty abstracted and will deduce what it doesn't know in a decent fashion if all of the information is not provided.
 392
 393         SAMPLE FILE
 394         HEADER:  'COL1NAME       COL2NAME       COL3NAMEEEEE'
 395         DATA1:   'FOOBARBAZ      THIS IS TEXT   ANHER COL   '
 396         DATA2:   'FOOBAR FOOBAR  IS TEXT        ANOTHER COL '
 397
 398 After you have constructed, you can C<-E<gt>parse> which will return an ArrayRef
 399         $de->parse('FOOBARBAZ THIS IS TEXT    ANOTHER COL');
 400
 401 Or, you can use C<-E<gt>parse_hash()> which returns a HashRef of the data indexed by the column headers. They can be determined in many ways with the data you provide.
 402
 403 =head2 Constructor
 404
 405 The class constructor, C<-E<gt>new>, has numerious forms. Some options it has are:
 406
 407 =over 12
 408
 409 =item heuristics => \@lines
 410
 411 This will deduce the unpack format string from data. If you opt to use this method, and need parse_hash, the first row of the heurisitic is assumed to be the header_row. The unpack_string that results for the heuristic is applied to the header_row to determine the columns.
 412
 413 =item cols => \@cols
 414
 415 This will permit you to explicitly list the columns in the header row. This is especially handy if you have spaces in the column header. This option will make the C<header_row> mandatory.
 416
 417 =item header_row => $string
 418
 419 If a C<cols> option is not provided the assumption is that there are no spaces in the column header. The module can take care of the rest. The only way this column can be avoided is if we deduce the header from heuristics, or if you explicitly supply the unpack string and only use C<-E<gt>parse($line)>. If you are not going to supply a header, and you do not want to waste the first line on a header assumption, set the C<header_row =E<gt> undef> in the constructor.
 420
 421 =item verbose => 1|0
 422
 423 Right now, it simply display's warnings when it does something that might at first seem awkward. Like returning undef when it encouters a duplicate copy of a header row.
 424
 425 =back
 426
 427 =head2 Methods
 428
 429 B<An astrisk, (*) in the option means that is the default.>
 430
 431 =over 12
 432
 433 =item ->parse( $data_line )
 434
 435 Parses the data and returns an ArrayRef
 436
 437 =item ->parse_hash( $data_line )
 438
 439 Parses the data and returns a HashRef, indexed by the I<cols> (headers)
 440
 441 =item ->first_col_zero(1*|0)
 442
 443 This option forces the unpack string to make the first column assume the characters to the left of the header column. So, in the below example the first column also includes the first char of the row, even though the word stock begins at the second character.
 444
 445         CHAR NUMBERS: |1|2|3|4|5|6|7|8|9|10
 446         HEADER ROW  : | |S|T|O|C|K| |V|I|N
 447
 448 =item ->trim_whitespace(*1|0)
 449
 450 Trim the whitespace for the elements that C<-E<gt>parse($line)> outputs.
 451
 452 =item ->fix_overlay(1|0*)
 453
 454 Fixes columns that bleed into other columns, move over all non-whitespace characters preceding the first whitespace of the next column. This does not work with heurisitic because the unpack string makes the assumption the data is not mangeled.
 455
 456 So if ColumnA as is 'foob' and ColumnB is 'ar Hello world'
 457
 458 * ColumnA becomes 'foobar', and ColumnB becomes 'Hello world'
 459
 460 =item ->null_as_undef(1|0*)
 461
 462 Simply undef all elements that return C<length(element) = 0>, requires C<-E<gt>trim_whitespace>.
 463
 464 =item ->skip_header_data(1*|0)
 465
 466 Skips duplicate copies of the header_row if found in the data.
 467
 468 =item ->colchar_map
 469
 470 Returns a HashRef that displays the results of each column header and relative character position the column starts at. In the case of heuristic this is a simple ordinal number. In the case of non-heuristic provided data it is currently a cardinal character position.
 471
 472 =item ->unpack_string
 473
 474 Returns the C<CORE::unpack()> template string that will be used internally by C<-E<gt>parse($line)>
 475
 476 =back
 477
 478 =head1 AVAILABILITY
 479
 480 CPAN.org
 481
 482 Git repo at L<http://repo.or.cz/w/DataExtract-FixedWidth.git>
 483
 484 =head1 COPYRIGHT & LICENSE
 485
 486 Copyright 2008 Evan, all rights reserved.
 487
 488 This program is free software; you can redistribute it and/or modify it
 489 under the same terms as Perl itself.
 490
 491
 492 =head1 AUTHOR
 493
 494         Evan Carroll <me at evancarroll.com>
 495         System Lord of the Internets
 496
 497 =head1 BUGS
 498
 499 Please report any bugs or feature requests to C<bug-dataexract-fixedwidth at rt.cpan.org>, or through
 500 the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=DataExtract-FixedWidth>.  I will be notified, and then you'll
 501 automatically be notified of progress on your bug as I make changes.
 502
 503 =cut