lib/DataExtract/FixedWidth.pm

   1 package DataExtract::FixedWidth;
   2 use Moose;
   3 use Carp;
   4
   5 our $VERSION = '0.09';
   6
   7 sub BUILD {
   8         my $self = shift;
   9
  10         confess 'You must either send either a "header_row" or data for "heuristic"'
  11                 unless $self->has_header_row || $self->has_heuristic
  12         ;
  13         confess 'You must send a "header_row" if you send "cols"'
  14                 if $self->has_cols && !$self->has_header_row && !$self->has_heuristic
  15         ;
  16
  17 }
  18
  19 has 'unpack_string' => (
  20         isa          => 'Str'
  21         , is         => 'rw'
  22         , lazy_build => 1
  23 );
  24
  25 has 'cols' => (
  26         isa            => 'ArrayRef'
  27         , is           => 'rw'
  28         , auto_deref   => 1
  29         , lazy_build   => 1
  30 );
  31
  32 has 'colchar_map' => (
  33         isa          => 'HashRef'
  34         , is         => 'rw'
  35         , lazy_build => 1
  36 );
  37
  38 has 'header_row' => (
  39         isa          => 'Maybe[Str]'
  40         , is         => 'rw'
  41         , predicate  => 'has_header_row'
  42 );
  43
  44 has 'first_col_zero' => (
  45         isa       => 'Bool'
  46         , is      => 'ro'
  47         , default => 1
  48 );
  49
  50 has 'fix_overlay' => (
  51         isa       => 'Bool'
  52         , is      => 'ro'
  53         , default => 0
  54 );
  55
  56 has 'trim_whitespace' => (
  57         isa       => 'Bool'
  58         , is      => 'ro'
  59         , default => 1
  60 );
  61
  62 has 'sorted_colstart' => (
  63         isa          => 'ArrayRef'
  64         , is         => 'ro'
  65         , lazy_build => 1
  66         , auto_deref => 1
  67 );
  68
  69 has 'null_as_undef' => (
  70         isa       => 'Bool'
  71         , is      => 'ro'
  72         , default => 0
  73 );
  74
  75 has 'heuristic' => (
  76         isa          => 'ArrayRef'
  77         , is         => 'rw'
  78         , predicate  => 'has_heuristic'
  79         , auto_deref => 1
  80         , trigger    => \&_heuristic_trigger
  81 );
  82
  83 has 'skip_header_data' => (
  84         isa       => 'Bool'
  85         , is      => 'rw'
  86         , default => 1
  87 );
  88
  89 has 'verbose' => ( isa => 'Bool', 'is' => 'ro', default => 0 );
  90
  91 sub _heuristic_trigger {
  92         my ( $self, $data ) = @_;
  93
  94         chomp @$data;
  95
  96         my $maxLength = 0;
  97         for ( @$data ) {
  98                 $maxLength = length if length > $maxLength
  99         }
 100
 101         $self->header_row( $data->[0] )
 102                 unless $self->has_header_row
 103         ;
 104
 105         {
 106                 my @unpack;
 107                 my $mask = ' ' x $maxLength;
 108                 $mask |= $_ for @$data;
 109
 110                 ## The (?=\S) fixes a bug that creates null columns in the event any
 111                 ## one column has trailing whitespace (because you'll have '\S\s  '
 112                 ## this was a bug revealed in the dataset NullFirstRow.txt
 113                 #
 114                 ## the ^\s+ makes it so that right alligned tables
 115                 ## spaces on the left of the first non-whitespace character in
 116                 ## the first col work
 117                 push @unpack, length($1)
 118                         while $mask =~ m/((?:^\s+)?\S+\s+(?=\S))/g
 119                 ;
 120
 121                 $self->unpack_string( $self->_helper_unpack( \@unpack ) );
 122         }
 123
 124 }
 125
 126 sub _build_cols {
 127         my $self = shift;
 128
 129         my @cols;
 130
 131         ## If we have the unpack string and the header_row parse it all out on our own
 132         ## Here we have two conditionals because the unpack_string comes into existance in
 133         ## build_unpack_string and not the heuristic_trigger
 134         if (
 135                 ( $self->has_header_row && $self->has_unpack_string )
 136                 || ( $self->has_header_row && $self->has_heuristic )
 137         ) {
 138                 my $skd = $self->skip_header_data;
 139                 $self->skip_header_data( 0 );
 140
 141                 @cols = @{ $self->parse( $self->header_row ) };
 142
 143                 $self->skip_header_data( $skd );
 144         }
 145
 146         ## We only the header_row
 147         elsif ( $self->header_row ) {
 148                 @cols = split ' ', $self->header_row;
 149         }
 150
 151         else {
 152                 croak 'Need some method to calculate cols';
 153         }
 154
 155         \@cols;
 156
 157 }
 158
 159 sub _build_colchar_map {
 160         my $self = shift;
 161         my $ccm = {};
 162
 163         ## If we can generate from heurisitic data and don't have a header_row
 164         if (
 165                 $self->has_header_row
 166                 && !defined $self->header_row
 167                 && $self->has_heuristic
 168                 && $self->has_cols
 169         ) {
 170                 my @cols = $self->cols;
 171                 foreach my $idx ( 0 .. $#cols ) {
 172                         $ccm->{$idx} = $cols[$idx];
 173                 }
 174         }
 175
 176         ## Generate from header_row
 177         else {
 178                 croak 'Can not render the map of columns to start-chars without the header_row'
 179                         unless defined $self->has_header_row
 180                 ;
 181
 182                 foreach my $col ( $self->cols ) {
 183
 184                         my $pos = 0;
 185                         $pos = index( $self->header_row, $col, $pos );
 186
 187                         croak "Failed to find a column '$col' in the header row"
 188                                 unless defined $pos
 189                         ;
 190
 191                         unless ( exists $ccm->{ $pos } ) {
 192                                 $ccm->{ $pos } = $col;
 193                         }
 194
 195                         ## We have two like-named columns
 196                         else {
 197
 198                                 ## possible inf loop here
 199                                 until ( not exists $ccm->{$pos} ) {
 200                                         $pos = index( $self->header_row, $col, $pos+1 );
 201
 202                                         croak "Failed to find another column '$col' in the header row"
 203                                                 unless defined $pos
 204                                         ;
 205
 206                                 }
 207
 208                                 $ccm->{ $pos } = $col;
 209
 210                         }
 211
 212                 }
 213
 214         }
 215
 216         $ccm;
 217
 218 }
 219
 220 sub _build_unpack_string {
 221         my $self = shift;
 222
 223         my @unpack;
 224         my @startcols = $self->sorted_colstart;
 225         $startcols[0] = 0 if $self->first_col_zero;
 226         foreach my $idx ( 0 .. $#startcols ) {
 227
 228                 if ( exists $startcols[$idx+1] ) {
 229                         push @unpack, ( $startcols[$idx+1] - $startcols[$idx] );
 230                 }
 231
 232         }
 233
 234         $self->_helper_unpack( \@unpack );
 235
 236 }
 237
 238 ## Takes ArrayRef of startcols and returns the unpack string.
 239 sub _helper_unpack {
 240         my ( $self, $startcols ) = @_;
 241
 242         my $format;
 243         if ( @$startcols ) {
 244                 $format = 'a' . join 'a', @$startcols;
 245         }
 246         $format .= 'A*';
 247
 248         $format;
 249
 250 }
 251
 252 sub parse {
 253         my ( $self, $data ) = @_;
 254
 255         return undef if !defined $data;
 256
 257         chomp $data;
 258
 259         ## skip_header_data
 260         if (
 261                 $self->skip_header_data
 262                 && ( defined $self->header_row && $data eq $self->header_row )
 263         ) {
 264                 warn "Skipping duplicate header row\n" if $self->verbose;
 265                 return undef
 266         }
 267
 268         #printf "\nData:|%s|\tHeader:|%s|", $data, $self->header_row;
 269
 270         my @cols = unpack ( $self->unpack_string, $data );
 271
 272         ## If we bleed over a bit we can fix that.
 273         if ( $self->fix_overlay ) {
 274                 foreach my $idx ( 0 .. $#cols ) {
 275                         if (
 276                                 $cols[$idx] =~ m/\S+$/
 277                                 && exists $cols[$idx+1]
 278                                 && $cols[$idx+1] =~ s/^(\S+)//
 279                         ) {
 280                                         $cols[$idx] .= $1;
 281                         }
 282                 }
 283         }
 284
 285         ## Get rid of whitespaces
 286         if ( $self->trim_whitespace ) {
 287                 for ( @cols ) { s/^\s+//; s/\s+$//; }
 288         }
 289
 290         ## Swithc nulls to undef
 291         if ( $self->null_as_undef ) {
 292                 croak 'This ->null_as_undef option mandates ->trim_whitespace be true'
 293                         unless $self->trim_whitespace
 294                 ;
 295                 for ( @cols ) { undef $_ unless length($_) }
 296         }
 297
 298         \@cols;
 299
 300 }
 301
 302 sub parse_hash {
 303         my ( $self, $data ) = @_;
 304
 305         my $row = $self->parse( $data );
 306
 307         my $colstarts = $self->sorted_colstart;
 308
 309         my $results;
 310         foreach my $idx ( 0 .. $#$row ) {
 311                 my $col = $self->colchar_map->{ $colstarts->[$idx] };
 312                 $results->{ $col } = $row->[$idx];
 313         }
 314
 315         $results;
 316
 317 }
 318
 319 sub _build_sorted_colstart {
 320         my $self = shift;
 321
 322         my @startcols = map { $_->[0] }
 323                 sort { $a->[1] <=> $b->[1] }
 324                 map { [$_, sprintf( "%10d", $_ ) ] }
 325                 keys %{ $self->colchar_map }
 326         ;
 327
 328         \@startcols;
 329
 330 }
 331
 332 no Moose;
 333 __PACKAGE__->meta->make_immutable;
 334
 335 1;
 336
 337 __END__
 338
 339 =head1 NAME
 340
 341 DataExtract::FixedWidth - The one stop shop for parsing static column width text tables!
 342
 343 =head1 SYNOPSIS
 344
 345         ## We assume the columns have no spaces in the header.
 346         my $de = DataExtract::FixedWidth->new({ header_row => $header_row });
 347
 348         ## We explicitly tell what column names to pick out of the header.
 349         my $de = DataExtract::FixedWidth->new({
 350                 header_row => $header_row
 351                 cols       => [qw/COL1NAME COL2NAME COL3NAME/, 'COL WITH SPACE IN NAME']
 352         });
 353
 354         ## We supply data to heuristic and assume
 355         ## * first row is the header (to avoid this assumption
 356         ##   set the header_row to undef. )
 357         ## * heurisitic's unpack_string is correct
 358         ## * unpack_string applied to header_row will tell us the columns
 359         my $de = DataExtract::FixedWidth->new({ heuristic => \@datarows });
 360
 361         ## We supply data to heuristic, say we have no header, and the set columns
 362         ## just like the above except ->parse_hash will be be indexed by the
 363         ## provided columns and no row is designated as the header.
 364         my $de = DataExtract::FixedWidth->new({
 365                 heuristic    => \@datarows
 366                 , header_row => undef
 367                 , columns    => [qw/ foo bar baz/]
 368         });
 369
 370         ## We supply data to heuristic, and we explicitly add the header_row
 371         ## with this method it doesn't have to occur in the data.
 372         ## The unpack string rendered will be applied to the first row to get
 373         ## the columns
 374         my $de = DataExtract::FixedWidth->new({
 375                 heuristic    => \@datarows
 376                 , header_row => $header_row
 377         });
 378
 379         ## We explicitly add the header_row, with this method it doesn't have
 380         ## to occur in the data. The unpack string rendered will be applied
 381         ## to the provided header_row to get the columns
 382         my $de = DataExtract::FixedWidth->new({
 383                 unpack_string => $template
 384                 , header_row  => $header_row
 385         });
 386
 387         $de->parse( $data_row );
 388
 389         $de->parse_hash( $data_row );
 390
 391 =head1 DESCRIPTION
 392
 393 This module parses any type of fixed width table -- these types of tables are often outputed by ghostscript, printf() displays with string padding (i.e. %-20s %20s etc), and most screen capture mechanisms. This module is using Moose all methods can be specified in the constructor.
 394
 395 In the below example, this module can discern the column names from the header. Or, you can supply them explicitly in the constructor; or, you can supply the rows in an ArrayRef to heuristic and pray for the best luck. This module is pretty abstracted and will deduce what it doesn't know in a decent fashion if all of the information is not provided.
 396
 397         SAMPLE FILE
 398         HEADER:  'COL1NAME       COL2NAME       COL3NAMEEEEE'
 399         DATA1:   'FOOBARBAZ      THIS IS TEXT   ANHER COL   '
 400         DATA2:   'FOOBAR FOOBAR  IS TEXT        ANOTHER COL '
 401
 402 After you have constructed, you can C<-E<gt>parse> which will return an ArrayRef
 403         $de->parse('FOOBARBAZ THIS IS TEXT    ANOTHER COL');
 404
 405 Or, you can use C<-E<gt>parse_hash()> which returns a HashRef of the data indexed by the column headers. They can be determined in many ways with the data you provide.
 406
 407 =head2 Constructor
 408
 409 The class constructor, C<-E<gt>new>, has numerious forms. Some options it has are:
 410
 411 =over 12
 412
 413 =item heuristics => \@lines
 414
 415 This will deduce the unpack format string from data. If you opt to use this method, and need parse_hash, the first row of the heurisitic is assumed to be the header_row. The unpack_string that results for the heuristic is applied to the header_row to determine the columns.
 416
 417 =item cols => \@cols
 418
 419 This will permit you to explicitly list the columns in the header row. This is especially handy if you have spaces in the column header. This option will make the C<header_row> mandatory.
 420
 421 =item header_row => $string
 422
 423 If a C<cols> option is not provided the assumption is that there are no spaces in the column header. The module can take care of the rest. The only way this column can be avoided is if we deduce the header from heuristics, or if you explicitly supply the unpack string and only use C<-E<gt>parse($line)>. If you are not going to supply a header, and you do not want to waste the first line on a header assumption, set the C<header_row =E<gt> undef> in the constructor.
 424
 425 =item verbose => 1|0
 426
 427 Right now, it simply display's warnings when it does something that might at first seem awkward. Like returning undef when it encouters a duplicate copy of a header row.
 428
 429 =back
 430
 431 =head2 Methods
 432
 433 B<An astrisk, (*) in the option means that is the default.>
 434
 435 =over 12
 436
 437 =item ->parse( $data_line )
 438
 439 Parses the data and returns an ArrayRef
 440
 441 =item ->parse_hash( $data_line )
 442
 443 Parses the data and returns a HashRef, indexed by the I<cols> (headers)
 444
 445 =item ->first_col_zero(1*|0)
 446
 447 This option forces the unpack string to make the first column assume the characters to the left of the header column. So, in the below example the first column also includes the first char of the row, even though the word stock begins at the second character.
 448
 449         CHAR NUMBERS: |1|2|3|4|5|6|7|8|9|10
 450         HEADER ROW  : | |S|T|O|C|K| |V|I|N
 451
 452 =item ->trim_whitespace(*1|0)
 453
 454 Trim the whitespace for the elements that C<-E<gt>parse($line)> outputs.
 455
 456 =item ->fix_overlay(1|0*)
 457
 458 Fixes columns that bleed into other columns, move over all non-whitespace characters preceding the first whitespace of the next column. This does not work with heurisitic because the unpack string makes the assumption the data is not mangeled.
 459
 460 So if ColumnA as is 'foob' and ColumnB is 'ar Hello world'
 461
 462 * ColumnA becomes 'foobar', and ColumnB becomes 'Hello world'
 463
 464 =item ->null_as_undef(1|0*)
 465
 466 Simply undef all elements that return C<length(element) = 0>, requires C<-E<gt>trim_whitespace>.
 467
 468 =item ->skip_header_data(1*|0)
 469
 470 Skips duplicate copies of the header_row if found in the data.
 471
 472 =item ->colchar_map
 473
 474 Returns a HashRef that displays the results of each column header and relative character position the column starts at. In the case of heuristic this is a simple ordinal number. In the case of non-heuristic provided data it is currently a cardinal character position.
 475
 476 =item ->unpack_string
 477
 478 Returns the C<CORE::unpack()> template string that will be used internally by C<-E<gt>parse($line)>
 479
 480 =back
 481
 482 =head1 AVAILABILITY
 483
 484 CPAN.org
 485
 486 Git repo at L<http://repo.or.cz/w/DataExtract-FixedWidth.git>
 487
 488 =head1 COPYRIGHT & LICENSE
 489
 490 Copyright 2008 Evan, all rights reserved.
 491
 492 This program is free software; you can redistribute it and/or modify it
 493 under the same terms as Perl itself.
 494
 495
 496 =head1 AUTHOR
 497
 498         Evan Carroll <me at evancarroll.com>
 499         System Lord of the Internets
 500
 501 =head1 BUGS
 502
 503 Please report any bugs or feature requests to C<bug-dataexract-fixedwidth at rt.cpan.org>, or through
 504 the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=DataExtract-FixedWidth>.  I will be notified, and then you'll
 505 automatically be notified of progress on your bug as I make changes.
 506
 507 =cut