lib/DataExtract/FixedWidth.pm

   1 package DataExtract::FixedWidth;
   2 use Moose;
   3 use Carp;
   4
   5 our $VERSION = '0.07';
   6
   7 sub BUILD {
   8         my $self = shift;
   9
  10         confess 'You must either send either a "header_row" or data for "heuristic"'
  11                 unless $self->has_header_row || $self->has_heuristic
  12         ;
  13         confess 'You must send a "header_row" if you send "cols"'
  14                 if $self->has_cols && !$self->has_header_row && !$self->has_heuristic
  15         ;
  16
  17 }
  18
  19 has 'unpack_string' => (
  20         isa          => 'Str'
  21         , is         => 'rw'
  22         , lazy_build => 1
  23 );
  24
  25 has 'cols' => (
  26         isa            => 'ArrayRef'
  27         , is           => 'rw'
  28         , auto_deref   => 1
  29         , lazy_build   => 1
  30 );
  31
  32 has 'colchar_map' => (
  33         isa          => 'HashRef'
  34         , is         => 'rw'
  35         , lazy_build => 1
  36 );
  37
  38 has 'header_row' => (
  39         isa          => 'Maybe[Str]'
  40         , is         => 'rw'
  41         , predicate  => 'has_header_row'
  42 );
  43
  44 has 'first_col_zero' => (
  45         isa       => 'Bool'
  46         , is      => 'ro'
  47         , default => 1
  48 );
  49
  50 has 'fix_overlay' => (
  51         isa       => 'Bool'
  52         , is      => 'ro'
  53         , default => 0
  54 );
  55
  56 has 'trim_whitespace' => (
  57         isa       => 'Bool'
  58         , is      => 'ro'
  59         , default => 1
  60 );
  61
  62 has 'sorted_colstart' => (
  63         isa          => 'ArrayRef'
  64         , is         => 'ro'
  65         , lazy_build => 1
  66         , auto_deref => 1
  67 );
  68
  69 has 'null_as_undef' => (
  70         isa       => 'Bool'
  71         , is      => 'ro'
  72         , default => 0
  73 );
  74
  75 has 'heuristic' => (
  76         isa          => 'ArrayRef'
  77         , is         => 'rw'
  78         , predicate  => 'has_heuristic'
  79         , auto_deref => 1
  80         , trigger    => \&_heuristic_trigger
  81 );
  82
  83 has 'skip_header_data' => (
  84         isa       => 'Bool'
  85         , is      => 'rw'
  86         , default => 1
  87 );
  88
  89 sub _heuristic_trigger {
  90         my ( $self, $data ) = @_;
  91
  92         chomp @$data;
  93
  94         my $maxLength = 0;
  95         for ( @$data ) {
  96                 $maxLength = length if length > $maxLength
  97         }
  98
  99         $self->header_row( $data->[0] )
 100                 unless $self->has_header_row
 101         ;
 102
 103         {
 104                 my @unpack;
 105                 my $mask = ' ' x $maxLength;
 106                 $mask |= $_ for @$data;
 107
 108                 push @unpack, length($1)
 109                         while $mask =~ m/(\S+\s+|$)/g
 110                 ;
 111
 112                 ## Remove last row, (to be replaced with A*)
 113                 pop @unpack;
 114
 115                 $self->unpack_string( $self->_helper_unpack( \@unpack ) );
 116         }
 117
 118 }
 119
 120 sub _build_cols {
 121         my $self = shift;
 122
 123         my @cols;
 124
 125         ## If we have the unpack string and the header_row parse it all out on our own
 126         ## Here we have two conditionals because the unpack_string comes into existance in
 127         ## build_unpack_string and not the heuristic_trigger
 128         if (
 129                 ( $self->has_header_row && $self->has_unpack_string )
 130                 || ( $self->has_header_row && $self->has_heuristic )
 131         ) {
 132                 my $skd = $self->skip_header_data;
 133                 $self->skip_header_data( 0 );
 134
 135                 @cols = @{ $self->parse( $self->header_row ) };
 136
 137                 $self->skip_header_data( $skd );
 138         }
 139
 140         ## We only the header_row
 141         elsif ( $self->header_row ) {
 142                 @cols = split ' ', $self->header_row;
 143         }
 144
 145         else {
 146                 croak 'Need some method to calculate cols';
 147         }
 148
 149         \@cols;
 150
 151 }
 152
 153 sub _build_colchar_map {
 154         my $self = shift;
 155         my $ccm = {};
 156
 157         ## If we can generate from heurisitic data and don't have a header_row
 158         if (
 159                 $self->has_header_row
 160                 && !defined $self->header_row
 161                 && $self->has_heuristic
 162                 && $self->has_cols
 163         ) {
 164                 my @cols = $self->cols;
 165                 foreach my $idx ( 0 .. $#cols ) {
 166                         $ccm->{$idx} = $cols[$idx];
 167                 }
 168         }
 169
 170         ## Generate from header_row
 171         else {
 172                 croak 'Can not render the map of columns to start-chars without the header_row'
 173                         unless defined $self->has_header_row
 174                 ;
 175
 176                 foreach my $col ( $self->cols ) {
 177
 178                         my $pos = 0;
 179                         $pos = index( $self->header_row, $col, $pos );
 180
 181                         croak "Failed to find a column '$col' in the header row"
 182                                 unless defined $pos
 183                         ;
 184
 185                         unless ( exists $ccm->{ $pos } ) {
 186                                 $ccm->{ $pos } = $col;
 187                         }
 188
 189                         ## We have two like-named columns
 190                         else {
 191
 192                                 ## possible inf loop here
 193                                 until ( not exists $ccm->{$pos} ) {
 194                                         $pos = index( $self->header_row, $col, $pos+1 );
 195
 196                                         croak "Failed to find another column '$col' in the header row"
 197                                                 unless defined $pos
 198                                         ;
 199
 200                                 }
 201
 202                                 $ccm->{ $pos } = $col;
 203
 204                         }
 205
 206                 }
 207
 208         }
 209
 210         $ccm;
 211
 212 }
 213
 214 sub _build_unpack_string {
 215         my $self = shift;
 216
 217         my @unpack;
 218         my @startcols = $self->sorted_colstart;
 219         $startcols[0] = 0 if $self->first_col_zero;
 220         foreach my $idx ( 0 .. $#startcols ) {
 221
 222                 if ( exists $startcols[$idx+1] ) {
 223                         push @unpack, ( $startcols[$idx+1] - $startcols[$idx] );
 224                 }
 225
 226         }
 227
 228         $self->_helper_unpack( \@unpack );
 229
 230 }
 231
 232 ## Takes ArrayRef of startcols and returns the unpack string.
 233 sub _helper_unpack {
 234         my ( $self, $startcols ) = @_;
 235
 236         my $format;
 237         if ( @$startcols ) {
 238                 $format = 'a' . join 'a', @$startcols;
 239         }
 240         $format .= 'A*';
 241
 242         $format;
 243
 244 }
 245
 246 sub parse {
 247         my ( $self, $data ) = @_;
 248
 249         return undef if !defined $data;
 250
 251         chomp $data;
 252
 253         ## skip_header_data
 254         return undef
 255                 if $self->skip_header_data
 256                 && ( defined $self->header_row && $data eq $self->header_row )
 257         ;
 258
 259         #printf "\nData:|%s|\tHeader:|%s|", $data, $self->header_row;
 260
 261         my @cols = unpack ( $self->unpack_string, $data );
 262
 263         ## If we bleed over a bit we can fix that.
 264         if ( $self->fix_overlay ) {
 265                 foreach my $idx ( 0 .. $#cols ) {
 266                         if (
 267                                 $cols[$idx] =~ m/\S+$/
 268                                 && exists $cols[$idx+1]
 269                                 && $cols[$idx+1] =~ s/^(\S+)//
 270                         ) {
 271                                         $cols[$idx] .= $1;
 272                         }
 273                 }
 274         }
 275
 276         ## Get rid of whitespaces
 277         if ( $self->trim_whitespace ) {
 278                 for ( @cols ) { s/^\s+//; s/\s+$//; }
 279         }
 280
 281         ## Swithc nulls to undef
 282         if ( $self->null_as_undef ) {
 283                 croak 'This ->null_as_undef option mandates ->trim_whitespace be true'
 284                         unless $self->trim_whitespace
 285                 ;
 286                 for ( @cols ) { undef $_ unless length($_) }
 287         }
 288
 289         \@cols;
 290
 291 }
 292
 293 sub parse_hash {
 294         my ( $self, $data ) = @_;
 295
 296         my $row = $self->parse( $data );
 297
 298         my $colstarts = $self->sorted_colstart;
 299
 300         my $results;
 301         foreach my $idx ( 0 .. $#$row ) {
 302                 my $col = $self->colchar_map->{ $colstarts->[$idx] };
 303                 $results->{ $col } = $row->[$idx];
 304         }
 305
 306         $results;
 307
 308 }
 309
 310 sub _build_sorted_colstart {
 311         my $self = shift;
 312
 313         my @startcols = map { $_->[0] }
 314                 sort { $a->[1] <=> $b->[1] }
 315                 map { [$_, sprintf( "%10d", $_ ) ] }
 316                 keys %{ $self->colchar_map }
 317         ;
 318
 319         \@startcols;
 320
 321 }
 322
 323 no Moose;
 324
 325 1;
 326
 327 __END__
 328
 329 =head1 NAME
 330
 331 DataExtract::FixedWidth - The one stop shop for parsing static column width text tables!
 332
 333 =head1 SYNOPSIS
 334
 335         ## We assume the columns have no spaces in the header.
 336         my $de = DataExtract::FixedWidth->new({ header_row => $header_row });
 337
 338         ## We explicitly tell what column names to pick out of the header.
 339         my $de = DataExtract::FixedWidth->new({
 340                 header_row => $header_row
 341                 cols       => [qw/COL1NAME COL2NAME COL3NAME/, 'COL WITH SPACE IN NAME']
 342         });
 343
 344         ## We supply data to heuristic and assume
 345         ## * first row is the header (to avoid this assumption
 346         ##   set the header_row to undef. )
 347         ## * heurisitic's unpack_string is correct
 348         ## * unpack_string applied to header_row will tell us the columns
 349         my $de = DataExtract::FixedWidth->new({ heuristic => \@datarows });
 350
 351         ## We supply data to heuristic, say we have no header, and the set columns
 352         ## just like the above except ->parse_hash will be be indexed by the
 353         ## provided columns and no row is designated as the header.
 354         my $de = DataExtract::FixedWidth->new({
 355                 heuristic    => \@datarows
 356                 , header_row => undef
 357                 , columns    => [qw/ foo bar baz/]
 358         });
 359
 360         ## We supply data to heuristic, and we explicitly add the header_row
 361         ## with this method it doesn't have to occur in the data.
 362         ## The unpack string rendered will be applied to the first row to get
 363         ## the columns
 364         my $de = DataExtract::FixedWidth->new({
 365                 heuristic    => \@datarows
 366                 , header_row => $header_row
 367         });
 368
 369         ## We explicitly add the header_row, with this method it doesn't have
 370         ## to occur in the data. The unpack string rendered will be applied
 371         ## to the provided header_row to get the columns
 372         my $de = DataExtract::FixedWidth->new({
 373                 unpack_string => $template
 374                 , header_row  => $header_row
 375         });
 376
 377         $de->parse( $data_row );
 378
 379         $de->parse_hash( $data_row );
 380
 381 =head1 DESCRIPTION
 382
 383 This module parses any type of fixed width table -- these types of tables are often outputed by ghostscript, printf() displays with string padding (i.e. %-20s %20s etc), and most screen capture mechanisms. This module is using Moose all methods can be specified in the constructor.
 384
 385 In the below example, this module can discern the column names from the header. Or, you can supply them explicitly in the constructor; or, you can supply the rows in an ArrayRef to heuristic and pray for the best luck. This module is pretty abstracted and will deduce what it doesn't know in a decent fashion if all of the information is not provided.
 386
 387         SAMPLE FILE
 388         HEADER:  'COL1NAME       COL2NAME       COL3NAMEEEEE'
 389         DATA1:   'FOOBARBAZ      THIS IS TEXT   ANHER COL   '
 390         DATA2:   'FOOBAR FOOBAR  IS TEXT        ANOTHER COL '
 391
 392 After you have constructed, you can C<-E<gt>parse> which will return an ArrayRef
 393         $de->parse('FOOBARBAZ THIS IS TEXT    ANOTHER COL');
 394
 395 Or, you can use C<-E<gt>parse_hash()> which returns a HashRef of the data indexed by the column headers. They can be determined in many ways with the data you provide.
 396
 397 =head2 Constructor
 398
 399 The class constructor, C<-E<gt>new>, has numerious forms. Some options it has are:
 400
 401 =over 12
 402
 403 =item heuristics => \@lines
 404
 405 This will deduce the unpack format string from data. If you opt to use this method, and need parse_hash, the first row of the heurisitic is assumed to be the header_row. The unpack_string that results for the heuristic is applied to the header_row to determine the columns.
 406
 407 =item cols => \@cols
 408
 409 This will permit you to explicitly list the columns in the header row. This is especially handy if you have spaces in the column header. This option will make the C<header_row> mandatory.
 410
 411 =item header_row => $string
 412
 413 If a C<cols> option is not provided the assumption is that there are no spaces in the column header. The module can take care of the rest. The only way this column can be avoided is if we deduce the header from heuristics, or if you explicitly supply the unpack string and only use C<-E<gt>parse($line)>. If you are not going to supply a header, and you do not want to waste the first line on a header assumption, set the C<header_row =E<gt> undef> in the constructor.
 414
 415 =back
 416
 417 =head2 Methods
 418
 419 B<An astrisk, (*) in the option means that is the default.>
 420
 421 =over 12
 422
 423 =item ->parse( $data_line )
 424
 425 Parses the data and returns an ArrayRef
 426
 427 =item ->parse_hash( $data_line )
 428
 429 Parses the data and returns a HashRef, indexed by the I<cols> (headers)
 430
 431 =item ->first_col_zero(1*|0)
 432
 433 This option forces the unpack string to make the first column assume the characters to the left of the header column. So, in the below example the first column also includes the first char of the row, even though the word stock begins at the second character.
 434
 435         CHAR NUMBERS: |1|2|3|4|5|6|7|8|9|10
 436         HEADER ROW  : | |S|T|O|C|K| |V|I|N
 437
 438 =item ->trim_whitespace(*1|0)
 439
 440 Trim the whitespace for the elements that C<-E<gt>parse($line)> outputs.
 441
 442 =item ->fix_overlay(1|0*)
 443
 444 Fixes columns that bleed into other columns, move over all non-whitespace characters preceding the first whitespace of the next column. This does not work with heurisitic because the unpack string makes the assumption the data is not mangeled.
 445
 446 So if ColumnA as is 'foob' and ColumnB is 'ar Hello world'
 447
 448 * ColumnA becomes 'foobar', and ColumnB becomes 'Hello world'
 449
 450 =item ->null_as_undef(1|0*)
 451
 452 Simply undef all elements that return C<length(element) = 0>, requires C<-E<gt>trim_whitespace>.
 453
 454 =item ->skip_header_data(1*|0)
 455
 456 Skips duplicate copies of the header_row if found in the data.
 457
 458 =item ->colchar_map
 459
 460 Returns a HashRef that displays the results of each column header and relative character position the column starts at. In the case of heuristic this is a simple ordinal number. In the case of non-heuristic provided data it is currently a cardinal character position.
 461
 462 =item ->unpack_string
 463
 464 Returns the C<CORE::unpack()> template string that will be used internally by C<-E<gt>parse($line)>
 465
 466 =back
 467
 468 =head1 AVAILABILITY
 469
 470 CPAN.org
 471
 472 =head1 COPYRIGHT & LICENSE
 473
 474 Copyright 2008 Evan, all rights reserved.
 475
 476 This program is free software; you can redistribute it and/or modify it
 477 under the same terms as Perl itself.
 478
 479
 480 =head1 AUTHOR
 481
 482         Evan Carroll <me at evancarroll.com>
 483         System Lord of the Internets
 484
 485 =head1 BUGS
 486
 487 Please report any bugs or feature requests to C<bug-dataexract-fixedwidth at rt.cpan.org>, or through
 488 the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=DataExtract-FixedWidth>.  I will be notified, and then you'll
 489 automatically be notified of progress on your bug as I make changes.
 490
 491 =cut