lib/DataExtract/FixedWidth.pm

   1 package DataExtract::FixedWidth;
   2 use Moose;
   3 use Carp;
   4
   5 our $VERSION = '0.06';
   6
   7 sub BUILD {
   8         my $self = shift;
   9
  10         confess 'You must either send either a "header_row" or data for "heuristic"'
  11                 unless $self->has_header_row || $self->has_heuristic
  12         ;
  13         confess 'You must send a "header_row" if you send "cols"'
  14                 if $self->has_cols && !$self->has_header_row && !$self->has_heuristic
  15         ;
  16
  17 }
  18
  19 has 'unpack_string' => (
  20         isa          => 'Str'
  21         , is         => 'rw'
  22         , lazy_build => 1
  23 );
  24
  25 has 'cols' => (
  26         isa            => 'ArrayRef'
  27         , is           => 'rw'
  28         , auto_deref   => 1
  29         , lazy_build   => 1
  30 );
  31
  32 has 'colchar_map' => (
  33         isa          => 'HashRef'
  34         , is         => 'rw'
  35         , lazy_build => 1
  36 );
  37
  38 has 'header_row' => (
  39         isa          => 'Maybe[Str]'
  40         , is         => 'rw'
  41         , predicate  => 'has_header_row'
  42 );
  43
  44 has 'first_col_zero' => (
  45         isa       => 'Bool'
  46         , is      => 'ro'
  47         , default => 1
  48 );
  49
  50 has 'fix_overlay' => (
  51         isa       => 'Bool'
  52         , is      => 'ro'
  53         , default => 0
  54 );
  55
  56 has 'trim_whitespace' => (
  57         isa       => 'Bool'
  58         , is      => 'ro'
  59         , default => 1
  60 );
  61
  62 has 'sorted_colstart' => (
  63         isa          => 'ArrayRef'
  64         , is         => 'ro'
  65         , lazy_build => 1
  66         , auto_deref => 1
  67 );
  68
  69 has 'null_as_undef' => (
  70         isa       => 'Bool'
  71         , is      => 'ro'
  72         , default => 0
  73 );
  74
  75 has 'heuristic' => (
  76         isa          => 'ArrayRef'
  77         , is         => 'rw'
  78         , predicate  => 'has_heuristic'
  79         , auto_deref => 1
  80         , trigger    => \&_heuristic_trigger
  81 );
  82
  83 has 'skip_header_data' => (
  84         isa       => 'Bool'
  85         , is      => 'rw'
  86         , default => 1
  87 );
  88
  89 sub _heuristic_trigger {
  90         my ( $self, $data ) = @_;
  91
  92         chomp @$data;
  93
  94         $self->header_row( $data->[0] )
  95                 unless $self->has_header_row
  96         ;
  97
  98         {
  99                 my @unpack;
 100                 my $mask = ' ' x length $data->[0];
 101                 $mask |= $_ for @$data;
 102
 103                 push @unpack, length($1)
 104                         while $mask =~ m/(\S+\s+|$)/g
 105                 ;
 106
 107                 ## Remove last row, (to be replaced with A*)
 108                 pop @unpack;
 109
 110                 $self->unpack_string( $self->_helper_unpack( \@unpack ) );
 111         }
 112
 113 }
 114
 115 sub _build_cols {
 116         my $self = shift;
 117
 118         my @cols;
 119
 120         ## If we have the unpack string and the header_row parse it all out on our own
 121         ## Here we have two conditionals because the unpack_string comes into existance in
 122         ## build_unpack_string and not the heuristic_trigger
 123         if (
 124                 ( $self->has_header_row && $self->has_unpack_string )
 125                 || ( $self->has_header_row && $self->has_heuristic )
 126         ) {
 127                 my $skd = $self->skip_header_data;
 128                 $self->skip_header_data( 0 );
 129
 130                 @cols = @{ $self->parse( $self->header_row ) };
 131
 132                 $self->skip_header_data( $skd );
 133         }
 134
 135         ## We only the header_row
 136         elsif ( $self->header_row ) {
 137                 @cols = split ' ', $self->header_row;
 138         }
 139
 140         else {
 141                 croak 'Need some method to calculate cols';
 142         }
 143
 144         \@cols;
 145
 146 }
 147
 148 sub _build_colchar_map {
 149         my $self = shift;
 150         my $ccm = {};
 151
 152         ## If we can generate from heurisitic data and don't have a header_row
 153         if (
 154                 $self->has_header_row
 155                 && !defined $self->header_row
 156                 && $self->has_heuristic
 157                 && $self->has_cols
 158         ) {
 159                 my @cols = $self->cols;
 160                 foreach my $idx ( 0 .. $#cols ) {
 161                         $ccm->{$idx} = $cols[$idx];
 162                 }
 163         }
 164
 165         ## Generate from header_row
 166         else {
 167                 croak 'Can not render the map of columns to start-chars without the header_row'
 168                         unless defined $self->has_header_row
 169                 ;
 170
 171                 foreach my $col ( $self->cols ) {
 172
 173                         my $pos = 0;
 174                         $pos = index( $self->header_row, $col, $pos );
 175
 176                         croak "Failed to find a column '$col' in the header row"
 177                                 unless defined $pos
 178                         ;
 179
 180                         unless ( exists $ccm->{ $pos } ) {
 181                                 $ccm->{ $pos } = $col;
 182                         }
 183
 184                         ## We have two like-named columns
 185                         else {
 186
 187                                 ## possible inf loop here
 188                                 until ( not exists $ccm->{$pos} ) {
 189                                         $pos = index( $self->header_row, $col, $pos+1 );
 190
 191                                         croak "Failed to find another column '$col' in the header row"
 192                                                 unless defined $pos
 193                                         ;
 194
 195                                 }
 196
 197                                 $ccm->{ $pos } = $col;
 198
 199                         }
 200
 201                 }
 202
 203         }
 204
 205         $ccm;
 206
 207 }
 208
 209 sub _build_unpack_string {
 210         my $self = shift;
 211
 212         my @unpack;
 213         my @startcols = $self->sorted_colstart;
 214         $startcols[0] = 0 if $self->first_col_zero;
 215         foreach my $idx ( 0 .. $#startcols ) {
 216
 217                 if ( exists $startcols[$idx+1] ) {
 218                         push @unpack, ( $startcols[$idx+1] - $startcols[$idx] );
 219                 }
 220
 221         }
 222
 223         $self->_helper_unpack( \@unpack );
 224
 225 }
 226
 227 ## Takes ArrayRef of startcols and returns the unpack string.
 228 sub _helper_unpack {
 229         my ( $self, $startcols ) = @_;
 230
 231         my $format;
 232         if ( @$startcols ) {
 233                 $format = 'a' . join 'a', @$startcols;
 234         }
 235         $format .= 'A*';
 236
 237         $format;
 238
 239 }
 240
 241 sub parse {
 242         my ( $self, $data ) = @_;
 243
 244         return undef if !defined $data;
 245
 246         chomp $data;
 247
 248         ## skip_header_data
 249         return undef
 250                 if $self->skip_header_data
 251                 && ( defined $self->header_row && $data eq $self->header_row )
 252         ;
 253
 254         #printf "\nData:|%s|\tHeader:|%s|", $data, $self->header_row;
 255
 256         my @cols = unpack ( $self->unpack_string, $data );
 257
 258         ## If we bleed over a bit we can fix that.
 259         if ( $self->fix_overlay ) {
 260                 foreach my $idx ( 0 .. $#cols ) {
 261                         if (
 262                                 $cols[$idx] =~ m/\S+$/
 263                                 && exists $cols[$idx+1]
 264                                 && $cols[$idx+1] =~ s/^(\S+)//
 265                         ) {
 266                                         $cols[$idx] .= $1;
 267                         }
 268                 }
 269         }
 270
 271         ## Get rid of whitespaces
 272         if ( $self->trim_whitespace ) {
 273                 for ( @cols ) { s/^\s+//; s/\s+$//; }
 274         }
 275
 276         ## Swithc nulls to undef
 277         if ( $self->null_as_undef ) {
 278                 croak 'This ->null_as_undef option mandates ->trim_whitespace be true'
 279                         unless $self->trim_whitespace
 280                 ;
 281                 for ( @cols ) { undef $_ unless length($_) }
 282         }
 283
 284         \@cols;
 285
 286 }
 287
 288 sub parse_hash {
 289         my ( $self, $data ) = @_;
 290
 291         my $row = $self->parse( $data );
 292
 293         my $colstarts = $self->sorted_colstart;
 294
 295         my $results;
 296         foreach my $idx ( 0 .. $#$row ) {
 297                 my $col = $self->colchar_map->{ $colstarts->[$idx] };
 298                 $results->{ $col } = $row->[$idx];
 299         }
 300
 301         $results;
 302
 303 }
 304
 305 sub _build_sorted_colstart {
 306         my $self = shift;
 307
 308         my @startcols = map { $_->[0] }
 309                 sort { $a->[1] <=> $b->[1] }
 310                 map { [$_, sprintf( "%10d", $_ ) ] }
 311                 keys %{ $self->colchar_map }
 312         ;
 313
 314         \@startcols;
 315
 316 }
 317
 318 no Moose;
 319
 320 1;
 321
 322 __END__
 323
 324 =head1 NAME
 325
 326 DataExtract::FixedWidth - The one stop shop for parsing static column width text tables!
 327
 328 =head1 SYNOPSIS
 329
 330         ## We assume the columns have no spaces in the header.
 331         my $de = DataExtract::FixedWidth->new({ header_row => $header_row });
 332
 333         ## We explicitly tell what column names to pick out of the header.
 334         my $de = DataExtract::FixedWidth->new({
 335                 header_row => $header_row
 336                 cols       => [qw/COL1NAME COL2NAME COL3NAME/, 'COL WITH SPACE IN NAME']
 337         });
 338
 339         ## We supply data to heuristic and assume
 340         ## * first row is the header (to avoid this assumption
 341         ##   set the header_row to undef. )
 342         ## * heurisitic's unpack_string is correct
 343         ## * unpack_string applied to header_row will tell us the columns
 344         my $de = DataExtract::FixedWidth->new({ heuristic => \@datarows });
 345
 346         ## We supply data to heuristic, say we have no header, and the set columns
 347         ## just like the above except ->parse_hash will be be indexed by the
 348         ## provided columns and no row is designated as the header.
 349         my $de = DataExtract::FixedWidth->new({
 350                 heuristic    => \@datarows
 351                 , header_row => undef
 352                 , columns    => [qw/ foo bar baz/]
 353         });
 354
 355         ## We supply data to heuristic, and we explicitly add the header_row
 356         ## with this method it doesn't have to occur in the data.
 357         ## The unpack string rendered will be applied to the first row to get
 358         ## the columns
 359         my $de = DataExtract::FixedWidth->new({
 360                 heuristic    => \@datarows
 361                 , header_row => $header_row
 362         });
 363
 364         ## We explicitly add the header_row, with this method it doesn't have
 365         ## to occur in the data. The unpack string rendered will be applied
 366         ## to the provided header_row to get the columns
 367         my $de = DataExtract::FixedWidth->new({
 368                 unpack_string => $template
 369                 , header_row  => $header_row
 370         });
 371
 372         $de->parse( $data_row );
 373
 374         $de->parse_hash( $data_row );
 375
 376 =head1 DESCRIPTION
 377
 378 This module parses any type of fixed width table -- these types of tables are often outputed by ghostscript, printf() displays with string padding (i.e. %-20s %20s etc), and most screen capture mechanisms. This module is using Moose all methods can be specified in the constructor.
 379
 380 In the below example, this module can discern the column names from the header. Or, you can supply them explicitly in the constructor; or, you can supply the rows in an ArrayRef to heuristic and pray for the best luck. This module is pretty abstracted and will deduce what it doesn't know in a decent fashion if all of the information is not provided.
 381
 382         SAMPLE FILE
 383         HEADER:  'COL1NAME       COL2NAME       COL3NAMEEEEE'
 384         DATA1:   'FOOBARBAZ      THIS IS TEXT   ANHER COL   '
 385         DATA2:   'FOOBAR FOOBAR  IS TEXT        ANOTHER COL '
 386
 387 After you have constructed, you can C<-E<gt>parse> which will return an ArrayRef
 388         $de->parse('FOOBARBAZ THIS IS TEXT    ANOTHER COL');
 389
 390 Or, you can use C<-E<gt>parse_hash()> which returns a HashRef of the data indexed by the column headers. They can be determined in many ways with the data you provide.
 391
 392 =head2 Constructor
 393
 394 The class constructor, C<-E<gt>new>, has numerious forms. Some options it has are:
 395
 396 =over 12
 397
 398 =item heuristics => \@lines
 399
 400 This will deduce the unpack format string from data. If you opt to use this method, and need parse_hash, the first row of the heurisitic is assumed to be the header_row. The unpack_string that results for the heuristic is applied to the header_row to determine the columns.
 401
 402 =item cols => \@cols
 403
 404 This will permit you to explicitly list the columns in the header row. This is especially handy if you have spaces in the column header. This option will make the C<header_row> mandatory.
 405
 406 =item header_row => $string
 407
 408 If a C<cols> option is not provided the assumption is that there are no spaces in the column header. The module can take care of the rest. The only way this column can be avoided is if we deduce the header from heuristics, or if you explicitly supply the unpack string and only use C<-E<gt>parse($line)>. If you are not going to supply a header, and you do not want to waste the first line on a header assumption, set the C<header_row =E<gt> undef> in the constructor.
 409
 410 =back
 411
 412 =head2 Methods
 413
 414 B<An astrisk, (*) in the option means that is the default.>
 415
 416 =over 12
 417
 418 =item ->parse( $data_line )
 419
 420 Parses the data and returns an ArrayRef
 421
 422 =item ->parse_hash( $data_line )
 423
 424 Parses the data and returns a HashRef, indexed by the I<cols> (headers)
 425
 426 =item ->first_col_zero(1*|0)
 427
 428 This option forces the unpack string to make the first column assume the characters to the left of the header column. So, in the below example the first column also includes the first char of the row, even though the word stock begins at the second character.
 429
 430         CHAR NUMBERS: |1|2|3|4|5|6|7|8|9|10
 431         HEADER ROW  : | |S|T|O|C|K| |V|I|N
 432
 433 =item ->trim_whitespace(*1|0)
 434
 435 Trim the whitespace for the elements that C<-E<gt>parse($line)> outputs.
 436
 437 =item ->fix_overlay(1|0*)
 438
 439 Fixes columns that bleed into other columns, move over all non-whitespace characters preceding the first whitespace of the next column. This does not work with heurisitic because the unpack string makes the assumption the data is not mangeled.
 440
 441 So if ColumnA as is 'foob' and ColumnB is 'ar Hello world'
 442
 443 * ColumnA becomes 'foobar', and ColumnB becomes 'Hello world'
 444
 445 =item ->null_as_undef(1|0*)
 446
 447 Simply undef all elements that return C<length(element) = 0>, requires C<-E<gt>trim_whitespace>.
 448
 449 =item ->skip_header_data(1*|0)
 450
 451 Skips duplicate copies of the header_row if found in the data.
 452
 453 =item ->colchar_map
 454
 455 Returns a HashRef that displays the results of each column header and relative character position the column starts at. In the case of heuristic this is a simple ordinal number. In the case of non-heuristic provided data it is currently a cardinal character position.
 456
 457 =item ->unpack_string
 458
 459 Returns the C<CORE::unpack()> template string that will be used internally by C<-E<gt>parse($line)>
 460
 461 =back
 462
 463 =head1 AVAILABILITY
 464
 465 CPAN.org
 466
 467 =head1 COPYRIGHT & LICENSE
 468
 469 Copyright 2008 Evan, all rights reserved.
 470
 471 This program is free software; you can redistribute it and/or modify it
 472 under the same terms as Perl itself.
 473
 474
 475 =head1 AUTHOR
 476
 477         Evan Carroll <me at evancarroll.com>
 478         System Lord of the Internets
 479
 480 =head1 BUGS
 481
 482 Please report any bugs or feature requests to C<bug-dataexract-fixedwidth at rt.cpan.org>, or through
 483 the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=DataExtract-FixedWidth>.  I will be notified, and then you'll
 484 automatically be notified of progress on your bug as I make changes.
 485
 486 =cut