lib/DataExtract/FixedWidth.pm

   1 package DataExtract::FixedWidth;
   2 use Moose;
   3 use Carp;
   4
   5 our $VERSION = '0.05';
   6
   7 sub BUILD {
   8         my $self = shift;
   9
  10         confess 'You must either send either a "header_row" or data for "heuristic"'
  11                 unless $self->has_header_row || $self->has_heuristic
  12         ;
  13         confess 'You must send a "header_row" if you send "cols"'
  14                 if $self->has_cols && !$self->has_header_row
  15         ;
  16
  17 }
  18
  19 has 'unpack_string' => (
  20         isa          => 'Str'
  21         , is         => 'rw'
  22         , lazy_build => 1
  23 );
  24
  25 has 'cols' => (
  26         isa            => 'ArrayRef'
  27         , is           => 'rw'
  28         , auto_deref   => 1
  29         , lazy_build   => 1
  30 );
  31
  32 has 'colchar_map' => (
  33         isa          => 'HashRef'
  34         , is         => 'rw'
  35         , lazy_build => 1
  36 );
  37
  38 has 'header_row' => (
  39         isa          => 'Maybe[Str]'
  40         , is         => 'rw'
  41         , lazy_build => 1
  42 );
  43
  44 has 'first_col_zero' => (
  45         isa       => 'Bool'
  46         , is      => 'ro'
  47         , default => 1
  48 );
  49
  50 has 'fix_overlay' => (
  51         isa       => 'Bool'
  52         , is      => 'ro'
  53         , default => 0
  54 );
  55
  56 has 'trim_whitespace' => (
  57         isa       => 'Bool'
  58         , is      => 'ro'
  59         , default => 1
  60 );
  61
  62 has 'sorted_colstart' => (
  63         isa          => 'ArrayRef'
  64         , is         => 'ro'
  65         , lazy_build => 1
  66         , auto_deref => 1
  67 );
  68
  69 has 'null_as_undef' => (
  70         isa       => 'Bool'
  71         , is      => 'ro'
  72         , default => 0
  73 );
  74
  75 has 'heuristic' => (
  76         isa          => 'ArrayRef'
  77         , is         => 'rw'
  78         , predicate  => 'has_heuristic'
  79         , auto_deref => 1
  80         , trigger    => sub { chomp @{$_[1]} }
  81 );
  82
  83 has 'skip_header_data' => (
  84         isa       => 'Bool'
  85         , is      => 'rw'
  86         , default => 1
  87 );
  88
  89 sub _build_header_row {
  90         my $self = shift;
  91
  92         $self->has_heuristic
  93                 ? return ${$self->heuristic}[0]
  94                 : undef
  95         ;
  96
  97 }
  98
  99 sub _build_cols {
 100         my $self = shift;
 101
 102         my @cols;
 103
 104         ## If we have the unpack string and the header_row parse it all out on our own
 105         if (
 106                 ( $self->header_row && $self->has_unpack_string )
 107                 || ( $self->header_row && $self->has_heuristic )
 108         ) {
 109                 my $skd = $self->skip_header_data;
 110                 $self->skip_header_data( 0 );
 111
 112                 @cols = @{ $self->parse( $self->header_row ) };
 113
 114                 $self->skip_header_data( $skd );
 115         }
 116
 117         ## We only the header_row
 118         elsif ( $self->header_row ) {
 119                 @cols = split ' ', $self->header_row;
 120         }
 121
 122         else {
 123                 croak 'Need some method to calculate cols';
 124         }
 125
 126         \@cols;
 127
 128 }
 129
 130 sub _build_colchar_map {
 131         my $self = shift;
 132
 133         croak 'Can not render the map of columns to start-chars without the header_row'
 134                 unless defined $self->header_row
 135         ;
 136
 137         my $ccm = {};
 138         foreach my $col ( $self->cols ) {
 139
 140                 my $pos = 0;
 141                 $pos = index( $self->header_row, $col, $pos );
 142
 143                 croak "Failed to find a column '$col' in the header row"
 144                         unless defined $pos
 145                 ;
 146
 147                 unless ( exists $ccm->{ $pos } ) {
 148                         $ccm->{ $pos } = $col;
 149                 }
 150
 151                 ## We have two like-named columns
 152                 else {
 153
 154                         until ( not exists $ccm->{$pos} ) {
 155                                 $pos = index( $self->header_row, $col, $pos+1 );
 156
 157                                 croak "Failed to find another column '$col' in the header row"
 158                                         unless defined $pos
 159                                 ;
 160
 161                         }
 162
 163                         $ccm->{ $pos } = $col;
 164
 165                 }
 166
 167         }
 168
 169         $ccm;
 170
 171 }
 172
 173 sub _build_unpack_string {
 174         my $self = shift;
 175
 176         my @unpack;
 177         if ( $self->has_heuristic ) {
 178                 my @lines = $self->heuristic;
 179
 180                 my $mask = ' ' x length $lines[ 0 ];
 181
 182                 $mask |= $_ for @lines;
 183
 184                 push @unpack, length($1)
 185                         while $mask =~ m/(\S+\s+|$)/g
 186                 ;
 187
 188                 ## Remove last row, (to be replaced with A*)
 189                 pop @unpack;
 190
 191         }
 192         else {
 193                 my @startcols = $self->sorted_colstart;
 194                 $startcols[0] = 0 if $self->first_col_zero;
 195                 foreach my $idx ( 0 .. $#startcols ) {
 196
 197                         if ( exists $startcols[$idx+1] ) {
 198                                 push @unpack, ( $startcols[$idx+1] - $startcols[$idx] );
 199                         }
 200
 201                 }
 202         }
 203
 204         my $unpack;
 205         if ( @unpack ) {
 206                 $unpack = 'a' . join 'a', @unpack;
 207         }
 208         $unpack .= 'A*';
 209
 210         $unpack;
 211
 212 }
 213
 214 sub parse {
 215         my ( $self, $data ) = @_;
 216
 217         return undef if !defined $data;
 218
 219         chomp $data;
 220
 221         ## skip_header_data
 222         return undef
 223                 if $self->skip_header_data
 224                 && ( defined $self->header_row && $data eq $self->header_row )
 225         ;
 226
 227         #printf "\nData:|%s|\tHeader:|%s|", $data, $self->header_row;
 228
 229         my @cols = unpack ( $self->unpack_string, $data );
 230
 231         ## If we bleed over a bit we can fix that.
 232         if ( $self->fix_overlay ) {
 233                 foreach my $idx ( 0 .. $#cols ) {
 234                         if (
 235                                 $cols[$idx] =~ m/\S+$/
 236                                 && exists $cols[$idx+1]
 237                                 && $cols[$idx+1] =~ s/^(\S+)//
 238                         ) {
 239                                         $cols[$idx] .= $1;
 240                         }
 241                 }
 242         }
 243
 244         ## Get rid of whitespaces
 245         if ( $self->trim_whitespace ) {
 246                 for ( @cols ) { s/^\s+//; s/\s+$//; }
 247         }
 248
 249         ## Swithc nulls to undef
 250         if ( $self->null_as_undef ) {
 251                 croak 'This ->null_as_undef option mandates ->trim_whitespace be true'
 252                         unless $self->trim_whitespace
 253                 ;
 254                 for ( @cols ) { undef $_ unless length($_) }
 255         }
 256
 257         \@cols;
 258
 259 }
 260
 261 sub parse_hash {
 262         my ( $self, $data ) = @_;
 263
 264         my $row = $self->parse( $data );
 265
 266         my $colstarts = $self->sorted_colstart;
 267
 268         my $results;
 269         foreach my $idx ( 0 .. $#$row ) {
 270                 my $col = $self->colchar_map->{ $colstarts->[$idx] };
 271                 $results->{ $col } = $row->[$idx];
 272         }
 273
 274         $results;
 275
 276 }
 277
 278 sub _build_sorted_colstart {
 279         my $self = shift;
 280
 281         my @startcols = map { $_->[0] }
 282                 sort { $a->[1] <=> $b->[1] }
 283                 map { [$_, sprintf( "%10d", $_ ) ] }
 284                 keys %{ $self->colchar_map }
 285         ;
 286
 287         \@startcols;
 288
 289 }
 290
 291 1;
 292
 293 __END__
 294
 295 =head1 NAME
 296
 297 DataExtract::FixedWidth - The one stop shop for parsing static column width text tables!
 298
 299 =head1 SYNOPSIS
 300
 301         ## We assume the columns have no spaces in the header.
 302         my $de = DataExtract::FixedWidth->new({ header_row => $header_row });
 303
 304         ## We explicitly tell what column names to pick out of the header.
 305         my $de = DataExtract::FixedWidth->new({
 306                 header_row => $header_row
 307                 cols       => [qw/COL1NAME COL2NAME COL3NAME/, 'COL WITH SPACE IN NAME']
 308         });
 309
 310         ## We supply data to heuristically determine header. Here we assume the first
 311         ## row is the header (if we need the first row to avoid this possible assumption set
 312         ## the header_row to undef. And the result of the heurisitic applied to the first row
 313         ## is the columns
 314         my $de = DataExtract::FixedWidth->new({ heuristic => \@datarows });
 315
 316         $de->parse( $data_row );
 317
 318         $de->parse_hash( $data_row );
 319
 320 =head1 DESCRIPTION
 321
 322 This module parses any type of fixed width table -- these types of tables are often outputed by ghostscript, printf() displays with string padding (i.e. %-20s %20s etc), and most screen capture mechanisms. This module is using Moose all methods can be specified in the constructor.
 323
 324
 325 In the below example, this module can discern the column names from the header. Or, you can supply them explicitly in the constructor; or, you can supply the rows in an ArrayRef to heuristic and pray for the best luck.
 326
 327         SAMPLE FILE
 328         HEADER:  'COL1NAME       COL2NAME       COL3NAMEEEEE'
 329         DATA1:   'FOOBARBAZ      THIS IS TEXT   ANHER COL   '
 330         DATA2:   'FOOBAR FOOBAR  IS TEXT        ANOTHER COL '
 331
 332 After you have constructed, you can C<-E<gt>parse> which will return an ArrayRef
 333         $de->parse('FOOBARBAZ THIS IS TEXT    ANOTHER COL');
 334
 335 Or, you can use C<-E<gt>parse_hash()> which returns a HashRef of the data indexed by the column header
 336
 337 =head2 Constructor
 338
 339 The class constructor -- C<-E<gt>new> -- provides numerious features. Some options it has are:
 340
 341 =over 12
 342
 343 =item heuristics => \@lines
 344
 345 This will deduce the unpack format string from data. If you opt to use this method, and need parse_hash, the first row of the heurisitic is assumed to be the header_row. The unpack_string that results for the heuristic is applied to the header_row to determine the columns.
 346
 347 =item cols => \@cols
 348
 349 This will permit you to explicitly list the columns in the header row. This is especially handy if you have spaces in the column header. This option will make the C<header_row> mandatory.
 350
 351 =item header_row => $string
 352
 353 If a C<cols> option is not provided the assumption is that there are no spaces in the column header. The module can take care of the rest. The only way this column can be avoided is if we deduce the header from heuristics, or if you explicitly supply the unpack string and only use C<-E<gt>parse($line)>. If you are not going to supply a header, and you do not want to waste the first line on a header assumption, set the C<header_row =E<gt> undef> in the constructor.
 354
 355 =back
 356
 357 =head2 Methods
 358
 359 B<An astrisk, (*) in the option means that is the default.>
 360
 361 =over 12
 362
 363 =item ->parse( $data_line )
 364
 365 Parses the data and returns an ArrayRef
 366
 367 =item ->parse_hash( $data_line )
 368
 369 Parses the data and returns a HashRef, indexed by the I<cols> (headers)
 370
 371 =item ->first_col_zero(1*|0)
 372
 373 This option forces the unpack string to make the first column assume the characters to the left of the header column. So, in the below example the first column also includes the first char of the row, even though the word stock begins at the second character.
 374
 375         CHAR NUMBERS: |1|2|3|4|5|6|7|8|9|10
 376         HEADER ROW  : | |S|T|O|C|K| |V|I|N
 377
 378 =item ->trim_whitespace(*1|0)
 379
 380 Trim the whitespace for the elements that ->parse() outputs
 381
 382 =item ->fix_overlay(1|0*)
 383
 384 Fixes columns that bleed into other columns, move over all non-whitespace characters preceding the first whitespace of the next column.
 385
 386 So if ColumnA as is 'foob' and ColumnB is 'ar Hello world'
 387
 388 * ColumnA becomes 'foobar', and ColumnB becomes 'Hello world'
 389
 390 =item ->null_as_undef(1|0*)
 391
 392 Simply undef all elements that return C<length(element) = 0>, requires C<-E<gt>trim_whitespace>
 393
 394 =item ->skip_header_data(1*|0)
 395
 396 Skips duplicate copies of the header_row if found in the data
 397
 398 =item ->colchar_map
 399
 400 Returns a hash ref that sisplays the results of each column header and the character position the column starts at.
 401
 402 =item ->unpack_string
 403
 404 Returns the CORE::unpack() template string that will be used internally by ->parse()
 405
 406 =back
 407
 408 =head1 AVAILABILITY
 409
 410 CPAN.org
 411
 412 =head1 COPYRIGHT & LICENSE
 413
 414 Copyright 2008 Evan, all rights reserved.
 415
 416 This program is free software; you can redistribute it and/or modify it
 417 under the same terms as Perl itself.
 418
 419
 420 =head1 AUTHOR
 421
 422         Evan Carroll <me at evancarroll.com>
 423         System Lord of the Internets
 424
 425 =head1 BUGS
 426
 427 Please report any bugs or feature requests to C<bug-dataexract-fixedwidth at rt.cpan.org>, or through
 428 the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=DataExtract-FixedWidth>.  I will be notified, and then you'll
 429 automatically be notified of progress on your bug as I make changes.
 430
 431 =cut