From 7d6f6080f300311295131115733c694ee14a5bda Mon Sep 17 00:00:00 2001 From: Evan Carroll Date: Thu, 22 May 2008 15:54:58 -0500 Subject: [PATCH] pod and new test new version 06 --- MANIFEST | 2 ++ META.yml | 2 +- lib/DataExtract/FixedWidth.pm | 48 +++++++++++++++++++++++++++++++++---------- 3 files changed, 40 insertions(+), 12 deletions(-) diff --git a/MANIFEST b/MANIFEST index 5f5a94f..e1d913d 100644 --- a/MANIFEST +++ b/MANIFEST @@ -30,8 +30,10 @@ t/07-Heurisitic_ParseHash.t t/08-SingleColumn.t t/09-01-pslA.t t/09-02-pslA.t +t/10-Heuristic-woHeader-wCols.t t/boilerplate.t t/data/BrowserUK.txt +t/data/Dealermade.txt t/data/Fix-Overlay.txt t/data/Jhourcle.txt t/data/Nulls.txt diff --git a/META.yml b/META.yml index 7e7162f..dcf636b 100644 --- a/META.yml +++ b/META.yml @@ -18,4 +18,4 @@ no_index: requires: Moose: 0 perl: 5.10.0 -version: 0.05 +version: 0.06 diff --git a/lib/DataExtract/FixedWidth.pm b/lib/DataExtract/FixedWidth.pm index b7ee2ea..76a2da5 100644 --- a/lib/DataExtract/FixedWidth.pm +++ b/lib/DataExtract/FixedWidth.pm @@ -2,7 +2,7 @@ package DataExtract::FixedWidth; use Moose; use Carp; -our $VERSION = '0.05'; +our $VERSION = '0.06'; sub BUILD { my $self = shift; @@ -336,11 +336,38 @@ DataExtract::FixedWidth - The one stop shop for parsing static column width text cols => [qw/COL1NAME COL2NAME COL3NAME/, 'COL WITH SPACE IN NAME'] }); - ## We supply data to heuristically determine header. Here we assume the first - ## row is the header (if we need the first row to avoid this possible assumption set - ## the header_row to undef. And the result of the heurisitic applied to the first row - ## is the columns + ## We supply data to heuristic and assume + ## * first row is the header (to avoid this assumption + ## set the header_row to undef. ) + ## * heurisitic's unpack_string is correct + ## * unpack_string applied to header_row will tell us the columns my $de = DataExtract::FixedWidth->new({ heuristic => \@datarows }); + + ## We supply data to heuristic, say we have no header, and the set columns + ## just like the above except ->parse_hash will be be indexed by the + ## provided columns and no row is designated as the header. + my $de = DataExtract::FixedWidth->new({ + heuristic => \@datarows + , header_row => undef + , columns => [qw/ foo bar baz/] + }); + + ## We supply data to heuristic, and we explicitly add the header_row + ## with this method it doesn't have to occur in the data. + ## The unpack string rendered will be applied to the first row to get + ## the columns + my $de = DataExtract::FixedWidth->new({ + heuristic => \@datarows + , header_row => $header_row + }); + + ## We explicitly add the header_row, with this method it doesn't have + ## to occur in the data. The unpack string rendered will be applied + ## to the provided header_row to get the columns + my $de = DataExtract::FixedWidth->new({ + unpack_string => $template + , header_row => $header_row + }); $de->parse( $data_row ); @@ -350,8 +377,7 @@ DataExtract::FixedWidth - The one stop shop for parsing static column width text This module parses any type of fixed width table -- these types of tables are often outputed by ghostscript, printf() displays with string padding (i.e. %-20s %20s etc), and most screen capture mechanisms. This module is using Moose all methods can be specified in the constructor. - -In the below example, this module can discern the column names from the header. Or, you can supply them explicitly in the constructor; or, you can supply the rows in an ArrayRef to heuristic and pray for the best luck. +In the below example, this module can discern the column names from the header. Or, you can supply them explicitly in the constructor; or, you can supply the rows in an ArrayRef to heuristic and pray for the best luck. This module is pretty abstracted and will deduce what it doesn't know in a decent fashion if all of the information is not provided. SAMPLE FILE HEADER: 'COL1NAME COL2NAME COL3NAMEEEEE' @@ -361,11 +387,11 @@ In the below example, this module can discern the column names from the header. After you have constructed, you can C<-Eparse> which will return an ArrayRef $de->parse('FOOBARBAZ THIS IS TEXT ANOTHER COL'); -Or, you can use C<-Eparse_hash()> which returns a HashRef of the data indexed by the column header +Or, you can use C<-Eparse_hash()> which returns a HashRef of the data indexed by the column headers. They can be determined in many ways with the data you provide. =head2 Constructor -The class constructor -- C<-Enew> -- provides numerious features. Some options it has are: +The class constructor, C<-Enew>, has numerious forms. Some options it has are: =over 12 @@ -410,7 +436,7 @@ Trim the whitespace for the elements that C<-Eparse($line)> outputs. =item ->fix_overlay(1|0*) -Fixes columns that bleed into other columns, move over all non-whitespace characters preceding the first whitespace of the next column. +Fixes columns that bleed into other columns, move over all non-whitespace characters preceding the first whitespace of the next column. This does not work with heurisitic because the unpack string makes the assumption the data is not mangeled. So if ColumnA as is 'foob' and ColumnB is 'ar Hello world' @@ -426,7 +452,7 @@ Skips duplicate copies of the header_row if found in the data. =item ->colchar_map -Returns a HashRef that displays the results of each column header and the character position the column starts at. This is not guarrentted to be true as internally numerious things can happen that cause this to be fudged. What it does always tell is the column names, and the order. +Returns a HashRef that displays the results of each column header and relative character position the column starts at. In the case of heuristic this is a simple ordinal number. In the case of non-heuristic provided data it is currently a cardinal character position. =item ->unpack_string -- 2.11.4.GIT