3 # BioPerl module for Bio::SearchIO::fasta
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Jason Stajich <jason-at-bioperl.org>
9 # Copyright Jason Stajich
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::SearchIO::fasta - A SearchIO parser for FASTA results
21 # Do not use this object directly, use it through the SearchIO system
23 my $searchio = Bio::SearchIO->new(-format => 'fasta',
24 -file => 'report.FASTA');
25 while( my $result = $searchio->next_result ) {
26 # ... do what you would normally doi with Bio::SearchIO.
31 This object contains the event based parsing code for FASTA format
32 reports. It creates L<Bio::Search::HSP::FastaHSP> objects instead of
33 L<Bio::Search::HSP::GenericHSP> for the HSP objects.
35 This module will parse -m 9 -d 0 output as well as default m 1 output
36 from FASTA as well as SSEARCH.
38 Also see the SearchIO HOWTO:
39 L<http://bioperl.open-bio.org/wiki/HOWTO:SearchIO>.
45 User feedback is an integral part of the evolution of this and other
46 Bioperl modules. Send your comments and suggestions preferably to
47 the Bioperl mailing list. Your participation is much appreciated.
49 bioperl-l@bioperl.org - General discussion
50 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
54 Please direct usage questions or support issues to the mailing list:
56 I<bioperl-l@bioperl.org>
58 rather than to the module maintainer directly. Many experienced and
59 reponsive experts will be able look at the problem and quickly
60 address it. Please include a thorough description of the problem
61 with code and data examples if at all possible.
65 Report bugs to the Bioperl bug tracking system to help us keep track
66 of the bugs and their resolution. Bug reports can be submitted via the
69 http://bugzilla.open-bio.org/
71 =head1 AUTHOR - Jason Stajich, Aaron Mackey
73 Email jason-at-bioperl.org
77 The rest of the documentation details each of the object methods.
78 Internal methods are usually preceded with a _
82 # Let the code begin...
84 package Bio
::SearchIO
::fasta
;
85 use vars
qw(%MODEMAP %MAPPING $IDLENGTH);
88 # Object preamble - inherits from Bio::Root::RootI
90 use Bio::Factory::ObjectFactory;
94 # Set IDLENGTH to a new value if you have
95 # compile FASTA with a different ID length
96 # (actually newest FASTA allows the setting of this
97 # via -C parameter, default is 6)
100 # mapping of NCBI Blast terms to Bioperl hash keys
102 'FastaOutput' => 'result',
107 # This should really be done more intelligently, like with
111 'Hsp_bit-score' => 'HSP-bits',
112 'Hsp_score' => 'HSP-score',
113 'Hsp_sw-score' => 'HSP-swscore',
114 'Hsp_evalue' => 'HSP-evalue',
115 'Hsp_query-from' => 'HSP-query_start',
116 'Hsp_query-to' => 'HSP-query_end',
117 'Hsp_hit-from' => 'HSP-hit_start',
118 'Hsp_hit-to' => 'HSP-hit_end',
119 'Hsp_positive' => 'HSP-conserved',
120 'Hsp_identity' => 'HSP-identical',
121 'Hsp_gaps' => 'HSP-hsp_gaps',
122 'Hsp_hitgaps' => 'HSP-hit_gaps',
123 'Hsp_querygaps' => 'HSP-query_gaps',
124 'Hsp_qseq' => 'HSP-query_seq',
125 'Hsp_hseq' => 'HSP-hit_seq',
126 'Hsp_midline' => 'HSP-homology_seq',
127 'Hsp_align-len' => 'HSP-hsp_length',
128 'Hsp_query-frame' => 'HSP-query_frame',
129 'Hsp_hit-frame' => 'HSP-hit_frame',
131 'Hit_id' => 'HIT-name',
132 'Hit_len' => 'HIT-length',
133 'Hit_accession' => 'HIT-accession',
134 'Hit_def' => 'HIT-description',
135 'Hit_signif' => 'HIT-significance',
136 'Hit_score' => 'HIT-score',
138 'FastaOutput_program' => 'RESULT-algorithm_name',
139 'FastaOutput_version' => 'RESULT-algorithm_version',
140 'FastaOutput_query-def' => 'RESULT-query_name',
141 'FastaOutput_querydesc' => 'RESULT-query_description',
142 'FastaOutput_query-len' => 'RESULT-query_length',
143 'FastaOutput_db' => 'RESULT-database_name',
144 'FastaOutput_db-len' => 'RESULT-database_entries',
145 'FastaOutput_db-let' => 'RESULT-database_letters',
147 'Parameters_matrix' => { 'RESULT-parameters' => 'matrix' },
148 'Parameters_expect' => { 'RESULT-parameters' => 'expect' },
149 'Parameters_include' => { 'RESULT-parameters' => 'include' },
150 'Parameters_sc-match' => { 'RESULT-parameters' => 'match' },
151 'Parameters_sc-mismatch' => { 'RESULT-parameters' => 'mismatch' },
152 'Parameters_gap-open' => { 'RESULT-parameters' => 'gapopen' },
153 'Parameters_gap-ext' => { 'RESULT-parameters' => 'gapext' },
154 'Parameters_word-size' => { 'RESULT-parameters' => 'wordsize' },
155 'Parameters_ktup' => { 'RESULT-parameters' => 'ktup' },
156 'Parameters_filter' => { 'RESULT-parameters' => 'filter' },
157 'Statistics_db-num' => { 'RESULT-statistics' => 'dbentries' },
158 'Statistics_db-len' => { 'RESULT-statistics' => 'dbletters' },
159 'Statistics_hsp-len' => { 'RESULT-statistics' => 'hsplength' },
160 'Statistics_eff-space' => { 'RESULT-statistics' => 'effectivespace' },
161 'Statistics_kappa' => { 'RESULT-statistics' => 'kappa' },
162 'Statistics_lambda' => { 'RESULT-statistics' => 'lambda' },
163 'Statistics_entropy' => { 'RESULT-statistics' => 'entropy' },
167 use base qw(Bio::SearchIO);
172 Usage : my $obj = Bio::SearchIO::fasta->new();
173 Function: Builds a new Bio::SearchIO::fasta object
174 Returns : Bio::SearchIO::fasta
175 Args : -idlength - set ID length to something other
176 than the default (7), this is only
177 necessary if you have compiled FASTA
178 with a new default id length to display
179 in the HSP alignment blocks
184 my ( $self, @args ) = @_;
185 $self->SUPER::_initialize
(@args);
187 my ($idlength) = $self->_rearrange( [qw(IDLENGTH)], @args );
188 $self->idlength( $idlength || $IDLENGTH );
189 $self->_eventHandler->register_factory(
191 Bio
::Factory
::ObjectFactory
->new(
192 -type
=> 'Bio::Search::HSP::FastaHSP',
193 -interface
=> 'Bio::Search::HSP::HSPI'
202 Usage : my $hit = $searchio->next_result;
203 Function: Returns the next Result from a search
204 Returns : Bio::Search::Result::ResultI object
217 $self->start_document();
219 while ( defined( $_ = $self->_readline ) ) {
220 next if ( !$self->in_element('hsp')
221 && /^\s+$/ ); # skip empty lines
223 m/(\S+)\s+searches\s+a\s+(protein\s+or\s+DNA\s+)?sequence/oxi
224 || /(\S+)\s+compares\s+a/
226 && ( $_ = $self->_readline )
227 && /(\S+)\s+searches\s+a\s+(protein\s+or\s+DNA\s+)?sequence/oxi
228 || /(\S+)\s+compares\s+a/ )
232 $self->_pushback($_);
233 $self->end_element( { 'Name' => 'FastaOutput' } );
234 return $self->end_document();
236 $self->{'_reporttype'} = $1;
237 $self->start_element( { 'Name' => 'FastaOutput' } );
238 $self->{'_result_count'}++;
240 #$self->debug( "reporttype is " . $self->{'_reporttype'} . "\n" );
243 'Name' => 'FastaOutput_program',
244 'Data' => $self->{'_reporttype'}
247 $_ = $self->_readline();
248 my ($version) = (/version\s+(\S+)/);
249 $version = '' unless defined $version;
250 $self->{'_version'} = $version;
253 'Name' => 'FastaOutput_version',
258 my ( $last, $leadin, $type, $querylen, $querytype, $querydef );
260 while ( defined( $_ = $self->_readline() ) ) {
263 (?
:\s
+>) | # fa33 lead-in
264 (?
:\s
*\d
+\s
*>>>) # fa34 mlib lead-in
270 ( $leadin, $querydef ) = ( $1, $2 );
271 if ( $leadin =~ m/>>>/ ) {
273 /^(.*?)\s+(?:\-\s+)?(\d+)\s+(aa|nt).*$/o )
275 ( $querydef, $querylen, $querytype ) =
281 if ( $last =~ /(\S+)[:,]\s*(\d+)\s+(aa|nt)/ ) {
282 ( $querylen, $querytype ) = ( $2, $3 );
288 elsif (m/^\s*vs\s+\S+/o) {
289 if ( $last =~ /(\S+)[,:]\s+(\d+)\s+(aa|nt)/o ) {
290 ( $querydef, $querylen, $querytype ) = ( $1, $2, $3 );
296 if ( $self->{'_reporttype'}
297 && $self->{'_reporttype'} eq 'FASTA' )
299 if ( $querytype eq 'nt' ) {
300 $self->{'_reporttype'} = 'FASTN';
302 elsif ( $querytype eq 'aa' ) {
303 $self->{'_reporttype'} = 'FASTP';
306 my ( $name, $descr ) = $querydef =~ m/^(\S+)\s*(.*?)\s*$/o;
309 'Name' => 'FastaOutput_query-def',
315 'Name' => 'FastaOutput_querydesc',
322 'Name' => 'FastaOutput_query-len',
328 $self->warn("unable to find and set query length");
331 $last =~ /^\s*vs\s+(\S+)/
332 || ( $last =~ /^searching\s+(\S+)\s+library/ )
333 || ( $last =~ /^Library:\s+(\S+)\s+/ )
336 && ( /^\s*vs\s+(\S+)/
337 || /^Library:\s+(\S+)\s+/ )
339 || ( defined( $_ = $self->_readline() )
340 && ( /^\s*vs\s+(\S+)/ || /^Library:\s+(\S+)/ ) )
345 'Name' => 'FastaOutput_db',
350 elsif (m/^\s+opt(?:\s+E\(\))?$/o) {
352 # histogram ... read over it more rapidly than the larger outer loop:
353 while ( defined( $_ = $self->_readline ) ) {
358 elsif (/(\d+)\s+residues\s+in\s+(\d+)\s+(?:library\s+)?sequences/) {
361 'Name' => 'FastaOutput_db-let',
367 'Name' => 'FastaOutput_db-len',
373 'Name' => 'Statistics_db-len',
379 'Name' => 'Statistics_db-num',
384 elsif (/Lambda=\s*(\S+)/) {
387 'Name' => 'Statistics_lambda',
392 elsif (/K=\s*(\S+)/) {
395 'Name' => 'Statistics_kappa',
400 elsif (/^\s*(Smith-Waterman).+(\S+)\s*matrix [^\]]*?(xS)?\]/) {
403 'Name' => 'Parameters_matrix',
409 'Name' => 'Parameters_filter',
410 'Data' => defined $3 ?
1 : 0,
413 $self->{'_reporttype'} = $1;
417 'Name' => 'FastaOutput_program',
418 'Data' => $self->{'_reporttype'}
422 elsif (/The best( related| unrelated)? scores are:/) {
426 if ( $_ =~ m/^E\((\d+)\)$/o )
429 { 'Name' => 'Statistics_eff-space', 'Data' => $1 } );
435 } @labels[ $rel ?
5 : 4 .. $#labels ];
437 while ( defined( $_ = $self->_readline() )
442 if ( $line[-1] =~ m/\=/o && $labels[-1] eq 'fs' ) {
444 # unlabelled alignment hit;
445 push @labels, "aln_code";
449 @data{@labels} = splice( @line, @line - @labels );
450 if ( $line[-1] =~ m/\[([1-6rf])\]/o ) {
454 ?
( $fr <= 3 ?
"+$fr" : "-@{[$fr-3]}" )
455 : ( $fr eq 'f' ?
'+1' : '-1' )
463 if ( $line[-1] =~ m/^\(?(\d+)\)$/ ) {
466 if ( $line[-1] =~ m/^\($/ ) {
474 # rebuild the first part of the line, preserving spaces:
475 ($_) = m/^(\S+(?:\s+\S+){$#line})/;
477 my ( $id, $desc ) = split( /\s+/, $_, 2 );
478 my @pieces = split( /\|/, $id );
479 my $acc = pop @pieces;
482 @data{qw(id desc acc)} = ( $id, $desc, $acc );
484 push @hit_signifs, \
%data;
488 /^\s*([T]?FAST[XYAF]).+,\s*(\S+)\s*matrix[^\]]+?(xS)?\]\s*ktup:\s*(\d+)/
494 'Name' => 'Parameters_matrix',
500 'Name' => 'Parameters_filter',
501 'Data' => defined $3 ?
1 : 0,
506 'Name' => 'Parameters_ktup',
510 $self->{'_reporttype'} = $1
511 if ( $self->{'_reporttype'} !~ /FAST[PN]/i );
515 'Name' => 'FastaOutput_program',
516 'Data' => $self->{'_reporttype'}
520 elsif (/^Algorithm:\s+(\S+)\s+\(([^)]+)\)\s+(\S+)/) {
521 $self->{'_reporttype'} = $1
522 if ( $self->{'_reporttype'} !~ /FAST[PN]/i );
525 /^Parameters:\s+(\S+)\s*matrix\s*(?:\(([^(]+?)\))?\s*ktup:\s*(\d+)/)
529 'Name' => 'Parameters_matrix',
535 'Name' => 'Parameters_filter',
536 'Data' => defined $2 ?
$2 : 0,
541 'Name' => 'Parameters_ktup',
547 /(?:gap\-pen|open\/ext
):\s
+([\
-\
+]?\d
+)\s
*\
/\s*([\-\+]?\d+).+width:\s+(\d+)/
552 'Name' => 'Parameters_gap-open',
558 'Name' => 'Parameters_gap-ext',
564 'Name' => 'Parameters_word-size',
569 elsif (/^>>(?!>)(.+?)\s+(?:\((\d+)\s*(aa|nt)\))?$/) {
570 my ($hit_id, $len, $alphabet) = ($1, $2, $3);
571 if (!$len || !$alphabet) {
573 while (defined($_ = $self->_readline)) {
574 if (/(.*?)\s+\((\d+)\s*(aa|nt)\)/) {
575 ($len, $alphabet) = ($2, $3);
576 $hit_id .= $1 ?
" ".$1 : '';
579 if (/^>>(?!>)/) { # too far, throw
580 $self->throw("Couldn't find length, bailing");
584 if ( $self->in_element('hsp') ) {
585 $self->end_element( { 'Name' => 'Hsp' } );
587 if ( $self->in_element('hit') ) {
588 $self->end_element( { 'Name' => 'Hit' } );
591 $self->start_element( { 'Name' => 'Hit' } );
598 my ( $id, $desc ) = split( /\s+/, $hit_id, 2 );
606 #$self->debug("Hit ID is $id\n");
607 my @pieces = split( /\|/, $id );
608 my $acc = pop @pieces;
612 'Name' => 'Hit_accession',
623 $_ = $self->_readline();
624 my ( $score, $bits, $e ) = /Z
-score
: \s
* (\S
+) \s
*
625 (?
: bits
: \s
* (\S
+) \s
+ )?
626 (?
: E
|expect
) \s
* \
(\
) :? \s
*(\S
+)/ox
;
627 $bits = $score unless defined $bits;
629 my $v = shift @hit_signifs;
631 @
{$v}{qw(evalue bits z-sc)} = ( $e, $bits, $score );
635 'Name' => 'Hit_signif',
636 'Data' => $v ?
$v->{evalue
} : $e
641 'Name' => 'Hit_score',
642 'Data' => $v ?
$v->{bits
} : $bits
645 $self->start_element( { 'Name' => 'Hsp' } );
649 'Name' => 'Hsp_score',
650 'Data' => $v ?
$v->{'z-sc'} : $score
655 'Name' => 'Hsp_evalue',
656 'Data' => $v ?
$v->{evalue
} : $e
661 'Name' => 'Hsp_bit-score',
662 'Data' => $v ?
$v->{bits
} : $bits
665 $_ = $self->_readline();
667 if (s/Smith-Waterman score:\s*(\d+)\;?//) {
670 'Name' => 'Hsp_sw-score',
676 / (\d
*\
.?\d
+)\
% \s
* identity
677 (?
:\s
* \
(\s
*(\S
+)\
% \s
* (?
:ungapped
|similar
) \
) )?
678 \s
* in \s
* (\d
+) \s
+ (?
:aa
|nt
) \s
+ overlap \s
*
679 \
( (\d
+) \
- (\d
+) : (\d
+) \
- (\d
+) \
)
683 my ( $identper, $gapper, $len, $querystart, $queryend,
685 = ( $1, $2, $3, $4, $5, $6, $7 );
686 my $ident = sprintf( "%.0f", ( $identper / 100 ) * $len );
687 my $positive = sprintf( "%.0f", ( $gapper / 100 ) * $len );
691 'Name' => 'Hsp_identity',
697 'Name' => 'Hsp_positive',
703 'Name' => 'Hsp_align-len',
710 'Name' => 'Hsp_query-from',
711 'Data' => $querystart
716 'Name' => 'Hsp_query-to',
722 'Name' => 'Hsp_hit-from',
728 'Name' => 'Hsp_hit-to',
737 { 'Name' => 'Hsp_querygaps', 'Data' => $v->{qgaps
} } )
738 if exists $v->{qgaps
};
740 { 'Name' => 'Hsp_hitgaps', 'Data' => $v->{lgaps
} } )
741 if exists $v->{lgaps
};
743 if ( $self->{'_reporttype'} =~ m/^FAST[NXY]$/o ) {
744 if ( 8 == scalar grep { exists $v->{$_} }
745 qw(an0 ax0 pn0 px0 an1 ax1 pn1 px1) )
747 if ( $v->{ax0
} < $v->{an0
} ) {
750 'Name' => 'Hsp_query-frame',
752 "-@{[(($v->{px0} - $v->{ax0}) % 3) + 1]}"
759 'Name' => 'Hsp_query-frame',
761 "+@{[(($v->{an0} - $v->{pn0}) % 3) + 1]}"
765 if ( $v->{ax1
} < $v->{an1
} ) {
768 'Name' => 'Hsp_hit-frame',
770 "-@{[(($v->{px1} - $v->{ax1}) % 3) + 1]}"
777 'Name' => 'Hsp_hit-frame',
779 "+@{[(($v->{an1} - $v->{pn1}) % 3) + 1]}"
787 'Name' => 'Hsp_query-frame',
788 'Data' => $v->{lframe
}
792 { 'Name' => 'Hsp_hit-frame', 'Data' => 0 } );
797 { 'Name' => 'Hsp_query-frame', 'Data' => 0 } );
799 { 'Name' => 'Hsp_hit-frame', 'Data' => $v->{lframe
} } );
804 $self->warn("unable to parse FASTA score line: $_");
807 elsif (/\d+\s*residues\s*in\s*\d+\s*query\s*sequences/) {
808 if ( $self->in_element('hsp') ) {
809 $self->end_element( { 'Name' => 'Hsp' } );
811 if ( $self->in_element('hit') ) {
812 $self->end_element( { 'Name' => 'Hit' } );
815 # $_ = $self->_readline();
816 # my ( $liblen,$libsize) = /(\d+)\s+residues\s*in(\d+)\s*library/;
817 # fast forward to the end of the file as there is
818 # nothing else left to do with this file and want to be sure and
820 while ( defined( $_ = $self->_readline() ) ) {
821 last if (/^Function used was/);
823 /(\S
+)\s
+searches\s
+a\s
+(protein\s
+or\s
+DNA\s
+)?
824 sequence
/oxi || /(\S
+)\s
+compares\s
+a
/oi
827 $self->_pushback($_);
833 # process remaining best hits
834 for my $h (@hit_signifs) {
836 # Hsp_score Hsp_evalue Hsp_bit-score
837 # Hsp_sw-score Hsp_gaps Hsp_identity Hsp_positive
838 # Hsp_align-len Hsp_query-from Hsp_query-to
839 # Hsp_hit-from Hsp_hit-to Hsp_qseq Hsp_midline
841 $self->start_element( { 'Name' => 'Hit' } );
845 'Data' => $h->{hit_len
}
847 ) if exists $h->{hit_len
};
853 ) if exists $h->{id
};
856 'Name' => 'Hit_accession',
859 ) if exists $h->{acc
};
865 ) if exists $h->{desc
};
868 'Name' => 'Hit_signif',
869 'Data' => $h->{evalue
}
871 ) if exists $h->{evalue
};
874 'Name' => 'Hit_score',
877 ) if exists $h->{bits
};
879 $self->start_element( { 'Name' => 'Hsp' } );
881 { 'Name' => 'Hsp_score', 'Data' => $h->{'z-sc'} } )
882 if exists $h->{'z-sc'};
884 { 'Name' => 'Hsp_evalue', 'Data' => $h->{evalue
} } )
885 if exists $h->{evalue
};
887 { 'Name' => 'Hsp_bit-score', 'Data' => $h->{bits
} } )
888 if exists $h->{bits
};
890 { 'Name' => 'Hsp_sw-score', 'Data' => $h->{sw
} } )
893 { 'Name' => 'Hsp_gaps', 'Data' => $h->{'%_gid'} } )
894 if exists $h->{'%_gid'};
897 'Name' => 'Hsp_identity',
899 sprintf( "%.0f", $h->{'%_id'} * $h->{alen
} )
901 ) if ( exists $h->{'%_id'} && exists $h->{alen
} );
903 if ( exists $h->{'%_gid'} ) {
906 'Name' => 'Hsp_positive',
908 sprintf( "%.0f", $h->{'%_gid'} * $h->{alen
} )
910 ) if exists $h->{'%_gid'} && exists $h->{alen
};
915 'Name' => 'Hsp_positive',
917 sprintf( "%.0f", $h->{'%_id'} * $h->{alen
} )
919 ) if ( exists $h->{'%_id'} && exists $h->{alen
} );
922 { 'Name' => 'Hsp_align-len', 'Data' => $h->{alen
} } )
923 if exists $h->{alen
};
925 { 'Name' => 'Hsp_query-from', 'Data' => $h->{an0
} } )
928 { 'Name' => 'Hsp_query-to', 'Data' => $h->{ax0
} } )
931 { 'Name' => 'Hsp_hit-from', 'Data' => $h->{an1
} } )
934 { 'Name' => 'Hsp_hit-to', 'Data' => $h->{ax1
} } )
938 { 'Name' => 'Hsp_querygaps', 'Data' => $h->{qgaps
} } )
939 if exists $h->{qgaps
};
941 { 'Name' => 'Hsp_hitgaps', 'Data' => $h->{lgaps
} } )
942 if exists $h->{lgaps
};
944 if ( $self->{'_reporttype'} =~ m/^FAST[NXY]$/o ) {
945 if ( 8 == scalar grep { exists $h->{$_} }
946 qw(an0 ax0 pn0 px0 an1 ax1 pn1 px1) )
948 if ( $h->{ax0
} < $h->{an0
} ) {
951 'Name' => 'Hsp_query-frame',
953 "-@{[(($h->{px0} - $h->{ax0}) % 3) + 1]}"
960 'Name' => 'Hsp_query-frame',
962 "+@{[(($h->{an0} - $h->{pn0}) % 3) + 1]}"
966 if ( $h->{ax1
} < $h->{an1
} ) {
969 'Name' => 'Hsp_hit-frame',
971 "-@{[(($h->{px1} - $h->{ax1}) % 3) + 1]}"
978 'Name' => 'Hsp_hit-frame',
980 "+@{[(($h->{an1} - $h->{pn1}) % 3) + 1]}"
988 'Name' => 'Hsp_query-frame',
989 'Data' => $h->{lframe
}
993 { 'Name' => 'Hsp_hit-frame', 'Data' => 0 } );
998 { 'Name' => 'Hsp_query-frame', 'Data' => 0 } );
1001 'Name' => 'Hsp_hit-frame',
1002 'Data' => $h->{lframe
}
1007 $self->end_element( { 'Name' => 'Hsp' } );
1008 $self->end_element( { 'Name' => 'Hit' } );
1011 $self->end_element( { 'Name' => 'FastaOutput' } );
1012 return $self->end_document();
1014 elsif (/^\s*\d+\s*>>>/) {
1015 if ( $self->within_element('FastaOutput') ) {
1016 if ( $self->in_element('hsp') ) {
1017 $self->end_element( { 'Name' => 'Hsp' } );
1019 if ( $self->in_element('hit') ) {
1020 $self->end_element( { 'Name' => 'Hit' } );
1025 # process remaining best hits
1026 for my $h (@hit_signifs) {
1027 $self->start_element( { 'Name' => 'Hit' } );
1030 'Name' => 'Hit_len',
1031 'Data' => $h->{hit_len
}
1033 ) if exists $h->{hit_len
};
1039 ) if exists $h->{id
};
1042 'Name' => 'Hit_accession',
1045 ) if exists $h->{acc
};
1048 'Name' => 'Hit_def',
1049 'Data' => $h->{desc
}
1051 ) if exists $h->{desc
};
1054 'Name' => 'Hit_signif',
1055 'Data' => $h->{evalue
}
1057 ) if exists $h->{evalue
};
1060 'Name' => 'Hit_score',
1061 'Data' => $h->{bits
}
1063 ) if exists $h->{bits
};
1065 $self->start_element( { 'Name' => 'Hsp' } );
1067 { 'Name' => 'Hsp_score', 'Data' => $h->{'z-sc'} } )
1068 if exists $h->{'z-sc'};
1070 { 'Name' => 'Hsp_evalue', 'Data' => $h->{evalue
} } )
1071 if exists $h->{evalue
};
1073 { 'Name' => 'Hsp_bit-score', 'Data' => $h->{bits
} }
1074 ) if exists $h->{bits
};
1076 { 'Name' => 'Hsp_sw-score', 'Data' => $h->{sw
} } )
1079 { 'Name' => 'Hsp_gaps', 'Data' => $h->{'%_gid'} } )
1080 if exists $h->{'%_gid'};
1083 'Name' => 'Hsp_identity',
1085 sprintf( "%.0f", $h->{'%_id'} * $h->{alen
} )
1087 ) if ( exists $h->{'%_id'} && exists $h->{alen
} );
1089 if ( exists $h->{'%_gid'} ) {
1092 'Name' => 'Hsp_positive',
1093 'Data' => sprintf( "%.0f",
1094 $h->{'%_gid'} * $h->{alen
} )
1096 ) if exists $h->{'%_gid'} && exists $h->{alen
};
1101 'Name' => 'Hsp_positive',
1102 'Data' => sprintf( "%.0f",
1103 $h->{'%_id'} * $h->{alen
} )
1105 ) if ( exists $h->{'%_id'} && exists $h->{alen
} );
1108 { 'Name' => 'Hsp_align-len', 'Data' => $h->{alen
} }
1109 ) if exists $h->{alen
};
1111 { 'Name' => 'Hsp_query-from', 'Data' => $h->{an0
} }
1112 ) if exists $h->{an0
};
1114 { 'Name' => 'Hsp_query-to', 'Data' => $h->{ax0
} } )
1115 if exists $h->{ax0
};
1117 { 'Name' => 'Hsp_hit-from', 'Data' => $h->{an1
} } )
1118 if exists $h->{an1
};
1120 { 'Name' => 'Hsp_hit-to', 'Data' => $h->{ax1
} } )
1121 if exists $h->{ax1
};
1125 'Name' => 'Hsp_querygaps',
1126 'Data' => $h->{qgaps
}
1128 ) if exists $h->{qgaps
};
1130 { 'Name' => 'Hsp_hitgaps', 'Data' => $h->{lgaps
} } )
1131 if exists $h->{lgaps
};
1133 if ( $self->{'_reporttype'} =~ m/^FAST[NXY]$/o ) {
1134 if ( 8 == scalar grep { exists $h->{$_} }
1135 qw(an0 ax0 pn0 px0 an1 ax1 pn1 px1) )
1137 if ( $h->{ax0
} < $h->{an0
} ) {
1140 'Name' => 'Hsp_query-frame',
1141 'Data' => "-@{[(($h->{px0} - $h->{ax0}) % 3) + 1]}"
1148 'Name' => 'Hsp_query-frame',
1149 'Data' => "+@{[(($h->{an0} - $h->{pn0}) % 3) + 1]}"
1153 if ( $h->{ax1
} < $h->{an1
} ) {
1156 'Name' => 'Hsp_hit-frame',
1157 'Data' => "-@{[(($h->{px1} - $h->{ax1}) % 3) + 1]}"
1164 'Name' => 'Hsp_hit-frame',
1165 'Data' => "+@{[(($h->{an1} - $h->{pn1}) % 3) + 1]}"
1173 'Name' => 'Hsp_query-frame',
1174 'Data' => $h->{lframe
}
1178 { 'Name' => 'Hsp_hit-frame', 'Data' => 0 }
1184 { 'Name' => 'Hsp_query-frame', 'Data' => 0 } );
1187 'Name' => 'Hsp_hit-frame',
1188 'Data' => $h->{lframe
}
1193 $self->end_element( { 'Name' => 'Hsp' } );
1194 $self->end_element( { 'Name' => 'Hit' } );
1197 $self->end_element( { 'Name' => 'FastaOutput' } );
1198 $self->_pushback($_);
1199 return $self->end_document();
1202 $self->start_element( { 'Name' => 'FastaOutput' } );
1203 $self->{'_result_count'}++;
1207 'Name' => 'FastaOutput_program',
1208 'Data' => $self->{'_reporttype'}
1213 'Name' => 'FastaOutput_version',
1214 'Data' => $self->{'_version'}
1218 my ( $type, $querylen, $querytype, $querydef );
1220 if (/^\s*\d+\s*>>>(.*)/) {
1222 if ( $querydef =~ /^(.*?)\s+(?:\-\s+)?(\d+)\s+(aa|nt).*$/o )
1224 ( $querydef, $querylen, $querytype ) = ( $1, $2, $3 );
1228 if ( $self->{'_reporttype'}
1229 && $self->{'_reporttype'} eq 'FASTA' )
1231 if ( $querytype eq 'nt' ) {
1232 $self->{'_reporttype'} = 'FASTN';
1234 elsif ( $querytype eq 'aa' ) {
1235 $self->{'_reporttype'} = 'FASTP';
1238 my ( $name, $descr ) =
1239 ( $querydef =~ m/^(\S+)(?:\s+(.*))?\s*$/o );
1242 'Name' => 'FastaOutput_query-def',
1248 'Name' => 'FastaOutput_querydesc',
1255 'Name' => 'FastaOutput_query-len',
1261 $self->warn("unable to find and set query length");
1263 if ( defined( $_ = $self->_readline() )
1264 && ( /^\s*vs\s+(\S+)/ || /^Library:\s+(\S+)/ ) )
1268 'Name' => 'FastaOutput_db',
1276 elsif ( $self->in_element('hsp') ) {
1277 my @data = ( [], [], [] );
1279 my $len = $self->idlength + 1;
1281 while ( defined($_) ) {
1283 #$self->debug("$count $_\n");
1284 if (/residues in \d+\s+query\s+sequences/o) {
1285 $self->_pushback($_);
1288 elsif (/^>>>\*\*\*/o) {
1289 $self->end_element( { Name
=> "Hsp" } );
1293 $self->_pushback($_);
1296 elsif (/^\s*\d+\s*>>>/o) {
1297 $self->_pushback($_);
1300 if ( $count == 0 ) {
1302 $self->_pushback($_);
1305 elsif ( /^\s+\d+/ || /^\s+$/ ) {
1307 # do nothing, this is really a 0 line
1309 elsif ( length($_) == 0 ) {
1313 $self->_pushback($_);
1317 elsif ( $count == 1 || $count == 3 ) {
1319 $len = CORE
::length($1) if $len < CORE
::length($1);
1320 s/\s+$//; # trim trailing spaces,we don't want them
1321 push @
{ $data[ $count - 1 ] }, substr( $_, $len );
1323 elsif (/^\s+(\d+)/) {
1325 $self->_pushback($_);
1327 elsif ( /^\s+$/ || length($_) == 0 ) {
1330 # going to skip these
1334 "Unrecognized alignment line ($count) '$_'");
1337 elsif ( $count == 2 ) {
1339 $self->warn("$_\n") if $self->verbose > 0;
1341 # we are on a Subject part of the alignment
1342 # but we THOUGHT we were on the Query
1343 # move that last line to the proper place
1344 push @
{ $data[2] }, pop @
{ $data[0] };
1349 # toss the first IDLENGTH characters of the line
1350 if ( length($_) >= $len ) {
1351 push @
{ $data[ $count - 1 ] }, substr( $_, $len );
1355 last if ( $count++ >= 5 );
1356 $_ = $self->_readline();
1358 if ( @
{ $data[0] } || @
{ $data[2] } ) {
1361 'Name' => 'Hsp_qseq',
1362 'Data' => join( '', @
{ $data[0] } )
1367 'Name' => 'Hsp_midline',
1368 'Data' => join( '', @
{ $data[1] } )
1373 'Name' => 'Hsp_hseq',
1374 'Data' => join( '', @
{ $data[2] } )
1382 #$self->warn("unrecognized FASTA Family report file!");
1387 if ( $self->in_element('result') ) {
1388 if ( $self->in_element('hsp') ) {
1389 $self->end_element( { 'Name' => 'Hsp' } );
1391 if ( $self->in_element('hit') ) {
1392 $self->end_element( { 'Name' => 'Hit' } );
1394 $self->end_element( { 'Name' => 'FastaOutput' } );
1396 return $self->end_document();
1399 =head2 start_element
1401 Title : start_element
1402 Usage : $eventgenerator->start_element
1403 Function: Handles a start element event
1405 Args : hashref with at least 2 keys 'Data' and 'Name'
1411 my ( $self, $data ) = @_;
1413 # we currently don't care about attributes
1414 my $nm = $data->{'Name'};
1415 if ( my $type = $MODEMAP{$nm} ) {
1416 $self->_mode($type);
1417 if ( my $handler = $self->_will_handle($type) ) {
1418 my $func = sprintf( "start_%s", lc $type );
1419 $handler->$func( $data->{'Attributes'} );
1421 unshift @
{ $self->{'_elements'} }, $type;
1423 if ( $nm eq 'FastaOutput' ) {
1424 $self->{'_values'} = {};
1425 $self->{'_result'} = undef;
1426 $self->{'_mode'} = '';
1433 Title : start_element
1434 Usage : $eventgenerator->end_element
1435 Function: Handles an end element event
1437 Args : hashref with at least 2 keys 'Data' and 'Name'
1443 my ( $self, $data ) = @_;
1444 my $nm = $data->{'Name'};
1447 # Hsp are sort of weird, in that they end when another
1448 # object begins so have to detect this in end_element for now
1449 if ( $nm eq 'Hsp' ) {
1450 foreach (qw(Hsp_qseq Hsp_midline Hsp_hseq)) {
1454 'Data' => $self->{'_last_hspdata'}->{$_}
1458 $self->{'_last_hspdata'} = {};
1461 if ( my $type = $MODEMAP{$nm} ) {
1462 if ( my $handler = $self->_will_handle($type) ) {
1463 my $func = sprintf( "end_%s", lc $type );
1464 $rc = $handler->$func( $self->{'_reporttype'}, $self->{'_values'} );
1466 shift @
{ $self->{'_elements'} };
1469 elsif ( $MAPPING{$nm} ) {
1470 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
1471 my $key = ( keys %{ $MAPPING{$nm} } )[0];
1472 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
1473 $self->{'_last_data'};
1476 $self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'};
1480 $self->warn("unknown nm $nm, ignoring\n");
1482 $self->{'_last_data'} = ''; # remove read data if we are at
1484 $self->{'_result'} = $rc if ( $nm eq 'FastaOutput' );
1492 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
1493 Function: Convience method that calls start_element, characters, end_element
1495 Args : Hash ref with the keys 'Name' and 'Data'
1501 my ( $self, $data ) = @_;
1502 $self->start_element($data);
1503 $self->characters($data);
1504 $self->end_element($data);
1510 Usage : $eventgenerator->characters($str)
1511 Function: Send a character events
1519 my ( $self, $data ) = @_;
1521 return unless ( defined $data->{'Data'} );
1522 if ( $data->{'Data'} =~ /^\s+$/ ) {
1523 return unless $data->{'Name'} =~ /Hsp\_(midline|qseq|hseq)/;
1526 if ( $self->in_element('hsp')
1527 && $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ )
1530 $self->{'_last_hspdata'}->{ $data->{'Name'} } .= $data->{'Data'};
1533 $self->{'_last_data'} = $data->{'Data'};
1539 Usage : $obj->_mode($newval)
1542 Returns : value of _mode
1543 Args : newvalue (optional)
1549 my ( $self, $value ) = @_;
1550 if ( defined $value ) {
1551 $self->{'_mode'} = $value;
1553 return $self->{'_mode'};
1556 =head2 within_element
1558 Title : within_element
1559 Usage : if( $eventgenerator->within_element($element) ) {}
1560 Function: Test if we are within a particular element
1561 This is different than 'in' because within can be tested
1564 Args : string element name
1569 sub within_element
{
1570 my ( $self, $name ) = @_;
1572 if (!defined $name && !defined $self->{'_elements'}
1573 || scalar @
{ $self->{'_elements'} } == 0 );
1574 foreach ( @
{ $self->{'_elements'} } ) {
1575 if ( $_ eq $name || $_ eq $MODEMAP{$name} ) {
1585 Usage : if( $eventgenerator->in_element($element) ) {}
1586 Function: Test if we are in a particular element
1587 This is different than 'in' because within can be tested
1590 Args : string element name
1596 my ( $self, $name ) = @_;
1597 return 0 if !defined $self->{'_elements'}->[0];
1599 $self->{'_elements'}->[0] eq $name
1600 || ( exists $MODEMAP{$name}
1601 && $self->{'_elements'}->[0] eq $MODEMAP{$name} )
1605 =head2 start_document
1607 Title : start_document
1608 Usage : $eventgenerator->start_document
1609 Function: Handles a start document event
1616 sub start_document
{
1618 $self->{'_lasttype'} = '';
1619 $self->{'_values'} = {};
1620 $self->{'_result'} = undef;
1621 $self->{'_mode'} = '';
1622 $self->{'_elements'} = [];
1627 Title : end_document
1628 Usage : $eventgenerator->end_document
1629 Function: Handles an end document event
1630 Returns : Bio::Search::Result::ResultI object
1637 my ( $self, @args ) = @_;
1638 return $self->{'_result'};
1644 Usage : $obj->idlength($newval)
1645 Function: Internal storage of the length of the ID desc
1646 in the HSP alignment blocks. Defaults to
1647 $IDLENGTH class variable value
1648 Returns : value of idlength
1649 Args : newvalue (optional)
1655 my ( $self, $value ) = @_;
1656 if ( defined $value ) {
1657 $self->{'_idlength'} = $value;
1659 return $self->{'_idlength'} || $IDLENGTH;
1664 Title : result_count
1665 Usage : my $count = $searchio->result_count
1666 Function: Returns the number of results we have processed
1674 return $self->{'_result_count'};
1677 sub attach_EventHandler
{
1678 my ( $self, $handler ) = @_;
1680 $self->SUPER::attach_EventHandler
($handler);
1682 # Optimization: caching the EventHandler since it is used a lot
1685 $self->{'_handler_cache'} = $handler;
1691 Title : _will_handle
1692 Usage : Private method. For internal use only.
1693 if( $self->_will_handle($type) ) { ... }
1694 Function: Provides an optimized way to check whether or not an element of a
1695 given type is to be handled.
1696 Returns : Reference to EventHandler object if the element type is to be handled.
1697 undef if the element type is not to be handled.
1698 Args : string containing type of element.
1706 Using the cached pointer to the EventHandler to minimize repeated
1711 Caching the will_handle status for each type that is encountered so
1712 that it only need be checked by calling
1713 handler-E<gt>will_handle($type) once.
1717 This does not lead to a major savings by itself (only 5-10%). In
1718 combination with other optimizations, or for large parse jobs, the
1719 savings good be significant.
1721 To test against the unoptimized version, remove the parentheses from
1722 around the third term in the ternary " ? : " operator and add two
1723 calls to $self-E<gt>_eventHandler().
1728 my ( $self, $type ) = @_;
1729 my $handler = $self->{'_handler_cache'};
1731 defined( $self->{'_will_handle_cache'}->{$type} )
1732 ?
$self->{'_will_handle_cache'}->{$type}
1733 : ( $self->{'_will_handle_cache'}->{$type} =
1734 $handler->will_handle($type) );
1736 return $will_handle ?
$handler : undef;