2 # BioPerl module for Bio::SearchIO::psl
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl-dot-org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::SearchIO::psl - A parser for PSL output (UCSC)
21 my $parser = Bio::SearchIO->new(-file => 'file.psl',
23 while( my $result = $parser->next_result ) {
28 This is a SearchIO driver for PSL format.
29 PSL format is documented here:
30 http://genome.ucsc.edu/goldenPath/help/customTrack.html#PSL
32 By default it assumes PSL output came from BLAT you can override that
33 by specifying -program_name =E<gt> 'BLASTZ' when initializing the
40 User feedback is an integral part of the evolution of this and other
41 Bioperl modules. Send your comments and suggestions preferably to
42 the Bioperl mailing list. Your participation is much appreciated.
44 bioperl-l@bioperl.org - General discussion
45 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
49 Please direct usage questions or support issues to the mailing list:
51 I<bioperl-l@bioperl.org>
53 rather than to the module maintainer directly. Many experienced and
54 reponsive experts will be able look at the problem and quickly
55 address it. Please include a thorough description of the problem
56 with code and data examples if at all possible.
60 Report bugs to the Bioperl bug tracking system to help us keep track
61 of the bugs and their resolution. Bug reports can be submitted via the
64 https://github.com/bioperl/bioperl-live/issues
66 =head1 AUTHOR - Jason Stajich
68 Email jason-at-bioperl-dot-org
72 The rest of the documentation details each of the object methods.
73 Internal methods are usually preceded with a _
77 # Let the code begin...
79 package Bio
::SearchIO
::psl
;
80 use vars
qw(%MAPPING %MODEMAP $DEFAULT_WRITER_CLASS $DefaultProgramName);
83 use Bio::Search::HSP::HSPFactory;
84 use Bio::Search::Hit::HitFactory;
85 use Bio::Search::Result::ResultFactory;
87 $DefaultProgramName = 'BLAT';
88 $DEFAULT_WRITER_CLASS = 'Bio::SearchIO::Writer::HitTableWriter';
90 # mapping of terms to Bioperl hash keys
92 'PSLOutput' => 'result',
99 'Hsp_bit-score' => 'HSP-bits',
100 'Hsp_score' => 'HSP-score',
101 'Hsp_evalue' => 'HSP-evalue',
102 'Hsp_query-from' => 'HSP-query_start',
103 'Hsp_query-to' => 'HSP-query_end',
104 'Hsp_hit-from' => 'HSP-hit_start',
105 'Hsp_hit-to' => 'HSP-hit_end',
106 'Hsp_positive' => 'HSP-conserved',
107 'Hsp_identity' => 'HSP-identical',
108 'Hsp_mismatches' => 'HSP-mismatches',
109 'Hsp_qgapblocks' => 'HSP-query_gapblocks',
110 'Hsp_hgapblocks' => 'HSP-hit_gapblocks',
111 'Hsp_gaps' => 'HSP-hsp_gaps',
112 'Hsp_hitgaps' => 'HSP-hit_gaps',
113 'Hsp_querygaps' => 'HSP-query_gaps',
114 'Hsp_align-len' => 'HSP-hsp_length',
115 'Hsp_query-frame' => 'HSP-query_frame',
116 'Hsp_hit-frame' => 'HSP-hit_frame',
118 'Hit_id' => 'HIT-name',
119 'Hit_len' => 'HIT-length',
120 'Hit_accession' => 'HIT-accession',
121 'Hit_def' => 'HIT-description',
122 'Hit_signif' => 'HIT-significance',
123 'Hit_score' => 'HIT-score',
124 'Hit_bits' => 'HIT-bits',
126 'PSLOutput_program' => 'RESULT-algorithm_name',
127 'PSLOutput_version' => 'RESULT-algorithm_version',
128 'PSLOutput_query-def' => 'RESULT-query_name',
129 'PSLOutput_query-len' => 'RESULT-query_length',
130 'PSLOutput_query-acc' => 'RESULT-query_accession',
131 'PSLOutput_querydesc' => 'RESULT-query_description',
132 'PSLOutput_db' => 'RESULT-database_name',
133 'PSLOutput_db-len' => 'RESULT-database_entries',
134 'PSLOutput_db-let' => 'RESULT-database_letters',
137 use base qw(Bio::SearchIO);
142 Usage : my $obj = Bio::SearchIO::psl->new();
143 Function: Builds a new Bio::SearchIO::psl object
144 Returns : an instance of Bio::SearchIO::psl
151 my ( $self, @args ) = @_;
152 $self->SUPER::_initialize
(@args);
153 my ($pname) = $self->_rearrange( [qw(PROGRAM_NAME)], @args );
154 $self->program_name( $pname || $DefaultProgramName );
155 $self->_eventHandler->register_factory(
157 Bio
::Search
::Result
::ResultFactory
->new(
158 -type
=> 'Bio::Search::Result::GenericResult'
162 $self->_eventHandler->register_factory(
164 Bio
::Search
::Hit
::HitFactory
->new(
165 -type
=> 'Bio::Search::Hit::GenericHit'
168 $self->_eventHandler->register_factory(
170 Bio
::Search
::HSP
::HSPFactory
->new(
171 -type
=> 'Bio::Search::HSP::PSLHSP'
179 Usage : my $result = $parser->next_result
180 Function: Parse the next result from the data stream
181 Returns : L<Bio::Search::Result::ResultI> or undef if no more results
188 my ( $lastquery, $lasthit );
192 # skip over any header lines
193 while( defined($_ = $self->_readline) and ! /^\d+\s+\d+\s+/ ) {}
194 $self->_pushback($_);
196 # now start the main parsing loop
197 while ( defined( $_ = $self->_readline ) ) {
199 $matches, $mismatches, $rep_matches, $n_count,
200 $q_num_insert, $q_base_insert, $t_num_insert, $t_base_insert,
201 $strand, $q_name, $q_length, $q_start,
202 $q_end, $t_name, $t_length, $t_start,
203 $t_end, $block_count, $block_sizes, $q_starts,
207 $q_length > 0 or $self->throw("parse error, invalid query length '$q_length'");
208 my $score = sprintf( "%.2f", 100 * ( $matches + $mismatches + $rep_matches ) / $q_length );
210 # this is overall percent identity...
211 my $match_total = $matches + $mismatches + $rep_matches;
213 or $self->throw("parse error, matches + mismatches + rep_matches must be > 0!");
214 my $percent_id = sprintf("%.2f", 100 * ( $matches + $rep_matches ) / $match_total );
216 # Remember Jim's code is 0 based
217 if ( defined $lastquery
218 && $lastquery ne $q_name )
220 $self->end_element( { 'Name' => 'Hit' } );
221 $self->end_element( { 'Name' => 'PSLOutput' } );
222 $self->_pushback($_);
223 return $self->end_document;
225 elsif ( !defined $lastquery ) {
226 $self->{'_result_count'}++;
227 $self->start_element( { 'Name' => 'PSLOutput' } );
230 'Name' => 'PSLOutput_program',
231 'Data' => $self->program_name
236 'Name' => 'PSLOutput_query-def',
242 'Name' => 'PSLOutput_query-len',
246 $self->start_element( { 'Name' => 'Hit' } );
261 'Name' => 'Hit_score',
266 elsif ( $lasthit ne $t_name ) {
267 $self->end_element( { 'Name' => 'Hit' } );
268 $self->start_element( { 'Name' => 'Hit' } );
283 'Name' => 'Hit_score',
289 my $identical = $matches + $rep_matches;
290 $self->start_element( { 'Name' => 'Hsp' } );
293 'Name' => 'Hsp_score',
299 'Name' => 'Hsp_identity',
305 'Name' => 'Hsp_positive',
311 'Name' => 'Hsp_mismatches',
312 'Data' => $mismatches
317 'Name' => 'Hsp_gaps',
318 'Data' => $q_base_insert + $t_base_insert
322 # query gaps are the number of target inserts and vice-versa
325 'Name' => 'Hsp_querygaps',
326 'Data' => $t_base_insert
331 'Name' => 'Hsp_hitgaps',
332 'Data' => $q_base_insert
335 if ( $strand eq '+' ) {
338 'Name' => 'Hsp_query-from',
339 'Data' => $q_start + 1
344 'Name' => 'Hsp_query-to',
352 'Name' => 'Hsp_query-to',
353 'Data' => $q_start + 1
358 'Name' => 'Hsp_query-from',
367 abs( $t_end - $t_start ) +
368 abs( $q_end - $q_start ))/2;
372 'Name' => 'Hsp_hit-from',
373 'Data' => $t_start + 1
378 'Name' => 'Hsp_hit-to',
384 'Name' => 'Hsp_align-len',
389 # cleanup trailing commas in some output
390 $block_sizes =~ s/\,$//;
391 $q_starts =~ s/\,$//;
392 $t_starts =~ s/\,$//;
393 my @blocksizes = split( /,/, $block_sizes ); # block sizes
394 my @qstarts = split( /,/, $q_starts ); # starting position of each block
396 my @tstarts = split( /,/, $t_starts ); # starting position of each block
398 my ( @qgapblocks, @hgapblocks );
400 for ( my $i = 0 ; $i < $block_count ; $i++ ) {
401 if ( $strand eq '+' ) {
402 push @qgapblocks, [ $qstarts[$i] + 1, $blocksizes[$i] ];
405 push @qgapblocks, [ $q_length - $qstarts[$i], $blocksizes[$i] ];
407 push @hgapblocks, [ $tstarts[$i] + 1, $blocksizes[$i] ];
411 'Name' => 'Hsp_qgapblocks',
412 'Data' => \
@qgapblocks
417 'Name' => 'Hsp_hgapblocks',
418 'Data' => \
@hgapblocks
421 $self->end_element( { 'Name' => 'Hsp' } );
422 $lastquery = $q_name;
425 if ( defined $lasthit || defined $lastquery ) {
426 $self->end_element( { 'Name' => 'Hit' } );
427 $self->end_element( { 'Name' => 'Result' } );
428 return $self->end_document;
434 Title : start_element
435 Usage : $eventgenerator->start_element
436 Function: Handles a start element event
438 Args : hashref with at least 2 keys 'Data' and 'Name'
444 my ( $self, $data ) = @_;
446 # we currently don't care about attributes
447 my $nm = $data->{'Name'};
448 if ( my $type = $MODEMAP{$nm} ) {
450 if ( $self->_eventHandler->will_handle($type) ) {
451 my $func = 'start_'.lc $type;
452 $self->_eventHandler->$func( $data->{'Attributes'} );
454 unshift @
{ $self->{'_elements'} }, $type;
456 if ( $nm eq 'PSLOutput' ) {
457 $self->{'_values'} = {};
458 $self->{'_result'} = undef;
459 $self->{'_mode'} = '';
467 Usage : $eventgenerator->end_element
468 Function: Handles an end element event
469 Returns : return value from the associated end_$type event handler
470 Args : hashref with at least 2 keys 'Data' and 'Name'
476 my ( $self, $data ) = @_;
477 my $nm = $data->{'Name'};
480 # Hsp are sort of weird, in that they end when another
481 # object begins so have to detect this in end_element for now
483 if ( my $type = $MODEMAP{$nm} ) {
484 if ( $self->_eventHandler->will_handle($type) ) {
485 my $func = 'end_'.lc $type;
486 $rc = $self->_eventHandler->$func( $self->{'_reporttype'},
487 $self->{'_values'} );
489 shift @
{ $self->{'_elements'} };
492 elsif ( $MAPPING{$nm} ) {
493 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
494 my $key = ( keys %{ $MAPPING{$nm} } )[0];
495 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
496 $self->{'_last_data'};
499 $self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'};
504 __PACKAGE__
. "::end_element: unknown nm '$nm', ignoring\n" );
506 $self->{'_last_data'} = ''; # remove read data if we are at
508 $self->{'_result'} = $rc
510 && defined $MODEMAP{$nm}
511 && $MODEMAP{$nm} eq 'result' );
519 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
520 Function: Convience method that calls start_element, characters, end_element
522 Args : Hash ref with the keys 'Name' and 'Data'
528 my ( $self, $data ) = @_;
529 $self->start_element($data);
530 $self->characters($data);
531 $self->end_element($data);
537 Usage : $eventgenerator->characters($str)
538 Function: Send a character events
546 my ( $self, $data ) = @_;
548 return unless ( defined $data->{'Data'} );
549 if ( $data->{'Data'} =~ /^\s+$/ ) {
550 return unless $data->{'Name'} =~ /Hsp\_(midline|qseq|hseq)/;
553 if ( $self->in_element('hsp')
554 && $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ )
557 $self->{'_last_hspdata'}->{ $data->{'Name'} } .= $data->{'Data'};
560 $self->{'_last_data'} = $data->{'Data'};
566 Usage : $obj->_mode($newval)
569 Returns : value of _mode
570 Args : newvalue (optional)
576 my ( $self, $value ) = @_;
577 if ( defined $value ) {
578 $self->{'_mode'} = $value;
580 return $self->{'_mode'};
583 =head2 within_element
585 Title : within_element
586 Usage : if( $eventgenerator->within_element($element) ) {}
587 Function: Test if we are within a particular element
588 This is different than 'in' because within can be tested
591 Args : string element name
597 my ( $self, $name ) = @_;
599 if (!defined $name && !defined $self->{'_elements'}
600 || scalar @
{ $self->{'_elements'} } == 0 );
601 foreach ( @
{ $self->{'_elements'} } ) {
612 Usage : if( $eventgenerator->in_element($element) ) {}
613 Function: Test if we are in a particular element
614 This is different than 'in' because within can be tested
617 Args : string element name
623 my ( $self, $name ) = @_;
624 return 0 if !defined $self->{'_elements'}->[0];
625 return ( $self->{'_elements'}->[0] eq $name );
628 =head2 start_document
630 Title : start_document
631 Usage : $eventgenerator->start_document
632 Function: Handles a start document event
641 $self->{'_lasttype'} = '';
642 $self->{'_values'} = {};
643 $self->{'_result'} = undef;
644 $self->{'_mode'} = '';
645 $self->{'_elements'} = [];
651 Usage : $eventgenerator->end_document
652 Function: Handles an end document event
653 Returns : Bio::Search::Result::ResultI object
660 my ( $self, @args ) = @_;
661 return $self->{'_result'};
667 Usage : my $count = $searchio->result_count
668 Function: Returns the number of results we have processed
677 return $self->{'_result_count'};
680 sub report_count
{ shift->result_count }
685 Usage : $obj->program_name($newval)
686 Function: Get/Set the program name
687 Returns : value of program_name (a scalar)
688 Args : on set, new value (a scalar or undef, optional)
696 $self->{'program_name'} = shift if @_;
697 return $self->{'program_name'} || $DefaultProgramName;