3 # BioPerl module for Bio::SearchIO::psl
5 # Cared for by Jason Stajich <jason-at-bioperl-dot-org>
7 # Copyright Jason Stajich
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::SearchIO::psl - A parser for PSL output (UCSC)
20 my $parser = Bio::SearchIO->new(-file => 'file.psl',
22 while( my $result = $parser->next_result ) {
27 This is a SearchIO driver for PSL format.
28 PSL format is documented here:
29 http://genome.ucsc.edu/goldenPath/help/customTrack.html#PSL
31 By default it assumes PSL output came from BLAT you can override that
32 by specifying -program_name =E<gt> 'BLASTZ' when initializing the
39 User feedback is an integral part of the evolution of this and other
40 Bioperl modules. Send your comments and suggestions preferably to
41 the Bioperl mailing list. Your participation is much appreciated.
43 bioperl-l@bioperl.org - General discussion
44 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
48 Report bugs to the Bioperl bug tracking system to help us keep track
49 of the bugs and their resolution. Bug reports can be submitted via the
52 http://bugzilla.open-bio.org/
54 =head1 AUTHOR - Jason Stajich
56 Email jason-at-bioperl-dot-org
60 The rest of the documentation details each of the object methods.
61 Internal methods are usually preceded with a _
66 # Let the code begin...
69 package Bio
::SearchIO
::psl
;
70 use vars
qw(%MAPPING %MODEMAP $DEFAULT_WRITER_CLASS $DefaultProgramName);
73 use Bio::Search::HSP::HSPFactory;
74 use Bio::Search::Hit::HitFactory;
75 use Bio::Search::Result::ResultFactory;
77 $DefaultProgramName = 'BLAT';
78 $DEFAULT_WRITER_CLASS = 'Bio::Search::Writer::HitTableWriter';
80 # mapping of terms to Bioperl hash keys
82 'PSLOutput' => 'result',
89 'Hsp_bit-score' => 'HSP-bits',
90 'Hsp_score' => 'HSP-score',
91 'Hsp_evalue' => 'HSP-evalue',
92 'Hsp_query-from' => 'HSP-query_start',
93 'Hsp_query-to' => 'HSP-query_end',
94 'Hsp_hit-from' => 'HSP-hit_start',
95 'Hsp_hit-to' => 'HSP-hit_end',
96 'Hsp_positive' => 'HSP-conserved',
97 'Hsp_identity' => 'HSP-identical',
98 'Hsp_mismatches' => 'HSP-mismatches',
99 'Hsp_qgapblocks' => 'HSP-query_gapblocks',
100 'Hsp_hgapblocks' => 'HSP-hit_gapblocks',
101 'Hsp_gaps' => 'HSP-hsp_gaps',
102 'Hsp_hitgaps' => 'HSP-hit_gaps',
103 'Hsp_querygaps' => 'HSP-query_gaps',
104 'Hsp_align-len' => 'HSP-hsp_length',
105 'Hsp_query-frame'=> 'HSP-query_frame',
106 'Hsp_hit-frame' => 'HSP-hit_frame',
108 'Hit_id' => 'HIT-name',
109 'Hit_len' => 'HIT-length',
110 'Hit_accession' => 'HIT-accession',
111 'Hit_def' => 'HIT-description',
112 'Hit_signif' => 'HIT-significance',
113 'Hit_score' => 'HIT-score',
114 'Hit_bits' => 'HIT-bits',
116 'PSLOutput_program' => 'RESULT-algorithm_name',
117 'PSLOutput_version' => 'RESULT-algorithm_version',
118 'PSLOutput_query-def'=> 'RESULT-query_name',
119 'PSLOutput_query-len'=> 'RESULT-query_length',
120 'PSLOutput_query-acc'=> 'RESULT-query_accession',
121 'PSLOutput_querydesc'=> 'RESULT-query_description',
122 'PSLOutput_db' => 'RESULT-database_name',
123 'PSLOutput_db-len' => 'RESULT-database_entries',
124 'PSLOutput_db-let' => 'RESULT-database_letters',
127 use base qw(Bio::SearchIO);
132 Usage : my $obj = Bio::SearchIO::psl->new();
133 Function: Builds a new Bio::SearchIO::psl object
134 Returns : an instance of Bio::SearchIO::psl
141 my ($self,@args) = @_;
142 $self->SUPER::_initialize
(@args);
143 my ($pname) = $self->_rearrange([qw(PROGRAM_NAME)],
145 $self->program_name($pname || $DefaultProgramName);
146 $self->_eventHandler->register_factory('result', Bio
::Search
::Result
::ResultFactory
->new(-type
=> 'Bio::Search::Result::GenericResult'));
148 $self->_eventHandler->register_factory('hit', Bio
::Search
::Hit
::HitFactory
->new(-type
=> 'Bio::Search::Hit::GenericHit'));
149 $self->_eventHandler->register_factory('hsp', Bio
::Search
::HSP
::HSPFactory
->new(-type
=> 'Bio::Search::HSP::PSLHSP'));
156 Usage : my $result = $parser->next_result
157 Function: Parse the next result from the data stream
158 Returns : L<Bio::Search::Result::ResultI>
166 my ($lastquery,$lasthit);
170 while( defined ($_ = $self->_readline) ) {
171 #clear header if exists
173 #pass over header lines lines
174 while(!/^\d+\s+\d+\s+/) {
175 $_ = $self->_readline;
178 my ( $matches,$mismatches,$rep_matches,$n_count,
179 $q_num_insert,$q_base_insert,
180 $t_num_insert, $t_base_insert,
181 $strand, $q_name, $q_length, $q_start,
182 $q_end, $t_name, $t_length,$t_start, $t_end, $block_count,
183 $block_sizes, $q_starts, $t_starts
186 my $score = sprintf "%.2f", ( 100 * ( $matches + $mismatches + $rep_matches ) / $q_length );
188 # this is overall percent identity...
189 my $percent_id = sprintf "%.2f", ( 100 * ($matches + $rep_matches)/( $matches + $mismatches + $rep_matches )
192 # Remember Jim's code is 0 based
193 if( defined $lastquery &&
194 $lastquery ne $q_name ) {
195 $self->end_element({'Name' => 'Hit'});
196 $self->end_element({'Name' => 'PSLOutput'});
197 $self->_pushback($_);
198 return $self->end_document;
199 } elsif( ! defined $lastquery ) {
200 $self->{'_result_count'}++;
201 $self->start_element({'Name' => 'PSLOutput'});
202 $self->element({'Name' => 'PSLOutput_program',
203 'Data' => $self->program_name});
204 $self->element({'Name' => 'PSLOutput_query-def',
206 $self->element({'Name' => 'PSLOutput_query-len',
207 'Data' => $q_length});
208 $self->start_element({'Name' => 'Hit'});
209 $self->element({'Name' => 'Hit_id',
211 $self->element({'Name' => 'Hit_len',
212 'Data' => $t_length});
213 $self->element({'Name' => 'Hit_score',
215 } elsif( $lasthit ne $t_name ) {
216 $self->end_element({'Name' => 'Hit'});
217 $self->start_element({'Name' => 'Hit'});
218 $self->element({'Name' => 'Hit_id',
220 $self->element({'Name' => 'Hit_len',
221 'Data' => $t_length});
222 $self->element({'Name' => 'Hit_score',
226 my $identical = $matches + $rep_matches;
227 $self->start_element({'Name' => 'Hsp'});
228 $self->element({'Name' => 'Hsp_score',
230 $self->element({'Name' => 'Hsp_identity',
231 'Data' => $identical});
232 $self->element({'Name' => 'Hsp_positive',
233 'Data' => $identical});
234 $self->element({'Name' => 'Hsp_mismatches',
235 'Data' => $mismatches});
236 $self->element({'Name' => 'Hsp_gaps',
237 'Data' => $q_base_insert + $t_base_insert});
238 # query gaps are the number of target inserts and vice-versa
239 $self->element({'Name' => 'Hsp_querygaps',
240 'Data' => $t_base_insert});
241 $self->element({'Name' => 'Hsp_hitgaps',
242 'Data' => $q_base_insert});
243 if( $strand eq '+' ) {
244 $self->element({'Name' => 'Hsp_query-from',
245 'Data' => $q_start + 1});
246 $self->element({'Name' => 'Hsp_query-to',
249 $self->element({'Name' => 'Hsp_query-to',
250 'Data' => $q_start + 1});
251 $self->element({'Name' => 'Hsp_query-from',
254 my $hsplen = $q_base_insert + $t_base_insert +
255 abs( $t_end - $t_start) + abs( $q_end - $q_start);
256 $self->element({'Name' => 'Hsp_hit-from',
257 'Data' => $t_start + 1 });
258 $self->element({'Name' => 'Hsp_hit-to',
260 $self->element({'Name' => 'Hsp_align-len',
262 # cleanup trailing commas in some output
263 $block_sizes =~ s/\,$//;
264 $q_starts =~ s/\,$//;
265 $t_starts =~ s/\,$//;
266 my @blocksizes = split(/,/,$block_sizes); # block sizes
267 my @qstarts = split(/,/,$q_starts); # starting position of each block
269 my @tstarts = split(/,/,$t_starts); # starting position of each block
271 my (@qgapblocks,@hgapblocks);
272 for( my $i = 0; $i < $block_count; $i++) {
273 if( $strand eq '+' ) {
274 push @qgapblocks, [ $qstarts[$i] + 1, $blocksizes[$i]];
276 push @qgapblocks, [ $q_length - $qstarts[$i], $blocksizes[$i]];
278 push @hgapblocks, [ $tstarts[$i] + 1, $blocksizes[$i]];
280 $self->element({'Name' => 'Hsp_qgapblocks',
281 'Data' => \
@qgapblocks});
282 $self->element({'Name' => 'Hsp_hgapblocks',
283 'Data' => \
@hgapblocks});
284 $self->end_element({'Name' => 'Hsp'});
285 $lastquery = $q_name;
288 if( defined $lasthit || defined $lastquery ) {
289 $self->end_element({'Name' => 'Hit'});
290 $self->end_element({'Name' => 'Result'});
291 return $self->end_document;
297 Title : start_element
298 Usage : $eventgenerator->start_element
299 Function: Handles a start element event
301 Args : hashref with at least 2 keys 'Data' and 'Name'
307 my ($self,$data) = @_;
308 # we currently don't care about attributes
309 my $nm = $data->{'Name'};
310 if( my $type = $MODEMAP{$nm} ) {
312 if( $self->_eventHandler->will_handle($type) ) {
313 my $func = sprintf("start_%s",lc $type);
314 $self->_eventHandler->$func($data->{'Attributes'});
316 unshift @
{$self->{'_elements'}}, $type;
318 if($nm eq 'PSLOutput') {
319 $self->{'_values'} = {};
320 $self->{'_result'}= undef;
321 $self->{'_mode'} = '';
328 Title : start_element
329 Usage : $eventgenerator->end_element
330 Function: Handles an end element event
332 Args : hashref with at least 2 keys 'Data' and 'Name'
338 my ($self,$data) = @_;
339 my $nm = $data->{'Name'};
341 # Hsp are sort of weird, in that they end when another
342 # object begins so have to detect this in end_element for now
344 if( my $type = $MODEMAP{$nm} ) {
345 if( $self->_eventHandler->will_handle($type) ) {
346 my $func = sprintf("end_%s",lc $type);
347 $rc = $self->_eventHandler->$func($self->{'_reporttype'},
350 shift @
{$self->{'_elements'}};
352 } elsif( $MAPPING{$nm} ) {
353 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
354 my $key = (keys %{$MAPPING{$nm}})[0];
355 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
357 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
360 $self->warn( __PACKAGE__
."::end_element: unknown nm '$nm', ignoring\n");
362 $self->{'_last_data'} = ''; # remove read data if we are at
364 $self->{'_result'} = $rc if( defined $nm &&
365 defined $MODEMAP{$nm} &&
366 $MODEMAP{$nm} eq 'result' );
374 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
375 Function: Convience method that calls start_element, characters, end_element
377 Args : Hash ref with the keys 'Name' and 'Data'
383 my ($self,$data) = @_;
384 $self->start_element($data);
385 $self->characters($data);
386 $self->end_element($data);
393 Usage : $eventgenerator->characters($str)
394 Function: Send a character events
402 my ($self,$data) = @_;
404 return unless ( defined $data->{'Data'} );
405 if( $data->{'Data'} =~ /^\s+$/ ) {
406 return unless $data->{'Name'} =~ /Hsp\_(midline|qseq|hseq)/;
409 if( $self->in_element('hsp') &&
410 $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ ) {
412 $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'};
415 $self->{'_last_data'} = $data->{'Data'};
421 Usage : $obj->_mode($newval)
424 Returns : value of _mode
425 Args : newvalue (optional)
431 my ($self,$value) = @_;
432 if( defined $value) {
433 $self->{'_mode'} = $value;
435 return $self->{'_mode'};
438 =head2 within_element
440 Title : within_element
441 Usage : if( $eventgenerator->within_element($element) ) {}
442 Function: Test if we are within a particular element
443 This is different than 'in' because within can be tested
446 Args : string element name
452 my ($self,$name) = @_;
453 return 0 if ( ! defined $name &&
454 ! defined $self->{'_elements'} ||
455 scalar @
{$self->{'_elements'}} == 0) ;
456 foreach ( @
{$self->{'_elements'}} ) {
467 Usage : if( $eventgenerator->in_element($element) ) {}
468 Function: Test if we are in a particular element
469 This is different than 'in' because within can be tested
472 Args : string element name
478 my ($self,$name) = @_;
479 return 0 if ! defined $self->{'_elements'}->[0];
480 return ( $self->{'_elements'}->[0] eq $name)
484 =head2 start_document
486 Title : start_document
487 Usage : $eventgenerator->start_document
488 Function: Handles a start document event
497 $self->{'_lasttype'} = '';
498 $self->{'_values'} = {};
499 $self->{'_result'}= undef;
500 $self->{'_mode'} = '';
501 $self->{'_elements'} = [];
508 Usage : $eventgenerator->end_document
509 Function: Handles an end document event
510 Returns : Bio::Search::Result::ResultI object
517 my ($self,@args) = @_;
518 return $self->{'_result'};
524 Usage : my $count = $searchio->result_count
525 Function: Returns the number of results we have processed
534 return $self->{'_result_count'};
537 sub report_count
{ shift->result_count }
543 Usage : $obj->program_name($newval)
544 Function: Get/Set the program name
545 Returns : value of program_name (a scalar)
546 Args : on set, new value (a scalar or undef, optional)
554 $self->{'program_name'} = shift if @_;
555 return $self->{'program_name'} || $DefaultProgramName;