bug 2549; fixed small bug in Bio::Taxon which doesn't catch -common_name
[bioperl-live.git] / Bio / SearchIO / psl.pm
blobe9b40dd8478dd124fa11c87b91ede92575a8f944
1 # $Id$
3 # BioPerl module for Bio::SearchIO::psl
5 # Cared for by Jason Stajich <jason-at-bioperl-dot-org>
7 # Copyright Jason Stajich
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::SearchIO::psl - A parser for PSL output (UCSC)
17 =head1 SYNOPSIS
19 use Bio::SearchIO;
20 my $parser = Bio::SearchIO->new(-file => 'file.psl',
21 -format => 'psl');
22 while( my $result = $parser->next_result ) {
25 =head1 DESCRIPTION
27 This is a SearchIO driver for PSL format.
28 PSL format is documented here:
29 http://genome.ucsc.edu/goldenPath/help/customTrack.html#PSL
31 By default it assumes PSL output came from BLAT you can override that
32 by specifying -program_name =E<gt> 'BLASTZ' when initializing the
33 SearchIO object.
35 =head1 FEEDBACK
37 =head2 Mailing Lists
39 User feedback is an integral part of the evolution of this and other
40 Bioperl modules. Send your comments and suggestions preferably to
41 the Bioperl mailing list. Your participation is much appreciated.
43 bioperl-l@bioperl.org - General discussion
44 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
46 =head2 Reporting Bugs
48 Report bugs to the Bioperl bug tracking system to help us keep track
49 of the bugs and their resolution. Bug reports can be submitted via the
50 web:
52 http://bugzilla.open-bio.org/
54 =head1 AUTHOR - Jason Stajich
56 Email jason-at-bioperl-dot-org
58 =head1 APPENDIX
60 The rest of the documentation details each of the object methods.
61 Internal methods are usually preceded with a _
63 =cut
66 # Let the code begin...
69 package Bio::SearchIO::psl;
70 use vars qw(%MAPPING %MODEMAP $DEFAULT_WRITER_CLASS $DefaultProgramName);
72 use strict;
73 use Bio::Search::HSP::HSPFactory;
74 use Bio::Search::Hit::HitFactory;
75 use Bio::Search::Result::ResultFactory;
77 $DefaultProgramName = 'BLAT';
78 $DEFAULT_WRITER_CLASS = 'Bio::Search::Writer::HitTableWriter';
80 # mapping of terms to Bioperl hash keys
81 %MODEMAP = (
82 'PSLOutput' => 'result',
83 'Result' => 'result',
84 'Hit' => 'hit',
85 'Hsp' => 'hsp'
88 %MAPPING = (
89 'Hsp_bit-score' => 'HSP-bits',
90 'Hsp_score' => 'HSP-score',
91 'Hsp_evalue' => 'HSP-evalue',
92 'Hsp_query-from' => 'HSP-query_start',
93 'Hsp_query-to' => 'HSP-query_end',
94 'Hsp_hit-from' => 'HSP-hit_start',
95 'Hsp_hit-to' => 'HSP-hit_end',
96 'Hsp_positive' => 'HSP-conserved',
97 'Hsp_identity' => 'HSP-identical',
98 'Hsp_mismatches' => 'HSP-mismatches',
99 'Hsp_qgapblocks' => 'HSP-query_gapblocks',
100 'Hsp_hgapblocks' => 'HSP-hit_gapblocks',
101 'Hsp_gaps' => 'HSP-hsp_gaps',
102 'Hsp_hitgaps' => 'HSP-hit_gaps',
103 'Hsp_querygaps' => 'HSP-query_gaps',
104 'Hsp_align-len' => 'HSP-hsp_length',
105 'Hsp_query-frame'=> 'HSP-query_frame',
106 'Hsp_hit-frame' => 'HSP-hit_frame',
108 'Hit_id' => 'HIT-name',
109 'Hit_len' => 'HIT-length',
110 'Hit_accession' => 'HIT-accession',
111 'Hit_def' => 'HIT-description',
112 'Hit_signif' => 'HIT-significance',
113 'Hit_score' => 'HIT-score',
114 'Hit_bits' => 'HIT-bits',
116 'PSLOutput_program' => 'RESULT-algorithm_name',
117 'PSLOutput_version' => 'RESULT-algorithm_version',
118 'PSLOutput_query-def'=> 'RESULT-query_name',
119 'PSLOutput_query-len'=> 'RESULT-query_length',
120 'PSLOutput_query-acc'=> 'RESULT-query_accession',
121 'PSLOutput_querydesc'=> 'RESULT-query_description',
122 'PSLOutput_db' => 'RESULT-database_name',
123 'PSLOutput_db-len' => 'RESULT-database_entries',
124 'PSLOutput_db-let' => 'RESULT-database_letters',
127 use base qw(Bio::SearchIO);
129 =head2 new
131 Title : new
132 Usage : my $obj = Bio::SearchIO::psl->new();
133 Function: Builds a new Bio::SearchIO::psl object
134 Returns : an instance of Bio::SearchIO::psl
135 Args :
138 =cut
140 sub _initialize {
141 my ($self,@args) = @_;
142 $self->SUPER::_initialize(@args);
143 my ($pname) = $self->_rearrange([qw(PROGRAM_NAME)],
144 @args);
145 $self->program_name($pname || $DefaultProgramName);
146 $self->_eventHandler->register_factory('result', Bio::Search::Result::ResultFactory->new(-type => 'Bio::Search::Result::GenericResult'));
148 $self->_eventHandler->register_factory('hit', Bio::Search::Hit::HitFactory->new(-type => 'Bio::Search::Hit::GenericHit'));
149 $self->_eventHandler->register_factory('hsp', Bio::Search::HSP::HSPFactory->new(-type => 'Bio::Search::HSP::PSLHSP'));
153 =head2 next_result
155 Title : next_result
156 Usage : my $result = $parser->next_result
157 Function: Parse the next result from the data stream
158 Returns : L<Bio::Search::Result::ResultI>
159 Args : none
162 =cut
164 sub next_result{
165 my ($self) = @_;
166 my ($lastquery,$lasthit);
167 local $/ = "\n";
168 local $_;
170 while( defined ($_ = $self->_readline) ) {
171 #clear header if exists
172 if(/^psLayout/){
173 #pass over header lines lines
174 while(!/^\d+\s+\d+\s+/) {
175 $_ = $self->_readline;
178 my ( $matches,$mismatches,$rep_matches,$n_count,
179 $q_num_insert,$q_base_insert,
180 $t_num_insert, $t_base_insert,
181 $strand, $q_name, $q_length, $q_start,
182 $q_end, $t_name, $t_length,$t_start, $t_end, $block_count,
183 $block_sizes, $q_starts, $t_starts
184 ) = split;
186 my $score = sprintf "%.2f", ( 100 * ( $matches + $mismatches + $rep_matches ) / $q_length );
188 # this is overall percent identity...
189 my $percent_id = sprintf "%.2f", ( 100 * ($matches + $rep_matches)/( $matches + $mismatches + $rep_matches )
192 # Remember Jim's code is 0 based
193 if( defined $lastquery &&
194 $lastquery ne $q_name ) {
195 $self->end_element({'Name' => 'Hit'});
196 $self->end_element({'Name' => 'PSLOutput'});
197 $self->_pushback($_);
198 return $self->end_document;
199 } elsif( ! defined $lastquery ) {
200 $self->{'_result_count'}++;
201 $self->start_element({'Name' => 'PSLOutput'});
202 $self->element({'Name' => 'PSLOutput_program',
203 'Data' => $self->program_name});
204 $self->element({'Name' => 'PSLOutput_query-def',
205 'Data' => $q_name});
206 $self->element({'Name' => 'PSLOutput_query-len',
207 'Data' => $q_length});
208 $self->start_element({'Name' => 'Hit'});
209 $self->element({'Name' => 'Hit_id',
210 'Data' => $t_name});
211 $self->element({'Name' => 'Hit_len',
212 'Data' => $t_length});
213 $self->element({'Name' => 'Hit_score',
214 'Data' => $score});
215 } elsif( $lasthit ne $t_name ) {
216 $self->end_element({'Name' => 'Hit'});
217 $self->start_element({'Name' => 'Hit'});
218 $self->element({'Name' => 'Hit_id',
219 'Data' => $t_name});
220 $self->element({'Name' => 'Hit_len',
221 'Data' => $t_length});
222 $self->element({'Name' => 'Hit_score',
223 'Data' => $score});
226 my $identical = $matches + $rep_matches;
227 $self->start_element({'Name' => 'Hsp'});
228 $self->element({'Name' => 'Hsp_score',
229 'Data' => $score});
230 $self->element({'Name' => 'Hsp_identity',
231 'Data' => $identical});
232 $self->element({'Name' => 'Hsp_positive',
233 'Data' => $identical});
234 $self->element({'Name' => 'Hsp_mismatches',
235 'Data' => $mismatches});
236 $self->element({'Name' => 'Hsp_gaps',
237 'Data' => $q_base_insert + $t_base_insert});
238 # query gaps are the number of target inserts and vice-versa
239 $self->element({'Name' => 'Hsp_querygaps',
240 'Data' => $t_base_insert});
241 $self->element({'Name' => 'Hsp_hitgaps',
242 'Data' => $q_base_insert});
243 if( $strand eq '+' ) {
244 $self->element({'Name' => 'Hsp_query-from',
245 'Data' => $q_start + 1});
246 $self->element({'Name' => 'Hsp_query-to',
247 'Data' => $q_end});
248 } else {
249 $self->element({'Name' => 'Hsp_query-to',
250 'Data' => $q_start + 1});
251 $self->element({'Name' => 'Hsp_query-from',
252 'Data' => $q_end});
254 my $hsplen = $q_base_insert + $t_base_insert +
255 abs( $t_end - $t_start) + abs( $q_end - $q_start);
256 $self->element({'Name' => 'Hsp_hit-from',
257 'Data' => $t_start + 1 });
258 $self->element({'Name' => 'Hsp_hit-to',
259 'Data' => $t_end});
260 $self->element({'Name' => 'Hsp_align-len',
261 'Data' => $hsplen});
262 # cleanup trailing commas in some output
263 $block_sizes =~ s/\,$//;
264 $q_starts =~ s/\,$//;
265 $t_starts =~ s/\,$//;
266 my @blocksizes = split(/,/,$block_sizes); # block sizes
267 my @qstarts = split(/,/,$q_starts); # starting position of each block
268 # in query
269 my @tstarts = split(/,/,$t_starts); # starting position of each block
270 # in target
271 my (@qgapblocks,@hgapblocks);
272 for( my $i = 0; $i < $block_count; $i++) {
273 if( $strand eq '+' ) {
274 push @qgapblocks, [ $qstarts[$i] + 1, $blocksizes[$i]];
275 } else {
276 push @qgapblocks, [ $q_length - $qstarts[$i], $blocksizes[$i]];
278 push @hgapblocks, [ $tstarts[$i] + 1, $blocksizes[$i]];
280 $self->element({'Name' => 'Hsp_qgapblocks',
281 'Data' => \@qgapblocks});
282 $self->element({'Name' => 'Hsp_hgapblocks',
283 'Data' => \@hgapblocks});
284 $self->end_element({'Name' => 'Hsp'});
285 $lastquery = $q_name;
286 $lasthit = $t_name;
288 if( defined $lasthit || defined $lastquery ) {
289 $self->end_element({'Name' => 'Hit'});
290 $self->end_element({'Name' => 'Result'});
291 return $self->end_document;
295 =head2 start_element
297 Title : start_element
298 Usage : $eventgenerator->start_element
299 Function: Handles a start element event
300 Returns : none
301 Args : hashref with at least 2 keys 'Data' and 'Name'
304 =cut
306 sub start_element{
307 my ($self,$data) = @_;
308 # we currently don't care about attributes
309 my $nm = $data->{'Name'};
310 if( my $type = $MODEMAP{$nm} ) {
311 $self->_mode($type);
312 if( $self->_eventHandler->will_handle($type) ) {
313 my $func = sprintf("start_%s",lc $type);
314 $self->_eventHandler->$func($data->{'Attributes'});
316 unshift @{$self->{'_elements'}}, $type;
318 if($nm eq 'PSLOutput') {
319 $self->{'_values'} = {};
320 $self->{'_result'}= undef;
321 $self->{'_mode'} = '';
326 =head2 end_element
328 Title : start_element
329 Usage : $eventgenerator->end_element
330 Function: Handles an end element event
331 Returns : none
332 Args : hashref with at least 2 keys 'Data' and 'Name'
335 =cut
337 sub end_element {
338 my ($self,$data) = @_;
339 my $nm = $data->{'Name'};
340 my $rc;
341 # Hsp are sort of weird, in that they end when another
342 # object begins so have to detect this in end_element for now
344 if( my $type = $MODEMAP{$nm} ) {
345 if( $self->_eventHandler->will_handle($type) ) {
346 my $func = sprintf("end_%s",lc $type);
347 $rc = $self->_eventHandler->$func($self->{'_reporttype'},
348 $self->{'_values'});
350 shift @{$self->{'_elements'}};
352 } elsif( $MAPPING{$nm} ) {
353 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
354 my $key = (keys %{$MAPPING{$nm}})[0];
355 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
356 } else {
357 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
359 } else {
360 $self->warn( __PACKAGE__."::end_element: unknown nm '$nm', ignoring\n");
362 $self->{'_last_data'} = ''; # remove read data if we are at
363 # end of an element
364 $self->{'_result'} = $rc if( defined $nm &&
365 defined $MODEMAP{$nm} &&
366 $MODEMAP{$nm} eq 'result' );
367 return $rc;
371 =head2 element
373 Title : element
374 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
375 Function: Convience method that calls start_element, characters, end_element
376 Returns : none
377 Args : Hash ref with the keys 'Name' and 'Data'
380 =cut
382 sub element{
383 my ($self,$data) = @_;
384 $self->start_element($data);
385 $self->characters($data);
386 $self->end_element($data);
390 =head2 characters
392 Title : characters
393 Usage : $eventgenerator->characters($str)
394 Function: Send a character events
395 Returns : none
396 Args : string
399 =cut
401 sub characters{
402 my ($self,$data) = @_;
404 return unless ( defined $data->{'Data'} );
405 if( $data->{'Data'} =~ /^\s+$/ ) {
406 return unless $data->{'Name'} =~ /Hsp\_(midline|qseq|hseq)/;
409 if( $self->in_element('hsp') &&
410 $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ ) {
412 $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'};
415 $self->{'_last_data'} = $data->{'Data'};
418 =head2 _mode
420 Title : _mode
421 Usage : $obj->_mode($newval)
422 Function:
423 Example :
424 Returns : value of _mode
425 Args : newvalue (optional)
428 =cut
430 sub _mode{
431 my ($self,$value) = @_;
432 if( defined $value) {
433 $self->{'_mode'} = $value;
435 return $self->{'_mode'};
438 =head2 within_element
440 Title : within_element
441 Usage : if( $eventgenerator->within_element($element) ) {}
442 Function: Test if we are within a particular element
443 This is different than 'in' because within can be tested
444 for a whole block.
445 Returns : boolean
446 Args : string element name
449 =cut
451 sub within_element{
452 my ($self,$name) = @_;
453 return 0 if ( ! defined $name &&
454 ! defined $self->{'_elements'} ||
455 scalar @{$self->{'_elements'}} == 0) ;
456 foreach ( @{$self->{'_elements'}} ) {
457 if( $_ eq $name ) {
458 return 1;
461 return 0;
464 =head2 in_element
466 Title : in_element
467 Usage : if( $eventgenerator->in_element($element) ) {}
468 Function: Test if we are in a particular element
469 This is different than 'in' because within can be tested
470 for a whole block.
471 Returns : boolean
472 Args : string element name
475 =cut
477 sub in_element{
478 my ($self,$name) = @_;
479 return 0 if ! defined $self->{'_elements'}->[0];
480 return ( $self->{'_elements'}->[0] eq $name)
484 =head2 start_document
486 Title : start_document
487 Usage : $eventgenerator->start_document
488 Function: Handles a start document event
489 Returns : none
490 Args : none
493 =cut
495 sub start_document{
496 my ($self) = @_;
497 $self->{'_lasttype'} = '';
498 $self->{'_values'} = {};
499 $self->{'_result'}= undef;
500 $self->{'_mode'} = '';
501 $self->{'_elements'} = [];
505 =head2 end_document
507 Title : end_document
508 Usage : $eventgenerator->end_document
509 Function: Handles an end document event
510 Returns : Bio::Search::Result::ResultI object
511 Args : none
514 =cut
516 sub end_document{
517 my ($self,@args) = @_;
518 return $self->{'_result'};
521 =head2 result_count
523 Title : result_count
524 Usage : my $count = $searchio->result_count
525 Function: Returns the number of results we have processed
526 Returns : integer
527 Args : none
530 =cut
532 sub result_count {
533 my $self = shift;
534 return $self->{'_result_count'};
537 sub report_count { shift->result_count }
540 =head2 program_name
542 Title : program_name
543 Usage : $obj->program_name($newval)
544 Function: Get/Set the program name
545 Returns : value of program_name (a scalar)
546 Args : on set, new value (a scalar or undef, optional)
549 =cut
551 sub program_name{
552 my $self = shift;
554 $self->{'program_name'} = shift if @_;
555 return $self->{'program_name'} || $DefaultProgramName;