sync w/ main trunk
[bioperl-live.git] / Bio / SearchIO / Writer / GbrowseGFF.pm
blob3e9eb5fd08728aeda002964a5756583e9feffdb2
1 #-----------------------------------------------------------------
2 # $Id$
4 # BioPerl module Bio::SearchIO::Writer::GbrowseGFF.pm
6 # Please direct questions and support issues to <bioperl-l@bioperl.org>
8 # Cared for by Mark Wilkinson <markw@illuminae.com>
10 # You may distribute this module under the same terms as perl itself
11 #-----------------------------------------------------------------
13 =head1 NAME
15 Bio::SearchIO::Writer::GbrowseGFF - Interface for outputting parsed search results in Gbrowse GFF format
17 =head1 SYNOPSIS
19 use Bio::SearchIO;
20 my $in = Bio::SearchIO->new(-file => 'result.blast',
21 -format => 'blast');
22 my $out = Bio::SearchIO->new(-output_format => 'GbrowseGFF',
23 -output_cigar => 1,
24 -output_signif => 1,
25 -file => ">result.gff");
26 while( my $r = $in->next_result ) {
27 $out->write_result($r);
30 =head1 DESCRIPTION
32 This writer produces Gbrowse flavour GFF from a Search::Result object.
34 =head1 AUTHOR Mark Wilkinson
36 Email markw-at-illuminae-dot-com
38 =head1 CONTRIBUTORS
40 Susan Miller sjmiller at email-DOT-arizon-DOT-edu
41 Jason Stajich jason at bioperl-dot-org
43 =head1 FEEDBACK
45 =head2 Mailing Lists
47 User feedback is an integral part of the evolution of this and other
48 Bioperl modules. Send your comments and suggestions preferably to
49 the Bioperl mailing list. Your participation is much appreciated.
51 bioperl-l@bioperl.org - General discussion
52 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
54 =head2 Support
56 Please direct usage questions or support issues to the mailing list:
58 L<bioperl-l@bioperl.org>
60 rather than to the module maintainer directly. Many experienced and
61 reponsive experts will be able look at the problem and quickly
62 address it. Please include a thorough description of the problem
63 with code and data examples if at all possible.
65 =head2 Reporting Bugs
67 Report bugs to the Bioperl bug tracking system to help us keep track
68 of the bugs and their resolution. Bug reports can be submitted via
69 the web:
71 http://bugzilla.open-bio.org/
73 =head1 APPENDIX
75 The rest of the documentation details each of the object methods.
76 Internal methods are usually preceded with a _
78 =cut
81 package Bio::SearchIO::Writer::GbrowseGFF;
82 use vars qw(%Defaults);
83 use strict;
85 $Defaults{'Prefix'} = 'EST';
86 $Defaults{'HSPTag'} = 'HSP';
87 $Defaults{'MatchTag'} = 'match';
89 use base qw(Bio::Root::Root Bio::SearchIO::SearchWriterI);
92 =head2 new
94 Title : new
95 Usage : my $obj = Bio::SearchIO::Writer::GbrowseGFF->new(@args);
96 Function: Builds a new Bio::SearchIO::Writer::GbrowseGFF object
97 Returns : an instance of Bio::SearchIO::Writer::GbrowseGFF
98 Args : -e_value => 10 : set e_value parsing cutoff (default undef)
99 (note the -e_value flag is deprecated.)
101 =cut
103 sub new {
104 my($class,@args) = @_;
106 my $self = $class->SUPER::new(@args);
107 ($self->{'_evalue'},
108 $self->{'_cigar'},
109 $self->{'_prefix'},
110 $self->{'signif'} ) = $self->_rearrange([qw(E_VALUE OUTPUT_CIGAR PREFIX
111 OUTPUT_SIGNIF)], @args);
112 $self->{'_evalue'} && warn( "Use of the -e_value argument is deprecated.\nIn future, use \$writer->filter(\"type\", \&code) instead.\n\tparsing will proceed correctly with this e_value\n");
113 $self->{Gbrowse_HSPID} = 0;
114 $self->{Gbrowse_HITID} = 0;
115 $self->{'_prefix'} ||= $Defaults{'Prefix'};
116 return $self;
119 sub _incrementHSP {
120 my ($self) = @_;
121 return ++$self->{Gbrowse_HSPID};
124 sub _incrementHIT {
125 my ($self) = @_;
126 return ++$self->{Gbrowse_HITID}
128 # according to the GFF3 spec:
129 #"match". In addition to the generic "match"
130 #type, there are the subclasses "cDNA_match," "EST_match,"
131 #"translated_nucleotide_match," "nucleotide_to_protein_match," and
132 #"nucleotide_motif."
134 =head2 to_string
136 Purpose : Produce the Gbrowse format GFF lines for a Result
137 Usage : print $writer->to_string( $result_obj, @args);
138 Argument : $result_obj = A Bio::Search::Result::ResultI object
139 -version => 1|2|2.5|3 ; the GFF format you want to output (default 3)
140 -match_tag => match|cDNA_match|EST_match|translated_nucleotide_match
141 nucleotide_to_protein_match|nucleotide_motif
142 This is the SO term to be placed in GFF column 3.
143 -prefix => String to prefix the group by, default is EST
144 (see %Defaults class variable) A default can also
145 be set on object init
146 Returns : String containing data for each search Result or any of its
147 : sub-objects (Hits and HSPs).
148 Throws : n/a
150 =cut
152 #-reference => 'hit'|'query' ; whether the hit sequence name or the
153 # query sequence name is used as the
154 # 'reference' sequence (GFF column 1)
156 sub to_string {
157 my ($self, $result, @args) = @_;
158 my ($format, $reference,
159 $match_tag,$hsp_tag,
160 $prefix) = $self->_rearrange([qw
161 (VERSION
162 REFERENCE
163 MATCH_TAG HSP_TAG
164 PREFIX)], @args);
165 $self->warn($reference) if $reference;
166 $reference ||='hit'; # default is that the hit sequence (db sequence) becomes the reference sequence. I think this is fairly typical...
167 $match_tag ||= $Defaults{'MatchTag'}; # default is the generic 'match' tag.
168 $hsp_tag ||= $Defaults{'HSPTag'}; # default is the generic 'hsp' tag.
169 $prefix ||= $self->{'_prefix'};
170 $self->throw("$reference must be one of 'query', or 'hit'\n") unless $reference;
172 #************* THIS IS WHERE I STOPPED ****************
173 # *****************************************************
174 #*************************************************
176 $format ||='3';
177 my $gffio = Bio::Tools::GFF->new(-gff_version => $format); # try to set it
179 # just in case that behaviour changes (at the moment, an invalid format throws an exception, but it might return undef in the future
180 return "" unless defined $gffio; # be kind and don't return undef in case the person is putting teh output directly into a printstatement without testing it
181 # now $gffio is either false, or a valid GFF formatter
183 my ($GFF,$cigar,$score);
184 my ($resultfilter,$hitfilter,$hspfilter) = (
185 $self->filter('RESULT'),
186 $self->filter('HIT'),
187 $self->filter('HSP'));
188 $result->can('rewind') && $result->rewind(); # ensure we're at the beginning
189 next if (defined $resultfilter && ! (&{$resultfilter}($result)) );
191 while( my $hit = $result->next_hit ) {
193 if (defined $self->{_evalue}){
194 next unless ($hit->significance < $self->{_evalue});
196 next if( defined $hitfilter && ! &{$hitfilter}($hit) ); # test against filter code
198 my $refseq = $reference eq 'hit' ? $hit->name : $result->query_name;
199 my $seqname = $reference eq 'hit' ? $result->query_name : $hit->name; # hopefully this will be a simple identifier without a full description line!!
200 if ($self->{_signif}) {
201 $score = $hit->significance;
202 } else {
203 $score = $hit->raw_score;
205 $self->throw("No reference sequence name found in hit; required for GFF (this may not be your fault if your report type does not include reference sequence names)\n") unless $refseq;
206 my $source = $hit->algorithm;
207 $self->throw("No algorithm name found in hit; required for GFF (this may not be your fault if your report type does not include algorithm names)\n") unless $refseq;
208 $self->throw("This module only works on BLASTN reports at this time. Sorry.\n") unless $source eq "BLASTN";
210 my @plus_hsps;
211 my @minus_hsps;
213 # pre-process the HSP's because we later need to know
214 # the extents of the plus and munus strand
215 # on both the subject and query strands individually
216 my ($qpmin, $qpmax, $qmmin, $qmmax, $spmin, $spmax, $smmin, $smmax); # variables for the plus/minus strand min start and max end to know the full extents of the hit
217 while( my $hsp = $hit->next_hsp ) {
218 if ( defined $self->{_evalue} ) {
219 # for backward compatibility only
220 next unless ($hsp->significance < $self->{_evalue});
222 next if( defined $hspfilter && ! &{$hspfilter}($hsp) ); # test against HSP filter
223 if ($hsp->hit->strand >= 0 ){
224 push @plus_hsps, $hsp;
225 if (defined $qpmin){ # set or reset the minimum and maximum extent of the plus-strand hit
226 $qpmin = $hsp->query->start if $hsp->query->start < $qpmin;
227 $qpmax = $hsp->query->end if $hsp->query->end > $qpmax;
228 $spmin = $hsp->hit->start if $hsp->hit->start < $spmin;
229 $spmax = $hsp->hit->end if $hsp->hit->end > $spmax;
230 } else {
231 $qpmin = $hsp->query->start;
232 $qpmax = $hsp->query->end;
233 $spmin = $hsp->hit->start;
234 $spmax = $hsp->hit->end;
237 if ($hsp->hit->strand < 0 ){
238 push @minus_hsps, $hsp;
239 if (defined $qmmin){ # set or reset the minimum and maximum extent of the minus-strand hit
240 $qmmin = $hsp->query->start if $hsp->query->start < $qmmin;
241 $qmmax = $hsp->query->end if $hsp->query->end > $qmmax;
242 $smmin = $hsp->hit->start if $hsp->hit->start < $smmin;
243 $smmax = $hsp->hit->end if $hsp->hit->end > $smmax;
244 } else {
245 $qmmin = $hsp->query->start;
246 $qmmax = $hsp->query->end;
247 $smmin = $hsp->hit->start;
248 $smmax = $hsp->hit->end;
251 #else next if there is no strand, but that makes no sense..??
253 next unless (scalar(@plus_hsps) + scalar(@minus_hsps)); # next if no hsps (??)
254 my $ID = $self->_incrementHIT();
255 # okay, write out the index line for the entire hit before
256 # processing HSP's
257 # unfortunately (or not??), HitI objects do not implement
258 # SeqFeatureI, so we can't just call ->gff_string
259 # as a result, this module is quite brittle to changes
260 # in the GFF format since we are hard-coding the GFF output here :-(
262 if (scalar(@plus_hsps)){
263 my %tags = ( 'ID' => "match_sequence$ID");
265 if ($format==2.5) {
266 $tags{'Target'} = "$prefix:$seqname";
267 $tags{'tstart'} = $qmmin;
268 $tags{'tend'} = $qmmax;
269 } else {
270 $tags{'Target'} = "$prefix:$seqname $qpmin $qpmax";
272 if ( $self->{'_cigar'} ) {
273 $tags{'Gap'} = $cigar;
275 my $feat = Bio::SeqFeature::Generic->new(
276 -seq_id => $refseq,
277 -source_tag => $source,
278 -primary_tag => $match_tag,
279 -start => $spmin,
280 -end => $spmax,
281 -score => $score,
282 -strand => '+',
283 -frame => '.',
284 -tag => \%tags
288 my $formatter = Bio::Tools::GFF->new(-gff_version => $format);
289 $GFF .= $feat->gff_string($formatter)."\n";
291 if (scalar(@minus_hsps)){
292 my %tags = ( 'ID' => "match_sequence$ID");
294 if ($format==2.5) {
295 $tags{'Target'} = "$prefix:$seqname";
296 $tags{'tstart'} = $qpmax;
297 $tags{'tend'} = $qpmin;
299 else {
300 $tags{'Target'} = "$prefix:$seqname $qpmax $qpmin";
302 my $feat = Bio::SeqFeature::Generic->new(
303 -seq_id => $refseq,
304 -source_tag => $source,
305 -primary_tag => $match_tag,
306 -start => $smmin,
307 -end => $smmax,
308 -score => $score,
309 -strand => '-',
310 -frame => '.',
311 -tag => \%tags
314 my $formatter = Bio::Tools::GFF->new(-gff_version => $format);
315 $GFF .= $feat->gff_string($formatter)."\n";
318 # process + strand hsps
319 foreach my $hsp (@plus_hsps){
320 my $hspID = $self->_incrementHSP();
321 my $qstart = $hsp->query->start;
322 my $qend = $hsp->query->end;
323 my $sstart = $hsp->hit->start;
324 my $send = $hsp->hit->end;
325 my $score = $hsp->score;
327 my %tags = ( 'ID' => "match_hsp$hspID",
328 'Parent' => "match_sequence$ID" );
330 if ($format==2.5) {
331 $tags{'Target'} = "$prefix:$seqname";
332 $tags{'tstart'} = $qstart;
333 $tags{'tend'} = $qend;
335 else {
336 $tags{'Target'} = "$prefix:$seqname $qstart $qend";
338 if ( $self->{'_cigar'} ) {
339 $tags{'Gap'} = $hsp->cigar_string;
342 my $feat = Bio::SeqFeature::Generic->new(
343 -seq_id => $refseq,
344 -source_tag => $source,
345 -primary_tag => $hsp_tag,
346 -start => $sstart,
347 -end => $send,
348 -score => $score,
349 -strand => '+',
350 -frame => '.',
351 -tag => \%tags
354 my $formatter = Bio::Tools::GFF->new(-gff_version => $format);
355 $GFF .= $feat->gff_string($formatter)."\n";
358 foreach my $hsp (@minus_hsps) {
359 my $hspID = $self->_incrementHSP();
360 my $qstart = $hsp->query->start;
361 my $qend = $hsp->query->end;
362 my $sstart = $hsp->hit->start;
363 my $send = $hsp->hit->end;
364 my $score = $hsp->score;
366 my %tags = ( 'ID' => "match_hsp$hspID",
367 'Parent' => "match_sequence$ID" );
369 if ($format==2.5) {
370 $tags{'Target'} = "$prefix:$seqname";
371 $tags{'tstart'} = $qend;
372 $tags{'tend'} = $qstart;
374 else {
375 $tags{'Target'} = "$prefix:$seqname $qend $qstart";
377 if ( $self->{'_cigar'} ) {
378 $tags{'Gap'} = $hsp->cigar_string;
381 my $feat = Bio::SeqFeature::Generic->new(
382 -seq_id => $refseq,
383 -source_tag => $source,
384 -primary_tag => $hsp_tag,
385 -start => $sstart,
386 -end => $send,
387 -score => $score,
388 -strand => '-',
389 -frame => '.',
390 -tag => \%tags
393 my $formatter = Bio::Tools::GFF->new(-gff_version => $format);
394 $GFF .= $feat->gff_string($formatter) ."\n";
397 return $GFF;
400 sub significance_filter {
401 my ($self,$method,$code) = @_;
402 return unless $method;
403 $method = uc($method);
404 if( $method ne 'HSP' &&
405 $method ne 'HIT' &&
406 $method ne 'RESULT' ) {
407 $self->warn("Unknown method $method");
408 return;
410 if( $code ) {
411 $self->throw("Must provide a valid code reference") unless ref($code) =~ /CODE/;
412 $self->{$method} = $code;
414 return $self->{$method};
417 =head2 start_report
419 Title : start_report
420 Usage : $self->start_report()
421 Function: has no function, returns nothing
422 Returns : empty string
423 Args : none
425 =cut
427 sub start_report { return '' }
429 =head2 end_report
431 Title : end_report
432 Usage : $self->end_report()
433 Function: has no function, returns nothing
434 Returns : empty string
435 Args : none
438 =cut
440 sub end_report { return '' }
442 =head2 filter
444 Title : filter
445 Usage : $writer->filter('hsp', \&hsp_filter);
446 Function: Filter out either at HSP,Hit,or Result level
447 Returns : none
448 Args : string => data type,
449 CODE reference
450 Note : GbrowseGFF.pm makes no changes to the default filter code
453 =cut