1 #-----------------------------------------------------------------
4 # BioPerl module Bio::SearchIO::Writer::GbrowseGFF.pm
6 # Please direct questions and support issues to <bioperl-l@bioperl.org>
8 # Cared for by Mark Wilkinson <markw@illuminae.com>
10 # You may distribute this module under the same terms as perl itself
11 #-----------------------------------------------------------------
15 Bio::SearchIO::Writer::GbrowseGFF - Interface for outputting parsed search results in Gbrowse GFF format
20 my $in = Bio::SearchIO->new(-file => 'result.blast',
22 my $out = Bio::SearchIO->new(-output_format => 'GbrowseGFF',
25 -file => ">result.gff");
26 while( my $r = $in->next_result ) {
27 $out->write_result($r);
32 This writer produces Gbrowse flavour GFF from a Search::Result object.
34 =head1 AUTHOR Mark Wilkinson
36 Email markw-at-illuminae-dot-com
40 Susan Miller sjmiller at email-DOT-arizon-DOT-edu
41 Jason Stajich jason at bioperl-dot-org
47 User feedback is an integral part of the evolution of this and other
48 Bioperl modules. Send your comments and suggestions preferably to
49 the Bioperl mailing list. Your participation is much appreciated.
51 bioperl-l@bioperl.org - General discussion
52 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
56 Please direct usage questions or support issues to the mailing list:
58 L<bioperl-l@bioperl.org>
60 rather than to the module maintainer directly. Many experienced and
61 reponsive experts will be able look at the problem and quickly
62 address it. Please include a thorough description of the problem
63 with code and data examples if at all possible.
67 Report bugs to the Bioperl bug tracking system to help us keep track
68 of the bugs and their resolution. Bug reports can be submitted via
71 http://bugzilla.open-bio.org/
75 The rest of the documentation details each of the object methods.
76 Internal methods are usually preceded with a _
81 package Bio
::SearchIO
::Writer
::GbrowseGFF
;
82 use vars
qw(%Defaults);
85 $Defaults{'Prefix'} = 'EST';
86 $Defaults{'HSPTag'} = 'HSP';
87 $Defaults{'MatchTag'} = 'match';
89 use base qw(Bio::Root::Root Bio::SearchIO::SearchWriterI);
95 Usage : my $obj = Bio::SearchIO::Writer::GbrowseGFF->new(@args);
96 Function: Builds a new Bio::SearchIO::Writer::GbrowseGFF object
97 Returns : an instance of Bio::SearchIO::Writer::GbrowseGFF
98 Args : -e_value => 10 : set e_value parsing cutoff (default undef)
99 (note the -e_value flag is deprecated.)
104 my($class,@args) = @_;
106 my $self = $class->SUPER::new
(@args);
110 $self->{'signif'} ) = $self->_rearrange([qw(E_VALUE OUTPUT_CIGAR PREFIX
111 OUTPUT_SIGNIF)], @args);
112 $self->{'_evalue'} && warn( "Use of the -e_value argument is deprecated.\nIn future, use \$writer->filter(\"type\", \&code) instead.\n\tparsing will proceed correctly with this e_value\n");
113 $self->{Gbrowse_HSPID
} = 0;
114 $self->{Gbrowse_HITID
} = 0;
115 $self->{'_prefix'} ||= $Defaults{'Prefix'};
121 return ++$self->{Gbrowse_HSPID
};
126 return ++$self->{Gbrowse_HITID
}
128 # according to the GFF3 spec:
129 #"match". In addition to the generic "match"
130 #type, there are the subclasses "cDNA_match," "EST_match,"
131 #"translated_nucleotide_match," "nucleotide_to_protein_match," and
136 Purpose : Produce the Gbrowse format GFF lines for a Result
137 Usage : print $writer->to_string( $result_obj, @args);
138 Argument : $result_obj = A Bio::Search::Result::ResultI object
139 -version => 1|2|2.5|3 ; the GFF format you want to output (default 3)
140 -match_tag => match|cDNA_match|EST_match|translated_nucleotide_match
141 nucleotide_to_protein_match|nucleotide_motif
142 This is the SO term to be placed in GFF column 3.
143 -prefix => String to prefix the group by, default is EST
144 (see %Defaults class variable) A default can also
145 be set on object init
146 Returns : String containing data for each search Result or any of its
147 : sub-objects (Hits and HSPs).
152 #-reference => 'hit'|'query' ; whether the hit sequence name or the
153 # query sequence name is used as the
154 # 'reference' sequence (GFF column 1)
157 my ($self, $result, @args) = @_;
158 my ($format, $reference,
160 $prefix) = $self->_rearrange([qw
165 $self->warn($reference) if $reference;
166 $reference ||='hit'; # default is that the hit sequence (db sequence) becomes the reference sequence. I think this is fairly typical...
167 $match_tag ||= $Defaults{'MatchTag'}; # default is the generic 'match' tag.
168 $hsp_tag ||= $Defaults{'HSPTag'}; # default is the generic 'hsp' tag.
169 $prefix ||= $self->{'_prefix'};
170 $self->throw("$reference must be one of 'query', or 'hit'\n") unless $reference;
172 #************* THIS IS WHERE I STOPPED ****************
173 # *****************************************************
174 #*************************************************
177 my $gffio = Bio
::Tools
::GFF
->new(-gff_version
=> $format); # try to set it
179 # just in case that behaviour changes (at the moment, an invalid format throws an exception, but it might return undef in the future
180 return "" unless defined $gffio; # be kind and don't return undef in case the person is putting teh output directly into a printstatement without testing it
181 # now $gffio is either false, or a valid GFF formatter
183 my ($GFF,$cigar,$score);
184 my ($resultfilter,$hitfilter,$hspfilter) = (
185 $self->filter('RESULT'),
186 $self->filter('HIT'),
187 $self->filter('HSP'));
188 $result->can('rewind') && $result->rewind(); # ensure we're at the beginning
189 next if (defined $resultfilter && ! (&{$resultfilter}($result)) );
191 while( my $hit = $result->next_hit ) {
193 if (defined $self->{_evalue
}){
194 next unless ($hit->significance < $self->{_evalue
});
196 next if( defined $hitfilter && ! &{$hitfilter}($hit) ); # test against filter code
198 my $refseq = $reference eq 'hit' ?
$hit->name : $result->query_name;
199 my $seqname = $reference eq 'hit' ?
$result->query_name : $hit->name; # hopefully this will be a simple identifier without a full description line!!
200 if ($self->{_signif
}) {
201 $score = $hit->significance;
203 $score = $hit->raw_score;
205 $self->throw("No reference sequence name found in hit; required for GFF (this may not be your fault if your report type does not include reference sequence names)\n") unless $refseq;
206 my $source = $hit->algorithm;
207 $self->throw("No algorithm name found in hit; required for GFF (this may not be your fault if your report type does not include algorithm names)\n") unless $refseq;
208 $self->throw("This module only works on BLASTN reports at this time. Sorry.\n") unless $source eq "BLASTN";
213 # pre-process the HSP's because we later need to know
214 # the extents of the plus and munus strand
215 # on both the subject and query strands individually
216 my ($qpmin, $qpmax, $qmmin, $qmmax, $spmin, $spmax, $smmin, $smmax); # variables for the plus/minus strand min start and max end to know the full extents of the hit
217 while( my $hsp = $hit->next_hsp ) {
218 if ( defined $self->{_evalue
} ) {
219 # for backward compatibility only
220 next unless ($hsp->significance < $self->{_evalue
});
222 next if( defined $hspfilter && ! &{$hspfilter}($hsp) ); # test against HSP filter
223 if ($hsp->hit->strand >= 0 ){
224 push @plus_hsps, $hsp;
225 if (defined $qpmin){ # set or reset the minimum and maximum extent of the plus-strand hit
226 $qpmin = $hsp->query->start if $hsp->query->start < $qpmin;
227 $qpmax = $hsp->query->end if $hsp->query->end > $qpmax;
228 $spmin = $hsp->hit->start if $hsp->hit->start < $spmin;
229 $spmax = $hsp->hit->end if $hsp->hit->end > $spmax;
231 $qpmin = $hsp->query->start;
232 $qpmax = $hsp->query->end;
233 $spmin = $hsp->hit->start;
234 $spmax = $hsp->hit->end;
237 if ($hsp->hit->strand < 0 ){
238 push @minus_hsps, $hsp;
239 if (defined $qmmin){ # set or reset the minimum and maximum extent of the minus-strand hit
240 $qmmin = $hsp->query->start if $hsp->query->start < $qmmin;
241 $qmmax = $hsp->query->end if $hsp->query->end > $qmmax;
242 $smmin = $hsp->hit->start if $hsp->hit->start < $smmin;
243 $smmax = $hsp->hit->end if $hsp->hit->end > $smmax;
245 $qmmin = $hsp->query->start;
246 $qmmax = $hsp->query->end;
247 $smmin = $hsp->hit->start;
248 $smmax = $hsp->hit->end;
251 #else next if there is no strand, but that makes no sense..??
253 next unless (scalar(@plus_hsps) + scalar(@minus_hsps)); # next if no hsps (??)
254 my $ID = $self->_incrementHIT();
255 # okay, write out the index line for the entire hit before
257 # unfortunately (or not??), HitI objects do not implement
258 # SeqFeatureI, so we can't just call ->gff_string
259 # as a result, this module is quite brittle to changes
260 # in the GFF format since we are hard-coding the GFF output here :-(
262 if (scalar(@plus_hsps)){
263 my %tags = ( 'ID' => "match_sequence$ID");
266 $tags{'Target'} = "$prefix:$seqname";
267 $tags{'tstart'} = $qmmin;
268 $tags{'tend'} = $qmmax;
270 $tags{'Target'} = "$prefix:$seqname $qpmin $qpmax";
272 if ( $self->{'_cigar'} ) {
273 $tags{'Gap'} = $cigar;
275 my $feat = Bio
::SeqFeature
::Generic
->new(
277 -source_tag
=> $source,
278 -primary_tag
=> $match_tag,
288 my $formatter = Bio
::Tools
::GFF
->new(-gff_version
=> $format);
289 $GFF .= $feat->gff_string($formatter)."\n";
291 if (scalar(@minus_hsps)){
292 my %tags = ( 'ID' => "match_sequence$ID");
295 $tags{'Target'} = "$prefix:$seqname";
296 $tags{'tstart'} = $qpmax;
297 $tags{'tend'} = $qpmin;
300 $tags{'Target'} = "$prefix:$seqname $qpmax $qpmin";
302 my $feat = Bio
::SeqFeature
::Generic
->new(
304 -source_tag
=> $source,
305 -primary_tag
=> $match_tag,
314 my $formatter = Bio
::Tools
::GFF
->new(-gff_version
=> $format);
315 $GFF .= $feat->gff_string($formatter)."\n";
318 # process + strand hsps
319 foreach my $hsp (@plus_hsps){
320 my $hspID = $self->_incrementHSP();
321 my $qstart = $hsp->query->start;
322 my $qend = $hsp->query->end;
323 my $sstart = $hsp->hit->start;
324 my $send = $hsp->hit->end;
325 my $score = $hsp->score;
327 my %tags = ( 'ID' => "match_hsp$hspID",
328 'Parent' => "match_sequence$ID" );
331 $tags{'Target'} = "$prefix:$seqname";
332 $tags{'tstart'} = $qstart;
333 $tags{'tend'} = $qend;
336 $tags{'Target'} = "$prefix:$seqname $qstart $qend";
338 if ( $self->{'_cigar'} ) {
339 $tags{'Gap'} = $hsp->cigar_string;
342 my $feat = Bio
::SeqFeature
::Generic
->new(
344 -source_tag
=> $source,
345 -primary_tag
=> $hsp_tag,
354 my $formatter = Bio
::Tools
::GFF
->new(-gff_version
=> $format);
355 $GFF .= $feat->gff_string($formatter)."\n";
358 foreach my $hsp (@minus_hsps) {
359 my $hspID = $self->_incrementHSP();
360 my $qstart = $hsp->query->start;
361 my $qend = $hsp->query->end;
362 my $sstart = $hsp->hit->start;
363 my $send = $hsp->hit->end;
364 my $score = $hsp->score;
366 my %tags = ( 'ID' => "match_hsp$hspID",
367 'Parent' => "match_sequence$ID" );
370 $tags{'Target'} = "$prefix:$seqname";
371 $tags{'tstart'} = $qend;
372 $tags{'tend'} = $qstart;
375 $tags{'Target'} = "$prefix:$seqname $qend $qstart";
377 if ( $self->{'_cigar'} ) {
378 $tags{'Gap'} = $hsp->cigar_string;
381 my $feat = Bio
::SeqFeature
::Generic
->new(
383 -source_tag
=> $source,
384 -primary_tag
=> $hsp_tag,
393 my $formatter = Bio
::Tools
::GFF
->new(-gff_version
=> $format);
394 $GFF .= $feat->gff_string($formatter) ."\n";
400 sub significance_filter
{
401 my ($self,$method,$code) = @_;
402 return unless $method;
403 $method = uc($method);
404 if( $method ne 'HSP' &&
406 $method ne 'RESULT' ) {
407 $self->warn("Unknown method $method");
411 $self->throw("Must provide a valid code reference") unless ref($code) =~ /CODE/;
412 $self->{$method} = $code;
414 return $self->{$method};
420 Usage : $self->start_report()
421 Function: has no function, returns nothing
422 Returns : empty string
427 sub start_report
{ return '' }
432 Usage : $self->end_report()
433 Function: has no function, returns nothing
434 Returns : empty string
440 sub end_report
{ return '' }
445 Usage : $writer->filter('hsp', \&hsp_filter);
446 Function: Filter out either at HSP,Hit,or Result level
448 Args : string => data type,
450 Note : GbrowseGFF.pm makes no changes to the default filter code