[bug 2663]
[bioperl-live.git] / Bio / Tools / GFF.pm
blobed3ca866bc5ebadcba362ffacec4ad1f5c75e26c
1 # $Id$
3 # BioPerl module for Bio::Tools::GFF
5 # Cared for by the Bioperl core team
7 # Copyright Matthew Pocock
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::Tools::GFF - A Bio::SeqAnalysisParserI compliant GFF format parser
17 =head1 SYNOPSIS
19 use Bio::Tools::GFF;
21 # specify input via -fh or -file
22 my $gffio = Bio::Tools::GFF->new(-fh => \*STDIN, -gff_version => 2);
23 my $feature;
24 # loop over the input stream
25 while($feature = $gffio->next_feature()) {
26 # do something with feature
28 $gffio->close();
30 # you can also obtain a GFF parser as a SeqAnalasisParserI in
31 # HT analysis pipelines (see Bio::SeqAnalysisParserI and
32 # Bio::Factory::SeqAnalysisParserFactory)
33 my $factory = Bio::Factory::SeqAnalysisParserFactory->new();
34 my $parser = $factory->get_parser(-input => \*STDIN, -method => "gff");
35 while($feature = $parser->next_feature()) {
36 # do something with feature
39 =head1 DESCRIPTION
41 This class provides a simple GFF parser and writer. In the sense of a
42 SeqAnalysisParser, it parses an input file or stream into SeqFeatureI
43 objects, but is not in any way specific to a particular analysis
44 program and the output that program produces.
46 That is, if you can get your analysis program spit out GFF, here is
47 your result parser.
49 =head1 GFF3 AND SEQUENCE DATA
51 [added by cjm 2004/07/09]
53 GFF3 supports sequence data; see
54 http://song.sourceforge.net/gff3-jan04.shtml
56 There are a number of ways to deal with this -
58 If you call
60 $gffio->ignore_sequence(1)
62 prior to parsing the sequence data is ignored; this is useful if you
63 just want the features. It avoids the memory overhead in building and
64 caching sequences
66 Alternatively, you can call either
68 $gffio->get_seqs()
72 $gffio->seq_id_by_h()
74 At the B<end> of parsing to get either a list or hashref of Bio::Seq
75 objects (see the documentation for each of these methods)
77 Note that these objects will not have the features attached - you have
78 to do this yourself, OR call
80 $gffio->features_attached_to_seqs(1)
82 PRIOR to parsing; this will ensure that the Seqs have the features
83 attached; ie you will then be able to call
85 $seq->get_SeqFeatures();
87 And use Bio::SeqIO methods
89 Note that auto-attaching the features to seqs will incur a higher
90 memory overhead as the features must be cached until the sequence data
91 is found
93 =head1 TODO
95 Make a Bio::SeqIO class specifically for GFF3 with sequence data
97 =head1 FEEDBACK
99 =head2 Mailing Lists
101 User feedback is an integral part of the evolution of this and other
102 Bioperl modules. Send your comments and suggestions preferably to one
103 of the Bioperl mailing lists. Your participation is much appreciated.
105 bioperl-l@bioperl.org - General discussion
106 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
108 =head2 Reporting Bugs
110 Report bugs to the Bioperl bug tracking system to help us keep track
111 the bugs and their resolution. Bug reports can be submitted the web:
113 http://bugzilla.open-bio.org/
115 =head1 AUTHOR - Matthew Pocock
117 Email mrp-at-sanger.ac.uk
119 =head1 CONTRIBUTORS
121 Jason Stajich, jason-at-biperl-dot-org
122 Chris Mungall, cjm-at-fruitfly-dot-org
123 Steffen Grossmann [SG], grossman at molgen.mpg.de
124 Malcolm Cook, mec-at-stowers-institute.org
126 =head1 APPENDIX
128 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
130 =cut
132 # Let the code begin...
134 package Bio::Tools::GFF;
136 use vars qw($HAS_HTML_ENTITIES);
137 use strict;
139 use Bio::Seq::SeqFactory;
140 use Bio::LocatableSeq;
141 use Bio::SeqFeature::Generic;
143 use base qw(Bio::Root::Root Bio::SeqAnalysisParserI Bio::Root::IO);
145 my $i = 0;
146 my %GFF3_ID_Tags = map { $_ => $i++ } qw(ID Parent Target);
148 =head2 new
150 Title : new
151 Usage : my $parser = Bio::Tools::GFF->new(-gff_version => 2,
152 -file => "filename.gff");
154 my $writer = Bio::Tools::GFF->new(-gff_version => 3,
155 -file => ">filename.gff3");
156 Function: Creates a new instance. Recognized named parameters are -file, -fh,
157 and -gff_version.
158 Returns : a new object
159 Args : named parameters
160 -gff_version => [1,2,3]
162 =cut
165 { # make a class variable such that we can generate unique ID's over
166 # a session, no matter how many instances of GFF.pm we make
167 # since these have to be unique within the scope of a GFF file.
169 my $gff3_featureID = 0;
171 sub _incrementGFF3ID {
172 my ($self) = @_;
173 return ++ $gff3_featureID;
178 sub new {
179 my ($class, @args) = @_;
180 my $self = $class->SUPER::new(@args);
182 my ($gff_version, $noparse) = $self->_rearrange([qw(GFF_VERSION NOPARSE)],@args);
184 # initialize IO
185 $self->_initialize_io(@args);
186 $self->_parse_header() unless $noparse;
188 $gff_version ||= 2;
189 if( ! $self->gff_version($gff_version) ) {
190 $self->throw("Can't build a GFF object with the unknown version ".
191 $gff_version);
193 $self->{'_first'} = 1;
194 return $self;
197 =head2 _parse_header
199 Title : _parse_header
200 Usage : $gffio->_parse_header()
201 Function: used to turn parse GFF header lines. currently
202 produces Bio::LocatableSeq objects from ##sequence-region
203 lines
204 Returns : 1 on success
205 Args : none
208 =cut
210 sub _parse_header{
211 my ($self) = @_;
213 my @unhandled;
214 local $^W = 0; # hide warnings when we try and parse from a file opened
215 # for writing - there isn't really a better way to do
216 # AFAIK - cannot detech if a FH is read or write.
217 while(my $line = $self->_readline()){
218 my $handled = 0;
219 next if /^\s+$/;
220 if($line =~ /^\#\#sequence-region\s+(\S+)\s+(\S+)\s+(\S+)\s*/){
221 my($seqid,$start,$end) = ($1,$2,$3);
222 push @{ $self->{'segments'} }, Bio::LocatableSeq->new
224 -id => unescape($seqid),
225 -start => $start,
226 -end => $end,
227 -length => ($end - $start + 1), ## make the length explicit
229 $handled = 1;
230 } elsif($line =~ /^(\#\#feature-ontology)/) {
231 #to be implemented
232 $self->warn("$1 header tag parsing unimplemented");
233 } elsif($line =~ /^(\#\#attribute-ontology)/) {
234 #to be implemented
235 $self->warn("$1 header tag parsing unimplemented");
236 } elsif($line =~ /^(\#\#source-ontology)/) {
237 #to be implemented
238 $self->warn("$1 header tag parsing unimplemented");
239 } elsif($line =~ /^(\#\#\#)/) {
240 #to be implemented
241 $self->warn("$1 header tag parsing unimplemented");
242 } elsif($line =~ /^(\#\#FASTA)/) {
243 # initial ##FASTA is optional - artemis does not use it
244 $line = $self->_readline();
245 if ($line !~ /^\>(\S+)/) {
246 $self->throw("##FASTA directive must be followed by fasta header, not: $line");
248 } else {
251 if ($line =~ /^\>(.*)/) {
252 # seq data can be at header or footer
253 my $seq = $self->_parse_sequence($line);
254 if ($seq) {
255 $self->_seq_by_id_h->{$seq->primary_id} = $seq;
260 if(!$handled){
261 push @unhandled, $line
264 #looks like the header is over!
265 last unless $line =~ /^\#/;
268 foreach my $line (@unhandled){
269 $self->_pushback($line);
272 return 1;
275 sub _parse_sequence {
276 my ($self, $line) = @_;
278 if ($line =~ /^\>(.*)/) {
280 my $seqid = $1;
281 $seqid =~ s/\s+$//;
282 my $desc = '';
283 if ($seqid =~ /(\S+)\s+(.*)/) {
284 ($seqid, $desc) = ($1,$2);
286 my $res = '';
287 while (my $line = $self->_readline) {
288 if ($line =~ /^\#/) {
289 last;
291 if ($line =~ /^\>/) {
292 $self->_pushback($line);
293 last;
295 $line =~ s/\s//g;
296 $res .= $line;
298 return if $self->ignore_sequence;
300 my $seqfactory = Bio::Seq::SeqFactory->new('Bio::Seq');
301 my $seq = $seqfactory->create(-seq=>$res,
302 -id=>$seqid,
303 -desc=>$desc);
304 $seq->accession_number($seqid);
305 if ($self->features_attached_to_seqs) {
306 my @feats =
307 @{$self->_feature_idx_by_seq_id->{$seqid}};
308 $seq->add_SeqFeature($_) foreach @feats;
309 @{$self->_feature_idx_by_seq_id->{$seqid}} = ();
311 return $seq;
313 else {
314 $self->throw("expected fasta header, not: $line");
319 =head2 next_segment
321 Title : next_segment
322 Usage : my $seq = $gffio->next_segment;
323 Function: Returns a Bio::LocatableSeq object corresponding to a
324 GFF "##sequence-region" header line.
325 Example :
326 Returns : A Bio::LocatableSeq object, or undef if
327 there are no more sequences.
328 Args : none
331 =cut
333 sub next_segment{
334 my ($self,@args) = @_;
335 return shift @{ $self->{'segments'} } if defined $self->{'segments'};
336 return;
339 =head2 next_feature
341 Title : next_feature
342 Usage : $seqfeature = $gffio->next_feature();
343 Function: Returns the next feature available in the input file or stream, or
344 undef if there are no more features.
345 Example :
346 Returns : A Bio::SeqFeatureI implementing object, or undef if there are no
347 more features.
348 Args : none
350 =cut
352 sub next_feature {
353 my ($self) = @_;
355 my $gff_string;
357 # be graceful about empty lines or comments, and make sure we return undef
358 # if the input's consumed
359 while(($gff_string = $self->_readline()) && defined($gff_string)) {
360 if ($gff_string =~ /^\#\#\#/) {
361 # all forward refs have been seen; TODO
363 next if($gff_string =~ /^\#/ || $gff_string =~ /^\s*$/ ||
364 $gff_string =~ m{^//});
366 while ($gff_string =~ /^\>(.+)/) {
367 # fasta can be in header or footer
368 my $seq = $self->_parse_sequence($gff_string);
369 if ($seq) {
370 $self->_seq_by_id_h->{$seq->primary_id} = $seq;
371 $gff_string = $self->_readline;
372 last unless $gff_string;
375 last;
377 return unless $gff_string;
379 my $feat = Bio::SeqFeature::Generic->new();
380 $self->from_gff_string($feat, $gff_string);
382 if ($self->features_attached_to_seqs) {
383 push(@{$self->_feature_idx_by_seq_id->{$feat->seq_id}},
384 $feat);
387 return $feat;
390 sub _feature_idx_by_seq_id {
391 my $self = shift;
392 $self->{__feature_idx_by_seq_id} = shift if @_;
393 $self->{__feature_idx_by_seq_id} = {}
394 unless $self->{__feature_idx_by_seq_id};
395 return $self->{__feature_idx_by_seq_id};
399 =head2 from_gff_string
401 Title : from_gff_string
402 Usage : $gff->from_gff_string($feature, $gff_string);
403 Function: Sets properties of a SeqFeatureI object from a GFF-formatted
404 string. Interpretation of the string depends on the version
405 that has been specified at initialization.
407 This method is used by next_feature(). It actually dispatches to
408 one of the version-specific (private) methods.
409 Example :
410 Returns : void
411 Args : A Bio::SeqFeatureI implementing object to be initialized
412 The GFF-formatted string to initialize it from
414 =cut
416 sub from_gff_string {
417 my ($self, $feat, $gff_string) = @_;
419 if($self->gff_version() == 1) {
420 return $self->_from_gff1_string($feat, $gff_string);
421 } elsif( $self->gff_version() == 3 ) {
422 return $self->_from_gff3_string($feat, $gff_string);
423 } else {
424 return $self->_from_gff2_string($feat, $gff_string);
428 =head2 _from_gff1_string
430 Title : _from_gff1_string
431 Usage :
432 Function:
433 Example :
434 Returns : void
435 Args : A Bio::SeqFeatureI implementing object to be initialized
436 The GFF-formatted string to initialize it from
438 =cut
440 sub _from_gff1_string {
441 my ($gff, $feat, $string) = @_;
442 chomp $string;
443 my ($seqname, $source, $primary, $start, $end, $score,
444 $strand, $frame, @group) = split(/\t/, $string);
446 if ( !defined $frame ) {
447 $feat->throw("[$string] does not look like GFF to me");
449 $frame = 0 unless( $frame =~ /^\d+$/);
450 $feat->seq_id($seqname);
451 $feat->source_tag($source);
452 $feat->primary_tag($primary);
453 $feat->start($start);
454 $feat->end($end);
455 $feat->frame($frame);
456 if ( $score eq '.' ) {
457 #$feat->score(undef);
458 } else {
459 $feat->score($score);
461 if ( $strand eq '-' ) { $feat->strand(-1); }
462 if ( $strand eq '+' ) { $feat->strand(1); }
463 if ( $strand eq '.' ) { $feat->strand(0); }
464 foreach my $g ( @group ) {
465 if ( $g =~ /(\S+)=(\S+)/ ) {
466 my $tag = $1;
467 my $value = $2;
468 $feat->add_tag_value($1, $2);
469 } else {
470 $feat->add_tag_value('group', $g);
475 =head2 _from_gff2_string
477 Title : _from_gff2_string
478 Usage :
479 Function:
480 Example :
481 Returns : void
482 Args : A Bio::SeqFeatureI implementing object to be initialized
483 The GFF2-formatted string to initialize it from
486 =cut
488 sub _from_gff2_string {
489 my ($gff, $feat, $string) = @_;
490 chomp($string);
492 # according to the Sanger website, GFF2 should be single-tab
493 # separated elements, and the free-text at the end should contain
494 # text-translated tab symbols but no "real" tabs, so splitting on
495 # \t is safe, and $attribs gets the entire attributes field to be
496 # parsed later
498 # sendu: but the tag value pair can (should?) be separated by a tab. The
499 # 'no tabs' thing seems to apply only to the free text that is allowed for
500 # the value
502 my ($seqname, $source, $primary, $start,
503 $end, $score, $strand, $frame, @attribs) = split(/\t+/, $string);
504 my $attribs = join ' ', @attribs;
506 if ( !defined $frame ) {
507 $feat->throw("[$string] does not look like GFF2 to me");
509 $feat->seq_id($seqname);
510 $feat->source_tag($source);
511 $feat->primary_tag($primary);
512 $feat->start($start);
513 $feat->end($end);
514 $feat->frame($frame);
515 if ( $score eq '.' ) {
516 # $feat->score(undef);
517 } else {
518 $feat->score($score);
520 if ( $strand eq '-' ) { $feat->strand(-1); }
521 if ( $strand eq '+' ) { $feat->strand(1); }
522 if ( $strand eq '.' ) { $feat->strand(0); }
525 # <Begin Inefficient Code from Mark Wilkinson>
526 # this routine is necessay to allow the presence of semicolons in
527 # quoted text Semicolons are the delimiting character for new
528 # tag/value attributes. it is more or less a "state" machine, with
529 # the "quoted" flag going up and down as we pass thorugh quotes to
530 # distinguish free-text semicolon and hash symbols from GFF control
531 # characters
534 my $flag = 0; # this could be changed to a bit and just be twiddled
535 my @parsed;
537 # run through each character one at a time and check it
538 # NOTE: changed to foreach loop which is more efficient in perl
539 # --jasons
540 for my $a ( split //, $attribs ) {
541 # flag up on entering quoted text, down on leaving it
542 if( $a eq '"') { $flag = ( $flag == 0 ) ? 1:0 }
543 elsif( $a eq ';' && $flag ) { $a = "INSERT_SEMICOLON_HERE"}
544 elsif( $a eq '#' && ! $flag ) { last }
545 push @parsed, $a;
547 $attribs = join "", @parsed; # rejoin into a single string
549 # <End Inefficient Code>
550 # Please feel free to fix this and make it more "perlish"
552 my @key_vals = split /;/, $attribs; # attributes are semicolon-delimited
554 foreach my $pair ( @key_vals ) {
555 # replace semicolons that were removed from free-text above.
556 $pair =~ s/INSERT_SEMICOLON_HERE/;/g;
558 # separate the key from the value
559 my ($blank, $key, $values) = split /^\s*([\w\d]+)\s/, $pair;
562 if( defined $values ) {
563 my @values;
564 # free text is quoted, so match each free-text block
565 # and remove it from the $values string
566 while ($values =~ s/"(.*?)"//){
567 # and push it on to the list of values (tags may have
568 # more than one value... and the value may be undef)
569 push @values, $1;
572 # and what is left over should be space-separated
573 # non-free-text values
575 my @othervals = split /\s+/, $values;
576 foreach my $othervalue(@othervals){
577 # get rid of any empty strings which might
578 # result from the split
579 if (CORE::length($othervalue) > 0) {push @values, $othervalue}
582 foreach my $value(@values){
583 $feat->add_tag_value($key, $value);
590 sub _from_gff3_string {
591 my ($gff, $feat, $string) = @_;
592 chomp($string);
594 # according to the now nearly final GFF3 spec, columns should
595 # be tab separated, allowing unescaped spaces to occur in
596 # column 9
598 my ($seqname, $source, $primary, $start, $end,
599 $score, $strand, $frame, $groups) = split(/\t/, $string);
601 if ( ! defined $frame ) {
602 $feat->throw("[$string] does not look like GFF3 to me");
604 $feat->seq_id($seqname);
605 $feat->source_tag($source);
606 $feat->primary_tag($primary);
607 $feat->start($start);
608 $feat->end($end);
609 $feat->frame($frame);
610 if ( $score eq '.' ) {
611 #$feat->score(undef);
612 } else {
613 $feat->score($score);
615 if ( $strand eq '-' ) { $feat->strand(-1); }
616 if ( $strand eq '+' ) { $feat->strand(1); }
617 if ( $strand eq '.' ) { $feat->strand(0); }
618 my @groups = split(/\s*;\s*/, $groups);
620 for my $group (@groups) {
621 my ($tag,$value) = split /=/,$group;
622 $tag = unescape($tag);
623 my @values = map {unescape($_)} split /,/,$value;
624 for my $v ( @values ) { $feat->add_tag_value($tag,$v); }
628 # taken from Bio::DB::GFF
629 sub unescape {
630 my $v = shift;
631 $v =~ tr/+/ /;
632 $v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge;
633 return $v;
636 =head2 write_feature
638 Title : write_feature
639 Usage : $gffio->write_feature($feature);
640 Function: Writes the specified SeqFeatureI object in GFF format to the stream
641 associated with this instance.
642 Returns : none
643 Args : An array of Bio::SeqFeatureI implementing objects to be serialized
645 =cut
647 sub write_feature {
648 my ($self, @features) = @_;
649 return unless @features;
650 if( $self->{'_first'} && $self->gff_version() == 3 ) {
651 $self->_print("##gff-version 3\n");
653 $self->{'_first'} = 0;
654 foreach my $feature ( @features ) {
655 $self->_print($self->gff_string($feature)."\n");
659 =head2 gff_string
661 Title : gff_string
662 Usage : $gffstr = $gffio->gff_string($feature);
663 Function: Obtain the GFF-formatted representation of a SeqFeatureI object.
664 The formatting depends on the version specified at initialization.
666 This method is used by write_feature(). It actually dispatches to
667 one of the version-specific (private) methods.
668 Example :
669 Returns : A GFF-formatted string representation of the SeqFeature
670 Args : A Bio::SeqFeatureI implementing object to be GFF-stringified
672 =cut
674 sub gff_string{
675 my ($self, $feature) = @_;
677 if($self->gff_version() == 1) {
678 return $self->_gff1_string($feature);
679 } elsif( $self->gff_version() == 3 ) {
680 return $self->_gff3_string($feature);
681 } elsif( $self->gff_version() == 2.5 ) {
682 return $self->_gff25_string($feature);
683 } else {
684 return $self->_gff2_string($feature);
688 =head2 _gff1_string
690 Title : _gff1_string
691 Usage : $gffstr = $gffio->_gff1_string
692 Function:
693 Example :
694 Returns : A GFF1-formatted string representation of the SeqFeature
695 Args : A Bio::SeqFeatureI implementing object to be GFF-stringified
697 =cut
699 sub _gff1_string{
700 my ($gff, $feat) = @_;
701 my ($str,$score,$frame,$name,$strand);
703 if( $feat->can('score') ) {
704 $score = $feat->score();
706 $score = '.' unless defined $score;
708 if( $feat->can('frame') ) {
709 $frame = $feat->frame();
711 $frame = '.' unless defined $frame;
713 $strand = $feat->strand();
714 if(! $strand) {
715 $strand = ".";
716 } elsif( $strand == 1 ) {
717 $strand = '+';
718 } elsif ( $feat->strand == -1 ) {
719 $strand = '-';
722 if( $feat->can('seqname') ) {
723 $name = $feat->seq_id();
724 $name ||= 'SEQ';
725 } else {
726 $name = 'SEQ';
730 $str = join("\t",
731 $name,
732 $feat->source_tag(),
733 $feat->primary_tag(),
734 $feat->start(),
735 $feat->end(),
736 $score,
737 $strand,
738 $frame);
740 foreach my $tag ( $feat->all_tags ) {
741 foreach my $value ( $feat->each_tag_value($tag) ) {
742 $str .= " $tag=$value" if $value;
747 return $str;
750 =head2 _gff2_string
752 Title : _gff2_string
753 Usage : $gffstr = $gffio->_gff2_string
754 Function:
755 Example :
756 Returns : A GFF2-formatted string representation of the SeqFeature
757 Args : A Bio::SeqFeatureI implementing object to be GFF2-stringified
759 =cut
761 sub _gff2_string{
762 my ($gff, $origfeat) = @_;
763 my $feat;
764 if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){
765 $feat = $origfeat->feature2;
766 } else {
767 $feat = $origfeat;
769 my ($str1, $str2,$score,$frame,$name,$strand);
771 if( $feat->can('score') ) {
772 $score = $feat->score();
774 $score = '.' unless defined $score;
776 if( $feat->can('frame') ) {
777 $frame = $feat->frame();
779 $frame = '.' unless defined $frame;
781 $strand = $feat->strand();
782 if(! $strand) {
783 $strand = ".";
784 } elsif( $strand == 1 ) {
785 $strand = '+';
786 } elsif ( $feat->strand == -1 ) {
787 $strand = '-';
790 if( $feat->can('seqname') ) {
791 $name = $feat->seq_id();
792 $name ||= 'SEQ';
793 } else {
794 $name = 'SEQ';
796 $str1 = join("\t",
797 $name,
798 $feat->source_tag(),
799 $feat->primary_tag(),
800 $feat->start(),
801 $feat->end(),
802 $score,
803 $strand,
804 $frame);
805 # the routine below is the only modification I made to the original
806 # ->gff_string routine (above) as on November 17th, 2000, the
807 # Sanger webpage describing GFF2 format reads: "From version 2
808 # onwards, the attribute field must have a tag value structure
809 # following the syntax used within objects in a .ace file,
810 # flattened onto one line by semicolon separators. Tags must be
811 # standard identifiers ([A-Za-z][A-Za-z0-9_]*). Free text values
812 # must be quoted with double quotes".
814 # MW
817 my @all_tags = $feat->all_tags;
818 my @group;
819 if (@all_tags) { # only play this game if it is worth playing...
820 foreach my $tag ( @all_tags ) {
821 my @v;
822 foreach my $value ( $feat->each_tag_value($tag) ) {
823 unless( defined $value && length($value) ) {
824 $value = '""';
825 } elsif ($value =~ /[^A-Za-z0-9_]/){
826 $value =~ s/\t/\\t/g; # substitute tab and newline
827 # characters
828 $value =~ s/\n/\\n/g; # to their UNIX equivalents
829 $value = '"' . $value . '" ';
830 } # if the value contains
831 # anything other than valid
832 # tag/value characters, then
833 # quote it
834 push @v, $value;
835 # for this tag (allowed in GFF2 and .ace format)
837 push @group, "$tag ".join(" ", @v);
840 $str2 .= join(' ; ', @group);
841 # Add Target information for Feature Pairs
842 if( ! $feat->has_tag('Target') && # This is a bad hack IMHO
843 ! $feat->has_tag('Group') &&
844 $origfeat->isa('Bio::SeqFeature::FeaturePair') ) {
845 $str2 = sprintf("Target %s %d %d", $origfeat->feature1->seq_id,
846 ( $origfeat->feature1->strand < 0 ?
847 ( $origfeat->feature1->end,
848 $origfeat->feature1->start) :
849 ( $origfeat->feature1->start,
850 $origfeat->feature1->end)
851 )) . ($str2?" ; ".$str2:""); # need to put Target information before other tag/value pairs - mw
853 return $str1."\t".$str2;
858 =head2 _gff25_string
860 Title : _gff25_string
861 Usage : $gffstr = $gffio->_gff2_string
862 Function: To get a format of GFF that is peculiar to Gbrowse/Bio::DB::GFF
863 Example :
864 Returns : A GFF2.5-formatted string representation of the SeqFeature
865 Args : A Bio::SeqFeatureI implementing object to be GFF2.5-stringified
867 =cut
869 sub _gff25_string {
870 my ($gff, $origfeat) = @_;
871 my $feat;
872 if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){
873 $feat = $origfeat->feature2;
874 } else {
875 $feat = $origfeat;
877 my ($str1, $str2,$score,$frame,$name,$strand);
879 if( $feat->can('score') ) {
880 $score = $feat->score();
882 $score = '.' unless defined $score;
884 if( $feat->can('frame') ) {
885 $frame = $feat->frame();
887 $frame = '.' unless defined $frame;
889 $strand = $feat->strand();
890 if(! $strand) {
891 $strand = ".";
892 } elsif( $strand == 1 ) {
893 $strand = '+';
894 } elsif ( $feat->strand == -1 ) {
895 $strand = '-';
898 if( $feat->can('seqname') ) {
899 $name = $feat->seq_id();
900 $name ||= 'SEQ';
901 } else {
902 $name = 'SEQ';
904 $str1 = join("\t",
905 $name,
906 $feat->source_tag(),
907 $feat->primary_tag(),
908 $feat->start(),
909 $feat->end(),
910 $score,
911 $strand,
912 $frame);
914 my @all_tags = $feat->all_tags;
915 my @group; my @firstgroup;
916 if (@all_tags) { # only play this game if it is worth playing...
917 foreach my $tag ( @all_tags ) {
918 my @v;
919 foreach my $value ( $feat->each_tag_value($tag) ) {
920 unless( defined $value && length($value) ) {
921 $value = '""';
922 } elsif ($value =~ /[^A-Za-z0-9_]/){
923 $value =~ s/\t/\\t/g; # substitute tab and newline
924 # characters
925 $value =~ s/\n/\\n/g; # to their UNIX equivalents
926 $value = '"' . $value . '" ';
927 } # if the value contains
928 # anything other than valid
929 # tag/value characters, then
930 # quote it
931 push @v, $value;
932 # for this tag (allowed in GFF2 and .ace format)
934 if (($tag eq 'Group') || ($tag eq 'Target')){ # hopefully we wont get both...
935 push @firstgroup, "$tag ".join(" ", @v);
936 } else {
937 push @group, "$tag ".join(" ", @v);
941 $str2 = join(' ; ', (@firstgroup, @group));
942 # Add Target information for Feature Pairs
943 if( ! $feat->has_tag('Target') && # This is a bad hack IMHO
944 ! $feat->has_tag('Group') &&
945 $origfeat->isa('Bio::SeqFeature::FeaturePair') ) {
946 $str2 = sprintf("Target %s ; tstart %d ; tend %d", $origfeat->feature1->seq_id,
947 ( $origfeat->feature1->strand < 0 ?
948 ( $origfeat->feature1->end,
949 $origfeat->feature1->start) :
950 ( $origfeat->feature1->start,
951 $origfeat->feature1->end)
952 )) . ($str2?" ; ".$str2:""); # need to put the target info before other tag/value pairs - mw
954 return $str1 . "\t". $str2;
958 =head2 _gff3_string
960 Title : _gff3_string
961 Usage : $gffstr = $gffio->_gff3_string
962 Function:
963 Example :
964 Returns : A GFF3-formatted string representation of the SeqFeature
965 Args : A Bio::SeqFeatureI implementing object to be GFF3-stringified
967 =cut
969 sub _gff3_string {
970 my ($gff, $origfeat) = @_;
971 my $feat;
972 if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){
973 $feat = $origfeat->feature2;
974 } else {
975 $feat = $origfeat;
978 my $ID = $gff->_incrementGFF3ID();
980 my ($score,$frame,$name,$strand);
982 if( $feat->can('score') ) {
983 $score = $feat->score();
985 $score = '.' unless defined $score;
987 if( $feat->can('frame') ) {
988 $frame = $feat->frame();
990 $frame = '.' unless defined $frame;
992 $strand = $feat->strand();
994 if(! $strand) {
995 $strand = ".";
996 } elsif( $strand == 1 ) {
997 $strand = '+';
998 } elsif ( $feat->strand == -1 ) {
999 $strand = '-';
1002 if( $feat->can('seqname') ) {
1003 $name = $feat->seq_id();
1004 $name ||= 'SEQ';
1005 } else {
1006 $name = 'SEQ';
1009 my @groups;
1011 # force leading ID and Parent tags
1012 my @all_tags = grep { ! exists $GFF3_ID_Tags{$_} } $feat->all_tags;
1013 for my $t ( sort { $GFF3_ID_Tags{$b} <=> $GFF3_ID_Tags{$a} }
1014 keys %GFF3_ID_Tags ) {
1015 unshift @all_tags, $t if $feat->has_tag($t);
1018 for my $tag ( @all_tags ) {
1019 # next if $tag eq 'Target';
1020 if ($tag eq 'Target' && ! $origfeat->isa('Bio::SeqFeature::FeaturePair')){
1021 # simple Target,start,stop
1022 my($target_id, $b,$e,$strand) = $feat->get_tag_values($tag);
1023 next unless(defined($e) && defined($b) && $target_id);
1024 ($b,$e)= ($e,$b) if(defined $strand && $strand<0);
1025 $target_id =~ s/([\t\n\r%&\=;,])/sprintf("%%%X",ord($1))/ge;
1026 push @groups, sprintf("Target=%s %d %d", $target_id,$b,$e);
1027 next;
1030 my $valuestr;
1031 # a string which will hold one or more values
1032 # for this tag, with quoted free text and
1033 # space-separated individual values.
1034 my @v;
1035 for my $value ( $feat->each_tag_value($tag) ) {
1036 if( defined $value && length($value) ) {
1037 #$value =~ tr/ /+/; #spaces are allowed now
1038 if ( ref $value eq 'Bio::Annotation::Comment') {
1039 $value = $value->text;
1042 if ($value =~ /[^a-zA-Z0-9\,\;\=\.:\%\^\*\$\@\!\+\_\?\-]/) {
1043 $value =~ s/\t/\\t/g; # substitute tab and newline
1044 # characters
1045 $value =~ s/\n/\\n/g; # to their UNIX equivalents
1047 # Unescaped quotes are not allowed in GFF3
1048 # $value = '"' . $value . '"';
1050 $value =~ s/([\t\n\r%&\=;,])/sprintf("%%%X",ord($1))/ge;
1051 } else {
1052 # if it is completely empty,
1053 # then just make empty double
1054 # quotes
1055 $value = '""';
1057 push @v, $value;
1059 # can we figure out how to improve this?
1060 $tag= lcfirst($tag) unless ($tag
1061 =~ /^(ID|Name|Alias|Parent|Gap|Target|Derives_from|Note|Dbxref|Ontology_term)$/);
1063 push @groups, "$tag=".join(",",@v);
1065 # Add Target information for Feature Pairs
1066 if( $feat->has_tag('Target') &&
1067 ! $feat->has_tag('Group') &&
1068 $origfeat->isa('Bio::SeqFeature::FeaturePair') ) {
1070 my $target_id = $origfeat->feature1->seq_id;
1071 $target_id =~ s/([\t\n\r%&\=;,])/sprintf("%%%X",ord($1))/ge;
1073 push @groups, sprintf("Target=%s %d %d",
1074 $target_id,
1075 ( $origfeat->feature1->strand < 0 ?
1076 ( $origfeat->feature1->end,
1077 $origfeat->feature1->start) :
1078 ( $origfeat->feature1->start,
1079 $origfeat->feature1->end)
1083 # unshift @groups, "ID=autogenerated$ID" unless ($feat->has_tag('ID'));
1084 if ( $feat->can('name') && defined($feat->name) ) {
1085 # such as might be for Bio::DB::SeqFeature
1086 unshift @groups, 'Name=' . $feat->name;
1089 my $gff_string = "";
1090 if ($feat->location->isa("Bio::Location::SplitLocationI")) {
1091 my @locs = $feat->location->each_Location;
1092 foreach my $loc (@locs) {
1093 $gff_string .= join("\t",
1094 $name,
1095 $feat->source_tag() || '.',
1096 $feat->primary_tag(),
1097 $loc->start(),
1098 $loc->end(),
1099 $score,
1100 $strand,
1101 $frame,
1102 join(';', @groups)) . "\n";
1104 chop $gff_string;
1105 return $gff_string;
1106 } else {
1107 $gff_string = join("\t",
1108 $name,
1109 $feat->source_tag() || '.',
1110 $feat->primary_tag(),
1111 $feat->start(),
1112 $feat->end(),
1113 $score,
1114 $strand,
1115 $frame,
1116 join(';', @groups));
1118 return $gff_string;
1121 =head2 gff_version
1123 Title : _gff_version
1124 Usage : $gffversion = $gffio->gff_version
1125 Function:
1126 Example :
1127 Returns : The GFF version this parser will accept and emit.
1128 Args : none
1130 =cut
1132 sub gff_version {
1133 my ($self, $value) = @_;
1134 if(defined $value && grep {$value == $_ } ( 1, 2, 2.5, 3)) {
1135 $self->{'GFF_VERSION'} = $value;
1137 return $self->{'GFF_VERSION'};
1140 # Make filehandles
1142 =head2 newFh
1144 Title : newFh
1145 Usage : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format')
1146 Function: does a new() followed by an fh()
1147 Example : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format')
1148 $feature = <$fh>; # read a feature object
1149 print $fh $feature; # write a feature object
1150 Returns : filehandle tied to the Bio::Tools::GFF class
1151 Args :
1154 =cut
1156 sub newFh {
1157 my $class = shift;
1158 return unless my $self = $class->new(@_);
1159 return $self->fh;
1162 =head2 fh
1164 Title : fh
1165 Usage : $obj->fh
1166 Function:
1167 Example : $fh = $obj->fh; # make a tied filehandle
1168 $feature = <$fh>; # read a feature object
1169 print $fh $feature; # write a feature object
1170 Returns : filehandle tied to Bio::Tools::GFF class
1171 Args : none
1174 =cut
1177 sub fh {
1178 my $self = shift;
1179 my $class = ref($self) || $self;
1180 my $s = Symbol::gensym;
1181 tie $$s,$class,$self;
1182 return $s;
1185 # This accessor is used for accessing the Bio::Seq objects from a GFF3
1186 # file; if the file you are using has no sequence data you can ignore
1187 # this accessor
1189 # This accessor returns a hash reference containing Bio::Seq objects,
1190 # indexed by Bio::Seq->primary_id
1192 sub _seq_by_id_h {
1193 my $self = shift;
1195 return $self->{'_seq_by_id_h'} = shift if @_;
1196 $self->{'_seq_by_id_h'} = {}
1197 unless $self->{'_seq_by_id_h'};
1198 return $self->{'_seq_by_id_h'};
1201 =head2 get_seqs
1203 Title : get_seqs
1204 Usage :
1205 Function: Returns all Bio::Seq objects populated by GFF3 file
1206 Example :
1207 Returns :
1208 Args :
1210 =cut
1212 sub get_seqs {
1213 my ($self,@args) = @_;
1214 return values %{$self->_seq_by_id_h};
1217 =head2 features_attached_to_seqs
1219 Title : features_attached_to_seqs
1220 Usage : $obj->features_attached_to_seqs(1);
1221 Function: For use with GFF3 containg sequence only
1223 Setting this B<before> parsing ensures that all Bio::Seq object
1224 created will have the appropriate features added to them
1226 defaults to false (off)
1228 Note that this mode will incur higher memory usage because features
1229 will have to be cached until the relevant feature comes along
1231 Example :
1232 Returns : value of features_attached_to_seqs (a boolean)
1233 Args : on set, new value (a boolean, optional)
1236 =cut
1238 sub features_attached_to_seqs{
1239 my $self = shift;
1241 return $self->{'_features_attached_to_seqs'} = shift if @_;
1242 return $self->{'_features_attached_to_seqs'};
1245 =head2 ignore_sequence
1247 Title : ignore_sequence
1248 Usage : $obj->ignore_sequence(1);
1249 Function: For use with GFF3 containg sequence only
1251 Setting this B<before> parsing means that all sequence data will be
1252 ignored
1254 Example :
1255 Returns : value of ignore_sequence (a boolean)
1256 Args : on set, new value (a boolean, optional)
1258 =cut
1260 sub ignore_sequence{
1261 my $self = shift;
1263 return $self->{'_ignore_sequence'} = shift if @_;
1264 return $self->{'_ignore_sequence'};
1268 sub DESTROY {
1269 my $self = shift;
1270 $self->close();
1273 sub TIEHANDLE {
1274 my ($class,$val) = @_;
1275 return bless {'gffio' => $val}, $class;
1278 sub READLINE {
1279 my $self = shift;
1280 return $self->{'gffio'}->next_feature() unless wantarray;
1281 my (@list, $obj);
1282 push @list, $obj while $obj = $self->{'gffio'}->next_feature();
1283 return @list;
1286 sub PRINT {
1287 my $self = shift;
1288 $self->{'gffio'}->write_feature(@_);