2 # BioPerl module for Bio::Tools::GFF
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by the Bioperl core team
8 # Copyright Matthew Pocock
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Tools::GFF - A Bio::SeqAnalysisParserI compliant GFF format parser
22 # specify input via -fh or -file
23 my $gffio = Bio::Tools::GFF->new(-fh => \*STDIN, -gff_version => 2);
25 # loop over the input stream
26 while($feature = $gffio->next_feature()) {
27 # do something with feature
31 # you can also obtain a GFF parser as a SeqAnalasisParserI in
32 # HT analysis pipelines (see Bio::SeqAnalysisParserI and
33 # Bio::Factory::SeqAnalysisParserFactory)
34 my $factory = Bio::Factory::SeqAnalysisParserFactory->new();
35 my $parser = $factory->get_parser(-input => \*STDIN, -method => "gff");
36 while($feature = $parser->next_feature()) {
37 # do something with feature
42 This class provides a simple GFF parser and writer. In the sense of a
43 SeqAnalysisParser, it parses an input file or stream into SeqFeatureI
44 objects, but is not in any way specific to a particular analysis
45 program and the output that program produces.
47 That is, if you can get your analysis program spit out GFF, here is
50 =head1 GFF3 AND SEQUENCE DATA
52 GFF3 supports sequence data; see
54 http://www.sequenceontology.org/gff3.shtml
56 There are a number of ways to deal with this -
60 $gffio->ignore_sequence(1)
62 prior to parsing the sequence data is ignored; this is useful if you
63 just want the features. It avoids the memory overhead in building and
66 Alternatively, you can call either
74 At the B<end> of parsing to get either a list or hashref of Bio::Seq
75 objects (see the documentation for each of these methods)
77 Note that these objects will not have the features attached - you have
78 to do this yourself, OR call
80 $gffio->features_attached_to_seqs(1)
82 PRIOR to parsing; this will ensure that the Seqs have the features
83 attached; ie you will then be able to call
85 $seq->get_SeqFeatures();
87 And use Bio::SeqIO methods
89 Note that auto-attaching the features to seqs will incur a higher
90 memory overhead as the features must be cached until the sequence data
95 Make a Bio::SeqIO class specifically for GFF3 with sequence data
101 User feedback is an integral part of the evolution of this and other
102 Bioperl modules. Send your comments and suggestions preferably to one
103 of the Bioperl mailing lists. Your participation is much appreciated.
105 bioperl-l@bioperl.org - General discussion
106 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
110 Please direct usage questions or support issues to the mailing list:
112 I<bioperl-l@bioperl.org>
114 rather than to the module maintainer directly. Many experienced and
115 reponsive experts will be able look at the problem and quickly
116 address it. Please include a thorough description of the problem
117 with code and data examples if at all possible.
119 =head2 Reporting Bugs
121 Report bugs to the Bioperl bug tracking system to help us keep track
122 the bugs and their resolution. Bug reports can be submitted the web:
124 https://github.com/bioperl/bioperl-live/issues
126 =head1 AUTHOR - Matthew Pocock
128 Email mrp-at-sanger.ac.uk
132 Jason Stajich, jason-at-biperl-dot-org
133 Chris Mungall, cjm-at-fruitfly-dot-org
134 Steffen Grossmann [SG], grossman at molgen.mpg.de
135 Malcolm Cook, mec-at-stowers-institute.org
139 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
143 # Let the code begin...
145 package Bio
::Tools
::GFF
;
147 use vars
qw($HAS_HTML_ENTITIES);
150 use Bio::Seq::SeqFactory;
151 use Bio::LocatableSeq;
152 use Bio::SeqFeature::Generic;
154 use base qw(Bio::Root::Root Bio::SeqAnalysisParserI Bio::Root::IO);
157 my %GFF3_ID_Tags = map { $_ => $i++ } qw(ID Parent Target);
159 # for skipping data that may be represented elsewhere; currently, this is
161 my %SKIPPED_TAGS = map { $_ => 1 } qw(score);
167 Usage : my $parser = Bio::Tools::GFF->new(-gff_version => 2,
168 -file => "filename.gff");
170 my $writer = Bio::Tools::GFF->new(-gff_version => 3,
171 -file => ">filename.gff3");
172 Function: Creates a new instance. Recognized named parameters are -file, -fh,
174 Returns : a new object
175 Args : named parameters
176 -gff_version => [1,2,3]
180 { # make a class variable such that we can generate unique ID's over
181 # a session, no matter how many instances of GFF.pm we make
182 # since these have to be unique within the scope of a GFF file.
184 my $gff3_featureID = 0;
186 sub _incrementGFF3ID
{
188 return ++ $gff3_featureID;
194 my ($class, @args) = @_;
195 my $self = $class->SUPER::new
(@args);
197 my ($gff_version, $noparse) = $self->_rearrange([qw(GFF_VERSION NOPARSE)],@args);
200 $self->_initialize_io(@args);
201 $self->_parse_header() unless $noparse;
204 if( ! $self->gff_version($gff_version) ) {
205 $self->throw("Can't build a GFF object with the unknown version ".
208 $self->{'_first'} = 1;
215 Title : _parse_header
216 Usage : $gffio->_parse_header()
217 Function: used to turn parse GFF header lines. currently
218 produces Bio::LocatableSeq objects from ##sequence-region
220 Returns : 1 on success
230 local $^W
= 0; # hide warnings when we try and parse from a file opened
231 # for writing - there isn't really a better way to do
232 # AFAIK - cannot detech if a FH is read or write.
233 while(my $line = $self->_readline()){
236 if($line =~ /^\#\#sequence-region\s+(\S+)\s+(\S+)\s+(\S+)\s*/){
237 my($seqid,$start,$end) = ($1,$2,$3);
238 push @
{ $self->{'segments'} }, Bio
::LocatableSeq
->new(
239 -id
=> unescape
($seqid),
242 -length => ($end - $start + 1), ## make the length explicit
245 } elsif($line =~ /^(\#\#feature-ontology)/) {
247 $self->warn("$1 header tag parsing unimplemented");
248 } elsif($line =~ /^(\#\#attribute-ontology)/) {
250 $self->warn("$1 header tag parsing unimplemented");
251 } elsif($line =~ /^(\#\#source-ontology)/) {
253 $self->warn("$1 header tag parsing unimplemented");
254 } elsif($line =~ /^(\#\#\#)/) {
256 $self->warn("$1 header tag parsing unimplemented");
257 } elsif($line =~ /^(\#\#FASTA)/) {
258 # initial ##FASTA is optional - artemis does not use it
259 $line = $self->_readline();
260 if ($line !~ /^\>(\S+)/) {
261 $self->throw("##FASTA directive must be followed by fasta header, not: $line");
265 if ($line =~ /^\>(.*)/) {
266 # seq data can be at header or footer
267 my $seq = $self->_parse_sequence($line);
269 $self->_seq_by_id_h->{$seq->primary_id} = $seq;
275 push @unhandled, $line;
278 #looks like the header is over!
279 last unless $line =~ /^\#/;
282 foreach my $line (@unhandled){
283 $self->_pushback($line);
289 sub _parse_sequence
{
290 my ($self, $line) = @_;
292 if ($line =~ /^\>(.*)/) {
297 if ($seqid =~ /(\S+)\s+(.*)/) {
298 ($seqid, $desc) = ($1,$2);
301 while (my $line = $self->_readline) {
302 if ($line =~ /^\#/) {
305 if ($line =~ /^\>/) {
306 $self->_pushback($line);
312 return if $self->ignore_sequence;
314 my $seqfactory = Bio
::Seq
::SeqFactory
->new('Bio::Seq');
315 my $seq = $seqfactory->create(-seq
=>$res,
318 $seq->accession_number($seqid);
319 if ($self->features_attached_to_seqs) {
321 @
{$self->_feature_idx_by_seq_id->{$seqid}};
322 $seq->add_SeqFeature($_) foreach @feats;
323 @
{$self->_feature_idx_by_seq_id->{$seqid}} = ();
328 $self->throw("expected fasta header, not: $line");
336 Usage : my $seq = $gffio->next_segment;
337 Function: Returns a Bio::LocatableSeq object corresponding to a
338 GFF "##sequence-region" header line.
340 Returns : A Bio::LocatableSeq object, or undef if
341 there are no more sequences.
348 my ($self,@args) = @_;
349 return shift @
{ $self->{'segments'} } if defined $self->{'segments'};
357 Usage : $seqfeature = $gffio->next_feature();
358 Function: Returns the next feature available in the input file or stream, or
359 undef if there are no more features.
361 Returns : A Bio::SeqFeatureI implementing object, or undef if there are no
372 # be graceful about empty lines or comments, and make sure we return undef
373 # if the input's consumed
374 while(($gff_string = $self->_readline()) && defined($gff_string)) {
375 if ($gff_string =~ /^\#\#\#/) {
376 # all forward refs have been seen; TODO
378 next if($gff_string =~ /^\#/ || $gff_string =~ /^\s*$/ ||
379 $gff_string =~ m{^//});
381 while ($gff_string =~ /^\>(.+)/) {
382 # fasta can be in header or footer
383 my $seq = $self->_parse_sequence($gff_string);
385 $self->_seq_by_id_h->{$seq->primary_id} = $seq;
386 $gff_string = $self->_readline;
387 last unless $gff_string;
392 return unless $gff_string;
394 my $feat = Bio
::SeqFeature
::Generic
->new();
395 $self->from_gff_string($feat, $gff_string);
397 if ($self->features_attached_to_seqs) {
398 push(@
{$self->_feature_idx_by_seq_id->{$feat->seq_id}},
405 sub _feature_idx_by_seq_id
{
407 $self->{__feature_idx_by_seq_id
} = shift if @_;
408 $self->{__feature_idx_by_seq_id
} = {}
409 unless $self->{__feature_idx_by_seq_id
};
410 return $self->{__feature_idx_by_seq_id
};
414 =head2 from_gff_string
416 Title : from_gff_string
417 Usage : $gff->from_gff_string($feature, $gff_string);
418 Function: Sets properties of a SeqFeatureI object from a GFF-formatted
419 string. Interpretation of the string depends on the version
420 that has been specified at initialization.
422 This method is used by next_feature(). It actually dispatches to
423 one of the version-specific (private) methods.
426 Args : A Bio::SeqFeatureI implementing object to be initialized
427 The GFF-formatted string to initialize it from
431 sub from_gff_string
{
432 my ($self, $feat, $gff_string) = @_;
434 if($self->gff_version() == 1) {
435 return $self->_from_gff1_string($feat, $gff_string);
436 } elsif( $self->gff_version() == 3 ) {
437 return $self->_from_gff3_string($feat, $gff_string);
439 return $self->_from_gff2_string($feat, $gff_string);
444 =head2 _from_gff1_string
446 Title : _from_gff1_string
451 Args : A Bio::SeqFeatureI implementing object to be initialized
452 The GFF-formatted string to initialize it from
456 sub _from_gff1_string
{
457 my ($gff, $feat, $string) = @_;
459 my ($seqname, $source, $primary, $start, $end, $score,
460 $strand, $frame, @group) = split(/\t/, $string);
462 if ( !defined $frame ) {
463 $feat->throw("[$string] does not look like GFF to me");
465 $frame = 0 unless( $frame =~ /^\d+$/);
466 $feat->seq_id($seqname);
467 $feat->source_tag($source);
468 $feat->primary_tag($primary);
469 $feat->start($start);
471 $feat->frame($frame);
472 if ( $score eq '.' ) {
473 #$feat->score(undef);
475 $feat->score($score);
477 if ( $strand eq '-' ) { $feat->strand(-1); }
478 if ( $strand eq '+' ) { $feat->strand(1); }
479 if ( $strand eq '.' ) { $feat->strand(0); }
480 foreach my $g ( @group ) {
481 if ( $g =~ /(\S+)=(\S+)/ ) {
484 $feat->add_tag_value($1, $2);
486 $feat->add_tag_value('group', $g);
492 =head2 _from_gff2_string
494 Title : _from_gff2_string
499 Args : A Bio::SeqFeatureI implementing object to be initialized
500 The GFF2-formatted string to initialize it from
505 sub _from_gff2_string
{
506 my ($gff, $feat, $string) = @_;
509 # according to the Sanger website, GFF2 should be single-tab
510 # separated elements, and the free-text at the end should contain
511 # text-translated tab symbols but no "real" tabs, so splitting on
512 # \t is safe, and $attribs gets the entire attributes field to be
515 # sendu: but the tag value pair can (should?) be separated by a tab. The
516 # 'no tabs' thing seems to apply only to the free text that is allowed for
519 my ($seqname, $source, $primary, $start,
520 $end, $score, $strand, $frame, @attribs) = split(/\t+/, $string);
521 my $attribs = join ' ', @attribs;
523 if ( !defined $frame ) {
524 $feat->throw("[$string] does not look like GFF2 to me");
526 $feat->seq_id($seqname);
527 $feat->source_tag($source);
528 $feat->primary_tag($primary);
529 $feat->start($start);
531 $feat->frame($frame);
532 if ( $score eq '.' ) {
533 # $feat->score(undef);
535 $feat->score($score);
537 if ( $strand eq '-' ) { $feat->strand(-1); }
538 if ( $strand eq '+' ) { $feat->strand(1); }
539 if ( $strand eq '.' ) { $feat->strand(0); }
542 # <Begin Inefficient Code from Mark Wilkinson>
543 # this routine is necessay to allow the presence of semicolons in
544 # quoted text Semicolons are the delimiting character for new
545 # tag/value attributes. it is more or less a "state" machine, with
546 # the "quoted" flag going up and down as we pass thorugh quotes to
547 # distinguish free-text semicolon and hash symbols from GFF control
550 my $flag = 0; # this could be changed to a bit and just be twiddled
553 # run through each character one at a time and check it
554 # NOTE: changed to foreach loop which is more efficient in perl
556 for my $a ( split //, $attribs ) {
557 # flag up on entering quoted text, down on leaving it
558 if( $a eq '"') { $flag = ( $flag == 0 ) ?
1:0 }
559 elsif( $a eq ';' && $flag ) { $a = "INSERT_SEMICOLON_HERE"}
560 elsif( $a eq '#' && ! $flag ) { last }
563 $attribs = join "", @parsed; # rejoin into a single string
565 # <End Inefficient Code>
566 # Please feel free to fix this and make it more "perlish"
568 my @key_vals = split /;/, $attribs; # attributes are semicolon-delimited
570 foreach my $pair ( @key_vals ) {
571 # replace semicolons that were removed from free-text above.
572 $pair =~ s/INSERT_SEMICOLON_HERE/;/g;
574 # separate the key from the value
575 my ($blank, $key, $values) = split /^\s*([\w\d]+)\s/, $pair;
577 if( defined $values ) {
579 # free text is quoted, so match each free-text block
580 # and remove it from the $values string
581 while ($values =~ s/"(.*?)"//){
582 # and push it on to the list of values (tags may have
583 # more than one value... and the value may be undef)
587 # and what is left over should be space-separated
588 # non-free-text values
590 my @othervals = split /\s+/, $values;
591 foreach my $othervalue(@othervals){
592 # get rid of any empty strings which might
593 # result from the split
594 if (CORE
::length($othervalue) > 0) {push @values, $othervalue}
597 foreach my $value(@values){
598 $feat->add_tag_value($key, $value);
605 sub _from_gff3_string
{
606 my ($gff, $feat, $string) = @_;
609 # according to the now nearly final GFF3 spec, columns should
610 # be tab separated, allowing unescaped spaces to occur in
613 my ($seqname, $source, $primary, $start, $end,
614 $score, $strand, $frame, $groups) = split(/\t/, $string);
616 if ( ! defined $frame ) {
617 $feat->throw("[$string] does not look like GFF3 to me");
619 $feat->seq_id($seqname);
620 $feat->source_tag($source);
621 $feat->primary_tag($primary);
622 $feat->start($start);
624 $feat->frame($frame);
625 if ( $score eq '.' ) {
626 #$feat->score(undef);
628 $feat->score($score);
630 if ( $strand eq '-' ) { $feat->strand(-1); }
631 if ( $strand eq '+' ) { $feat->strand(1); }
632 if ( $strand eq '.' ) { $feat->strand(0); }
633 my @groups = split(/\s*;\s*/, $groups);
635 for my $group (@groups) {
636 my ($tag,$value) = split /=/,$group;
637 $tag = unescape
($tag);
638 my @values = map {unescape
($_)} split /,/,$value;
639 for my $v ( @values ) { $feat->add_tag_value($tag,$v); }
643 # taken from Bio::DB::GFF
647 $v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge;
654 Title : write_feature
655 Usage : $gffio->write_feature($feature);
656 Function: Writes the specified SeqFeatureI object in GFF format to the stream
657 associated with this instance.
659 Args : An array of Bio::SeqFeatureI implementing objects to be serialized
664 my ($self, @features) = @_;
665 return unless @features;
666 if( $self->{'_first'} && $self->gff_version() == 3 ) {
667 $self->_print("##gff-version 3\n");
669 $self->{'_first'} = 0;
670 foreach my $feature ( @features ) {
671 $self->_print($self->gff_string($feature)."\n");
679 Usage : $gffstr = $gffio->gff_string($feature);
680 Function: Obtain the GFF-formatted representation of a SeqFeatureI object.
681 The formatting depends on the version specified at initialization.
683 This method is used by write_feature(). It actually dispatches to
684 one of the version-specific (private) methods.
686 Returns : A GFF-formatted string representation of the SeqFeature
687 Args : A Bio::SeqFeatureI implementing object to be GFF-stringified
692 my ($self, $feature) = @_;
694 if($self->gff_version() == 1) {
695 return $self->_gff1_string($feature);
696 } elsif( $self->gff_version() == 3 ) {
697 return $self->_gff3_string($feature);
698 } elsif( $self->gff_version() == 2.5 ) {
699 return $self->_gff25_string($feature);
701 return $self->_gff2_string($feature);
709 Usage : $gffstr = $gffio->_gff1_string
712 Returns : A GFF1-formatted string representation of the SeqFeature
713 Args : A Bio::SeqFeatureI implementing object to be GFF-stringified
718 my ($gff, $feat) = @_;
719 my ($str,$score,$frame,$name,$strand);
721 if( $feat->can('score') ) {
722 $score = $feat->score();
724 $score = '.' unless defined $score;
726 if( $feat->can('frame') ) {
727 $frame = $feat->frame();
729 $frame = '.' unless defined $frame;
731 $strand = $feat->strand();
734 } elsif( $strand == 1 ) {
736 } elsif ( $feat->strand == -1 ) {
740 if( $feat->can('seqname') ) {
741 $name = $feat->seq_id();
757 foreach my $tag ( $feat->get_all_tags ) {
758 next if exists $SKIPPED_TAGS{$tag};
759 foreach my $value ( $feat->get_tag_values($tag) ) {
760 $str .= " $tag=$value" if $value;
771 Usage : $gffstr = $gffio->_gff2_string
774 Returns : A GFF2-formatted string representation of the SeqFeature
775 Args : A Bio::SeqFeatureI implementing object to be GFF2-stringified
780 my ($gff, $origfeat) = @_;
782 if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){
783 $feat = $origfeat->feature2;
787 my ($str1, $str2,$score,$frame,$name,$strand);
789 if( $feat->can('score') ) {
790 $score = $feat->score();
792 $score = '.' unless defined $score;
794 if( $feat->can('frame') ) {
795 $frame = $feat->frame();
797 $frame = '.' unless defined $frame;
799 $strand = $feat->strand();
802 } elsif( $strand == 1 ) {
804 } elsif ( $feat->strand == -1 ) {
808 if( $feat->can('seqname') ) {
809 $name = $feat->seq_id();
816 $feat->primary_tag(),
822 # the routine below is the only modification I made to the original
823 # ->gff_string routine (above) as on November 17th, 2000, the
824 # Sanger webpage describing GFF2 format reads: "From version 2
825 # onwards, the attribute field must have a tag value structure
826 # following the syntax used within objects in a .ace file,
827 # flattened onto one line by semicolon separators. Tags must be
828 # standard identifiers ([A-Za-z][A-Za-z0-9_]*). Free text values
829 # must be quoted with double quotes".
834 foreach my $tag ( $feat->get_all_tags ) {
835 next if exists $SKIPPED_TAGS{$tag};
837 foreach my $value ( $feat->get_tag_values($tag) ) {
838 unless( defined $value && length($value) ) {
839 # quote anything other than valid tag/value characters
841 } elsif ($value =~ /[^A-Za-z0-9_]/){
842 # substitute tab and newline chars by their UNIX equivalent
843 $value =~ s/\t/\\t/g;
844 $value =~ s/\n/\\n/g;
845 $value = '"' . $value . '" ';
848 # for this tag (allowed in GFF2 and .ace format)
850 push @group, "$tag ".join(" ", @v);
853 $str2 .= join(' ; ', @group);
854 # Add Target information for Feature Pairs
855 if( ! $feat->has_tag('Target') && # This is a bad hack IMHO
856 ! $feat->has_tag('Group') &&
857 $origfeat->isa('Bio::SeqFeature::FeaturePair') ) {
858 $str2 = sprintf("Target %s %d %d", $origfeat->feature1->seq_id,
859 ( $origfeat->feature1->strand < 0 ?
860 ( $origfeat->feature1->end,
861 $origfeat->feature1->start) :
862 ( $origfeat->feature1->start,
863 $origfeat->feature1->end)
864 )) . ($str2?
" ; ".$str2:""); # need to put Target information before other tag/value pairs - mw
866 return $str1."\t".$str2;
872 Title : _gff25_string
873 Usage : $gffstr = $gffio->_gff2_string
874 Function: To get a format of GFF that is peculiar to Gbrowse/Bio::DB::GFF
876 Returns : A GFF2.5-formatted string representation of the SeqFeature
877 Args : A Bio::SeqFeatureI implementing object to be GFF2.5-stringified
882 my ($gff, $origfeat) = @_;
884 if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){
885 $feat = $origfeat->feature2;
889 my ($str1, $str2,$score,$frame,$name,$strand);
891 if( $feat->can('score') ) {
892 $score = $feat->score();
894 $score = '.' unless defined $score;
896 if( $feat->can('frame') ) {
897 $frame = $feat->frame();
899 $frame = '.' unless defined $frame;
901 $strand = $feat->strand();
904 } elsif( $strand == 1 ) {
906 } elsif ( $feat->strand == -1 ) {
910 if( $feat->can('seqname') ) {
911 $name = $feat->seq_id();
919 $feat->primary_tag(),
926 my @all_tags = $feat->all_tags;
927 my @group; my @firstgroup;
928 if (@all_tags) { # only play this game if it is worth playing...
929 foreach my $tag ( @all_tags ) {
931 foreach my $value ( $feat->get_tag_values($tag) ) {
932 next if exists $SKIPPED_TAGS{$tag};
933 unless( defined $value && length($value) ) {
935 } elsif ($value =~ /[^A-Za-z0-9_]/){
936 $value =~ s/\t/\\t/g; # substitute tab and newline
938 $value =~ s/\n/\\n/g; # to their UNIX equivalents
939 $value = '"' . $value . '" ';
940 } # if the value contains
941 # anything other than valid
942 # tag/value characters, then
945 # for this tag (allowed in GFF2 and .ace format)
947 if (($tag eq 'Group') || ($tag eq 'Target')){ # hopefully we wont get both...
948 push @firstgroup, "$tag ".join(" ", @v);
950 push @group, "$tag ".join(" ", @v);
954 $str2 = join(' ; ', (@firstgroup, @group));
955 # Add Target information for Feature Pairs
956 if( ! $feat->has_tag('Target') && # This is a bad hack IMHO
957 ! $feat->has_tag('Group') &&
958 $origfeat->isa('Bio::SeqFeature::FeaturePair') ) {
959 $str2 = sprintf("Target %s ; tstart %d ; tend %d", $origfeat->feature1->seq_id,
960 ( $origfeat->feature1->strand < 0 ?
961 ( $origfeat->feature1->end,
962 $origfeat->feature1->start) :
963 ( $origfeat->feature1->start,
964 $origfeat->feature1->end)
965 )) . ($str2?
" ; ".$str2:""); # need to put the target info before other tag/value pairs - mw
967 return $str1 . "\t". $str2;
974 Usage : $gffstr = $gffio->_gff3_string
977 Returns : A GFF3-formatted string representation of the SeqFeature
978 Args : A Bio::SeqFeatureI implementing object to be GFF3-stringified
983 my ($gff, $origfeat) = @_;
985 if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){
986 $feat = $origfeat->feature2;
991 my $ID = $gff->_incrementGFF3ID();
993 my ($score,$frame,$name,$strand);
995 if( $feat->can('score') ) {
996 $score = $feat->score();
998 $score = '.' unless defined $score;
1000 if( $feat->can('frame') ) {
1001 $frame = $feat->frame();
1003 $frame = '1' unless defined $frame;
1005 $strand = $feat->strand();
1009 } elsif( $strand == 1 ) {
1011 } elsif ( $feat->strand == -1 ) {
1015 if( $feat->can('seqname') ) {
1016 $name = $feat->seq_id();
1024 # force leading ID and Parent tags
1025 my @all_tags = grep { ! exists $GFF3_ID_Tags{$_} } $feat->all_tags;
1026 for my $t ( sort { $GFF3_ID_Tags{$b} <=> $GFF3_ID_Tags{$a} }
1027 keys %GFF3_ID_Tags ) {
1028 unshift @all_tags, $t if $feat->has_tag($t);
1031 for my $tag ( @all_tags ) {
1032 next if exists $SKIPPED_TAGS{$tag};
1033 # next if $tag eq 'Target';
1034 if ($tag eq 'Target' && ! $origfeat->isa('Bio::SeqFeature::FeaturePair')){
1035 # simple Target,start,stop
1036 my($target_id, $b,$e,$strand) = $feat->get_tag_values($tag);
1037 next unless(defined($e) && defined($b) && $target_id);
1038 ($b,$e)= ($e,$b) if(defined $strand && $strand<0);
1039 $target_id =~ s/([\t\n\r%&\=;,])/sprintf("%%%X",ord($1))/ge;
1040 push @groups, sprintf("Target=%s %d %d", $target_id,$b,$e);
1045 # a string which will hold one or more values
1046 # for this tag, with quoted free text and
1047 # space-separated individual values.
1049 for my $value ( $feat->get_tag_values($tag) ) {
1050 if( defined $value && length($value) ) {
1051 #$value =~ tr/ /+/; #spaces are allowed now
1052 if ( ref $value eq 'Bio::Annotation::Comment') {
1053 $value = $value->text;
1056 if ($value =~ /[^a-zA-Z0-9\,\;\=\.:\%\^\*\$\@\!\+\_\?\-]/) {
1057 $value =~ s/\t/\\t/g; # substitute tab and newline
1059 $value =~ s/\n/\\n/g; # to their UNIX equivalents
1061 # Unescaped quotes are not allowed in GFF3
1062 # $value = '"' . $value . '"';
1064 $value =~ s/([\t\n\r%&\=;,])/sprintf("%%%X",ord($1))/ge;
1066 # if it is completely empty, then just make empty double quotes
1071 # can we figure out how to improve this?
1072 $tag = lcfirst($tag) unless ( $tag =~
1073 /^(ID|Name|Alias|Parent|Gap|Target|Derives_from|Note|Dbxref|Ontology_term)$/);
1075 push @groups, "$tag=".join(",",@v);
1077 # Add Target information for Feature Pairs
1078 if( $feat->has_tag('Target') &&
1079 ! $feat->has_tag('Group') &&
1080 $origfeat->isa('Bio::SeqFeature::FeaturePair') ) {
1082 my $target_id = $origfeat->feature1->seq_id;
1083 $target_id =~ s/([\t\n\r%&\=;,])/sprintf("%%%X",ord($1))/ge;
1085 push @groups, sprintf("Target=%s %d %d",
1087 ( $origfeat->feature1->strand < 0 ?
1088 ( $origfeat->feature1->end,
1089 $origfeat->feature1->start) :
1090 ( $origfeat->feature1->start,
1091 $origfeat->feature1->end)
1095 # unshift @groups, "ID=autogenerated$ID" unless ($feat->has_tag('ID'));
1096 if ( $feat->can('name') && defined($feat->name) ) {
1097 # such as might be for Bio::DB::SeqFeature
1098 unshift @groups, 'Name=' . $feat->name;
1101 my $gff_string = "";
1102 if ($feat->location->isa("Bio::Location::SplitLocationI")) {
1103 my @locs = $feat->location->each_Location;
1104 foreach my $loc (@locs) {
1105 $gff_string .= join("\t",
1107 $feat->source_tag() || '.',
1108 $feat->primary_tag(),
1114 join(';', @groups)) . "\n";
1119 $gff_string = join("\t",
1121 $feat->source_tag() || '.',
1122 $feat->primary_tag(),
1128 join(';', @groups));
1136 Title : _gff_version
1137 Usage : $gffversion = $gffio->gff_version
1140 Returns : The GFF version this parser will accept and emit.
1146 my ($self, $value) = @_;
1147 if(defined $value && grep {$value == $_ } ( 1, 2, 2.5, 3)) {
1148 $self->{'GFF_VERSION'} = $value;
1150 return $self->{'GFF_VERSION'};
1159 Usage : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format')
1160 Function: does a new() followed by an fh()
1161 Example : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format')
1162 $feature = <$fh>; # read a feature object
1163 print $fh $feature; # write a feature object
1164 Returns : filehandle tied to the Bio::Tools::GFF class
1171 return unless my $self = $class->new(@_);
1181 Example : $fh = $obj->fh; # make a tied filehandle
1182 $feature = <$fh>; # read a feature object
1183 print $fh $feature; # write a feature object
1184 Returns : filehandle tied to Bio::Tools::GFF class
1192 my $class = ref($self) || $self;
1193 my $s = Symbol
::gensym
;
1194 tie
$$s,$class,$self;
1198 # This accessor is used for accessing the Bio::Seq objects from a GFF3
1199 # file; if the file you are using has no sequence data you can ignore
1202 # This accessor returns a hash reference containing Bio::Seq objects,
1203 # indexed by Bio::Seq->primary_id
1208 return $self->{'_seq_by_id_h'} = shift if @_;
1209 $self->{'_seq_by_id_h'} = {}
1210 unless $self->{'_seq_by_id_h'};
1211 return $self->{'_seq_by_id_h'};
1219 Function: Returns all Bio::Seq objects populated by GFF3 file
1227 my ($self,@args) = @_;
1228 return values %{$self->_seq_by_id_h};
1232 =head2 features_attached_to_seqs
1234 Title : features_attached_to_seqs
1235 Usage : $obj->features_attached_to_seqs(1);
1236 Function: For use with GFF3 containg sequence only
1238 Setting this B<before> parsing ensures that all Bio::Seq object
1239 created will have the appropriate features added to them
1241 defaults to false (off)
1243 Note that this mode will incur higher memory usage because features
1244 will have to be cached until the relevant feature comes along
1247 Returns : value of features_attached_to_seqs (a boolean)
1248 Args : on set, new value (a boolean, optional)
1253 sub features_attached_to_seqs
{
1256 return $self->{'_features_attached_to_seqs'} = shift if @_;
1257 return $self->{'_features_attached_to_seqs'};
1261 =head2 ignore_sequence
1263 Title : ignore_sequence
1264 Usage : $obj->ignore_sequence(1);
1265 Function: For use with GFF3 containg sequence only
1267 Setting this B<before> parsing means that all sequence data will be
1271 Returns : value of ignore_sequence (a boolean)
1272 Args : on set, new value (a boolean, optional)
1276 sub ignore_sequence
{
1279 return $self->{'_ignore_sequence'} = shift if @_;
1280 return $self->{'_ignore_sequence'};
1290 my ($class,$val) = @_;
1291 return bless {'gffio' => $val}, $class;
1296 return $self->{'gffio'}->next_feature() || undef unless wantarray;
1298 push @list, $obj while $obj = $self->{'gffio'}->next_feature();
1304 $self->{'gffio'}->write_feature(@_);