3 # BioPerl module for Bio::Tools::GFF
5 # Cared for by the Bioperl core team
7 # Copyright Matthew Pocock
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::Tools::GFF - A Bio::SeqAnalysisParserI compliant GFF format parser
21 # specify input via -fh or -file
22 my $gffio = Bio::Tools::GFF->new(-fh => \*STDIN, -gff_version => 2);
24 # loop over the input stream
25 while($feature = $gffio->next_feature()) {
26 # do something with feature
30 # you can also obtain a GFF parser as a SeqAnalasisParserI in
31 # HT analysis pipelines (see Bio::SeqAnalysisParserI and
32 # Bio::Factory::SeqAnalysisParserFactory)
33 my $factory = Bio::Factory::SeqAnalysisParserFactory->new();
34 my $parser = $factory->get_parser(-input => \*STDIN, -method => "gff");
35 while($feature = $parser->next_feature()) {
36 # do something with feature
41 This class provides a simple GFF parser and writer. In the sense of a
42 SeqAnalysisParser, it parses an input file or stream into SeqFeatureI
43 objects, but is not in any way specific to a particular analysis
44 program and the output that program produces.
46 That is, if you can get your analysis program spit out GFF, here is
49 =head1 GFF3 AND SEQUENCE DATA
51 [added by cjm 2004/07/09]
53 GFF3 supports sequence data; see
54 http://song.sourceforge.net/gff3-jan04.shtml
56 There are a number of ways to deal with this -
60 $gffio->ignore_sequence(1)
62 prior to parsing the sequence data is ignored; this is useful if you
63 just want the features. It avoids the memory overhead in building and
66 Alternatively, you can call either
74 At the B<end> of parsing to get either a list or hashref of Bio::Seq
75 objects (see the documentation for each of these methods)
77 Note that these objects will not have the features attached - you have
78 to do this yourself, OR call
80 $gffio->features_attached_to_seqs(1)
82 PRIOR to parsing; this will ensure that the Seqs have the features
83 attached; ie you will then be able to call
85 $seq->get_SeqFeatures();
87 And use Bio::SeqIO methods
89 Note that auto-attaching the features to seqs will incur a higher
90 memory overhead as the features must be cached until the sequence data
95 Make a Bio::SeqIO class specifically for GFF3 with sequence data
101 User feedback is an integral part of the evolution of this and other
102 Bioperl modules. Send your comments and suggestions preferably to one
103 of the Bioperl mailing lists. Your participation is much appreciated.
105 bioperl-l@bioperl.org - General discussion
106 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
108 =head2 Reporting Bugs
110 Report bugs to the Bioperl bug tracking system to help us keep track
111 the bugs and their resolution. Bug reports can be submitted the web:
113 http://bugzilla.open-bio.org/
115 =head1 AUTHOR - Matthew Pocock
117 Email mrp-at-sanger.ac.uk
121 Jason Stajich, jason-at-biperl-dot-org
122 Chris Mungall, cjm-at-fruitfly-dot-org
123 Steffen Grossmann [SG], grossman at molgen.mpg.de
124 Malcolm Cook, mec-at-stowers-institute.org
128 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
132 # Let the code begin...
134 package Bio
::Tools
::GFF
;
136 use vars
qw($HAS_HTML_ENTITIES);
139 use Bio::Seq::SeqFactory;
140 use Bio::LocatableSeq;
141 use Bio::SeqFeature::Generic;
143 use base qw(Bio::Root::Root Bio::SeqAnalysisParserI Bio::Root::IO);
146 my %GFF3_ID_Tags = map { $_ => $i++ } qw(ID Parent Target);
151 Usage : my $parser = Bio::Tools::GFF->new(-gff_version => 2,
152 -file => "filename.gff");
154 my $writer = Bio::Tools::GFF->new(-gff_version => 3,
155 -file => ">filename.gff3");
156 Function: Creates a new instance. Recognized named parameters are -file, -fh,
158 Returns : a new object
159 Args : named parameters
160 -gff_version => [1,2,3]
165 { # make a class variable such that we can generate unique ID's over
166 # a session, no matter how many instances of GFF.pm we make
167 # since these have to be unique within the scope of a GFF file.
169 my $gff3_featureID = 0;
171 sub _incrementGFF3ID
{
173 return ++ $gff3_featureID;
179 my ($class, @args) = @_;
180 my $self = $class->SUPER::new
(@args);
182 my ($gff_version, $noparse) = $self->_rearrange([qw(GFF_VERSION NOPARSE)],@args);
185 $self->_initialize_io(@args);
186 $self->_parse_header() unless $noparse;
189 if( ! $self->gff_version($gff_version) ) {
190 $self->throw("Can't build a GFF object with the unknown version ".
193 $self->{'_first'} = 1;
199 Title : _parse_header
200 Usage : $gffio->_parse_header()
201 Function: used to turn parse GFF header lines. currently
202 produces Bio::LocatableSeq objects from ##sequence-region
204 Returns : 1 on success
214 local $^W
= 0; # hide warnings when we try and parse from a file opened
215 # for writing - there isn't really a better way to do
216 # AFAIK - cannot detech if a FH is read or write.
217 while(my $line = $self->_readline()){
220 if($line =~ /^\#\#sequence-region\s+(\S+)\s+(\S+)\s+(\S+)\s*/){
221 my($seqid,$start,$end) = ($1,$2,$3);
222 push @
{ $self->{'segments'} }, Bio
::LocatableSeq
->new
224 -id
=> unescape
($seqid),
227 -length => ($end - $start + 1), ## make the length explicit
230 } elsif($line =~ /^(\#\#feature-ontology)/) {
232 $self->warn("$1 header tag parsing unimplemented");
233 } elsif($line =~ /^(\#\#attribute-ontology)/) {
235 $self->warn("$1 header tag parsing unimplemented");
236 } elsif($line =~ /^(\#\#source-ontology)/) {
238 $self->warn("$1 header tag parsing unimplemented");
239 } elsif($line =~ /^(\#\#\#)/) {
241 $self->warn("$1 header tag parsing unimplemented");
242 } elsif($line =~ /^(\#\#FASTA)/) {
243 # initial ##FASTA is optional - artemis does not use it
244 $line = $self->_readline();
245 if ($line !~ /^\>(\S+)/) {
246 $self->throw("##FASTA directive must be followed by fasta header, not: $line");
251 if ($line =~ /^\>(.*)/) {
252 # seq data can be at header or footer
253 my $seq = $self->_parse_sequence($line);
255 $self->_seq_by_id_h->{$seq->primary_id} = $seq;
261 push @unhandled, $line
264 #looks like the header is over!
265 last unless $line =~ /^\#/;
268 foreach my $line (@unhandled){
269 $self->_pushback($line);
275 sub _parse_sequence
{
276 my ($self, $line) = @_;
278 if ($line =~ /^\>(.*)/) {
283 if ($seqid =~ /(\S+)\s+(.*)/) {
284 ($seqid, $desc) = ($1,$2);
287 while (my $line = $self->_readline) {
288 if ($line =~ /^\#/) {
291 if ($line =~ /^\>/) {
292 $self->_pushback($line);
298 return if $self->ignore_sequence;
300 my $seqfactory = Bio
::Seq
::SeqFactory
->new('Bio::Seq');
301 my $seq = $seqfactory->create(-seq
=>$res,
304 $seq->accession_number($seqid);
305 if ($self->features_attached_to_seqs) {
307 @
{$self->_feature_idx_by_seq_id->{$seqid}};
308 $seq->add_SeqFeature($_) foreach @feats;
309 @
{$self->_feature_idx_by_seq_id->{$seqid}} = ();
314 $self->throw("expected fasta header, not: $line");
322 Usage : my $seq = $gffio->next_segment;
323 Function: Returns a Bio::LocatableSeq object corresponding to a
324 GFF "##sequence-region" header line.
326 Returns : A Bio::LocatableSeq object, or undef if
327 there are no more sequences.
334 my ($self,@args) = @_;
335 return shift @
{ $self->{'segments'} } if defined $self->{'segments'};
342 Usage : $seqfeature = $gffio->next_feature();
343 Function: Returns the next feature available in the input file or stream, or
344 undef if there are no more features.
346 Returns : A Bio::SeqFeatureI implementing object, or undef if there are no
357 # be graceful about empty lines or comments, and make sure we return undef
358 # if the input's consumed
359 while(($gff_string = $self->_readline()) && defined($gff_string)) {
360 if ($gff_string =~ /^\#\#\#/) {
361 # all forward refs have been seen; TODO
363 next if($gff_string =~ /^\#/ || $gff_string =~ /^\s*$/ ||
364 $gff_string =~ m{^//});
366 while ($gff_string =~ /^\>(.+)/) {
367 # fasta can be in header or footer
368 my $seq = $self->_parse_sequence($gff_string);
370 $self->_seq_by_id_h->{$seq->primary_id} = $seq;
371 $gff_string = $self->_readline;
372 last unless $gff_string;
377 return unless $gff_string;
379 my $feat = Bio
::SeqFeature
::Generic
->new();
380 $self->from_gff_string($feat, $gff_string);
382 if ($self->features_attached_to_seqs) {
383 push(@
{$self->_feature_idx_by_seq_id->{$feat->seq_id}},
390 sub _feature_idx_by_seq_id
{
392 $self->{__feature_idx_by_seq_id
} = shift if @_;
393 $self->{__feature_idx_by_seq_id
} = {}
394 unless $self->{__feature_idx_by_seq_id
};
395 return $self->{__feature_idx_by_seq_id
};
399 =head2 from_gff_string
401 Title : from_gff_string
402 Usage : $gff->from_gff_string($feature, $gff_string);
403 Function: Sets properties of a SeqFeatureI object from a GFF-formatted
404 string. Interpretation of the string depends on the version
405 that has been specified at initialization.
407 This method is used by next_feature(). It actually dispatches to
408 one of the version-specific (private) methods.
411 Args : A Bio::SeqFeatureI implementing object to be initialized
412 The GFF-formatted string to initialize it from
416 sub from_gff_string
{
417 my ($self, $feat, $gff_string) = @_;
419 if($self->gff_version() == 1) {
420 return $self->_from_gff1_string($feat, $gff_string);
421 } elsif( $self->gff_version() == 3 ) {
422 return $self->_from_gff3_string($feat, $gff_string);
424 return $self->_from_gff2_string($feat, $gff_string);
428 =head2 _from_gff1_string
430 Title : _from_gff1_string
435 Args : A Bio::SeqFeatureI implementing object to be initialized
436 The GFF-formatted string to initialize it from
440 sub _from_gff1_string
{
441 my ($gff, $feat, $string) = @_;
443 my ($seqname, $source, $primary, $start, $end, $score,
444 $strand, $frame, @group) = split(/\t/, $string);
446 if ( !defined $frame ) {
447 $feat->throw("[$string] does not look like GFF to me");
449 $frame = 0 unless( $frame =~ /^\d+$/);
450 $feat->seq_id($seqname);
451 $feat->source_tag($source);
452 $feat->primary_tag($primary);
453 $feat->start($start);
455 $feat->frame($frame);
456 if ( $score eq '.' ) {
457 #$feat->score(undef);
459 $feat->score($score);
461 if ( $strand eq '-' ) { $feat->strand(-1); }
462 if ( $strand eq '+' ) { $feat->strand(1); }
463 if ( $strand eq '.' ) { $feat->strand(0); }
464 foreach my $g ( @group ) {
465 if ( $g =~ /(\S+)=(\S+)/ ) {
468 $feat->add_tag_value($1, $2);
470 $feat->add_tag_value('group', $g);
475 =head2 _from_gff2_string
477 Title : _from_gff2_string
482 Args : A Bio::SeqFeatureI implementing object to be initialized
483 The GFF2-formatted string to initialize it from
488 sub _from_gff2_string
{
489 my ($gff, $feat, $string) = @_;
492 # according to the Sanger website, GFF2 should be single-tab
493 # separated elements, and the free-text at the end should contain
494 # text-translated tab symbols but no "real" tabs, so splitting on
495 # \t is safe, and $attribs gets the entire attributes field to be
498 # sendu: but the tag value pair can (should?) be separated by a tab. The
499 # 'no tabs' thing seems to apply only to the free text that is allowed for
502 my ($seqname, $source, $primary, $start,
503 $end, $score, $strand, $frame, @attribs) = split(/\t+/, $string);
504 my $attribs = join ' ', @attribs;
506 if ( !defined $frame ) {
507 $feat->throw("[$string] does not look like GFF2 to me");
509 $feat->seq_id($seqname);
510 $feat->source_tag($source);
511 $feat->primary_tag($primary);
512 $feat->start($start);
514 $feat->frame($frame);
515 if ( $score eq '.' ) {
516 # $feat->score(undef);
518 $feat->score($score);
520 if ( $strand eq '-' ) { $feat->strand(-1); }
521 if ( $strand eq '+' ) { $feat->strand(1); }
522 if ( $strand eq '.' ) { $feat->strand(0); }
525 # <Begin Inefficient Code from Mark Wilkinson>
526 # this routine is necessay to allow the presence of semicolons in
527 # quoted text Semicolons are the delimiting character for new
528 # tag/value attributes. it is more or less a "state" machine, with
529 # the "quoted" flag going up and down as we pass thorugh quotes to
530 # distinguish free-text semicolon and hash symbols from GFF control
534 my $flag = 0; # this could be changed to a bit and just be twiddled
537 # run through each character one at a time and check it
538 # NOTE: changed to foreach loop which is more efficient in perl
540 for my $a ( split //, $attribs ) {
541 # flag up on entering quoted text, down on leaving it
542 if( $a eq '"') { $flag = ( $flag == 0 ) ?
1:0 }
543 elsif( $a eq ';' && $flag ) { $a = "INSERT_SEMICOLON_HERE"}
544 elsif( $a eq '#' && ! $flag ) { last }
547 $attribs = join "", @parsed; # rejoin into a single string
549 # <End Inefficient Code>
550 # Please feel free to fix this and make it more "perlish"
552 my @key_vals = split /;/, $attribs; # attributes are semicolon-delimited
554 foreach my $pair ( @key_vals ) {
555 # replace semicolons that were removed from free-text above.
556 $pair =~ s/INSERT_SEMICOLON_HERE/;/g;
558 # separate the key from the value
559 my ($blank, $key, $values) = split /^\s*([\w\d]+)\s/, $pair;
562 if( defined $values ) {
564 # free text is quoted, so match each free-text block
565 # and remove it from the $values string
566 while ($values =~ s/"(.*?)"//){
567 # and push it on to the list of values (tags may have
568 # more than one value... and the value may be undef)
572 # and what is left over should be space-separated
573 # non-free-text values
575 my @othervals = split /\s+/, $values;
576 foreach my $othervalue(@othervals){
577 # get rid of any empty strings which might
578 # result from the split
579 if (CORE
::length($othervalue) > 0) {push @values, $othervalue}
582 foreach my $value(@values){
583 $feat->add_tag_value($key, $value);
590 sub _from_gff3_string
{
591 my ($gff, $feat, $string) = @_;
594 # according to the now nearly final GFF3 spec, columns should
595 # be tab separated, allowing unescaped spaces to occur in
598 my ($seqname, $source, $primary, $start, $end,
599 $score, $strand, $frame, $groups) = split(/\t/, $string);
601 if ( ! defined $frame ) {
602 $feat->throw("[$string] does not look like GFF3 to me");
604 $feat->seq_id($seqname);
605 $feat->source_tag($source);
606 $feat->primary_tag($primary);
607 $feat->start($start);
609 $feat->frame($frame);
610 if ( $score eq '.' ) {
611 #$feat->score(undef);
613 $feat->score($score);
615 if ( $strand eq '-' ) { $feat->strand(-1); }
616 if ( $strand eq '+' ) { $feat->strand(1); }
617 if ( $strand eq '.' ) { $feat->strand(0); }
618 my @groups = split(/\s*;\s*/, $groups);
620 for my $group (@groups) {
621 my ($tag,$value) = split /=/,$group;
622 $tag = unescape
($tag);
623 my @values = map {unescape
($_)} split /,/,$value;
624 for my $v ( @values ) { $feat->add_tag_value($tag,$v); }
628 # taken from Bio::DB::GFF
632 $v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge;
638 Title : write_feature
639 Usage : $gffio->write_feature($feature);
640 Function: Writes the specified SeqFeatureI object in GFF format to the stream
641 associated with this instance.
643 Args : An array of Bio::SeqFeatureI implementing objects to be serialized
648 my ($self, @features) = @_;
649 return unless @features;
650 if( $self->{'_first'} && $self->gff_version() == 3 ) {
651 $self->_print("##gff-version 3\n");
653 $self->{'_first'} = 0;
654 foreach my $feature ( @features ) {
655 $self->_print($self->gff_string($feature)."\n");
662 Usage : $gffstr = $gffio->gff_string($feature);
663 Function: Obtain the GFF-formatted representation of a SeqFeatureI object.
664 The formatting depends on the version specified at initialization.
666 This method is used by write_feature(). It actually dispatches to
667 one of the version-specific (private) methods.
669 Returns : A GFF-formatted string representation of the SeqFeature
670 Args : A Bio::SeqFeatureI implementing object to be GFF-stringified
675 my ($self, $feature) = @_;
677 if($self->gff_version() == 1) {
678 return $self->_gff1_string($feature);
679 } elsif( $self->gff_version() == 3 ) {
680 return $self->_gff3_string($feature);
681 } elsif( $self->gff_version() == 2.5 ) {
682 return $self->_gff25_string($feature);
684 return $self->_gff2_string($feature);
691 Usage : $gffstr = $gffio->_gff1_string
694 Returns : A GFF1-formatted string representation of the SeqFeature
695 Args : A Bio::SeqFeatureI implementing object to be GFF-stringified
700 my ($gff, $feat) = @_;
701 my ($str,$score,$frame,$name,$strand);
703 if( $feat->can('score') ) {
704 $score = $feat->score();
706 $score = '.' unless defined $score;
708 if( $feat->can('frame') ) {
709 $frame = $feat->frame();
711 $frame = '.' unless defined $frame;
713 $strand = $feat->strand();
716 } elsif( $strand == 1 ) {
718 } elsif ( $feat->strand == -1 ) {
722 if( $feat->can('seqname') ) {
723 $name = $feat->seq_id();
733 $feat->primary_tag(),
740 foreach my $tag ( $feat->all_tags ) {
741 foreach my $value ( $feat->each_tag_value($tag) ) {
742 $str .= " $tag=$value" if $value;
753 Usage : $gffstr = $gffio->_gff2_string
756 Returns : A GFF2-formatted string representation of the SeqFeature
757 Args : A Bio::SeqFeatureI implementing object to be GFF2-stringified
762 my ($gff, $origfeat) = @_;
764 if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){
765 $feat = $origfeat->feature2;
769 my ($str1, $str2,$score,$frame,$name,$strand);
771 if( $feat->can('score') ) {
772 $score = $feat->score();
774 $score = '.' unless defined $score;
776 if( $feat->can('frame') ) {
777 $frame = $feat->frame();
779 $frame = '.' unless defined $frame;
781 $strand = $feat->strand();
784 } elsif( $strand == 1 ) {
786 } elsif ( $feat->strand == -1 ) {
790 if( $feat->can('seqname') ) {
791 $name = $feat->seq_id();
799 $feat->primary_tag(),
805 # the routine below is the only modification I made to the original
806 # ->gff_string routine (above) as on November 17th, 2000, the
807 # Sanger webpage describing GFF2 format reads: "From version 2
808 # onwards, the attribute field must have a tag value structure
809 # following the syntax used within objects in a .ace file,
810 # flattened onto one line by semicolon separators. Tags must be
811 # standard identifiers ([A-Za-z][A-Za-z0-9_]*). Free text values
812 # must be quoted with double quotes".
817 my @all_tags = $feat->all_tags;
819 if (@all_tags) { # only play this game if it is worth playing...
820 foreach my $tag ( @all_tags ) {
822 foreach my $value ( $feat->each_tag_value($tag) ) {
823 unless( defined $value && length($value) ) {
825 } elsif ($value =~ /[^A-Za-z0-9_]/){
826 $value =~ s/\t/\\t/g; # substitute tab and newline
828 $value =~ s/\n/\\n/g; # to their UNIX equivalents
829 $value = '"' . $value . '" ';
830 } # if the value contains
831 # anything other than valid
832 # tag/value characters, then
835 # for this tag (allowed in GFF2 and .ace format)
837 push @group, "$tag ".join(" ", @v);
840 $str2 .= join(' ; ', @group);
841 # Add Target information for Feature Pairs
842 if( ! $feat->has_tag('Target') && # This is a bad hack IMHO
843 ! $feat->has_tag('Group') &&
844 $origfeat->isa('Bio::SeqFeature::FeaturePair') ) {
845 $str2 = sprintf("Target %s %d %d", $origfeat->feature1->seq_id,
846 ( $origfeat->feature1->strand < 0 ?
847 ( $origfeat->feature1->end,
848 $origfeat->feature1->start) :
849 ( $origfeat->feature1->start,
850 $origfeat->feature1->end)
851 )) . ($str2?
" ; ".$str2:""); # need to put Target information before other tag/value pairs - mw
853 return $str1."\t".$str2;
860 Title : _gff25_string
861 Usage : $gffstr = $gffio->_gff2_string
862 Function: To get a format of GFF that is peculiar to Gbrowse/Bio::DB::GFF
864 Returns : A GFF2.5-formatted string representation of the SeqFeature
865 Args : A Bio::SeqFeatureI implementing object to be GFF2.5-stringified
870 my ($gff, $origfeat) = @_;
872 if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){
873 $feat = $origfeat->feature2;
877 my ($str1, $str2,$score,$frame,$name,$strand);
879 if( $feat->can('score') ) {
880 $score = $feat->score();
882 $score = '.' unless defined $score;
884 if( $feat->can('frame') ) {
885 $frame = $feat->frame();
887 $frame = '.' unless defined $frame;
889 $strand = $feat->strand();
892 } elsif( $strand == 1 ) {
894 } elsif ( $feat->strand == -1 ) {
898 if( $feat->can('seqname') ) {
899 $name = $feat->seq_id();
907 $feat->primary_tag(),
914 my @all_tags = $feat->all_tags;
915 my @group; my @firstgroup;
916 if (@all_tags) { # only play this game if it is worth playing...
917 foreach my $tag ( @all_tags ) {
919 foreach my $value ( $feat->each_tag_value($tag) ) {
920 unless( defined $value && length($value) ) {
922 } elsif ($value =~ /[^A-Za-z0-9_]/){
923 $value =~ s/\t/\\t/g; # substitute tab and newline
925 $value =~ s/\n/\\n/g; # to their UNIX equivalents
926 $value = '"' . $value . '" ';
927 } # if the value contains
928 # anything other than valid
929 # tag/value characters, then
932 # for this tag (allowed in GFF2 and .ace format)
934 if (($tag eq 'Group') || ($tag eq 'Target')){ # hopefully we wont get both...
935 push @firstgroup, "$tag ".join(" ", @v);
937 push @group, "$tag ".join(" ", @v);
941 $str2 = join(' ; ', (@firstgroup, @group));
942 # Add Target information for Feature Pairs
943 if( ! $feat->has_tag('Target') && # This is a bad hack IMHO
944 ! $feat->has_tag('Group') &&
945 $origfeat->isa('Bio::SeqFeature::FeaturePair') ) {
946 $str2 = sprintf("Target %s ; tstart %d ; tend %d", $origfeat->feature1->seq_id,
947 ( $origfeat->feature1->strand < 0 ?
948 ( $origfeat->feature1->end,
949 $origfeat->feature1->start) :
950 ( $origfeat->feature1->start,
951 $origfeat->feature1->end)
952 )) . ($str2?
" ; ".$str2:""); # need to put the target info before other tag/value pairs - mw
954 return $str1 . "\t". $str2;
961 Usage : $gffstr = $gffio->_gff3_string
964 Returns : A GFF3-formatted string representation of the SeqFeature
965 Args : A Bio::SeqFeatureI implementing object to be GFF3-stringified
970 my ($gff, $origfeat) = @_;
972 if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){
973 $feat = $origfeat->feature2;
978 my $ID = $gff->_incrementGFF3ID();
980 my ($score,$frame,$name,$strand);
982 if( $feat->can('score') ) {
983 $score = $feat->score();
985 $score = '.' unless defined $score;
987 if( $feat->can('frame') ) {
988 $frame = $feat->frame();
990 $frame = '.' unless defined $frame;
992 $strand = $feat->strand();
996 } elsif( $strand == 1 ) {
998 } elsif ( $feat->strand == -1 ) {
1002 if( $feat->can('seqname') ) {
1003 $name = $feat->seq_id();
1011 # force leading ID and Parent tags
1012 my @all_tags = grep { ! exists $GFF3_ID_Tags{$_} } $feat->all_tags;
1013 for my $t ( sort { $GFF3_ID_Tags{$b} <=> $GFF3_ID_Tags{$a} }
1014 keys %GFF3_ID_Tags ) {
1015 unshift @all_tags, $t if $feat->has_tag($t);
1018 for my $tag ( @all_tags ) {
1019 # next if $tag eq 'Target';
1020 if ($tag eq 'Target' && ! $origfeat->isa('Bio::SeqFeature::FeaturePair')){
1021 # simple Target,start,stop
1022 my($target_id, $b,$e,$strand) = $feat->get_tag_values($tag);
1023 next unless(defined($e) && defined($b) && $target_id);
1024 ($b,$e)= ($e,$b) if(defined $strand && $strand<0);
1025 $target_id =~ s/([\t\n\r%&\=;,])/sprintf("%%%X",ord($1))/ge;
1026 push @groups, sprintf("Target=%s %d %d", $target_id,$b,$e);
1031 # a string which will hold one or more values
1032 # for this tag, with quoted free text and
1033 # space-separated individual values.
1035 for my $value ( $feat->each_tag_value($tag) ) {
1036 if( defined $value && length($value) ) {
1037 #$value =~ tr/ /+/; #spaces are allowed now
1038 if ( ref $value eq 'Bio::Annotation::Comment') {
1039 $value = $value->text;
1042 if ($value =~ /[^a-zA-Z0-9\,\;\=\.:\%\^\*\$\@\!\+\_\?\-]/) {
1043 $value =~ s/\t/\\t/g; # substitute tab and newline
1045 $value =~ s/\n/\\n/g; # to their UNIX equivalents
1047 # Unescaped quotes are not allowed in GFF3
1048 # $value = '"' . $value . '"';
1050 $value =~ s/([\t\n\r%&\=;,])/sprintf("%%%X",ord($1))/ge;
1052 # if it is completely empty,
1053 # then just make empty double
1059 # can we figure out how to improve this?
1060 $tag= lcfirst($tag) unless ($tag
1061 =~ /^(ID|Name|Alias|Parent|Gap|Target|Derives_from|Note|Dbxref|Ontology_term)$/);
1063 push @groups, "$tag=".join(",",@v);
1065 # Add Target information for Feature Pairs
1066 if( $feat->has_tag('Target') &&
1067 ! $feat->has_tag('Group') &&
1068 $origfeat->isa('Bio::SeqFeature::FeaturePair') ) {
1070 my $target_id = $origfeat->feature1->seq_id;
1071 $target_id =~ s/([\t\n\r%&\=;,])/sprintf("%%%X",ord($1))/ge;
1073 push @groups, sprintf("Target=%s %d %d",
1075 ( $origfeat->feature1->strand < 0 ?
1076 ( $origfeat->feature1->end,
1077 $origfeat->feature1->start) :
1078 ( $origfeat->feature1->start,
1079 $origfeat->feature1->end)
1083 # unshift @groups, "ID=autogenerated$ID" unless ($feat->has_tag('ID'));
1084 if ( $feat->can('name') && defined($feat->name) ) {
1085 # such as might be for Bio::DB::SeqFeature
1086 unshift @groups, 'Name=' . $feat->name;
1089 my $gff_string = "";
1090 if ($feat->location->isa("Bio::Location::SplitLocationI")) {
1091 my @locs = $feat->location->each_Location;
1092 foreach my $loc (@locs) {
1093 $gff_string .= join("\t",
1095 $feat->source_tag() || '.',
1096 $feat->primary_tag(),
1102 join(';', @groups)) . "\n";
1107 $gff_string = join("\t",
1109 $feat->source_tag() || '.',
1110 $feat->primary_tag(),
1116 join(';', @groups));
1123 Title : _gff_version
1124 Usage : $gffversion = $gffio->gff_version
1127 Returns : The GFF version this parser will accept and emit.
1133 my ($self, $value) = @_;
1134 if(defined $value && grep {$value == $_ } ( 1, 2, 2.5, 3)) {
1135 $self->{'GFF_VERSION'} = $value;
1137 return $self->{'GFF_VERSION'};
1145 Usage : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format')
1146 Function: does a new() followed by an fh()
1147 Example : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format')
1148 $feature = <$fh>; # read a feature object
1149 print $fh $feature; # write a feature object
1150 Returns : filehandle tied to the Bio::Tools::GFF class
1158 return unless my $self = $class->new(@_);
1167 Example : $fh = $obj->fh; # make a tied filehandle
1168 $feature = <$fh>; # read a feature object
1169 print $fh $feature; # write a feature object
1170 Returns : filehandle tied to Bio::Tools::GFF class
1179 my $class = ref($self) || $self;
1180 my $s = Symbol
::gensym
;
1181 tie
$$s,$class,$self;
1185 # This accessor is used for accessing the Bio::Seq objects from a GFF3
1186 # file; if the file you are using has no sequence data you can ignore
1189 # This accessor returns a hash reference containing Bio::Seq objects,
1190 # indexed by Bio::Seq->primary_id
1195 return $self->{'_seq_by_id_h'} = shift if @_;
1196 $self->{'_seq_by_id_h'} = {}
1197 unless $self->{'_seq_by_id_h'};
1198 return $self->{'_seq_by_id_h'};
1205 Function: Returns all Bio::Seq objects populated by GFF3 file
1213 my ($self,@args) = @_;
1214 return values %{$self->_seq_by_id_h};
1217 =head2 features_attached_to_seqs
1219 Title : features_attached_to_seqs
1220 Usage : $obj->features_attached_to_seqs(1);
1221 Function: For use with GFF3 containg sequence only
1223 Setting this B<before> parsing ensures that all Bio::Seq object
1224 created will have the appropriate features added to them
1226 defaults to false (off)
1228 Note that this mode will incur higher memory usage because features
1229 will have to be cached until the relevant feature comes along
1232 Returns : value of features_attached_to_seqs (a boolean)
1233 Args : on set, new value (a boolean, optional)
1238 sub features_attached_to_seqs
{
1241 return $self->{'_features_attached_to_seqs'} = shift if @_;
1242 return $self->{'_features_attached_to_seqs'};
1245 =head2 ignore_sequence
1247 Title : ignore_sequence
1248 Usage : $obj->ignore_sequence(1);
1249 Function: For use with GFF3 containg sequence only
1251 Setting this B<before> parsing means that all sequence data will be
1255 Returns : value of ignore_sequence (a boolean)
1256 Args : on set, new value (a boolean, optional)
1260 sub ignore_sequence
{
1263 return $self->{'_ignore_sequence'} = shift if @_;
1264 return $self->{'_ignore_sequence'};
1274 my ($class,$val) = @_;
1275 return bless {'gffio' => $val}, $class;
1280 return $self->{'gffio'}->next_feature() unless wantarray;
1282 push @list, $obj while $obj = $self->{'gffio'}->next_feature();
1288 $self->{'gffio'}->write_feature(@_);