2 # BioPerl module for Bio::SeqIO::genbank
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Bioperl project bioperl-l(at)bioperl.org
8 # Copyright Elia Stupka and contributors see AUTHORS section
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::SeqIO::genbank - GenBank sequence input/output stream
20 It is probably best not to use this object directly, but
21 rather go through the SeqIO handler:
23 $stream = Bio::SeqIO->new(-file => $filename,
24 -format => 'GenBank');
26 while ( my $seq = $stream->next_seq() ) {
27 # do something with $seq
33 This object can transform Bio::Seq objects to and from GenBank flat
36 There is some flexibility here about how to write GenBank output
37 that is not fully documented.
39 =head2 Optional functions
45 (output only) shows the dna or not
49 (output only) provides a sorting func which is applied to the FTHelpers
52 =item _id_generation_func()
54 This is function which is called as
56 print "ID ", $func($seq), "\n";
58 To generate the ID line. If it is not there, it generates a sensible ID
59 line using a number of tools.
61 If you want to output annotations in Genbank format they need to be
62 stored in a Bio::Annotation::Collection object which is accessible
63 through the Bio::SeqI interface method L<annotation()|annotation>.
65 The following are the names of the keys which are pulled from a
66 L<Bio::Annotation::Collection> object:
68 reference - Should contain Bio::Annotation::Reference objects
69 comment - Should contain Bio::Annotation::Comment objects
70 dblink - Should contain a Bio::Annotation::DBLink object
71 segment - Should contain a Bio::Annotation::SimpleValue object
72 origin - Should contain a Bio::Annotation::SimpleValue object
73 wgs - Should contain a Bio::Annotation::SimpleValue object
77 =head1 Where does the data go?
79 Data parsed in Bio::SeqIO::genbank is stored in a variety of data
80 fields in the sequence object that is returned. Here is a partial list
83 Items listed as RichSeq or Seq or PrimarySeq and then NAME() tell you
84 the top level object which defines a function called NAME() which
85 stores this information.
87 Items listed as Annotation 'NAME' tell you the data is stored the
88 associated Bio::AnnotationCollectionI object which is associated with
89 Bio::Seq objects. If it is explicitly requested that no annotations
90 should be stored when parsing a record of course they will not be
91 available when you try and get them. If you are having this problem
92 look at the type of SeqBuilder that is being used to contruct your
95 Comments Annotation 'comment'
96 References Annotation 'reference'
97 Segment Annotation 'segment'
98 Origin Annotation 'origin'
99 Dbsource Annotation 'dblink'
101 Accessions PrimarySeq accession_number()
102 Secondary accessions RichSeq get_secondary_accessions()
103 GI number PrimarySeq primary_id()
104 LOCUS PrimarySeq display_id()
105 Keywords RichSeq get_keywords()
106 Dates RichSeq get_dates()
107 Molecule RichSeq molecule()
108 Seq Version RichSeq seq_version()
110 Division RichSeq division()
111 Features Seq get_SeqFeatures()
112 Alphabet PrimarySeq alphabet()
113 Definition PrimarySeq description() or desc()
114 Version PrimarySeq version()
116 Sequence PrimarySeq seq()
118 There is more information in the Feature-Annotation HOWTO about each
119 field and how it is mapped to the Sequence object
120 L<http://bioperl.open-bio.org/wiki/HOWTO:Feature-Annotation>.
126 User feedback is an integral part of the evolution of this and other
127 Bioperl modules. Send your comments and suggestions preferably to one
128 of the Bioperl mailing lists. Your participation is much appreciated.
130 bioperl-l@bioperl.org - General discussion
131 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
135 Please direct usage questions or support issues to the mailing list:
137 I<bioperl-l@bioperl.org>
139 rather than to the module maintainer directly. Many experienced and
140 reponsive experts will be able look at the problem and quickly
141 address it. Please include a thorough description of the problem
142 with code and data examples if at all possible.
144 =head2 Reporting Bugs
146 Report bugs to the Bioperl bug tracking system to help us keep track
147 the bugs and their resolution. Bug reports can be submitted via the web:
149 https://redmine.open-bio.org/projects/bioperl/
151 =head1 AUTHOR - Bioperl Project
153 bioperl-l at bioperl.org
155 Original author Elia Stupka, elia -at- tigem.it
159 Ewan Birney birney at ebi.ac.uk
160 Jason Stajich jason at bioperl.org
161 Chris Mungall cjm at fruitfly.bdgp.berkeley.edu
162 Lincoln Stein lstein at cshl.org
163 Heikki Lehvaslaiho, heikki at ebi.ac.uk
164 Hilmar Lapp, hlapp at gmx.net
165 Donald G. Jackson, donald.jackson at bms.com
166 James Wasmuth, james.wasmuth at ed.ac.uk
167 Brian Osborne, bosborne at alum.mit.edu
168 Chris Fields, cjfields at bioperl dot org
172 The rest of the documentation details each of the object
173 methods. Internal methods are usually preceded with a _
177 # Let the code begin...
179 package Bio
::SeqIO
::genbank
;
182 use Bio
::SeqIO
::FTHelper
;
183 use Bio
::SeqFeature
::Generic
;
185 use Bio
::Seq
::SeqFactory
;
186 use Bio
::Annotation
::Collection
;
187 use Bio
::Annotation
::Comment
;
188 use Bio
::Annotation
::Reference
;
189 use Bio
::Annotation
::DBLink
;
191 use base
qw(Bio::SeqIO);
193 # Note that a qualifier that exceeds one line (i.e. a long label) will
194 # automatically be quoted regardless:
196 our $FTQUAL_LINE_LENGTH = 60;
198 our %FTQUAL_NO_QUOTE = map {$_ => 1} qw(
201 cons_splice direction
205 transl_except transl_table
209 our %DBSOURCE = map {$_ => 1} qw(
210 EchoBASE IntAct SWISS-2DPAGE ECO2DBASE ECOGENE TIGRFAMs
211 TIGR GO InterPro Pfam PROSITE SGD GermOnline
212 HSSP PhosSite Ensembl RGD AGD ArrayExpress KEGG
213 H-InvDB HGNC LinkHub PANTHER PRINTS SMART SMR
214 MGI MIM RZPD-ProtExp ProDom MEROPS TRANSFAC Reactome
215 UniGene GlycoSuiteDB PIRSF HSC-2DPAGE PHCI-2DPAGE
216 PMMA-2DPAGE Siena-2DPAGE Rat-heart-2DPAGE Aarhus/Ghent-2DPAGE
217 Biocyc MetaCyc Biocyc:Metacyc GenomeReviews FlyBase
218 TMHOBP COMPLUYEAST-2DPAGE OGP DictyBase HAMAP
219 PhotoList Gramene WormBase WormPep Genew ZFIN
220 PeroxiBase MaizeDB TAIR DrugBank REBASE HPA
221 swissprot GenBank GenPept REFSEQ embl PDB UniProtKB
222 DIP PeptideAtlas PRIDE CYGD HOGENOME Gene3D
225 our %VALID_MOLTYPE = map {$_ => 1} qw(NA DNA RNA tRNA rRNA cDNA cRNA ms-DNA
226 mRNA uRNA ss-RNA ss-DNA snRNA snoRNA PRT);
228 our %VALID_ALPHABET = (
231 'rc' => '' # rc = release candidate; file has no sequences
235 my($self,@args) = @_;
237 $self->SUPER::_initialize
(@args);
238 # hash for functions for decoding keys.
239 $self->{'_func_ftunit_hash'} = {};
240 $self->_show_dna(1); # sets this to one by default. People can change it
241 if( ! defined $self->sequence_factory ) {
242 $self->sequence_factory(Bio
::Seq
::SeqFactory
->new
243 (-verbose
=> $self->verbose(),
244 -type
=> 'Bio::Seq::RichSeq'));
251 Usage : $seq = $stream->next_seq()
252 Function: returns the next sequence in the stream
253 Returns : Bio::Seq object
259 my ( $self, @args ) = @_;
261 my $builder = $self->sequence_builder();
268 my ( @acc, @features );
269 my ( $display_id, $annotation );
272 # initialize; we may come here because of starting over
277 %params = ( -verbose
=> $self->verbose ); # reset hash
279 while ( defined( $buffer = $self->_readline() ) ) {
280 last if index( $buffer, 'LOCUS ' ) == 0;
282 return unless defined $buffer; # end of file
283 $buffer =~ /^LOCUS\s+(\S.*)$/o
285 "GenBank stream with bad LOCUS line. Not GenBank in my book. Got '$buffer'"
288 my @tokens = split( ' ', $1 );
290 # this is important to have the id for display in e.g. FTHelper,
291 # otherwise you won't know which entry caused an error
292 $display_id = shift(@tokens);
293 $params{'-display_id'} = $display_id;
295 # may still be useful if we don't want the seq
296 my $seqlength = shift(@tokens);
297 if ( exists $VALID_ALPHABET{$seqlength} ) {
299 # moved one token too far. No locus name?
301 "Bad LOCUS name? Changing [$params{'-display_id'}] to 'unknown' and length to $display_id"
303 $params{'-display_id'} = 'unknown';
304 $params{'-length'} = $display_id;
307 unshift @tokens, $seqlength;
310 $params{'-length'} = $seqlength;
313 # the alphabet of the entry
314 # shouldn't assign alphabet unless one is specifically designated (such as for rc files)
315 my $alphabet = lc( shift @tokens );
316 $params{'-alphabet'} =
317 ( exists $VALID_ALPHABET{$alphabet} )
318 ?
$VALID_ALPHABET{$alphabet}
319 : $self->warn("Unknown alphabet: $alphabet");
321 # for aa there is usually no 'molecule' (mRNA etc)
322 if ( $params{'-alphabet'} eq 'protein' ) {
323 $params{'-molecule'} = 'PRT';
326 $params{'-molecule'} = shift(@tokens);
329 # take care of lower case issues
330 if ( $params{'-molecule'} eq 'dna' || $params{'-molecule'} eq 'rna' ) {
331 $params{'-molecule'} = uc $params{'-molecule'};
333 $self->debug( "Unrecognized molecule type:" . $params{'-molecule'} )
334 if !exists( $VALID_MOLTYPE{ $params{'-molecule'} } );
335 my $circ = shift(@tokens);
336 if ( $circ eq 'circular' ) {
337 $params{'-is_circular'} = 1;
338 $params{'-division'} = shift(@tokens);
341 # 'linear' or 'circular' may actually be omitted altogether
342 $params{'-division'} =
343 ( CORE
::length($circ) == 3 ) ?
$circ : shift(@tokens);
345 my $date = join( ' ', @tokens ); # we lump together the rest
347 # this is per request bug #1513
353 if ( $date =~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/ ) {
354 if ( length($date) < 11 ) {
356 # improperly formatted date
357 # But we'll be nice and fix it for them
358 my ( $d, $m, $y ) = ( $2, $3, $4 );
359 if ( length($d) == 1 ) {
363 # guess the century here
364 if ( length($y) == 2 ) {
365 if ( $y > 60 ) { # arbitrarily guess that '60' means 1960
372 "Date was malformed, guessing the century for $date to be $y\n"
375 $params{'-dates'} = [ join( '-', $d, $m, $y ) ];
378 $params{'-dates'} = [$date];
382 # set them all at once
383 $builder->add_slot_value(%params);
386 # parse the rest if desired, otherwise start over
387 if ( !$builder->want_object() ) {
388 $builder->make_object();
392 # set up annotation depending on what the builder wants
393 if ( $builder->want_slot('annotation') ) {
394 $annotation = Bio
::Annotation
::Collection
->new();
396 $buffer = $self->_readline();
397 until ( !defined($buffer) ) {
400 # Description line(s)
401 if (/^DEFINITION\s+(\S.*\S)/) {
403 while ( defined( $_ = $self->_readline ) ) {
404 if (/^\s+(.*)/) { push( @desc, $1 ); next }
407 $builder->add_slot_value( -desc
=> join( ' ', @desc ) );
409 # we'll continue right here because DEFINITION always comes
410 # at the top of the entry
414 # accession number (there can be multiple accessions)
415 if (/^ACCESSION\s+(\S.*\S)/) {
416 push( @acc, split( /\s+/, $1 ) );
417 while ( defined( $_ = $self->_readline ) ) {
418 /^\s+(.*)/ && do { push( @acc, split( /\s+/, $1 ) ); next };
426 elsif (/^PID\s+(\S+)/) {
427 $params{'-pid'} = $1;
431 elsif (/^VERSION\s+(\S.+)$/) {
432 my ( $acc, $gi ) = split( ' ', $1 );
433 if ( $acc =~ /^\w+\.(\d+)/ ) {
434 $params{'-version'} = $1;
435 $params{'-seq_version'} = $1;
437 if ( $gi && ( index( $gi, "GI:" ) == 0 ) ) {
438 $params{'-primary_id'} = substr( $gi, 3 );
443 elsif (/^KEYWORDS\s+(\S.*)/) {
444 my @kw = split( /\s*\;\s*/, $1 );
445 while ( defined( $_ = $self->_readline ) ) {
448 && do { push( @kw, split( /\s*\;\s*/, $1 ) ); next };
452 @kw && $kw[-1] =~ s/\.$//;
453 $params{'-keywords'} = \
@kw;
458 # Organism name and phylogenetic information
459 elsif (/^SOURCE\s+\S/) {
460 if ( $builder->want_slot('species') ) {
461 $species = $self->_read_GenBank_Species( \
$buffer );
462 $builder->add_slot_value( -species
=> $species );
465 while ( defined( $buffer = $self->_readline() ) ) {
466 last if substr( $buffer, 0, 1 ) ne ' ';
473 elsif (/^REFERENCE\s+\S/) {
475 my @refs = $self->_read_GenBank_References( \
$buffer );
476 foreach my $ref (@refs) {
477 $annotation->add_Annotation( 'reference', $ref );
481 while ( defined( $buffer = $self->_readline() ) ) {
482 last if substr( $buffer, 0, 1 ) ne ' ';
489 elsif (/^PROJECT\s+(\S.*)/) {
492 Bio
::Annotation
::SimpleValue
->new( -value
=> $1 );
493 $annotation->add_Annotation( 'project', $project );
498 elsif (/^COMMENT\s+(\S.*)/) {
501 while ( defined( $_ = $self->_readline ) ) {
505 $comment =~ s/\n/ /g;
506 $comment =~ s/ +/ /g;
507 $annotation->add_Annotation(
509 Bio
::Annotation
::Comment
->new(
511 -tagname
=> 'comment'
517 while ( defined( $buffer = $self->_readline() ) ) {
518 last if substr( $buffer, 0, 1 ) ne ' ';
524 # Corresponding Genbank nucleotide id, Genpept only
525 elsif (/^DB(?:SOURCE|LINK)\s+(\S.+)/) {
528 while ( defined( $_ = $self->_readline ) ) {
533 # deal with UniProKB dbsources
535 s/(UniProt(?:KB)?|swissprot):\s+locus\s+(\S+)\,.+\n// )
537 $annotation->add_Annotation(
539 Bio
::Annotation
::DBLink
->new(
545 if ( $dbsource =~ s/\s+created:\s+([^\.]+)\.\n// ) {
546 $annotation->add_Annotation(
548 Bio
::Annotation
::SimpleValue
->new(
549 -tagname
=> 'date_created',
555 s/\s+(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g
558 $annotation->add_Annotation(
560 Bio
::Annotation
::SimpleValue
->new(
561 -tagname
=> 'date_updated',
566 $dbsource =~ s/\n/ /g;
568 s/\s+xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ )
570 # will use $i to determine even or odd
571 # for swissprot the accessions are paired
573 for my $dbsrc ( split( /,\s+/, $1 ) ) {
574 if ( $dbsrc =~ /(\S+)\.(\d+)/
575 || $dbsrc =~ /(\S+)/ )
577 my ( $id, $version ) = ( $1, $2 );
578 $version = '' unless defined $version;
580 if ( $id =~ /^\d\S{3}/ ) {
585 ( $i++ % 2 ) ?
'GenPept' : 'GenBank';
587 $annotation->add_Annotation(
589 Bio
::Annotation
::DBLink
->new(
591 -version
=> $version,
600 $dbsource =~ s/\s+xrefs:\s+(.+)\s+xrefs/xrefs/i )
602 # download screwed up and ncbi didn't put acc in for gi numbers
604 for my $id ( split( /\,\s+/, $1 ) ) {
606 if ( $id =~ /gi:\s+(\d+)/ ) {
608 $db = ( $i++ % 2 ) ?
'GenPept' : 'GenBank';
610 elsif ( $id =~ /pdb\s+accession\s+(\S+)/ ) {
618 $annotation->add_Annotation(
620 Bio
::Annotation
::DBLink
->new(
629 $self->debug("Cannot match $dbsource\n");
633 s
/xrefs\s
+\
(non\
-sequence\s
+databases\
):\s
+
637 for my $id ( split( /\,\s+/, $1 ) ) {
640 # this is because GenBank dropped the spaces!!!
641 # I'm sure we're not going to get this right
642 ##if( $id =~ s/^://i ) {
645 $db = substr( $id, 0, index( $id, ':' ) );
646 if ( !exists $DBSOURCE{$db} ) {
647 $db = ''; # do we want 'GenBank' here?
649 $id = substr( $id, index( $id, ':' ) + 1 );
650 $annotation->add_Annotation(
652 Bio
::Annotation
::DBLink
->new(
664 /^(\S*?):?\s*accession\s+(\S+)\.(\d+)/ )
666 my ( $db, $id, $version ) = ( $1, $2, $3 );
667 $annotation->add_Annotation(
669 Bio
::Annotation
::DBLink
->new(
671 -version
=> $version,
672 -database
=> $db || 'GenBank',
677 elsif ( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)/ ) {
678 my ( $db, $id ) = ( $1, $2 );
679 $annotation->add_Annotation(
681 Bio
::Annotation
::DBLink
->new(
683 -database
=> $db || 'GenBank',
688 elsif ( $dbsource =~ /(\S+)([\.:])\s*(\S+)/ ) {
689 my ( $db, $version );
694 # Genbank 192 release notes say this: "The second field can consist of
695 # multiple comma-separated identifiers, if a sequence record has
696 # multiple DBLINK cross-references of a given type."
697 # For example: DBLINK Project:100,200,300"
698 @ids = split( /,/, $3 );
701 ( $db, $version ) = ( 'GenBank', $3 );
705 foreach my $id (@ids) {
706 $annotation->add_Annotation(
708 Bio
::Annotation
::DBLink
->new(
710 -version
=> $version,
719 "Unrecognized DBSOURCE data: $dbsource\n");
726 while ( defined( $buffer = $self->_readline() ) ) {
727 last if substr( $buffer, 0, 1 ) ne ' ';
733 # Exit at start of Feature table, or start of sequence
734 last if (/^(FEATURES|ORIGIN)/);
736 # Get next line and loop again
737 $buffer = $self->_readline;
739 return unless defined $buffer;
741 # add them all at once for efficiency
742 $builder->add_slot_value(
743 -accession_number
=> shift(@acc),
744 -secondary_accessions
=> \
@acc,
747 $builder->add_slot_value( -annotation
=> $annotation ) if $annotation;
748 %params = (); # reset before possible re-use to avoid setting twice
750 # start over if we don't want to continue with this entry
751 if ( !$builder->want_object() ) {
752 $builder->make_object();
756 # some "minimal" formats may not necessarily have a feature table
757 if ( $builder->want_slot('features') && defined($_) && /^FEATURES/o ) {
759 # need to read the first line of the feature table
760 $buffer = $self->_readline;
762 # DO NOT read lines in the while condition -- this is done as a side
763 # effect in _read_FTHelper_GenBank!
765 # part of new circular spec:
766 # commented out for now until kinks worked out
768 #$sourceEnd = $2 if ($buffer =~ /(\d+?)\.\.(\d+?)$/);
770 while ( defined($buffer) ) {
772 # check immediately -- not at the end of the loop
773 # note: GenPept entries obviously do not have a BASE line
774 last if ( $buffer =~ /^BASE|ORIGIN|CONTIG|WGS/o );
776 # slurp in one feature at a time -- at return, the start of
777 # the next feature will have been read already, so we need
778 # to pass a reference, and the called method must set this
779 # to the last line read before returning
781 my $ftunit = $self->_read_FTHelper_GenBank( \
$buffer );
783 # implement new circular spec: features that cross the origin are now
784 # seamless instead of being 2 separate joined features
785 # commented out until kinks get worked out
786 #if ((! $args{'-nojoin'}) && $ftunit->{'loc'} =~ /^join\((\d+?)\.\.(\d+?),(\d+?)..(\d+?)\)$/
787 #&& $sourceEnd == $2 && $3 == 1) {
790 #$ftunit->{'loc'} = "$start..$end";
793 # fix suggested by James Diggans
795 if ( !defined $ftunit ) {
797 # GRRRR. We have fallen over. Try to recover
798 $self->warn( "Unexpected error in feature table for "
799 . $params{'-display_id'}
800 . " Skipping feature, attempting to recover" );
801 unless ( ( $buffer =~ /^\s{5,5}\S+/o )
802 or ( $buffer =~ /^\S+/o ) )
804 $buffer = $self->_readline();
806 next; # back to reading FTHelpers
811 $ftunit->_generic_seqfeature( $self->location_factory(),
814 # add taxon_id from source if available
817 && ( $feat->primary_tag eq 'source' )
818 && $feat->has_tag('db_xref')
820 !$species->ncbi_taxid()
821 || ( $species->ncbi_taxid
822 && $species->ncbi_taxid =~ /^list/ )
826 foreach my $tagval ( $feat->get_tag_values('db_xref') ) {
827 if ( index( $tagval, "taxon:" ) == 0 ) {
828 $species->ncbi_taxid( substr( $tagval, 6 ) );
834 # add feature to list of features
835 push( @features, $feat );
837 $builder->add_slot_value( -features
=> \
@features );
842 # CONTIG lines: TODO, this needs to be cleaned up
843 if (/^CONTIG\s+(.*)/o) {
845 while ( defined( $_ = $self->_readline)) {
846 last if m{^ORIGIN|//}o;
851 $annotation->add_Annotation(
852 Bio
::Annotation
::SimpleValue
->new(
853 -tagname
=> 'contig',
859 elsif (/^WGS|WGS_SCAFLD\s+/o) { # catch WGS/WGS_SCAFLD lines
860 while ( $_ =~ s/(^WGS|WGS_SCAFLD)\s+// ) { # gulp lines
862 $annotation->add_Annotation(
863 Bio
::Annotation
::SimpleValue
->new(
868 $_ = $self->_readline;
871 elsif ( !m{^ORIGIN|//}o ) { # advance to the sequence, if any
872 while ( defined( $_ = $self->_readline ) ) {
873 last if m{^(ORIGIN|//)};
877 if ( !$builder->want_object() ) {
878 $builder->make_object(); # implicit end-of-object
881 if ( $builder->want_slot('seq') ) {
882 # the fact that we want a sequence does not necessarily mean that
883 # there also is a sequence ...
884 if ( defined($_) && s/^ORIGIN\s+// ) {
885 if ( $annotation && length($_) > 0 ) {
886 $annotation->add_Annotation(
888 Bio
::Annotation
::SimpleValue
->new(
889 -tagname
=> 'origin',
895 while ( defined( $_ = $self->_readline ) ) {
902 $builder->add_slot_value( -seq
=> $seqc );
905 elsif ( defined($_) && ( substr( $_, 0, 2 ) ne '//' ) ) {
907 # advance to the end of the record
908 while ( defined( $_ = $self->_readline ) ) {
909 last if substr( $_, 0, 2 ) eq '//';
913 # Unlikely, but maybe the sequence is so weird that we don't want it
914 # anymore. We don't want to return undef if the stream's not exhausted
916 $seq = $builder->make_object();
917 next RECORDSTART
unless $seq;
919 } # end while RECORDSTART
927 Usage : $stream->write_seq($seq)
928 Function: writes the $seq object (must be seq) to the stream
929 Returns : 1 for success and 0 for error
930 Args : array of 1 to n Bio::SeqI objects
936 my ($self,@seqs) = @_;
938 foreach my $seq ( @seqs ) {
939 $self->throw("Attempting to write with no seq!") unless defined $seq;
941 if( ! ref $seq || ! $seq->isa('Bio::SeqI') ) {
942 $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!");
948 my $len = $seq->length();
950 if ( $seq->can('division') ) {
951 $div = $seq->division;
953 if( !defined $div || ! $div ) { $div = 'UNK'; }
954 my $alpha = $seq->alphabet;
955 if( !$seq->can('molecule') || ! defined ($mol = $seq->molecule()) ) {
956 $mol = $alpha || 'DNA';
959 my $circular = 'linear ';
960 $circular = 'circular' if $seq->is_circular;
962 local($^W
) = 0; # supressing warnings about uninitialized fields.
965 if( $self->_id_generation_func ) {
966 $temp_line = &{$self->_id_generation_func}($seq);
969 if( $seq->can('get_dates') ) {
970 ($date) = $seq->get_dates();
973 $self->warn("No whitespace allowed in GenBank display id [". $seq->display_id. "]")
974 if $seq->display_id =~ /\s/;
976 $temp_line = sprintf ("%-12s%-15s%13s %s%4s%-8s%-8s %3s %-s\n",
977 'LOCUS', $seq->id(),$len,
978 (lc($alpha) eq 'protein') ?
('aa','', '') :
979 ('bp', '',$mol),$circular,$div,$date);
982 $self->_print($temp_line);
983 $self->_write_line_GenBank_regex("DEFINITION ", " ",
984 $seq->desc(),"\\s\+\|\$",80);
986 # if there, write the accession line
988 if( $self->_ac_generation_func ) {
989 $temp_line = &{$self->_ac_generation_func}($seq);
990 $self->_print("ACCESSION $temp_line\n");
993 push(@acc, $seq->accession_number());
994 if( $seq->isa('Bio::Seq::RichSeqI') ) {
995 push(@acc, $seq->get_secondary_accessions());
997 $self->_print("ACCESSION ", join(" ", @acc), "\n");
998 # otherwise - cannot print <sigh>
1001 # if PID defined, print it
1002 if($seq->isa('Bio::Seq::RichSeqI') && $seq->pid()) {
1003 $self->_print("PID ", $seq->pid(), "\n");
1006 # if there, write the version line
1008 if( defined $self->_sv_generation_func() ) {
1009 $temp_line = &{$self->_sv_generation_func}($seq);
1011 $self->_print("VERSION $temp_line\n");
1014 if($seq->isa('Bio::Seq::RichSeqI') && defined($seq->seq_version)) {
1015 my $id = $seq->primary_id(); # this may be a GI number
1016 $self->_print("VERSION ",
1017 $seq->accession_number(), ".", $seq->seq_version,
1018 ($id && ($id =~ /^\d+$/) ?
" GI:".$id : ""),
1023 # if there, write the PROJECT line
1024 for my $proj ( $seq->annotation->get_Annotations('project') ) {
1025 $self->_print("PROJECT ".$proj->value."\n");
1028 # if there, write the DBSOURCE line
1029 foreach my $ref ( $seq->annotation->get_Annotations('dblink') ) {
1030 my ($db, $id) = ($ref->database, $ref->primary_id);
1031 my $prefix = $db eq 'Project' ?
'DBLINK' : 'DBSOURCE';
1032 my $text = $db eq 'GenBank' ?
'' :
1033 $db eq 'Project' ?
"$db:$id" : "$db accession $id";
1034 $self->_print(sprintf ("%-11s %s\n",$prefix, $text));
1037 # if there, write the keywords line
1038 if( defined $self->_kw_generation_func() ) {
1039 $temp_line = &{$self->_kw_generation_func}($seq);
1040 $self->_print("KEYWORDS $temp_line\n");
1042 if( $seq->can('keywords') ) {
1043 my $kw = $seq->keywords;
1044 $kw .= '.' if( $kw !~ /\.$/ );
1045 $self->_print("KEYWORDS $kw\n");
1049 # SEGMENT if it exists
1050 foreach my $ref ( $seq->annotation->get_Annotations('segment') ) {
1051 $self->_print(sprintf ("%-11s %s\n",'SEGMENT',
1056 if (my $spec = $seq->species) {
1057 my ($on, $sn, $cn) = ($spec->can('organelle') ?
$spec->organelle : '',
1058 $spec->scientific_name,
1059 $spec->common_name);
1061 if ($spec->isa('Bio::Species')) {
1062 @classification = $spec->classification;
1063 shift(@classification);
1065 # Bio::Taxon should have a DB handle of some type attached, so
1066 # derive the classification from that
1069 $node = $node->ancestor || last;
1070 unshift(@classification, $node->node_name);
1071 #$node eq $root && last;
1073 @classification = reverse @classification;
1075 my $abname = $spec->name('abbreviated') ?
# from genbank file
1076 $spec->name('abbreviated')->[0] : $sn;
1077 my $sl = $on ?
"$on " : '';
1078 $sl .= $cn ?
$abname." ($cn)" : "$abname";
1080 $self->_write_line_GenBank_regex("SOURCE ", ' 'x12
, $sl, "\\s\+\|\$",80);
1081 $self->_print(" ORGANISM ", $spec->scientific_name, "\n");
1082 my $OC = join('; ', (reverse(@classification))) .'.';
1083 $self->_write_line_GenBank_regex(' 'x12
,' 'x12
,
1084 $OC,"\\s\+\|\$",80);
1089 foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
1090 $temp_line = "REFERENCE $count";
1092 $temp_line .= sprintf (" (%s %d to %d)",
1093 ($seq->alphabet() eq "protein" ?
1094 "residues" : "bases"),
1095 $ref->start,$ref->end);
1096 } elsif ($ref->gb_reference) {
1097 $temp_line .= sprintf (" (%s)", $ref->gb_reference);
1099 $self->_print("$temp_line\n");
1100 $self->_write_line_GenBank_regex(" AUTHORS ",' 'x12
,
1101 $ref->authors,"\\s\+\|\$",80);
1102 $self->_write_line_GenBank_regex(" CONSRTM ",' 'x12
,
1103 $ref->consortium,"\\s\+\|\$",80) if $ref->consortium;
1104 $self->_write_line_GenBank_regex(" TITLE "," "x12
,
1105 $ref->title,"\\s\+\|\$",80);
1106 $self->_write_line_GenBank_regex(" JOURNAL "," "x12
,
1107 $ref->location,"\\s\+\|\$",80);
1108 if( $ref->medline) {
1109 $self->_write_line_GenBank_regex(" MEDLINE "," "x12
,
1110 $ref->medline, "\\s\+\|\$",80);
1111 # I am assuming that pubmed entries only exist when there
1112 # are also MEDLINE entries due to the indentation
1114 # This could be a wrong assumption
1115 if( $ref->pubmed ) {
1116 $self->_write_line_GenBank_regex(" PUBMED "," "x12
,
1117 $ref->pubmed, "\\s\+\|\$",
1120 # put remark at the end
1121 if ($ref->comment) {
1122 $self->_write_line_GenBank_regex(" REMARK "," "x12
,
1123 $ref->comment,"\\s\+\|\$",80);
1129 foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
1130 $self->_write_line_GenBank_regex("COMMENT "," "x12
,
1131 $comment->text,"\\s\+\|\$",80);
1135 $self->_print("FEATURES Location/Qualifiers\n");
1137 if( defined $self->_post_sort ) {
1138 # we need to read things into an array. Process. Sort them. Print 'em
1140 my $post_sort_func = $self->_post_sort();
1143 foreach my $sf ( $seq->top_SeqFeatures ) {
1144 push(@fth,Bio
::SeqIO
::FTHelper
::from_SeqFeature
($sf,$seq));
1147 @fth = sort { &$post_sort_func($a,$b) } @fth;
1149 foreach my $fth ( @fth ) {
1150 $self->_print_GenBank_FTHelper($fth);
1153 # not post sorted. And so we can print as we get them.
1154 # lower memory load...
1156 foreach my $sf ( $seq->top_SeqFeatures ) {
1157 my @fth = Bio
::SeqIO
::FTHelper
::from_SeqFeature
($sf,$seq);
1158 foreach my $fth ( @fth ) {
1159 if( ! $fth->isa('Bio::SeqIO::FTHelper') ) {
1160 $sf->throw("Cannot process FTHelper... $fth");
1162 $self->_print_GenBank_FTHelper($fth);
1167 # deal with WGS; WGS_SCAFLD present only if WGS is also present
1168 if($seq->annotation->get_Annotations('wgs')) {
1170 (map {$seq->annotation->get_Annotations($_)} qw(wgs wgs_scaffold)) {
1171 $self->_print(sprintf ("%-11s %s\n",uc($wgs->tagname),
1174 $self->_show_dna(0);
1176 if($seq->annotation->get_Annotations('contig')) {
1179 foreach my $contig ($seq->annotation->get_Annotations('contig')) {
1181 $cline = uc($contig->tagname)." ".$contig->value."\n";
1183 $cline = " ".$contig->value."\n";
1185 $self->_print($cline);
1188 $self->_show_dna(0);
1190 if( $seq->length == 0 ) { $self->_show_dna(0) }
1192 if( $self->_show_dna() == 0 ) {
1193 $self->_print("\n//\n");
1197 # finished printing features.
1199 $str =~ tr/A-Z/a-z/;
1201 my ($o) = $seq->annotation->get_Annotations('origin');
1202 $self->_print(sprintf("%-12s%s\n",
1203 'ORIGIN', $o ?
$o->value : ''));
1204 # print out the sequence
1205 my $nuc = 60; # Number of nucleotides per line
1206 my $whole_pat = 'a10' x
6; # Pattern for unpacking a whole line
1207 my $out_pat = 'A11' x
6; # Pattern for packing a line
1208 my $length = length($str);
1210 # Calculate the number of nucleotides which fit on whole lines
1211 my $whole = int($length / $nuc) * $nuc;
1213 # Print the whole lines
1215 for ($i = 0; $i < $whole; $i += $nuc) {
1216 my $blocks = pack $out_pat,
1218 substr($str, $i, $nuc);
1220 $self->_print(sprintf("%9d $blocks\n", $i + $nuc - 59));
1223 # Print the last line
1224 if (my $last = substr($str, $i)) {
1225 my $last_len = length($last);
1226 my $last_pat = 'a10' x
int($last_len / 10) .
1227 'a'. $last_len % 10;
1228 my $blocks = pack $out_pat,
1229 unpack($last_pat, $last);
1231 $self->_print(sprintf("%9d $blocks\n",
1232 $length - $last_len + 1));
1235 $self->_print("//\n");
1237 $self->flush if $self->_flush_on_write && defined $self->_fh;
1242 =head2 _print_GenBank_FTHelper
1244 Title : _print_GenBank_FTHelper
1253 sub _print_GenBank_FTHelper
{
1254 my ( $self, $fth ) = @_;
1256 if ( !ref $fth || !$fth->isa('Bio::SeqIO::FTHelper') ) {
1258 "$fth is not a FTHelper class. Attempting to print but there could be issues"
1262 my $spacer = ( length $fth->key >= 15 ) ?
' ' : '';
1263 $self->_write_line_GenBank_regex(
1264 sprintf( " %-16s%s", $fth->key, $spacer ),
1265 " " x
21, $fth->loc, "\,\|\$", 80 );
1267 foreach my $tag ( keys %{ $fth->field } ) {
1268 # Account for hash structure in Annotation::DBLink, not the expected array
1269 if ( $tag eq 'db_xref' && grep /HASH/, @
{ $fth->field->{$tag} }) {
1270 for my $ref ( @
{ $fth->field->{$tag} } ) {
1271 my $db = $ref->{'database'};
1272 my $id = $ref->{'primary_id'};
1273 $self->_write_line_GenBank_regex( " " x
21, " " x
21,
1274 "/$tag=\"$db:$id\"", "\.\|\$", 80 );
1277 # The usual case, where all values are found in an array
1279 foreach my $value ( @
{ $fth->field->{$tag} } ) {
1280 $value =~ s/\"/\"\"/g;
1281 if ( $value eq "_no_value" ) {
1282 $self->_write_line_GenBank_regex( " " x
21, " " x
21,
1283 "/$tag", "\.\|\$", 80 );
1286 # There are almost 3x more quoted qualifier values and they
1287 # are more common too so we take quoted ones first.
1288 # Long qualifiers, that will be line wrapped, are always quoted
1289 elsif ( !$FTQUAL_NO_QUOTE{$tag}
1290 or length("/$tag=$value") >= $FTQUAL_LINE_LENGTH )
1292 my ($pat) = ( $value =~ /\s/ ?
'\s|$' : '.|$' );
1293 $self->_write_line_GenBank_regex( " " x
21, " " x
21,
1294 "/$tag=\"$value\"", $pat, 80 );
1298 $self->_write_line_GenBank_regex( " " x
21, " " x
21,
1299 "/$tag=$value", "\.\|\$", 80 );
1307 =head2 _read_GenBank_References
1309 Title : _read_GenBank_References
1311 Function: Reads references from GenBank format. Internal function really
1317 sub _read_GenBank_References
{
1318 my ($self,$buffer) = @_;
1322 # assumme things are starting with RN
1324 if( $$buffer !~ /^REFERENCE/ ) {
1325 warn("Not parsing line '$$buffer' which maybe important");
1330 my (@title,@loc,@authors,@consort,@com,@medline,@pubmed);
1332 REFLOOP
: while( defined($_) || defined($_ = $self->_readline) ) {
1333 if (/^\s{2}AUTHORS\s+(.*)/o) {
1334 push (@authors, $1);
1335 while ( defined($_ = $self->_readline) ) {
1336 /^\s{9,}(.*)/o && do { push (@authors, $1);next;};
1339 $ref->authors(join(' ', @authors));
1341 if (/^\s{2}CONSRTM\s+(.*)/o) {
1342 push (@consort, $1);
1343 while ( defined($_ = $self->_readline) ) {
1344 /^\s{9,}(.*)/o && do { push (@consort, $1);next;};
1347 $ref->consortium(join(' ', @consort));
1349 if (/^\s{2}TITLE\s+(.*)/o) {
1351 while ( defined($_ = $self->_readline) ) {
1352 /^\s{9,}(.*)/o && do { push (@title, $1);
1357 $ref->title(join(' ', @title));
1359 if (/^\s{2}JOURNAL\s+(.*)/o) {
1361 while ( defined($_ = $self->_readline) ) {
1362 # we only match when there are at least 4 spaces
1363 # there is probably a better way to match this
1364 # as it assumes that the describing tag is short enough
1365 /^\s{9,}(.*)/o && do { push(@loc, $1);
1370 $ref->location(join(' ', @loc));
1373 if (/^\s{2}REMARK\s+(.*)/o) {
1375 while ( defined($_ = $self->_readline) ) {
1376 /^\s{9,}(.*)/o && do { push(@com, $1);
1381 $ref->comment(join(' ', @com));
1384 if( /^\s{2}MEDLINE\s+(.*)/ ) {
1386 while ( defined($_ = $self->_readline) ) {
1387 /^\s{9,}(.*)/ && do { push(@medline, $1);
1392 $ref->medline(join(' ', @medline));
1395 if( /^\s{3}PUBMED\s+(.*)/ ) {
1397 while ( defined($_ = $self->_readline) ) {
1398 /^\s{9,}(.*)/ && do { push(@pubmed, $1);
1403 $ref->pubmed(join(' ', @pubmed));
1407 /^REFERENCE/o && do {
1408 # store current reference
1409 $self->_add_ref_to_array(\
@refs,$ref) if defined $ref;
1417 # create the new reference object
1418 $ref = Bio
::Annotation
::Reference
->new(-tagname
=> 'reference');
1419 # check whether start and end base is given
1420 if (/^REFERENCE\s+\d+\s+\([a-z]+ (\d+) to (\d+)\)/){
1423 } elsif (/^REFERENCE\s+\d+\s+\((.*)\)/) {
1424 $ref->gb_reference($1);
1428 /^(FEATURES)|(COMMENT)/o && last;
1430 $_ = undef; # Empty $_ to trigger read of next line
1433 # store last reference
1434 $self->_add_ref_to_array(\
@refs,$ref) if defined $ref;
1438 #print "\nnumber of references found: ", $#refs+1,"\n";
1443 =head2 _add_ref_to_array
1445 Title: _add_ref_to_array
1447 Function: Adds a Reference object to an array of Reference objects, takes
1448 care of possible cleanups to be done (currently, only author and title
1449 will be chopped of trailing semicolons).
1450 Args: A reference to an array of Reference objects and
1451 the Reference object to be added
1456 sub _add_ref_to_array
{
1457 my ($self, $refs, $ref) = @_;
1459 # first, polish author and title by removing possible trailing semicolons
1460 my $au = $ref->authors();
1461 my $title = $ref->title();
1462 $au =~ s/;\s*$//g if $au;
1463 $title =~ s/;\s*$//g if $title;
1465 $ref->title($title);
1466 # the rest should be clean already, so go ahead and add it
1467 push(@
{$refs}, $ref);
1470 =head2 _read_GenBank_Species
1472 Title : _read_GenBank_Species
1474 Function: Reads the GenBank Organism species and classification
1475 lines. Able to deal with unconvential Organism naming
1476 formats, and varietas in plants
1477 Example : ORGANISM unknown marine gamma proteobacterium NOR5
1479 $species = unknown marine gamma proteobacterium NOR5
1481 ORGANISM Drosophila sp. 'white tip scutellum'
1483 $species = sp. 'white tip scutellum'
1484 (yes, this really is a species and that is its name)
1487 ORGANISM Ajellomyces capsulatus var. farciminosus
1488 $genus = Ajellomyces
1489 $species = capsulatus
1490 $subspecies = var. farciminosus
1492 ORGANISM Hepatitis delta virus
1493 $genus = undef (though this virus has a genus in its lineage, we
1494 cannot know that without a database lookup)
1495 $species = Hepatitis delta virus
1497 Returns : A Bio::Species object
1498 Args : A reference to the current line buffer
1502 sub _read_GenBank_Species
{
1503 my ($self, $buffer) = @_;
1505 my @unkn_names = ('other', 'unknown organism', 'not specified', 'not shown',
1506 'Unspecified', 'Unknown', 'None', 'unclassified',
1507 'unidentified organism', 'not supplied');
1508 # dictionary of synonyms for taxid 32644
1509 my @unkn_genus = ('unknown','unclassified','uncultured','unidentified');
1510 # all above can be part of valid species name
1512 my $line = $$buffer;
1514 my( $sub_species, $species, $genus, $sci_name, $common, $class_lines,
1515 $source_flag, $abbr_name, $organelle, $sl );
1516 my %source = map { $_ => 1 } qw(SOURCE ORGANISM CLASSIFICATION);
1517 # upon first entering the loop, we must not read a new line -- the SOURCE
1518 # line is already in the buffer (HL 05/10/2000)
1519 my ($ann, $tag, $data);
1520 while (defined($line) || defined($line = $self->_readline())) {
1521 # de-HTMLify (links that may be encountered here don't contain
1522 # escaped '>', so a simple-minded approach suffices)
1523 $line =~ s{<[^>]+>}{}g;
1524 if ($line =~ m{^(?:\s{0,2})(\w
+)\s
+(.+)?
$}ox
) {
1525 ($tag, $data) = ($1, $2 || '');
1526 last if ($tag && !exists $source{$tag});
1529 ($data = $line) =~ s{^\s+}{};
1531 $tag = 'CLASSIFICATION' if ($tag ne 'CLASSIFICATION' && $tag eq 'ORGANISM' && $line =~ m{[;\.]+});
1533 (exists $ann->{$tag}) ?
($ann->{$tag} .= ' '.$data) : ($ann->{$tag} .= $data);
1537 ($sl, $class_lines, $sci_name) = ($ann->{SOURCE
}, $ann->{CLASSIFICATION
}, $ann->{ORGANISM
});
1541 $sci_name || return;
1543 # parse out organelle, common name, abbreviated name if present;
1544 # this should catch everything, but falls back to
1545 # entire SOURCE line just in case
1547 (mitochondrion
|chloroplast
|plastid
)?
1549 \s
*(?
: \
( (.*?
) \
) )?\
.?
1552 ($organelle, $abbr_name, $common) = ($1, $2, $3); # optional
1554 $abbr_name = $sl; # nothing caught; this is a backup!
1557 # Convert data in classification lines into classification array.
1558 # only split on ';' or '.' so that classification that is 2 or more words will
1559 # still get matched, use map() to remove trailing/leading/intervening spaces
1560 my @class = map { s/^\s+//; s/\s+$//; s/\s{2,}/ /g; $_; } split /(?<!subgen)[;\.]+/, $class_lines;
1562 # do we have a genus?
1563 my $possible_genus = quotemeta($class[-1])
1564 . ($class[-2] ?
"|" . quotemeta($class[-2]) : '');
1565 if ($sci_name =~ /^($possible_genus)/) {
1567 ($species) = $sci_name =~ /^$genus\s+(.+)/;
1570 $species = $sci_name;
1573 # is this organism of rank species or is it lower?
1574 # (we don't catch everything lower than species, but it doesn't matter -
1575 # this is just so we abide by previous behaviour whilst not calling a
1576 # species a subspecies)
1577 if ($species && $species =~ /(.+)\s+((?:subsp\.|var\.).+)/) {
1578 ($species, $sub_species) = ($1, $2);
1581 # Don't make a species object if it's empty or "Unknown" or "None"
1582 # return unless $genus and $genus !~ /^(Unknown|None)$/oi;
1583 # Don't make a species object if it belongs to taxid 32644
1584 # my $unkn = grep { $_ =~ /^\Q$sl\E$/; } @unkn_names;
1585 my $unkn = grep { $_ eq $sl } @unkn_names;
1586 return unless ($species || $genus) and $unkn == 0;
1588 # Bio::Species array needs array in Species -> Kingdom direction
1589 push(@class, $sci_name);
1590 @class = reverse @class;
1592 my $make = Bio
::Species
->new();
1593 $make->scientific_name($sci_name);
1594 $make->classification(@class) if @class > 0;
1595 $make->common_name( $common ) if $common;
1596 $make->name('abbreviated', $abbr_name) if $abbr_name;
1597 $make->organelle($organelle) if $organelle;
1598 #$make->sub_species( $sub_species ) if $sub_species;
1602 =head2 _read_FTHelper_GenBank
1604 Title : _read_FTHelper_GenBank
1605 Usage : _read_FTHelper_GenBank($buffer)
1606 Function: reads the next FT key line
1608 Returns : Bio::SeqIO::FTHelper object
1609 Args : filehandle and reference to a scalar
1613 sub _read_FTHelper_GenBank
{
1614 my ($self,$buffer) = @_;
1616 my ($key, # The key of the feature
1617 $loc # The location line from the feature
1619 my @qual = (); # An array of lines making up the qualifiers
1621 if ($$buffer =~ /^\s{5}(\S+)\s+(.+?)\s*$/o) {
1624 # Read all the lines up to the next feature
1625 while ( defined($_ = $self->_readline) ) {
1626 if (/^(\s+)(.+?)\s*$/o) {
1627 # Lines inside features are preceded by 21 spaces
1628 # A new feature is preceded by 5 spaces
1629 if (length($1) > 6) {
1630 # Add to qualifiers if we're in the qualifiers, or if it's
1631 # the first qualifier
1632 if (@qual || (index($2,'/') == 0)) {
1635 # We're still in the location line, so append to location
1640 # We've reached the start of the next feature
1644 # We're at the end of the feature table
1650 $self->debug("no feature key!\n");
1651 # change suggested by JDiggans to avoid infinite loop-
1652 # see bugreport 1062.
1653 # reset buffer to prevent infinite loop
1654 $$buffer = $self->_readline();
1658 # Put the first line of the next feature into the buffer
1661 # Make the new FTHelper object
1662 my $out = Bio
::SeqIO
::FTHelper
->new();
1663 $out->verbose($self->verbose());
1667 # Now parse and add any qualifiers. (@qual is kept
1668 # intact to provide informative error messages.)
1670 for (my $i = 0; $i < @qual; $i++) {
1672 my( $qualifier, $value ) = (m{^/([^=]+)(?:=(.+))?})
1673 or $self->warn("cannot see new qualifier in feature $key: ".
1675 $qualifier = '' unless( defined $qualifier);
1676 if (defined $value) {
1677 # Do we have a quoted value?
1678 if (substr($value, 0, 1) eq '"') {
1679 # Keep adding to value until we find the trailing quote
1680 # and the quotes are balanced
1681 while ($value !~ /\"$/ or $value =~ tr/"/"/ % 2) {
1683 $self->warn("Unbalanced quote in:\n" .
1685 "No further qualifiers will " .
1686 "be added for this feature");
1689 $i++; # modifying a for-loop variable inside of the loop
1690 # is not the best programming style ...
1691 my $next = $qual[$i];
1693 # add to value with a space unless the value appears
1694 # to be a sequence (translation for example)
1695 # if(($value.$next) =~ /[^A-Za-z\"\-]/o) {
1696 # changed to explicitly look for translation tag - cjf 06/8/29
1697 if ($qualifier !~ /^translation$/i ) {
1702 # Trim leading and trailing quotes
1703 $value =~ s/^"|"$//g;
1704 # Undouble internal quotes
1705 $value =~ s/""/\"/g;
1706 } elsif ( $value =~ /^\(/ ) { # values quoted by ()s
1707 # Keep adding to value until we find the trailing bracket
1708 # and the ()s are balanced
1709 my $left = ($value =~ tr/\(/\(/); # count left parens
1710 my $right = ($value =~ tr/\)/\)/); # count right parens
1712 while( $left != $right ) { # was "$value !~ /\)$/ or $left != $right"
1714 $self->warn("Unbalanced parens in:\n".
1716 "\nNo further qualifiers will ".
1717 "be added for this feature");
1721 my $next = $qual[$i];
1723 $left += ($next =~ tr/\(/\(/);
1724 $right += ($next =~ tr/\)/\)/);
1728 $value = '_no_value';
1730 # Store the qualifier
1731 $out->field->{$qualifier} ||= [];
1732 push(@
{$out->field->{$qualifier}},$value);
1737 =head2 _write_line_GenBank
1739 Title : _write_line_GenBank
1741 Function: internal function
1748 sub _write_line_GenBank
{
1749 my ($self,$pre1,$pre2,$line,$length) = @_;
1751 $length || $self->throw("Miscalled write_line_GenBank without length. Programming error!");
1752 my $subl = $length - length $pre2;
1753 my $linel = length $line;
1756 my $subr = substr($line,0,$length - length $pre1);
1758 $self->_print("$pre1$subr\n");
1759 for($i= ($length - length $pre1);$i < $linel; $i += $subl) {
1760 $subr = substr($line,$i,$subl);
1761 $self->_print("$pre2$subr\n");
1766 =head2 _write_line_GenBank_regex
1768 Title : _write_line_GenBank_regex
1770 Function: internal function for writing lines of specified
1771 length, with different first and the next line
1772 left hand headers and split at specific points in the
1780 regex for line breaks,
1786 sub _write_line_GenBank_regex
{
1787 my ($self,$pre1,$pre2,$line,$regex,$length) = @_;
1789 #print STDOUT "Going to print with $line!\n";
1791 $length || $self->throw( "Miscalled write_line_GenBank without length. Programming error!");
1793 my $subl = $length - (length $pre1) - 2;
1796 CHUNK
: while($line) {
1797 foreach my $pat ($regex, '[,;\.\/-]\s|'.$regex, '[,;\.\/-]|'.$regex) {
1798 if($line =~ m/^(.{0,$subl})($pat)(.*)/ ) {
1800 $line = substr($line,length($l));
1801 # be strict about not padding spaces according to
1804 next CHUNK
if ($l eq '');
1809 # if we get here none of the patterns matched $subl or less chars
1810 $self->warn("trouble dissecting \"$line\"\n into chunks ".
1811 "of $subl chars or less - this tag won't print right");
1812 # insert a space char to prevent infinite loops
1813 $line = substr($line,0,$subl) . " " . substr($line,$subl);
1815 my $s = shift @lines;
1816 $self->_print("$pre1$s\n") if $s;
1817 foreach my $s ( @lines ) {
1818 $self->_print("$pre2$s\n");
1825 Usage : $obj->_post_sort($newval)
1827 Returns : value of _post_sort
1828 Args : newvalue (optional)
1834 my ($obj,$value) = @_;
1835 if( defined $value) {
1836 $obj->{'_post_sort'} = $value;
1838 return $obj->{'_post_sort'};
1845 Usage : $obj->_show_dna($newval)
1847 Returns : value of _show_dna
1848 Args : newvalue (optional)
1854 my ($obj,$value) = @_;
1855 if( defined $value) {
1856 $obj->{'_show_dna'} = $value;
1858 return $obj->{'_show_dna'};
1861 =head2 _id_generation_func
1863 Title : _id_generation_func
1864 Usage : $obj->_id_generation_func($newval)
1866 Returns : value of _id_generation_func
1867 Args : newvalue (optional)
1872 sub _id_generation_func
{
1873 my ($obj,$value) = @_;
1874 if( defined $value ) {
1875 $obj->{'_id_generation_func'} = $value;
1877 return $obj->{'_id_generation_func'};
1881 =head2 _ac_generation_func
1883 Title : _ac_generation_func
1884 Usage : $obj->_ac_generation_func($newval)
1886 Returns : value of _ac_generation_func
1887 Args : newvalue (optional)
1891 sub _ac_generation_func
{
1892 my ($obj,$value) = @_;
1893 if( defined $value ) {
1894 $obj->{'_ac_generation_func'} = $value;
1896 return $obj->{'_ac_generation_func'};
1900 =head2 _sv_generation_func
1902 Title : _sv_generation_func
1903 Usage : $obj->_sv_generation_func($newval)
1905 Returns : value of _sv_generation_func
1906 Args : newvalue (optional)
1911 sub _sv_generation_func
{
1912 my ($obj,$value) = @_;
1913 if( defined $value ) {
1914 $obj->{'_sv_generation_func'} = $value;
1916 return $obj->{'_sv_generation_func'};
1921 =head2 _kw_generation_func
1923 Title : _kw_generation_func
1924 Usage : $obj->_kw_generation_func($newval)
1926 Returns : value of _kw_generation_func
1927 Args : newvalue (optional)
1932 sub _kw_generation_func
{
1933 my ($obj,$value) = @_;
1934 if( defined $value ) {
1935 $obj->{'_kw_generation_func'} = $value;
1937 return $obj->{'_kw_generation_func'};