2 # BioPerl module for Bio::SeqIO::genbank
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Bioperl project bioperl-l(at)bioperl.org
8 # Copyright Elia Stupka and contributors see AUTHORS section
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::SeqIO::genbank - GenBank sequence input/output stream
20 It is probably best not to use this object directly, but
21 rather go through the SeqIO handler:
23 $stream = Bio::SeqIO->new(-file => $filename,
24 -format => 'GenBank');
26 while ( my $seq = $stream->next_seq() ) {
27 # do something with $seq
33 This object can transform Bio::Seq objects to and from GenBank flat
36 There is some flexibility here about how to write GenBank output
37 that is not fully documented.
39 =head2 Optional functions
45 (output only) shows the dna or not
49 (output only) provides a sorting func which is applied to the FTHelpers
52 =item _id_generation_func()
54 This is function which is called as
56 print "ID ", $func($seq), "\n";
58 To generate the ID line. If it is not there, it generates a sensible ID
59 line using a number of tools.
61 If you want to output annotations in Genbank format they need to be
62 stored in a Bio::Annotation::Collection object which is accessible
63 through the Bio::SeqI interface method L<annotation()|annotation>.
65 The following are the names of the keys which are pulled from a
66 L<Bio::Annotation::Collection> object:
68 reference - Should contain Bio::Annotation::Reference objects
69 comment - Should contain Bio::Annotation::Comment objects
70 dblink - Should contain a Bio::Annotation::DBLink object
71 segment - Should contain a Bio::Annotation::SimpleValue object
72 origin - Should contain a Bio::Annotation::SimpleValue object
73 wgs - Should contain a Bio::Annotation::SimpleValue object
77 =head1 Where does the data go?
79 Data parsed in Bio::SeqIO::genbank is stored in a variety of data
80 fields in the sequence object that is returned. Here is a partial list
83 Items listed as RichSeq or Seq or PrimarySeq and then NAME() tell you
84 the top level object which defines a function called NAME() which
85 stores this information.
87 Items listed as Annotation 'NAME' tell you the data is stored the
88 associated Bio::AnnotationCollectionI object which is associated with
89 Bio::Seq objects. If it is explictly requested that no annotations
90 should be stored when parsing a record of course they will not be
91 available when you try and get them. If you are having this problem
92 look at the type of SeqBuilder that is being used to contruct your
95 Comments Annotation 'comment'
96 References Annotation 'reference'
97 Segment Annotation 'segment'
98 Origin Annotation 'origin'
99 Dbsource Annotation 'dblink'
101 Accessions PrimarySeq accession_number()
102 Secondary accessions RichSeq get_secondary_accessions()
103 GI number PrimarySeq primary_id()
104 LOCUS PrimarySeq display_id()
105 Keywords RichSeq get_keywords()
106 Dates RichSeq get_dates()
107 Molecule RichSeq molecule()
108 Seq Version RichSeq seq_version()
110 Division RichSeq division()
111 Features Seq get_SeqFeatures()
112 Alphabet PrimarySeq alphabet()
113 Definition PrimarySeq description() or desc()
114 Version PrimarySeq version()
116 Sequence PrimarySeq seq()
118 There is more information in the Feature-Annotation HOWTO about each
119 field and how it is mapped to the Sequence object
120 L<http://bioperl.open-bio.org/wiki/HOWTO:Feature-Annotation>.
126 User feedback is an integral part of the evolution of this and other
127 Bioperl modules. Send your comments and suggestions preferably to one
128 of the Bioperl mailing lists. Your participation is much appreciated.
130 bioperl-l@bioperl.org - General discussion
131 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
135 Please direct usage questions or support issues to the mailing list:
137 I<bioperl-l@bioperl.org>
139 rather than to the module maintainer directly. Many experienced and
140 reponsive experts will be able look at the problem and quickly
141 address it. Please include a thorough description of the problem
142 with code and data examples if at all possible.
144 =head2 Reporting Bugs
146 Report bugs to the Bioperl bug tracking system to help us keep track
147 the bugs and their resolution. Bug reports can be submitted via the web:
149 https://redmine.open-bio.org/projects/bioperl/
151 =head1 AUTHOR - Bioperl Project
153 bioperl-l at bioperl.org
155 Original author Elia Stupka, elia -at- tigem.it
159 Ewan Birney birney at ebi.ac.uk
160 Jason Stajich jason at bioperl.org
161 Chris Mungall cjm at fruitfly.bdgp.berkeley.edu
162 Lincoln Stein lstein at cshl.org
163 Heikki Lehvaslaiho, heikki at ebi.ac.uk
164 Hilmar Lapp, hlapp at gmx.net
165 Donald G. Jackson, donald.jackson at bms.com
166 James Wasmuth, james.wasmuth at ed.ac.uk
167 Brian Osborne, bosborne at alum.mit.edu
168 Chris Fields, cjfields at bioperl dot org
172 The rest of the documentation details each of the object
173 methods. Internal methods are usually preceded with a _
177 # Let the code begin...
179 package Bio
::SeqIO
::genbank
;
182 use Bio
::SeqIO
::FTHelper
;
183 use Bio
::SeqFeature
::Generic
;
185 use Bio
::Seq
::SeqFactory
;
186 use Bio
::Annotation
::Collection
;
187 use Bio
::Annotation
::Comment
;
188 use Bio
::Annotation
::Reference
;
189 use Bio
::Annotation
::DBLink
;
191 use base
qw(Bio::SeqIO);
193 our %FTQUAL_NO_QUOTE = map {$_ => 1} qw(
196 cons_splice direction
200 transl_except transl_table
204 our %DBSOURCE = map {$_ => 1} qw(
205 EchoBASE IntAct SWISS-2DPAGE ECO2DBASE ECOGENE TIGRFAMs
206 TIGR GO InterPro Pfam PROSITE SGD GermOnline
207 HSSP PhosSite Ensembl RGD AGD ArrayExpress KEGG
208 H-InvDB HGNC LinkHub PANTHER PRINTS SMART SMR
209 MGI MIM RZPD-ProtExp ProDom MEROPS TRANSFAC Reactome
210 UniGene GlycoSuiteDB PIRSF HSC-2DPAGE PHCI-2DPAGE
211 PMMA-2DPAGE Siena-2DPAGE Rat-heart-2DPAGE Aarhus/Ghent-2DPAGE
212 Biocyc MetaCyc Biocyc:Metacyc GenomeReviews FlyBase
213 TMHOBP COMPLUYEAST-2DPAGE OGP DictyBase HAMAP
214 PhotoList Gramene WormBase WormPep Genew ZFIN
215 PeroxiBase MaizeDB TAIR DrugBank REBASE HPA
216 swissprot GenBank GenPept REFSEQ embl PDB UniProtKB
217 DIP PeptideAtlas PRIDE CYGD HOGENOME Gene3D
220 our %VALID_MOLTYPE = map {$_ => 1} qw(NA DNA RNA tRNA rRNA cDNA cRNA ms-DNA
221 mRNA uRNA ss-RNA ss-DNA snRNA snoRNA PRT);
223 our %VALID_ALPHABET = (
226 'rc' => '' # rc = release candidate; file has no sequences
230 my($self,@args) = @_;
232 $self->SUPER::_initialize
(@args);
233 # hash for functions for decoding keys.
234 $self->{'_func_ftunit_hash'} = {};
235 $self->_show_dna(1); # sets this to one by default. People can change it
236 if( ! defined $self->sequence_factory ) {
237 $self->sequence_factory(Bio
::Seq
::SeqFactory
->new
238 (-verbose
=> $self->verbose(),
239 -type
=> 'Bio::Seq::RichSeq'));
246 Usage : $seq = $stream->next_seq()
247 Function: returns the next sequence in the stream
248 Returns : Bio::Seq object
254 my ($self,@args) = @_;
256 my $builder = $self->sequence_builder();
263 my (@acc, @features);
264 my ($display_id, $annotation);
267 # initialize; we may come here because of starting over
272 %params = (-verbose
=> $self->verbose); # reset hash
274 while(defined($buffer = $self->_readline())) {
275 last if index($buffer,'LOCUS ') == 0;
277 return unless defined $buffer; # end of file
278 $buffer =~ /^LOCUS\s+(\S.*)$/o ||
279 $self->throw("GenBank stream with bad LOCUS line. Not GenBank in my book. Got '$buffer'");
281 my @tokens = split(' ', $1);
283 # this is important to have the id for display in e.g. FTHelper,
284 # otherwise you won't know which entry caused an error
285 $display_id = shift(@tokens);
286 $params{'-display_id'} = $display_id;
287 # may still be useful if we don't want the seq
288 my $seqlength = shift(@tokens);
289 if (exists $VALID_ALPHABET{$seqlength}) {
290 # moved one token too far. No locus name?
291 $self->warn("Bad LOCUS name? Changing [$params{'-display_id'}] to 'unknown' and length to $display_id");
292 $params{'-display_id'} = 'unknown';
293 $params{'-length'} = $display_id;
295 unshift @tokens, $seqlength;
297 $params{'-length'} = $seqlength;
299 # the alphabet of the entry
300 # shouldn't assign alphabet unless one is specifically designated (such as for rc files)
301 my $alphabet = lc(shift @tokens);
302 $params{'-alphabet'} = (exists $VALID_ALPHABET{$alphabet}) ?
$VALID_ALPHABET{$alphabet} :
303 $self->warn("Unknown alphabet: $alphabet");
304 # for aa there is usually no 'molecule' (mRNA etc)
305 if ($params{'-alphabet'} eq 'protein') {
306 $params{'-molecule'} = 'PRT'
308 $params{'-molecule'} = shift(@tokens);
310 # take care of lower case issues
311 if ($params{'-molecule'} eq 'dna' || $params{'-molecule'} eq 'rna') {
312 $params{'-molecule'} = uc $params{'-molecule'};
314 $self->debug("Unrecognized molecule type:".$params{'-molecule'}) if
315 !exists($VALID_MOLTYPE{$params{'-molecule'}});
316 my $circ = shift(@tokens);
317 if ($circ eq 'circular') {
318 $params{'-is_circular'} = 1;
319 $params{'-division'} = shift(@tokens);
321 # 'linear' or 'circular' may actually be omitted altogether
322 $params{'-division'} =
323 (CORE
::length($circ) == 3 ) ?
$circ : shift(@tokens);
325 my $date = join(' ', @tokens); # we lump together the rest
327 # this is per request bug #1513
333 if($date =~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/) {
334 if( length($date) < 11 ) {
335 # improperly formatted date
336 # But we'll be nice and fix it for them
337 my ($d,$m,$y) = ($2,$3,$4);
338 if( length($d) == 1 ) {
341 # guess the century here
342 if( length($y) == 2 ) {
343 if( $y > 60 ) { # arbitrarily guess that '60' means 1960
348 $self->warn("Date was malformed, guessing the century for $date to be $y\n");
350 $params{'-dates'} = [join('-',$d,$m,$y)];
352 $params{'-dates'} = [$date];
355 # set them all at once
356 $builder->add_slot_value(%params);
359 # parse the rest if desired, otherwise start over
360 if(! $builder->want_object()) {
361 $builder->make_object();
365 # set up annotation depending on what the builder wants
366 if($builder->want_slot('annotation')) {
367 $annotation = Bio
::Annotation
::Collection
->new();
369 $buffer = $self->_readline();
370 until( !defined ($buffer) ) {
372 # Description line(s)
373 if (/^DEFINITION\s+(\S.*\S)/) {
375 while ( defined($_ = $self->_readline) ) {
376 if( /^\s+(.*)/ ) { push (@desc, $1); next };
379 $builder->add_slot_value(-desc
=> join(' ', @desc));
380 # we'll continue right here because DEFINITION always comes
381 # at the top of the entry
384 # accession number (there can be multiple accessions)
385 if( /^ACCESSION\s+(\S.*\S)/ ) {
386 push(@acc, split(/\s+/,$1));
387 while( defined($_ = $self->_readline) ) {
388 /^\s+(.*)/ && do { push (@acc, split(/\s+/,$1)); next };
395 elsif( /^PID\s+(\S+)/ ) {
396 $params{'-pid'} = $1;
399 elsif( /^VERSION\s+(\S.+)$/ ) {
400 my ($acc,$gi) = split(' ',$1);
401 if($acc =~ /^\w+\.(\d+)/) {
402 $params{'-version'} = $1;
403 $params{'-seq_version'} = $1;
405 if($gi && (index($gi,"GI:") == 0)) {
406 $params{'-primary_id'} = substr($gi,3);
410 elsif( /^KEYWORDS\s+(\S.*)/ ) {
411 my @kw = split(/\s*\;\s*/,$1);
412 while( defined($_ = $self->_readline) ) {
414 /^\s+(.*)/ && do { push (@kw, split(/\s*\;\s*/,$1)); next };
418 @kw && $kw[-1] =~ s/\.$//;
419 $params{'-keywords'} = \
@kw;
423 # Organism name and phylogenetic information
424 elsif (/^SOURCE\s+\S/) {
425 if($builder->want_slot('species')) {
426 $species = $self->_read_GenBank_Species(\
$buffer);
427 $builder->add_slot_value(-species
=> $species);
429 while(defined($buffer = $self->_readline())) {
430 last if substr($buffer,0,1) ne ' ';
436 elsif (/^REFERENCE\s+\S/) {
438 my @refs = $self->_read_GenBank_References(\
$buffer);
439 foreach my $ref ( @refs ) {
440 $annotation->add_Annotation('reference',$ref);
443 while(defined($buffer = $self->_readline())) {
444 last if substr($buffer,0,1) ne ' ';
450 elsif (/^PROJECT\s+(\S.*)/) {
452 my $project = new Bio
::Annotation
::SimpleValue
->new(-value
=> $1);
453 $annotation->add_Annotation('project',$project);
457 elsif (/^COMMENT\s+(\S.*)/) {
460 while (defined($_ = $self->_readline)) {
464 $comment =~ s/\n/ /g;
465 $comment =~ s/ +/ /g;
466 $annotation->add_Annotation('comment',
467 Bio
::Annotation
::Comment
->new(-text
=> $comment,
468 -tagname
=> 'comment'));
471 while(defined($buffer = $self->_readline())) {
472 last if substr($buffer,0,1) ne ' ';
477 # Corresponding Genbank nucleotide id, Genpept only
478 elsif( /^DB(?:SOURCE|LINK)\s+(\S.+)/ ) {
481 while (defined($_ = $self->_readline)) {
485 # deal with UniProKB dbsources
486 if( $dbsource =~ s/(UniProt(?:KB)?|swissprot):\s+locus\s+(\S+)\,.+\n// ) {
487 $annotation->add_Annotation
489 Bio
::Annotation
::DBLink
->new
492 -tagname
=> 'dblink'));
493 if( $dbsource =~ s/\s+created:\s+([^\.]+)\.\n// ) {
494 $annotation->add_Annotation
496 Bio
::Annotation
::SimpleValue
->new
497 (-tagname
=> 'date_created',
500 while( $dbsource =~ s/\s+(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g ) {
501 $annotation->add_Annotation
503 Bio
::Annotation
::SimpleValue
->new
504 (-tagname
=> 'date_updated',
507 $dbsource =~ s/\n/ /g;
508 if( $dbsource =~ s/\s+xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ ) {
509 # will use $i to determine even or odd
510 # for swissprot the accessions are paired
512 for my $dbsrc ( split(/,\s+/,$1) ) {
513 if( $dbsrc =~ /(\S+)\.(\d+)/ ||
514 $dbsrc =~ /(\S+)/ ) {
515 my ($id,$version) = ($1,$2);
516 $version ='' unless defined $version;
518 if( $id =~ /^\d\S{3}/) {
521 $db = ($i++ % 2 ) ?
'GenPept' : 'GenBank';
523 $annotation->add_Annotation
525 Bio
::Annotation
::DBLink
->new
527 -version
=> $version,
529 -tagname
=> 'dblink'));
532 } elsif( $dbsource =~ s/\s+xrefs:\s+(.+)\s+xrefs/xrefs/i ) {
533 # download screwed up and ncbi didn't put acc in for gi numbers
535 for my $id ( split(/\,\s+/,$1) ) {
537 if( $id =~ /gi:\s+(\d+)/ ) {
539 $db = ($i++ % 2 ) ?
'GenPept' : 'GenBank';
540 } elsif( $id =~ /pdb\s+accession\s+(\S+)/ ) {
547 $annotation->add_Annotation
549 Bio
::Annotation
::DBLink
->new
550 (-primary_id
=> $acc,
552 -tagname
=> 'dblink'));
555 $self->debug("Cannot match $dbsource\n");
557 if( $dbsource =~ s
/xrefs\s
+\
(non\
-sequence\s
+databases\
):\s
+
558 ((?
:\S
+,\s
+)+\S
+)//x ) {
559 for my $id ( split(/\,\s+/,$1) ) {
561 # this is because GenBank dropped the spaces!!!
562 # I'm sure we're not going to get this right
563 ##if( $id =~ s/^://i ) {
566 $db = substr($id,0,index($id,':'));
567 if (! exists $DBSOURCE{ $db }) {
568 $db = ''; # do we want 'GenBank' here?
570 $id = substr($id,index($id,':')+1);
571 $annotation->add_Annotation
573 Bio
::Annotation
::DBLink
->new
576 -tagname
=> 'dblink'));
581 if( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)\.(\d+)/ ) {
582 my ($db,$id,$version) = ($1,$2,$3);
583 $annotation->add_Annotation
585 Bio
::Annotation
::DBLink
->new
587 -version
=> $version,
588 -database
=> $db || 'GenBank',
589 -tagname
=> 'dblink'));
590 } elsif ( $dbsource =~ /(\S+)([\.:])\s*(\d+)/ ) {
591 my ($id, $db, $version);
593 ($db, $id) = ($1, $3);
595 ($db, $id, $version) = ('GenBank', $1, $3);
597 $annotation->add_Annotation('dblink',
598 Bio
::Annotation
::DBLink
->new(
600 -version
=> $version,
602 -tagname
=> 'dblink')
605 $self->warn("Unrecognized DBSOURCE data: $dbsource\n");
611 while(defined($buffer = $self->_readline())) {
612 last if substr($buffer,0,1) ne ' ';
617 # Exit at start of Feature table, or start of sequence
618 last if( /^(FEATURES|ORIGIN)/ );
619 # Get next line and loop again
620 $buffer = $self->_readline;
622 return unless defined $buffer;
624 # add them all at once for efficiency
625 $builder->add_slot_value(-accession_number
=> shift(@acc),
626 -secondary_accessions
=> \
@acc,
628 $builder->add_slot_value(-annotation
=> $annotation) if $annotation;
629 %params = (); # reset before possible re-use to avoid setting twice
631 # start over if we don't want to continue with this entry
632 if(! $builder->want_object()) {
633 $builder->make_object();
636 # some "minimal" formats may not necessarily have a feature table
637 if($builder->want_slot('features') && defined($_) && /^FEATURES/o) {
638 # need to read the first line of the feature table
639 $buffer = $self->_readline;
640 # DO NOT read lines in the while condition -- this is done as a side
641 # effect in _read_FTHelper_GenBank!
643 # part of new circular spec:
644 # commented out for now until kinks worked out
646 #$sourceEnd = $2 if ($buffer =~ /(\d+?)\.\.(\d+?)$/);
648 while( defined($buffer) ) {
649 # check immediately -- not at the end of the loop
650 # note: GenPept entries obviously do not have a BASE line
651 last if( $buffer =~ /^BASE|ORIGIN|CONTIG|WGS/o);
653 # slurp in one feature at a time -- at return, the start of
654 # the next feature will have been read already, so we need
655 # to pass a reference, and the called method must set this
656 # to the last line read before returning
658 my $ftunit = $self->_read_FTHelper_GenBank(\
$buffer);
660 # implement new circular spec: features that cross the origin are now
661 # seamless instead of being 2 separate joined features
662 # commented out until kinks get worked out
663 #if ((! $args{'-nojoin'}) && $ftunit->{'loc'} =~ /^join\((\d+?)\.\.(\d+?),(\d+?)..(\d+?)\)$/
664 #&& $sourceEnd == $2 && $3 == 1) {
667 #$ftunit->{'loc'} = "$start..$end";
670 # fix suggested by James Diggans
672 if( !defined $ftunit ) {
673 # GRRRR. We have fallen over. Try to recover
674 $self->warn("Unexpected error in feature table for ".$params{'-display_id'}." Skipping feature, attempting to recover");
675 unless( ($buffer =~ /^\s{5,5}\S+/o) or
676 ($buffer =~ /^\S+/o)) {
677 $buffer = $self->_readline();
679 next; # back to reading FTHelpers
684 $ftunit->_generic_seqfeature($self->location_factory(),
686 # add taxon_id from source if available
687 if($species && ($feat->primary_tag eq 'source') &&
688 $feat->has_tag('db_xref') && (! $species->ncbi_taxid() ||
689 ($species->ncbi_taxid && $species->ncbi_taxid =~ /^list/))) {
690 foreach my $tagval ($feat->get_tag_values('db_xref')) {
691 if(index($tagval,"taxon:") == 0) {
692 $species->ncbi_taxid(substr($tagval,6));
697 # add feature to list of features
698 push(@features, $feat);
700 $builder->add_slot_value(-features
=> \
@features);
707 while($_ !~ m{^//}) { # end of file
708 $_ =~ /^(?:CONTIG)?\s+(.*)/;
710 $_ = $self->_readline;
713 $annotation->add_Annotation(
714 Bio
::Annotation
::SimpleValue
->new(-tagname
=> 'contig',
718 $self->_pushback($_);
719 } elsif( /^WGS|WGS_SCAFLD\s+/o ) { # catch WGS/WGS_SCAFLD lines
720 while($_ =~ s/(^WGS|WGS_SCAFLD)\s+//){ # gulp lines
722 $annotation->add_Annotation(
723 Bio
::Annotation
::SimpleValue
->new(-value
=> $_,
724 -tagname
=> lc($1)));
725 $_ = $self->_readline;
727 } elsif(! m{^(ORIGIN|//)} ) { # advance to the sequence, if any
728 while (defined( $_ = $self->_readline) ) {
729 last if m{^(ORIGIN|//)};
733 if(! $builder->want_object()) {
734 $builder->make_object(); # implicit end-of-object
737 if($builder->want_slot('seq')) {
738 # the fact that we want a sequence does not necessarily mean that
739 # there also is a sequence ...
740 if(defined($_) && s/^ORIGIN\s+//) {
742 if( $annotation && length($_) > 0 ) {
743 $annotation->add_Annotation('origin',
744 Bio
::Annotation
::SimpleValue
->new(-tagname
=> 'origin',
748 while( defined($_ = $self->_readline) ) {
754 #$self->debug("sequence length is ". length($seqc) ."\n");
755 $builder->add_slot_value(-seq
=> $seqc);
757 } elsif ( defined($_) && (substr($_,0,2) ne '//')) {
758 # advance to the end of the record
759 while( defined($_ = $self->_readline) ) {
760 last if substr($_,0,2) eq '//';
763 # Unlikely, but maybe the sequence is so weird that we don't want it
764 # anymore. We don't want to return undef if the stream's not exhausted
766 $seq = $builder->make_object();
767 next RECORDSTART
unless $seq;
769 } # end while RECORDSTART
777 Usage : $stream->write_seq($seq)
778 Function: writes the $seq object (must be seq) to the stream
779 Returns : 1 for success and 0 for error
780 Args : array of 1 to n Bio::SeqI objects
786 my ($self,@seqs) = @_;
788 foreach my $seq ( @seqs ) {
789 $self->throw("Attempting to write with no seq!") unless defined $seq;
791 if( ! ref $seq || ! $seq->isa('Bio::SeqI') ) {
792 $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!");
798 my $len = $seq->length();
800 if ( $seq->can('division') ) {
801 $div = $seq->division;
803 if( !defined $div || ! $div ) { $div = 'UNK'; }
804 my $alpha = $seq->alphabet;
805 if( !$seq->can('molecule') || ! defined ($mol = $seq->molecule()) ) {
806 $mol = $alpha || 'DNA';
809 my $circular = 'linear ';
810 $circular = 'circular' if $seq->is_circular;
812 local($^W
) = 0; # supressing warnings about uninitialized fields.
815 if( $self->_id_generation_func ) {
816 $temp_line = &{$self->_id_generation_func}($seq);
819 if( $seq->can('get_dates') ) {
820 ($date) = $seq->get_dates();
823 $self->warn("No whitespace allowed in GenBank display id [". $seq->display_id. "]")
824 if $seq->display_id =~ /\s/;
826 $temp_line = sprintf ("%-12s%-15s%13s %s%4s%-8s%-8s %3s %-s\n",
827 'LOCUS', $seq->id(),$len,
828 (lc($alpha) eq 'protein') ?
('aa','', '') :
829 ('bp', '',$mol),$circular,$div,$date);
832 $self->_print($temp_line);
833 $self->_write_line_GenBank_regex("DEFINITION ", " ",
834 $seq->desc(),"\\s\+\|\$",80);
836 # if there, write the accession line
838 if( $self->_ac_generation_func ) {
839 $temp_line = &{$self->_ac_generation_func}($seq);
840 $self->_print("ACCESSION $temp_line\n");
843 push(@acc, $seq->accession_number());
844 if( $seq->isa('Bio::Seq::RichSeqI') ) {
845 push(@acc, $seq->get_secondary_accessions());
847 $self->_print("ACCESSION ", join(" ", @acc), "\n");
848 # otherwise - cannot print <sigh>
851 # if PID defined, print it
852 if($seq->isa('Bio::Seq::RichSeqI') && $seq->pid()) {
853 $self->_print("PID ", $seq->pid(), "\n");
856 # if there, write the version line
858 if( defined $self->_sv_generation_func() ) {
859 $temp_line = &{$self->_sv_generation_func}($seq);
861 $self->_print("VERSION $temp_line\n");
864 if($seq->isa('Bio::Seq::RichSeqI') && defined($seq->seq_version)) {
865 my $id = $seq->primary_id(); # this may be a GI number
866 $self->_print("VERSION ",
867 $seq->accession_number(), ".", $seq->seq_version,
868 ($id && ($id =~ /^\d+$/) ?
" GI:".$id : ""),
873 # if there, write the PROJECT line
874 for my $proj ( $seq->annotation->get_Annotations('project') ) {
875 $self->_print("PROJECT ".$proj->value."\n");
878 # if there, write the DBSOURCE line
879 foreach my $ref ( $seq->annotation->get_Annotations('dblink') ) {
880 my ($db, $id) = ($ref->database, $ref->primary_id);
881 my $prefix = $db eq 'Project' ?
'DBLINK' : 'DBSOURCE';
882 my $text = $db eq 'GenBank' ?
'' :
883 $db eq 'Project' ?
"$db:$id" : "$db accession $id";
884 $self->_print(sprintf ("%-11s %s\n",$prefix, $text));
887 # if there, write the keywords line
888 if( defined $self->_kw_generation_func() ) {
889 $temp_line = &{$self->_kw_generation_func}($seq);
890 $self->_print("KEYWORDS $temp_line\n");
892 if( $seq->can('keywords') ) {
893 my $kw = $seq->keywords;
894 $kw .= '.' if( $kw !~ /\.$/ );
895 $self->_print("KEYWORDS $kw\n");
899 # SEGMENT if it exists
900 foreach my $ref ( $seq->annotation->get_Annotations('segment') ) {
901 $self->_print(sprintf ("%-11s %s\n",'SEGMENT',
906 if (my $spec = $seq->species) {
907 my ($on, $sn, $cn) = ($spec->can('organelle') ?
$spec->organelle : '',
908 $spec->scientific_name,
911 if ($spec->isa('Bio::Species')) {
912 @classification = $spec->classification;
913 shift(@classification);
915 # Bio::Taxon should have a DB handle of some type attached, so
916 # derive the classification from that
919 $node = $node->ancestor || last;
920 unshift(@classification, $node->node_name);
921 #$node eq $root && last;
923 @classification = reverse @classification;
925 my $abname = $spec->name('abbreviated') ?
# from genbank file
926 $spec->name('abbreviated')->[0] : $sn;
927 my $sl = $on ?
"$on " : '';
928 $sl .= $cn ?
$abname." ($cn)" : "$abname";
930 $self->_write_line_GenBank_regex("SOURCE ", ' 'x12
, $sl, "\\s\+\|\$",80);
931 $self->_print(" ORGANISM ", $spec->scientific_name, "\n");
932 my $OC = join('; ', (reverse(@classification))) .'.';
933 $self->_write_line_GenBank_regex(' 'x12
,' 'x12
,
939 foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
940 $temp_line = "REFERENCE $count";
942 $temp_line .= sprintf (" (%s %d to %d)",
943 ($seq->alphabet() eq "protein" ?
944 "residues" : "bases"),
945 $ref->start,$ref->end);
946 } elsif ($ref->gb_reference) {
947 $temp_line .= sprintf (" (%s)", $ref->gb_reference);
949 $self->_print("$temp_line\n");
950 $self->_write_line_GenBank_regex(" AUTHORS ",' 'x12
,
951 $ref->authors,"\\s\+\|\$",80);
952 $self->_write_line_GenBank_regex(" CONSRTM ",' 'x12
,
953 $ref->consortium,"\\s\+\|\$",80) if $ref->consortium;
954 $self->_write_line_GenBank_regex(" TITLE "," "x12
,
955 $ref->title,"\\s\+\|\$",80);
956 $self->_write_line_GenBank_regex(" JOURNAL "," "x12
,
957 $ref->location,"\\s\+\|\$",80);
959 $self->_write_line_GenBank_regex(" MEDLINE "," "x12
,
960 $ref->medline, "\\s\+\|\$",80);
961 # I am assuming that pubmed entries only exist when there
962 # are also MEDLINE entries due to the indentation
964 # This could be a wrong assumption
966 $self->_write_line_GenBank_regex(" PUBMED "," "x12
,
967 $ref->pubmed, "\\s\+\|\$",
970 # put remark at the end
972 $self->_write_line_GenBank_regex(" REMARK "," "x12
,
973 $ref->comment,"\\s\+\|\$",80);
979 foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
980 $self->_write_line_GenBank_regex("COMMENT "," "x12
,
981 $comment->text,"\\s\+\|\$",80);
983 $self->_print("FEATURES Location/Qualifiers\n");
985 if( defined $self->_post_sort ) {
986 # we need to read things into an array. Process. Sort them. Print 'em
988 my $post_sort_func = $self->_post_sort();
991 foreach my $sf ( $seq->top_SeqFeatures ) {
992 push(@fth,Bio
::SeqIO
::FTHelper
::from_SeqFeature
($sf,$seq));
995 @fth = sort { &$post_sort_func($a,$b) } @fth;
997 foreach my $fth ( @fth ) {
998 $self->_print_GenBank_FTHelper($fth);
1001 # not post sorted. And so we can print as we get them.
1002 # lower memory load...
1004 foreach my $sf ( $seq->top_SeqFeatures ) {
1005 my @fth = Bio
::SeqIO
::FTHelper
::from_SeqFeature
($sf,$seq);
1006 foreach my $fth ( @fth ) {
1007 if( ! $fth->isa('Bio::SeqIO::FTHelper') ) {
1008 $sf->throw("Cannot process FTHelper... $fth");
1010 $self->_print_GenBank_FTHelper($fth);
1015 # deal with WGS; WGS_SCAFLD present only if WGS is also present
1016 if($seq->annotation->get_Annotations('wgs')) {
1018 (map {$seq->annotation->get_Annotations($_)} qw(wgs wgs_scaffold)) {
1019 $self->_print(sprintf ("%-11s %s\n",uc($wgs->tagname),
1022 $self->_show_dna(0);
1024 if($seq->annotation->get_Annotations('contig')) {
1027 foreach my $contig ($seq->annotation->get_Annotations('contig')) {
1029 $cline = uc($contig->tagname)." ".$contig->value."\n";
1031 $cline = " ".$contig->value."\n";
1033 $self->_print($cline);
1036 $self->_show_dna(0);
1038 if( $seq->length == 0 ) { $self->_show_dna(0) }
1040 if( $self->_show_dna() == 0 ) {
1041 $self->_print("\n//\n");
1045 # finished printing features.
1047 $str =~ tr/A-Z/a-z/;
1049 my ($o) = $seq->annotation->get_Annotations('origin');
1050 $self->_print(sprintf("%-12s%s\n",
1051 'ORIGIN', $o ?
$o->value : ''));
1052 # print out the sequence
1053 my $nuc = 60; # Number of nucleotides per line
1054 my $whole_pat = 'a10' x
6; # Pattern for unpacking a whole line
1055 my $out_pat = 'A11' x
6; # Pattern for packing a line
1056 my $length = length($str);
1058 # Calculate the number of nucleotides which fit on whole lines
1059 my $whole = int($length / $nuc) * $nuc;
1061 # Print the whole lines
1063 for ($i = 0; $i < $whole; $i += $nuc) {
1064 my $blocks = pack $out_pat,
1066 substr($str, $i, $nuc);
1068 $self->_print(sprintf("%9d $blocks\n", $i + $nuc - 59));
1071 # Print the last line
1072 if (my $last = substr($str, $i)) {
1073 my $last_len = length($last);
1074 my $last_pat = 'a10' x
int($last_len / 10) .
1075 'a'. $last_len % 10;
1076 my $blocks = pack $out_pat,
1077 unpack($last_pat, $last);
1079 $self->_print(sprintf("%9d $blocks\n",
1080 $length - $last_len + 1));
1083 $self->_print("//\n");
1085 $self->flush if $self->_flush_on_write && defined $self->_fh;
1090 =head2 _print_GenBank_FTHelper
1092 Title : _print_GenBank_FTHelper
1102 sub _print_GenBank_FTHelper
{
1103 my ($self,$fth) = @_;
1105 if( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) {
1106 $fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!");
1108 my $spacer = (length $fth->key >= 15) ?
' ' : '';
1109 $self->_write_line_GenBank_regex(sprintf(" %-16s%s",$fth->key,$spacer),
1111 $fth->loc,"\,\|\$",80);
1112 foreach my $tag ( keys %{$fth->field} ) {
1113 foreach my $value ( @
{$fth->field->{$tag}} ) {
1114 $value =~ s/\"/\"\"/g;
1115 if ($value eq "_no_value") {
1116 $self->_write_line_GenBank_regex(" "x21
,
1118 "/$tag","\.\|\$",80);
1120 # there are almost 3x more quoted qualifier values and they
1121 # are more common too so we take quoted ones first
1122 elsif (!$FTQUAL_NO_QUOTE{$tag}) {
1123 my ($pat) = ($value =~ /\s/ ?
'\s|$' : '.|$');
1124 $self->_write_line_GenBank_regex(" "x21
,
1126 "/$tag=\"$value\"",$pat,80);
1129 $self->_write_line_GenBank_regex(" "x21
,
1131 "/$tag=$value","\.\|\$",80);
1137 =head2 _read_GenBank_References
1139 Title : _read_GenBank_References
1141 Function: Reads references from GenBank format. Internal function really
1147 sub _read_GenBank_References
{
1148 my ($self,$buffer) = @_;
1152 # assumme things are starting with RN
1154 if( $$buffer !~ /^REFERENCE/ ) {
1155 warn("Not parsing line '$$buffer' which maybe important");
1160 my (@title,@loc,@authors,@consort,@com,@medline,@pubmed);
1162 REFLOOP
: while( defined($_) || defined($_ = $self->_readline) ) {
1163 if (/^\s{2}AUTHORS\s+(.*)/o) {
1164 push (@authors, $1);
1165 while ( defined($_ = $self->_readline) ) {
1166 /^\s{9,}(.*)/o && do { push (@authors, $1);next;};
1169 $ref->authors(join(' ', @authors));
1171 if (/^\s{2}CONSRTM\s+(.*)/o) {
1172 push (@consort, $1);
1173 while ( defined($_ = $self->_readline) ) {
1174 /^\s{9,}(.*)/o && do { push (@consort, $1);next;};
1177 $ref->consortium(join(' ', @consort));
1179 if (/^\s{2}TITLE\s+(.*)/o) {
1181 while ( defined($_ = $self->_readline) ) {
1182 /^\s{9,}(.*)/o && do { push (@title, $1);
1187 $ref->title(join(' ', @title));
1189 if (/^\s{2}JOURNAL\s+(.*)/o) {
1191 while ( defined($_ = $self->_readline) ) {
1192 # we only match when there are at least 4 spaces
1193 # there is probably a better way to match this
1194 # as it assumes that the describing tag is short enough
1195 /^\s{9,}(.*)/o && do { push(@loc, $1);
1200 $ref->location(join(' ', @loc));
1203 if (/^\s{2}REMARK\s+(.*)/o) {
1205 while ( defined($_ = $self->_readline) ) {
1206 /^\s{9,}(.*)/o && do { push(@com, $1);
1211 $ref->comment(join(' ', @com));
1214 if( /^\s{2}MEDLINE\s+(.*)/ ) {
1216 while ( defined($_ = $self->_readline) ) {
1217 /^\s{9,}(.*)/ && do { push(@medline, $1);
1222 $ref->medline(join(' ', @medline));
1225 if( /^\s{3}PUBMED\s+(.*)/ ) {
1227 while ( defined($_ = $self->_readline) ) {
1228 /^\s{9,}(.*)/ && do { push(@pubmed, $1);
1233 $ref->pubmed(join(' ', @pubmed));
1237 /^REFERENCE/o && do {
1238 # store current reference
1239 $self->_add_ref_to_array(\
@refs,$ref) if defined $ref;
1247 # create the new reference object
1248 $ref = Bio
::Annotation
::Reference
->new(-tagname
=> 'reference');
1249 # check whether start and end base is given
1250 if (/^REFERENCE\s+\d+\s+\([a-z]+ (\d+) to (\d+)\)/){
1253 } elsif (/^REFERENCE\s+\d+\s+\((.*)\)/) {
1254 $ref->gb_reference($1);
1258 /^(FEATURES)|(COMMENT)/o && last;
1260 $_ = undef; # Empty $_ to trigger read of next line
1263 # store last reference
1264 $self->_add_ref_to_array(\
@refs,$ref) if defined $ref;
1268 #print "\nnumber of references found: ", $#refs+1,"\n";
1274 # This is undocumented as it shouldn't be called by anywhere else as
1275 # read_GenBank_References. For those who still want to know:
1277 # Purpose: adds a Reference object to an array of Reference objects, takes
1278 # care of possible cleanups to be done (currently, only author and title
1279 # will be chopped of trailing semicolons).
1281 # a reference to an array of Reference objects
1282 # the Reference object to be added
1285 sub _add_ref_to_array
{
1286 my ($self, $refs, $ref) = @_;
1288 # first, polish author and title by removing possible trailing semicolons
1289 my $au = $ref->authors();
1290 my $title = $ref->title();
1291 $au =~ s/;\s*$//g if $au;
1292 $title =~ s/;\s*$//g if $title;
1294 $ref->title($title);
1295 # the rest should be clean already, so go ahead and add it
1296 push(@
{$refs}, $ref);
1299 =head2 _read_GenBank_Species
1301 Title : _read_GenBank_Species
1303 Function: Reads the GenBank Organism species and classification
1304 lines. Able to deal with unconvential Organism naming
1305 formats, and varietas in plants
1306 Example : ORGANISM unknown marine gamma proteobacterium NOR5
1308 $species = unknown marine gamma proteobacterium NOR5
1310 ORGANISM Drosophila sp. 'white tip scutellum'
1312 $species = sp. 'white tip scutellum'
1313 (yes, this really is a species and that is its name)
1316 ORGANISM Ajellomyces capsulatus var. farciminosus
1317 $genus = Ajellomyces
1318 $species = capsulatus
1319 $subspecies = var. farciminosus
1321 ORGANISM Hepatitis delta virus
1322 $genus = undef (though this virus has a genus in its lineage, we
1323 cannot know that without a database lookup)
1324 $species = Hepatitis delta virus
1326 Returns : A Bio::Species object
1327 Args : A reference to the current line buffer
1331 sub _read_GenBank_Species
{
1332 my ($self, $buffer) = @_;
1334 my @unkn_names = ('other', 'unknown organism', 'not specified', 'not shown',
1335 'Unspecified', 'Unknown', 'None', 'unclassified',
1336 'unidentified organism', 'not supplied');
1337 # dictionary of synonyms for taxid 32644
1338 my @unkn_genus = ('unknown','unclassified','uncultured','unidentified');
1339 # all above can be part of valid species name
1341 my $line = $$buffer;
1343 my( $sub_species, $species, $genus, $sci_name, $common, $class_lines,
1344 $source_flag, $abbr_name, $organelle, $sl );
1345 my %source = map { $_ => 1 } qw(SOURCE ORGANISM CLASSIFICATION);
1346 # upon first entering the loop, we must not read a new line -- the SOURCE
1347 # line is already in the buffer (HL 05/10/2000)
1348 my ($ann, $tag, $data);
1349 while (defined($line) || defined($line = $self->_readline())) {
1350 # de-HTMLify (links that may be encountered here don't contain
1351 # escaped '>', so a simple-minded approach suffices)
1352 $line =~ s{<[^>]+>}{}g;
1353 if ($line =~ m{^(?:\s{0,2})(\w
+)\s
+(.+)?
$}ox
) {
1354 ($tag, $data) = ($1, $2 || '');
1355 last if ($tag && !exists $source{$tag});
1358 ($data = $line) =~ s{^\s+}{};
1360 $tag = 'CLASSIFICATION' if ($tag ne 'CLASSIFICATION' && $tag eq 'ORGANISM' && $line =~ m{[;\.]+});
1362 (exists $ann->{$tag}) ?
($ann->{$tag} .= ' '.$data) : ($ann->{$tag} .= $data);
1366 ($sl, $class_lines, $sci_name) = ($ann->{SOURCE
}, $ann->{CLASSIFICATION
}, $ann->{ORGANISM
});
1370 $sci_name || return;
1372 # parse out organelle, common name, abbreviated name if present;
1373 # this should catch everything, but falls back to
1374 # entire SOURCE line just in case
1376 (mitochondrion
|chloroplast
|plastid
)?
1378 \s
*(?
: \
( (.*?
) \
) )?\
.?
1381 ($organelle, $abbr_name, $common) = ($1, $2, $3); # optional
1383 $abbr_name = $sl; # nothing caught; this is a backup!
1386 # Convert data in classification lines into classification array.
1387 # only split on ';' or '.' so that classification that is 2 or more words will
1388 # still get matched, use map() to remove trailing/leading/intervening spaces
1389 my @class = map { s/^\s+//; s/\s+$//; s/\s{2,}/ /g; $_; } split /(?<!subgen)[;\.]+/, $class_lines;
1391 # do we have a genus?
1392 my $possible_genus = quotemeta($class[-1])
1393 . ($class[-2] ?
"|" . quotemeta($class[-2]) : '');
1394 if ($sci_name =~ /^($possible_genus)/) {
1396 ($species) = $sci_name =~ /^$genus\s+(.+)/;
1399 $species = $sci_name;
1402 # is this organism of rank species or is it lower?
1403 # (we don't catch everything lower than species, but it doesn't matter -
1404 # this is just so we abide by previous behaviour whilst not calling a
1405 # species a subspecies)
1406 if ($species && $species =~ /(.+)\s+((?:subsp\.|var\.).+)/) {
1407 ($species, $sub_species) = ($1, $2);
1410 # Don't make a species object if it's empty or "Unknown" or "None"
1411 # return unless $genus and $genus !~ /^(Unknown|None)$/oi;
1412 # Don't make a species object if it belongs to taxid 32644
1413 # my $unkn = grep { $_ =~ /^\Q$sl\E$/; } @unkn_names;
1414 my $unkn = grep { $_ eq $sl } @unkn_names;
1415 return unless ($species || $genus) and $unkn == 0;
1417 # Bio::Species array needs array in Species -> Kingdom direction
1418 push(@class, $sci_name);
1419 @class = reverse @class;
1421 my $make = Bio
::Species
->new();
1422 $make->scientific_name($sci_name);
1423 $make->classification(@class) if @class > 0;
1424 $make->common_name( $common ) if $common;
1425 $make->name('abbreviated', $abbr_name) if $abbr_name;
1426 $make->organelle($organelle) if $organelle;
1427 #$make->sub_species( $sub_species ) if $sub_species;
1431 =head2 _read_FTHelper_GenBank
1433 Title : _read_FTHelper_GenBank
1434 Usage : _read_FTHelper_GenBank($buffer)
1435 Function: reads the next FT key line
1437 Returns : Bio::SeqIO::FTHelper object
1438 Args : filehandle and reference to a scalar
1442 sub _read_FTHelper_GenBank
{
1443 my ($self,$buffer) = @_;
1445 my ($key, # The key of the feature
1446 $loc # The location line from the feature
1448 my @qual = (); # An array of lines making up the qualifiers
1450 if ($$buffer =~ /^\s{5}(\S+)\s+(.+?)\s*$/o) {
1453 # Read all the lines up to the next feature
1454 while ( defined($_ = $self->_readline) ) {
1455 if (/^(\s+)(.+?)\s*$/o) {
1456 # Lines inside features are preceded by 21 spaces
1457 # A new feature is preceded by 5 spaces
1458 if (length($1) > 6) {
1459 # Add to qualifiers if we're in the qualifiers, or if it's
1460 # the first qualifier
1461 if (@qual || (index($2,'/') == 0)) {
1464 # We're still in the location line, so append to location
1469 # We've reached the start of the next feature
1473 # We're at the end of the feature table
1479 $self->debug("no feature key!\n");
1480 # change suggested by JDiggans to avoid infinite loop-
1481 # see bugreport 1062.
1482 # reset buffer to prevent infinite loop
1483 $$buffer = $self->_readline();
1487 # Put the first line of the next feature into the buffer
1490 # Make the new FTHelper object
1491 my $out = Bio
::SeqIO
::FTHelper
->new();
1492 $out->verbose($self->verbose());
1496 # Now parse and add any qualifiers. (@qual is kept
1497 # intact to provide informative error messages.)
1499 for (my $i = 0; $i < @qual; $i++) {
1501 my( $qualifier, $value ) = (m{^/([^=]+)(?:=(.+))?})
1502 or $self->warn("cannot see new qualifier in feature $key: ".
1504 $qualifier = '' unless( defined $qualifier);
1505 if (defined $value) {
1506 # Do we have a quoted value?
1507 if (substr($value, 0, 1) eq '"') {
1508 # Keep adding to value until we find the trailing quote
1509 # and the quotes are balanced
1510 while ($value !~ /\"$/ or $value =~ tr/"/"/ % 2) {
1512 $self->warn("Unbalanced quote in:\n" .
1514 "No further qualifiers will " .
1515 "be added for this feature");
1518 $i++; # modifying a for-loop variable inside of the loop
1519 # is not the best programming style ...
1520 my $next = $qual[$i];
1522 # add to value with a space unless the value appears
1523 # to be a sequence (translation for example)
1524 # if(($value.$next) =~ /[^A-Za-z\"\-]/o) {
1525 # changed to explicitly look for translation tag - cjf 06/8/29
1526 if ($qualifier ne 'translation') {
1531 # Trim leading and trailing quotes
1532 $value =~ s/^"|"$//g;
1533 # Undouble internal quotes
1534 $value =~ s/""/\"/g;
1535 } elsif ( $value =~ /^\(/ ) { # values quoted by ()s
1536 # Keep adding to value until we find the trailing bracket
1537 # and the ()s are balanced
1538 my $left = ($value =~ tr/\(/\(/); # count left parens
1539 my $right = ($value =~ tr/\)/\)/); # count right parens
1541 while( $left != $right ) { # was "$value !~ /\)$/ or $left != $right"
1543 $self->warn("Unbalanced parens in:\n".
1545 "\nNo further qualifiers will ".
1546 "be added for this feature");
1550 my $next = $qual[$i];
1552 $left += ($next =~ tr/\(/\(/);
1553 $right += ($next =~ tr/\)/\)/);
1557 $value = '_no_value';
1559 # Store the qualifier
1560 $out->field->{$qualifier} ||= [];
1561 push(@
{$out->field->{$qualifier}},$value);
1566 =head2 _write_line_GenBank
1568 Title : _write_line_GenBank
1570 Function: internal function
1577 sub _write_line_GenBank
{
1578 my ($self,$pre1,$pre2,$line,$length) = @_;
1580 $length || $self->throw("Miscalled write_line_GenBank without length. Programming error!");
1581 my $subl = $length - length $pre2;
1582 my $linel = length $line;
1585 my $subr = substr($line,0,$length - length $pre1);
1587 $self->_print("$pre1$subr\n");
1588 for($i= ($length - length $pre1);$i < $linel; $i += $subl) {
1589 $subr = substr($line,$i,$subl);
1590 $self->_print("$pre2$subr\n");
1595 =head2 _write_line_GenBank_regex
1597 Title : _write_line_GenBank_regex
1599 Function: internal function for writing lines of specified
1600 length, with different first and the next line
1601 left hand headers and split at specific points in the
1609 regex for line breaks,
1615 sub _write_line_GenBank_regex
{
1616 my ($self,$pre1,$pre2,$line,$regex,$length) = @_;
1618 #print STDOUT "Going to print with $line!\n";
1620 $length || $self->throw( "Miscalled write_line_GenBank without length. Programming error!");
1622 my $subl = $length - (length $pre1) - 2;
1625 CHUNK
: while($line) {
1626 foreach my $pat ($regex, '[,;\.\/-]\s|'.$regex, '[,;\.\/-]|'.$regex) {
1627 if($line =~ m/^(.{0,$subl})($pat)(.*)/ ) {
1629 $line = substr($line,length($l));
1630 # be strict about not padding spaces according to
1633 next CHUNK
if ($l eq '');
1638 # if we get here none of the patterns matched $subl or less chars
1639 $self->warn("trouble dissecting \"$line\"\n into chunks ".
1640 "of $subl chars or less - this tag won't print right");
1641 # insert a space char to prevent infinite loops
1642 $line = substr($line,0,$subl) . " " . substr($line,$subl);
1644 my $s = shift @lines;
1645 $self->_print("$pre1$s\n") if $s;
1646 foreach my $s ( @lines ) {
1647 $self->_print("$pre2$s\n");
1654 Usage : $obj->_post_sort($newval)
1656 Returns : value of _post_sort
1657 Args : newvalue (optional)
1663 my ($obj,$value) = @_;
1664 if( defined $value) {
1665 $obj->{'_post_sort'} = $value;
1667 return $obj->{'_post_sort'};
1674 Usage : $obj->_show_dna($newval)
1676 Returns : value of _show_dna
1677 Args : newvalue (optional)
1683 my ($obj,$value) = @_;
1684 if( defined $value) {
1685 $obj->{'_show_dna'} = $value;
1687 return $obj->{'_show_dna'};
1690 =head2 _id_generation_func
1692 Title : _id_generation_func
1693 Usage : $obj->_id_generation_func($newval)
1695 Returns : value of _id_generation_func
1696 Args : newvalue (optional)
1701 sub _id_generation_func
{
1702 my ($obj,$value) = @_;
1703 if( defined $value ) {
1704 $obj->{'_id_generation_func'} = $value;
1706 return $obj->{'_id_generation_func'};
1710 =head2 _ac_generation_func
1712 Title : _ac_generation_func
1713 Usage : $obj->_ac_generation_func($newval)
1715 Returns : value of _ac_generation_func
1716 Args : newvalue (optional)
1720 sub _ac_generation_func
{
1721 my ($obj,$value) = @_;
1722 if( defined $value ) {
1723 $obj->{'_ac_generation_func'} = $value;
1725 return $obj->{'_ac_generation_func'};
1729 =head2 _sv_generation_func
1731 Title : _sv_generation_func
1732 Usage : $obj->_sv_generation_func($newval)
1734 Returns : value of _sv_generation_func
1735 Args : newvalue (optional)
1740 sub _sv_generation_func
{
1741 my ($obj,$value) = @_;
1742 if( defined $value ) {
1743 $obj->{'_sv_generation_func'} = $value;
1745 return $obj->{'_sv_generation_func'};
1750 =head2 _kw_generation_func
1752 Title : _kw_generation_func
1753 Usage : $obj->_kw_generation_func($newval)
1755 Returns : value of _kw_generation_func
1756 Args : newvalue (optional)
1761 sub _kw_generation_func
{
1762 my ($obj,$value) = @_;
1763 if( defined $value ) {
1764 $obj->{'_kw_generation_func'} = $value;
1766 return $obj->{'_kw_generation_func'};