sync w/ main trunk
[bioperl-live.git] / Bio / SeqIO / embl.pm
blob9adf9a42491a4ef98f4c4b07dfbe8112a65c4707
1 # $Id$
3 # BioPerl module for Bio::SeqIO::EMBL
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Ewan Birney <birney@ebi.ac.uk>
9 # Copyright Ewan Birney
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::SeqIO::embl - EMBL sequence input/output stream
19 =head1 SYNOPSIS
21 It is probably best not to use this object directly, but
22 rather go through the SeqIO handler system. Go:
24 $stream = Bio::SeqIO->new(-file => $filename, -format => 'EMBL');
26 while ( (my $seq = $stream->next_seq()) ) {
27 # do something with $seq
30 =head1 DESCRIPTION
32 This object can transform Bio::Seq objects to and from EMBL flat
33 file databases.
35 There is a lot of flexibility here about how to dump things which
36 should be documented more fully.
38 There should be a common object that this and Genbank share (probably
39 with Swissprot). Too much of the magic is identical.
41 =head2 Optional functions
43 =over 3
45 =item _show_dna()
47 (output only) shows the dna or not
49 =item _post_sort()
51 (output only) provides a sorting func which is applied to the FTHelpers
52 before printing
54 =item _id_generation_func()
56 This is function which is called as
58 print "ID ", $func($annseq), "\n";
60 To generate the ID line. If it is not there, it generates a sensible ID
61 line using a number of tools.
63 If you want to output annotations in EMBL format they need to be
64 stored in a Bio::Annotation::Collection object which is accessible
65 through the Bio::SeqI interface method L<annotation()|annotation>.
67 The following are the names of the keys which are polled from a
68 L<Bio::Annotation::Collection> object.
70 reference - Should contain Bio::Annotation::Reference objects
71 comment - Should contain Bio::Annotation::Comment objects
72 dblink - Should contain Bio::Annotation::DBLink objects
74 =back
76 =head1 FEEDBACK
78 =head2 Mailing Lists
80 User feedback is an integral part of the evolution of this and other
81 Bioperl modules. Send your comments and suggestions preferably to one
82 of the Bioperl mailing lists. Your participation is much appreciated.
84 bioperl-l@bioperl.org - General discussion
85 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
87 =head2 Support
89 Please direct usage questions or support issues to the mailing list:
91 L<bioperl-l@bioperl.org>
93 rather than to the module maintainer directly. Many experienced and
94 reponsive experts will be able look at the problem and quickly
95 address it. Please include a thorough description of the problem
96 with code and data examples if at all possible.
98 =head2 Reporting Bugs
100 Report bugs to the Bioperl bug tracking system to help us keep track
101 the bugs and their resolution. Bug reports can be submitted via
102 the web:
104 http://bugzilla.open-bio.org/
106 =head1 AUTHOR - Ewan Birney
108 Email birney@ebi.ac.uk
110 =head1 APPENDIX
112 The rest of the documentation details each of the object
113 methods. Internal methods are usually preceded with a _
115 =cut
118 # Let the code begin...
121 package Bio::SeqIO::embl;
122 use vars qw(%FTQUAL_NO_QUOTE);
123 use strict;
124 use Bio::SeqIO::FTHelper;
125 use Bio::SeqFeature::Generic;
126 use Bio::Species;
127 use Bio::Seq::SeqFactory;
128 use Bio::Annotation::Collection;
129 use Bio::Annotation::Comment;
130 use Bio::Annotation::Reference;
131 use Bio::Annotation::DBLink;
133 use base qw(Bio::SeqIO);
135 %FTQUAL_NO_QUOTE=(
136 'anticodon'=>1,
137 'citation'=>1,
138 'codon'=>1,
139 'codon_start'=>1,
140 'cons_splice'=>1,
141 'direction'=>1,
142 'evidence'=>1,
143 'label'=>1,
144 'mod_base'=> 1,
145 'number'=> 1,
146 'rpt_type'=> 1,
147 'rpt_unit'=> 1,
148 'transl_except'=> 1,
149 'transl_table'=> 1,
150 'usedin'=> 1,
153 sub _initialize {
154 my($self,@args) = @_;
156 $self->SUPER::_initialize(@args);
157 # hash for functions for decoding keys.
158 $self->{'_func_ftunit_hash'} = {};
159 # sets this to one by default. People can change it
160 $self->_show_dna(1);
161 if ( ! defined $self->sequence_factory ) {
162 $self->sequence_factory(Bio::Seq::SeqFactory->new
163 (-verbose => $self->verbose(),
164 -type => 'Bio::Seq::RichSeq'));
168 =head2 next_seq
170 Title : next_seq
171 Usage : $seq = $stream->next_seq()
172 Function: returns the next sequence in the stream
173 Returns : Bio::Seq object
174 Args :
176 =cut
178 sub next_seq {
179 my ($self,@args) = @_;
180 my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div,
181 $date, $comment, @date_arr);
183 my ($annotation, %params, @features) =
184 Bio::Annotation::Collection->new();
186 $line = $self->_readline;
187 # This needs to be before the first eof() test
189 if ( !defined $line ) {
190 return; # no throws - end of file
193 if ( $line =~ /^\s+$/ ) {
194 while ( defined ($line = $self->_readline) ) {
195 $line =~/^\S/ && last;
197 # return without error if the whole next sequence was just a single
198 # blank line and then eof
199 return unless $line;
202 # no ID as 1st non-blank line, need short circuit and exit routine
203 $self->throw("EMBL stream with no ID. Not embl in my book")
204 unless $line =~ /^ID\s+\S+/;
206 # At this point we are sure that $line contains an ID header line
207 my $alphabet;
208 if ( $line =~ tr/;/;/ == 6) { # New style headers contain exactly six semicolons.
210 # New style header (EMBL Release >= 87, after June 2006)
211 my $topology;
212 my $sv;
214 # ID DQ299383; SV 1; linear; mRNA; STD; MAM; 431 BP.
215 # This regexp comes from the new2old.pl conversion script, from EBI
216 if ($line =~ m/^ID (\w+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) BP./) {
217 ($name, $sv, $topology, $mol, $div) = ($1, $2, $3, $4, $6);
219 if (defined($sv)) {
220 $params{'-seq_version'} = $sv;
221 $params{'-version'} = $sv;
224 if ($topology eq "circular") {
225 $params{'-is_circular'} = 1;
228 if (defined $mol ) {
229 if ($mol =~ /DNA/) {
230 $alphabet='dna';
231 } elsif ($mol =~ /RNA/) {
232 $alphabet='rna';
233 } elsif ($mol =~ /AA/) {
234 $alphabet='protein';
237 } else {
239 # Old style header (EMBL Release < 87, before June 2006)
240 if ($line =~ /^ID\s+(\S+)[^;]*;\s+(\S+)[^;]*;\s+(\S+)[^;]*;/) {
241 ($name, $mol, $div) = ($1, $2, $3);
244 if ($mol) {
245 if ( $mol =~ /circular/ ) {
246 $params{'-is_circular'} = 1;
247 $mol =~ s|circular ||;
249 if (defined $mol ) {
250 if ($mol =~ /DNA/) {
251 $alphabet='dna';
252 } elsif ($mol =~ /RNA/) {
253 $alphabet='rna';
254 } elsif ($mol =~ /AA/) {
255 $alphabet='protein';
261 unless( defined $name && length($name) ) {
262 $name = "unknown_id";
265 # $self->warn("not parsing upper annotation in EMBL file yet!");
266 my $buffer = $line;
267 local $_;
268 BEFORE_FEATURE_TABLE :
269 until ( !defined $buffer ) {
270 $_ = $buffer;
271 # Exit at start of Feature table
272 if ( /^(F[HT]|SQ)/ ) {
273 $self->_pushback($_) if( $1 eq 'SQ' || $1 eq 'FT');
274 last;
276 # Description line(s)
277 if (/^DE\s+(\S.*\S)/) {
278 $desc .= $desc ? " $1" : $1;
281 #accession number
282 if ( /^AC\s+(.*)?/ || /^PA\s+(.*)?/) {
283 my @accs = split(/[; ]+/, $1); # allow space in addition
284 $params{'-accession_number'} = shift @accs
285 unless defined $params{'-accession_number'};
286 push @{$params{'-secondary_accessions'}}, @accs;
289 #version number
290 if ( /^SV\s+\S+\.(\d+);?/ ) {
291 my $sv = $1;
292 #$sv =~ s/\;//;
293 $params{'-seq_version'} = $sv;
294 $params{'-version'} = $sv;
297 #date (NOTE: takes last date line)
298 if ( /^DT\s+(.+)$/ ) {
299 my $line = $1;
300 my ($date, $version) = split(' ', $line, 2);
301 $date =~ tr/,//d; # remove comma if new version
302 if ($version =~ /\(Rel\. (\d+), Created\)/xms ) {
303 my $release = Bio::Annotation::SimpleValue->new(
304 -tagname => 'creation_release',
305 -value => $1
307 $annotation->add_Annotation($release);
308 } elsif ($version =~ /\(Rel\. (\d+), Last updated, Version (\d+)\)/xms ) {
309 my $release = Bio::Annotation::SimpleValue->new(
310 -tagname => 'update_release',
311 -value => $1
313 $annotation->add_Annotation($release);
315 my $update = Bio::Annotation::SimpleValue->new(
316 -tagname => 'update_version',
317 -value => $2
319 $annotation->add_Annotation($update);
321 push @{$params{'-dates'}}, $date;
324 #keywords
325 if ( /^KW (.*)\S*$/ ) {
326 my @kw = split(/\s*\;\s*/,$1);
327 push @{$params{'-keywords'}}, @kw;
330 # Organism name and phylogenetic information
331 elsif (/^O[SC]/) {
332 # pass the accession number so we can give an informative throw message if necessary
333 my $species = $self->_read_EMBL_Species(\$buffer, $params{'-accession_number'});
334 $params{'-species'}= $species;
337 # NCBI TaxID Xref
338 elsif (/^OX/) {
339 my @links = $self->_read_EMBL_TaxID_DBLink(\$buffer);
340 foreach my $dblink ( @links ) {
341 $annotation->add_Annotation('dblink',$dblink);
345 # References
346 elsif (/^R/) {
347 my @refs = $self->_read_EMBL_References(\$buffer);
348 foreach my $ref ( @refs ) {
349 $annotation->add_Annotation('reference',$ref);
353 # DB Xrefs
354 elsif (/^DR/) {
355 my @links = $self->_read_EMBL_DBLink(\$buffer);
356 foreach my $dblink ( @links ) {
357 $annotation->add_Annotation('dblink',$dblink);
361 # Comments
362 elsif (/^CC\s+(.*)/) {
363 $comment .= $1;
364 $comment .= " ";
365 while (defined ($_ = $self->_readline) ) {
366 if (/^CC\s+(.*)/) {
367 $comment .= $1;
368 $comment .= " ";
369 } else {
370 last;
373 my $commobj = Bio::Annotation::Comment->new();
374 $commobj->text($comment);
375 $annotation->add_Annotation('comment',$commobj);
376 $comment = "";
379 # Get next line.
380 $buffer = $self->_readline;
383 while ( defined ($_ = $self->_readline) ) {
384 /^FT\s{3}\w/ && last;
385 /^SQ / && last;
386 /^CO / && last;
388 $buffer = $_;
390 if (defined($buffer) && $buffer =~ /^FT /) {
391 until ( !defined ($buffer) ) {
392 my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
394 # process ftunit
395 my $feat =
396 $ftunit->_generic_seqfeature($self->location_factory(), $name);
398 # add taxon_id from source if available
399 if ($params{'-species'} && ($feat->primary_tag eq 'source')
400 && $feat->has_tag('db_xref')
401 && (! $params{'-species'}->ncbi_taxid())) {
402 foreach my $tagval ($feat->get_tag_values('db_xref')) {
403 if (index($tagval,"taxon:") == 0) {
404 $params{'-species'}->ncbi_taxid(substr($tagval,6));
405 last;
410 # add feature to list of features
411 push(@features, $feat);
413 if ( $buffer !~ /^FT/ ) {
414 last;
418 # skip comments
419 while ( defined ($buffer) && $buffer =~ /^XX/ ) {
420 $buffer = $self->_readline();
423 if ( $buffer =~ /^CO/ ) {
424 until ( !defined ($buffer) ) {
425 my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
426 # process ftunit
427 push(@features,
428 $ftunit->_generic_seqfeature($self->location_factory(),
429 $name));
431 if ( $buffer !~ /^CO/ ) {
432 last;
436 if ( $buffer !~ /^SQ/ ) {
437 while ( defined ($_ = $self->_readline) ) {
438 /^SQ/ && last;
441 $seqc = "";
442 while ( defined ($_ = $self->_readline) ) {
443 m{^//} && last;
444 $_ = uc($_);
445 s/[^A-Za-z]//g;
446 $seqc .= $_;
448 my $seq = $self->sequence_factory->create
449 (-verbose => $self->verbose(),
450 -division => $div,
451 -seq => $seqc,
452 -desc => $desc,
453 -display_id => $name,
454 -annotation => $annotation,
455 -molecule => $mol,
456 -alphabet => $alphabet,
457 -features => \@features,
458 %params);
459 return $seq;
464 =head2 _write_ID_line
466 Title : _write_ID_line
467 Usage : $self->_write_ID_line($seq);
468 Function: Writes the EMBL Release 87 format ID line to the stream, unless
469 : there is a user-supplied ID line generation function in which
470 : case that is used instead.
471 : ( See Bio::SeqIO::embl::_id_generation_function(). )
472 Returns : nothing
473 Args : Bio::Seq object
475 =cut
477 sub _write_ID_line {
479 my ($self, $seq) = @_;
481 my $id_line;
482 # If there is a user-supplied ID generation function, use it.
483 if ( $self->_id_generation_func ) {
484 $id_line = "ID " . &{$self->_id_generation_func}($seq) . "\nXX\n";
486 # Otherwise, generate a standard EMBL release 87 (June 2006) ID line.
487 else {
489 # The sequence name is supposed to be the primary accession number,
490 my $name = $seq->accession_number();
491 if (!$name) {
492 # but if it is not present, use the sequence ID.
493 $name = $seq->id();
496 $self->warn("No whitespace allowed in EMBL id [". $name. "]") if $name =~ /\s/;
498 # Use the sequence version, or default to 1.
499 my $version = $seq->version() || 1;
501 my $len = $seq->length();
503 # Taxonomic division.
504 my $div;
505 if ( $seq->can('division') && defined($seq->division) &&
506 $self->_is_valid_division($seq->division) ) {
507 $div = $seq->division();
508 } else {
509 $div ||= 'UNC'; # 'UNC' is the EMBL division code for 'unclassified'.
512 my $mol;
513 # If the molecule type is a valid EMBL type, use it.
514 if ( $seq->can('molecule')
515 && defined($seq->molecule)
516 && $self->_is_valid_molecule_type($seq->molecule)
518 $mol = $seq->molecule();
520 # Otherwise, choose unassigned DNA or RNA based on the alphabet.
521 elsif ($seq->can('primary_seq') && defined $seq->primary_seq->alphabet) {
522 my $alphabet =$seq->primary_seq->alphabet;
523 if ($alphabet eq 'dna') {
524 $mol ='unassigned DNA';
525 } elsif ($alphabet eq 'rna') {
526 $mol='unassigned RNA';
527 } elsif ($alphabet eq 'protein') {
528 $self->warn("Protein sequence found; EMBL is a nucleotide format.");
529 $mol='AA'; # AA is not a valid EMBL molecule type.
533 my $topology = 'linear';
534 if ($seq->is_circular) {
535 $topology = 'circular';
538 $mol ||= ''; # 'unassigned'; ?
539 $id_line = "ID $name; SV $version; $topology; $mol; STD; $div; $len BP.\nXX\n";
540 $self->_print($id_line);
544 =head2 _is_valid_division
546 Title : _is_valid_division
547 Usage : $self->_is_valid_division($div)
548 Function: tests division code for validity
549 Returns : true if $div is a valid EMBL release 87 taxonomic division.
550 Args : taxonomic division code string
552 =cut
554 sub _is_valid_division {
555 my ($self, $division) = @_;
557 my %EMBL_divisions = (
558 "PHG" => 1, # Bacteriophage
559 "ENV" => 1, # Environmental Sample
560 "FUN" => 1, # Fungal
561 "HUM" => 1, # Human
562 "INV" => 1, # Invertebrate
563 "MAM" => 1, # Other Mammal
564 "VRT" => 1, # Other Vertebrate
565 "MUS" => 1, # Mus musculus
566 "PLN" => 1, # Plant
567 "PRO" => 1, # Prokaryote
568 "ROD" => 1, # Other Rodent
569 "SYN" => 1, # Synthetic
570 "UNC" => 1, # Unclassified
571 "VRL" => 1 # Viral
574 return exists($EMBL_divisions{$division});
577 =head2 _is_valid_molecule_type
579 Title : _is_valid_molecule_type
580 Usage : $self->_is_valid_molecule_type($mol)
581 Function: tests molecule type for validity
582 Returns : true if $mol is a valid EMBL release 87 molecule type.
583 Args : molecule type string
585 =cut
587 sub _is_valid_molecule_type {
588 my ($self, $moltype) = @_;
590 my %EMBL_molecule_types = (
591 "genomic DNA" => 1,
592 "genomic RNA" => 1,
593 "mRNA" => 1,
594 "tRNA" => 1,
595 "rRNA" => 1,
596 "snoRNA" => 1,
597 "snRNA" => 1,
598 "scRNA" => 1,
599 "pre-RNA" => 1,
600 "other RNA" => 1,
601 "other DNA" => 1,
602 "unassigned DNA" => 1,
603 "unassigned RNA" => 1
606 return exists($EMBL_molecule_types{$moltype});
609 =head2 write_seq
611 Title : write_seq
612 Usage : $stream->write_seq($seq)
613 Function: writes the $seq object (must be seq) to the stream
614 Returns : 1 for success and undef for error
615 Args : array of 1 to n Bio::SeqI objects
618 =cut
620 sub write_seq {
621 my ($self,@seqs) = @_;
623 foreach my $seq ( @seqs ) {
624 $self->throw("Attempting to write with no seq!") unless defined $seq;
625 unless ( ref $seq && $seq->isa('Bio::SeqI' ) ) {
626 $self->warn("$seq is not a SeqI compliant sequence object!")
627 if $self->verbose >= 0;
628 unless ( ref $seq && $seq->isa('Bio::PrimarySeqI' ) ) {
629 $self->throw("$seq is not a PrimarySeqI compliant sequence object!");
632 my $str = $seq->seq || '';
634 # Write the ID line.
635 $self->_write_ID_line($seq);
638 # Write the accession line if present
639 my( $acc );
641 if ( my $func = $self->_ac_generation_func ) {
642 $acc = &{$func}($seq);
643 } elsif ( $seq->isa('Bio::Seq::RichSeqI') &&
644 defined($seq->accession_number) ) {
645 $acc = $seq->accession_number;
646 $acc = join("; ", $acc, $seq->get_secondary_accessions);
647 } elsif ( $seq->can('accession_number') ) {
648 $acc = $seq->accession_number;
651 if (defined $acc) {
652 $self->_print("AC $acc;\n",
653 "XX\n") || return;
657 # Date lines
658 my $switch=0;
659 if ( $seq->can('get_dates') ) {
660 my @dates = $seq->get_dates();
661 my $ct = 1;
662 my $date_flag = 0;
663 my ($cr) = $seq->annotation->get_Annotations("creation_release");
664 my ($ur) = $seq->annotation->get_Annotations("update_release");
665 my ($uv) = $seq->annotation->get_Annotations("update_version");
667 unless ($cr && $ur && $ur) {
668 $date_flag = 1;
671 foreach my $dt (@dates) {
672 if (!$date_flag) {
673 $self->_write_line_EMBL_regex("DT ","DT ",
674 $dt." (Rel. $cr, Created)",
675 '\s+|$',80) if $ct == 1;
676 $self->_write_line_EMBL_regex("DT ","DT ",
677 $dt." (Rel. $ur, Last updated, Version $uv)",
678 '\s+|$',80) if $ct == 2;
679 } else { # other formats?
680 $self->_write_line_EMBL_regex("DT ","DT ",
681 $dt,'\s+|$',80);
683 $switch =1;
684 $ct++;
686 if ($switch == 1) {
687 $self->_print("XX\n") || return;
691 # Description lines
692 $self->_write_line_EMBL_regex("DE ","DE ",$seq->desc(),'\s+|$',80) || return; #'
693 $self->_print( "XX\n") || return;
695 # if there, write the kw line
697 my( $kw );
698 if ( my $func = $self->_kw_generation_func ) {
699 $kw = &{$func}($seq);
700 } elsif ( $seq->can('keywords') ) {
701 $kw = $seq->keywords;
703 if (defined $kw) {
704 $self->_write_line_EMBL_regex("KW ", "KW ", $kw, '\s+|$', 80) || return; #'
705 $self->_print( "XX\n") || return;
709 # Organism lines
711 if ($seq->can('species') && (my $spec = $seq->species)) {
712 my @class = $spec->classification();
713 shift @class; # get rid of species name. Some embl files include
714 # the species name in the OC lines, but this seems
715 # more like an error than something we need to
716 # emulate
717 my $OS = $spec->scientific_name;
718 if ($spec->common_name) {
719 $OS .= ' ('.$spec->common_name.')';
721 $self->_print("OS $OS\n") || return;
722 my $OC = join('; ', reverse(@class)) .'.';
723 $self->_write_line_EMBL_regex("OC ","OC ",$OC,'; |$',80) || return;
724 if ($spec->organelle) {
725 $self->_write_line_EMBL_regex("OG ","OG ",$spec->organelle,'; |$',80) || return;
727 $self->_print("XX\n") || return;
730 # Reference lines
731 my $t = 1;
732 if ( $seq->can('annotation') && defined $seq->annotation ) {
733 foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
734 $self->_print( "RN [$t]\n") || return;
736 # Having no RP line is legal, but we need both
737 # start and end for a valid location.
738 if ($ref->comment) {
739 $self->_write_line_EMBL_regex("RC ", "RC ", $ref->comment, '\s+|$', 80) || return; #'
741 my $start = $ref->start;
742 my $end = $ref->end;
743 if ($start and $end) {
744 $self->_print( "RP $start-$end\n") || return;
745 } elsif ($start or $end) {
746 $self->throw("Both start and end are needed for a valid RP line.".
747 " Got: start='$start' end='$end'");
750 if (my $med = $ref->medline) {
751 $self->_print( "RX MEDLINE; $med.\n") || return;
753 if (my $pm = $ref->pubmed) {
754 $self->_print( "RX PUBMED; $pm.\n") || return;
756 my $authors = $ref->authors;
757 $authors =~ s/([\w\.]) (\w)/$1#$2/g; # add word wrap protection char '#'
759 $self->_write_line_EMBL_regex("RA ", "RA ",
760 $authors . ";",
761 '\s+|$', 80) || return; #'
763 # If there is no title to the reference, it appears
764 # as a single semi-colon. All titles must end in
765 # a semi-colon.
766 my $ref_title = $ref->title || '';
767 $ref_title =~ s/[\s;]*$/;/;
768 $self->_write_line_EMBL_regex("RT ", "RT ", $ref_title, '\s+|$', 80) || return; #'
769 $self->_write_line_EMBL_regex("RL ", "RL ", $ref->location, '\s+|$', 80) || return; #'
770 $self->_print("XX\n") || return;
771 $t++;
774 # DB Xref lines
775 if (my @db_xref = $seq->annotation->get_Annotations('dblink') ) {
776 for my $dr (@db_xref) {
777 my $db_name = $dr->database;
778 my $prim = $dr->primary_id;
780 my $opt = $dr->optional_id || '';
781 my $line = $opt ? "$db_name; $prim; $opt." : "$db_name; $prim.";
782 $self->_write_line_EMBL_regex("DR ", "DR ", $line, '\s+|$', 80) || return; #'
784 $self->_print("XX\n") || return;
787 # Comment lines
788 foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
789 $self->_write_line_EMBL_regex("CC ", "CC ", $comment->text, '\s+|$', 80) || return; #'
790 $self->_print("XX\n") || return;
793 # "\\s\+\|\$"
795 ## FEATURE TABLE
797 $self->_print("FH Key Location/Qualifiers\n") || return;
798 $self->_print("FH\n") || return;
800 my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : ();
801 if ($feats[0]) {
802 if ( defined $self->_post_sort ) {
803 # we need to read things into an array.
804 # Process. Sort them. Print 'em
806 my $post_sort_func = $self->_post_sort();
807 my @fth;
809 foreach my $sf ( @feats ) {
810 push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq));
813 @fth = sort { &$post_sort_func($a,$b) } @fth;
815 foreach my $fth ( @fth ) {
816 $self->_print_EMBL_FTHelper($fth) || return;
818 } else {
819 # not post sorted. And so we can print as we get them.
820 # lower memory load...
822 foreach my $sf ( @feats ) {
823 my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq);
824 foreach my $fth ( @fth ) {
825 if ( $fth->key eq 'CONTIG') {
826 $self->_show_dna(0);
828 $self->_print_EMBL_FTHelper($fth) || return;
834 if ( $self->_show_dna() == 0 ) {
835 $self->_print( "//\n") || return;
836 return;
838 $self->_print( "XX\n") || return;
840 # finished printing features.
842 $str =~ tr/A-Z/a-z/;
844 # Count each nucleotide
845 my $alen = $str =~ tr/a/a/;
846 my $clen = $str =~ tr/c/c/;
847 my $glen = $str =~ tr/g/g/;
848 my $tlen = $str =~ tr/t/t/;
850 my $len = $seq->length();
851 my $olen = $seq->length() - ($alen + $tlen + $clen + $glen);
852 if ( $olen < 0 ) {
853 $self->warn("Weird. More atgc than bases. Problem!");
856 $self->_print("SQ Sequence $len BP; $alen A; $clen C; $glen G; $tlen T; $olen other;\n") || return;
858 my $nuc = 60; # Number of nucleotides per line
859 my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line
860 my $out_pat = 'A11' x 6; # Pattern for packing a line
861 my $length = length($str);
863 # Calculate the number of nucleotides which fit on whole lines
864 my $whole = int($length / $nuc) * $nuc;
866 # Print the whole lines
867 my( $i );
868 for ($i = 0; $i < $whole; $i += $nuc) {
869 my $blocks = pack $out_pat,
870 unpack $whole_pat,
871 substr($str, $i, $nuc);
872 $self->_print(sprintf(" $blocks%9d\n", $i + $nuc)) || return;
875 # Print the last line
876 if (my $last = substr($str, $i)) {
877 my $last_len = length($last);
878 my $last_pat = 'a10' x int($last_len / 10) .'a'. $last_len % 10;
879 my $blocks = pack $out_pat,
880 unpack($last_pat, $last);
881 $self->_print(sprintf(" $blocks%9d\n", $length)) ||
882 return; # Add the length to the end
885 $self->_print( "//\n") || return;
887 $self->flush if $self->_flush_on_write && defined $self->_fh;
889 return 1;
892 =head2 _print_EMBL_FTHelper
894 Title : _print_EMBL_FTHelper
895 Usage :
896 Function: Internal function
897 Returns : 1 if writing suceeded, otherwise undef
898 Args :
901 =cut
903 sub _print_EMBL_FTHelper {
904 my ($self,$fth) = @_;
906 if ( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) {
907 $fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!");
911 #$self->_print( "FH Key Location/Qualifiers\n");
912 #$self->_print( sprintf("FT %-15s %s\n",$fth->key,$fth->loc));
913 # let
914 if ( $fth->key eq 'CONTIG' ) {
915 $self->_print("XX\n") || return;
916 $self->_write_line_EMBL_regex("CO ",
917 "CO ",$fth->loc,
918 '\,|$',80) || return; #'
919 return 1;
921 $self->_write_line_EMBL_regex(sprintf("FT %-15s ",$fth->key),
922 "FT ",$fth->loc,
923 '\,|$',80) || return; #'
924 foreach my $tag ( keys %{$fth->field} ) {
925 if ( ! defined $fth->field->{$tag} ) {
926 next;
928 foreach my $value ( @{$fth->field->{$tag}} ) {
929 $value =~ s/\"/\"\"/g;
930 if ($value eq "_no_value") {
931 $self->_write_line_EMBL_regex("FT ",
932 "FT ",
933 "/$tag",'.|$',80) || return; #'
935 # there are almost 3x more quoted qualifier values and they
936 # are more common too so we take quoted ones first
937 elsif (!$FTQUAL_NO_QUOTE{$tag}) {
938 my $pat = $value =~ /\s/ ? '\s|\-|$' : '.|\-|$';
939 $self->_write_line_EMBL_regex("FT ",
940 "FT ",
941 "/$tag=\"$value\"",$pat,80) || return;
942 } else {
943 $self->_write_line_EMBL_regex("FT ",
944 "FT ",
945 "/$tag=$value",'.|$',80) || return; #'
950 return 1;
954 =head2 _read_EMBL_References
956 Title : _read_EMBL_References
957 Usage :
958 Function: Reads references from EMBL format. Internal function really
959 Example :
960 Returns :
961 Args :
964 =cut
966 sub _read_EMBL_References {
967 my ($self,$buffer) = @_;
968 my (@refs);
970 # assume things are starting with RN
972 if ( $$buffer !~ /^RN/ ) {
973 warn("Not parsing line '$$buffer' which maybe important");
975 my $b1;
976 my $b2;
977 my $title;
978 my $loc;
979 my $au;
980 my $med;
981 my $pm;
982 my $com;
984 while ( defined ($_ = $self->_readline) ) {
985 /^R/ || last;
986 /^RP (\d+)-(\d+)/ && do {$b1=$1;$b2=$2;};
987 /^RX MEDLINE;\s+(\d+)/ && do {$med=$1};
988 /^RX PUBMED;\s+(\d+)/ && do {$pm=$1};
989 /^RA (.*)/ && do {
990 $au = $self->_concatenate_lines($au,$1); next;
992 /^RT (.*)/ && do {
993 $title = $self->_concatenate_lines($title,$1); next;
995 /^RL (.*)/ && do {
996 $loc = $self->_concatenate_lines($loc,$1); next;
998 /^RC (.*)/ && do {
999 $com = $self->_concatenate_lines($com,$1); next;
1003 my $ref = Bio::Annotation::Reference->new();
1004 $au =~ s/;\s*$//g;
1005 $title =~ s/;\s*$//g;
1007 $ref->start($b1);
1008 $ref->end($b2);
1009 $ref->authors($au);
1010 $ref->title($title);
1011 $ref->location($loc);
1012 $ref->medline($med);
1013 $ref->comment($com);
1014 $ref->pubmed($pm);
1016 push(@refs,$ref);
1017 $$buffer = $_;
1019 return @refs;
1022 =head2 _read_EMBL_Species
1024 Title : _read_EMBL_Species
1025 Usage :
1026 Function: Reads the EMBL Organism species and classification
1027 lines.
1028 Example :
1029 Returns : A Bio::Species object
1030 Args : a reference to the current line buffer, accession number
1032 =cut
1034 sub _read_EMBL_Species {
1035 my( $self, $buffer, $acc ) = @_;
1036 my $org;
1038 $_ = $$buffer;
1039 my( $sub_species, $species, $genus, $common, $sci_name, $class_lines );
1040 while (defined( $_ ||= $self->_readline )) {
1041 if (/^OS\s+(.+)/) {
1042 $sci_name .= ($sci_name) ? ' '.$1 : $1;
1043 } elsif (s/^OC\s+(.+)$//) {
1044 $class_lines .= $1;
1045 } elsif (/^OG\s+(.*)/) {
1046 $org = $1;
1047 } else {
1048 last;
1051 $_ = undef; # Empty $_ to trigger read of next line
1054 # $$buffer = $_;
1055 $self->_pushback($_);
1057 $sci_name =~ s{\.$}{};
1058 $sci_name || return;
1060 # Convert data in classification lines into classification array.
1061 # only split on ';' or '.' so that classification that is 2 or more words
1062 # will still get matched, use map() to remove trailing/leading/intervening
1063 # spaces
1064 my @class = map { s/^\s+//; s/\s+$//; s/\s{2,}/ /g; $_; } split /(?<!subgen)[;\.]+/, $class_lines;
1066 # do we have a genus?
1067 my $possible_genus = $class[-1];
1068 $possible_genus .= "|$class[-2]" if $class[-2];
1069 if ($sci_name =~ /^($possible_genus)/) {
1070 $genus = $1;
1071 ($species) = $sci_name =~ /^$genus\s+(.+)/;
1072 } else {
1073 $species = $sci_name;
1076 # Don't make a species object if it is "Unknown" or "None"
1077 if ($genus) {
1078 return if $genus =~ /^(Unknown|None)$/i;
1081 # is this organism of rank species or is it lower?
1082 # (doesn't catch everything, but at least the guess isn't dangerous)
1083 if ($species =~ /subsp\.|var\./) {
1084 ($species, $sub_species) = $species =~ /(.+)\s+((?:subsp\.|var\.).+)/;
1087 # sometimes things have common name in brackets, like
1088 # Schizosaccharomyces pombe (fission yeast), so get rid of the common
1089 # name bit. Probably dangerous if real scientific species name ends in
1090 # bracketed bit.
1091 unless ($class[-1] eq 'Viruses') {
1092 ($species, $common) = $species =~ /^(.+)\s+\((.+)\)$/;
1093 $sci_name =~ s/\s+\(.+\)$// if $common;
1096 # Bio::Species array needs array in Species -> Kingdom direction
1097 unless ($class[-1] eq $sci_name) {
1098 push(@class, $sci_name);
1100 @class = reverse @class;
1102 # do minimal sanity checks before we hand off to Bio::Species which won't
1103 # be able to give informative throw messages if it has to throw because
1104 # of problems here
1105 $self->throw("$acc seems to be missing its OS line: invalid.") unless $sci_name;
1106 my %names;
1107 foreach my $i (0..$#class) {
1108 my $name = $class[$i];
1109 $names{$name}++;
1110 if ($names{$name} > 1 && $name ne $class[$i - 1]) {
1111 $self->throw("$acc seems to have an invalid species classification.");
1114 my $make = Bio::Species->new();
1115 $make->scientific_name($sci_name);
1116 $make->classification(@class);
1117 unless ($class[-1] eq 'Viruses') {
1118 $make->genus($genus) if $genus;
1119 $make->species($species) if $species;
1120 $make->sub_species($sub_species) if $sub_species;
1121 $make->common_name($common) if $common;
1123 $make->organelle($org) if $org;
1124 return $make;
1127 =head2 _read_EMBL_DBLink
1129 Title : _read_EMBL_DBLink
1130 Usage :
1131 Function: Reads the EMBL database cross reference ("DR") lines
1132 Example :
1133 Returns : A list of Bio::Annotation::DBLink objects
1134 Args :
1136 =cut
1138 sub _read_EMBL_DBLink {
1139 my( $self,$buffer ) = @_;
1140 my( @db_link );
1142 $_ = $$buffer;
1143 while (defined( $_ ||= $self->_readline )) {
1144 if ( /^DR ([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?\.$/) {
1145 my ($databse, $prim_id, $sec_id) = ($1,$2,$3);
1146 my $link = Bio::Annotation::DBLink->new(-database => $databse,
1147 -primary_id => $prim_id,
1148 -optional_id => $sec_id);
1150 push(@db_link, $link);
1151 } else {
1152 last;
1154 $_ = undef; # Empty $_ to trigger read of next line
1157 $$buffer = $_;
1158 return @db_link;
1161 =head2 _read_EMBL_TaxID_DBLink
1163 Title : _read_EMBL_TaxID_DBLink
1164 Usage :
1165 Function: Reads the EMBL database cross reference to NCBI TaxID ("OX") lines
1166 Example :
1167 Returns : A list of Bio::Annotation::DBLink objects
1168 Args :
1170 =cut
1172 sub _read_EMBL_TaxID_DBLink {
1173 my( $self,$buffer ) = @_;
1174 my( @db_link );
1176 $_ = $$buffer;
1177 while (defined( $_ ||= $self->_readline )) {
1178 if ( /^OX (\S+)=(\d+);$/ ) {
1179 my ($databse, $prim_id) = ($1,$2);
1180 my $link = Bio::Annotation::DBLink->new(-database => $databse,
1181 -primary_id => $prim_id,);
1182 push(@db_link, $link);
1183 } else {
1184 last;
1186 $_ = undef; # Empty $_ to trigger read of next line
1189 $$buffer = $_;
1190 return @db_link;
1193 =head2 _filehandle
1195 Title : _filehandle
1196 Usage : $obj->_filehandle($newval)
1197 Function:
1198 Example :
1199 Returns : value of _filehandle
1200 Args : newvalue (optional)
1203 =cut
1205 sub _filehandle{
1206 my ($obj,$value) = @_;
1207 if ( defined $value) {
1208 $obj->{'_filehandle'} = $value;
1210 return $obj->{'_filehandle'};
1214 =head2 _read_FTHelper_EMBL
1216 Title : _read_FTHelper_EMBL
1217 Usage : _read_FTHelper_EMBL($buffer)
1218 Function: reads the next FT key line
1219 Example :
1220 Returns : Bio::SeqIO::FTHelper object
1221 Args : filehandle and reference to a scalar
1224 =cut
1226 sub _read_FTHelper_EMBL {
1227 my ($self,$buffer) = @_;
1229 my ($key, # The key of the feature
1230 $loc, # The location line from the feature
1231 @qual, # An arrray of lines making up the qualifiers
1234 if ($$buffer =~ /^FT\s{3}(\S+)\s+(\S+)/ ) {
1235 $key = $1;
1236 $loc = $2;
1237 # Read all the lines up to the next feature
1238 while ( defined($_ = $self->_readline) ) {
1239 if (/^FT(\s+)(.+?)\s*$/) {
1240 # Lines inside features are preceeded by 19 spaces
1241 # A new feature is preceeded by 3 spaces
1242 if (length($1) > 4) {
1243 # Add to qualifiers if we're in the qualifiers
1244 if (@qual) {
1245 push(@qual, $2);
1247 # Start the qualifier list if it's the first qualifier
1248 elsif (substr($2, 0, 1) eq '/') {
1249 @qual = ($2);
1251 # We're still in the location line, so append to location
1252 else {
1253 $loc .= $2;
1255 } else {
1256 # We've reached the start of the next feature
1257 last;
1259 } else {
1260 # We're at the end of the feature table
1261 last;
1264 } elsif ( $$buffer =~ /^CO\s+(\S+)/) {
1265 $key = 'CONTIG';
1266 $loc = $1;
1267 # Read all the lines up to the next feature
1268 while ( defined($_ = $self->_readline) ) {
1269 if (/^CO\s+(\S+)\s*$/) {
1270 $loc .= $1;
1271 } else {
1272 # We've reached the start of the next feature
1273 last;
1276 } else {
1277 # No feature key
1278 return;
1281 # Put the first line of the next feature into the buffer
1282 $$buffer = $_;
1284 # Make the new FTHelper object
1285 my $out = Bio::SeqIO::FTHelper->new();
1286 $out->verbose($self->verbose());
1287 $out->key($key);
1288 $out->loc($loc);
1290 # Now parse and add any qualifiers. (@qual is kept
1291 # intact to provide informative error messages.)
1292 QUAL: for (my $i = 0; $i < @qual; $i++) {
1293 $_ = $qual[$i];
1294 my( $qualifier, $value ) = m{^/([^=]+)(?:=(.+))?}
1295 or $self->throw("Can't see new qualifier in: $_\nfrom:\n"
1296 . join('', map "$_\n", @qual));
1297 if (defined $value) {
1298 # Do we have a quoted value?
1299 if (substr($value, 0, 1) eq '"') {
1300 # Keep adding to value until we find the trailing quote
1301 # and the quotes are balanced
1302 QUOTES:
1303 while ($value !~ /"$/ or $value =~ tr/"/"/ % 2) { #"
1304 $i++;
1305 my $next = $qual[$i];
1306 if (!defined($next)) {
1307 $self->warn("Unbalanced quote in:\n".join("\n", @qual).
1308 "\nAdding quote to close...".
1309 "Check sequence quality!");
1310 $value .= '"';
1311 last QUOTES;
1314 # Join to value with space if value or next line contains a space
1315 $value .= (grep /\s/, ($value, $next)) ? " $next" : $next;
1317 # Trim leading and trailing quotes
1318 $value =~ s/^"|"$//g;
1319 # Undouble internal quotes
1320 $value =~ s/""/"/g; #"
1322 } else {
1323 $value = '_no_value';
1326 # Store the qualifier
1327 $out->field->{$qualifier} ||= [];
1328 push(@{$out->field->{$qualifier}},$value);
1331 return $out;
1334 =head2 _write_line_EMBL
1336 Title : _write_line_EMBL
1337 Usage :
1338 Function: internal function
1339 Example :
1340 Returns : 1 if writing suceeded, else undef
1341 Args :
1344 =cut
1346 sub _write_line_EMBL {
1347 my ($self,$pre1,$pre2,$line,$length) = @_;
1349 $length || $self->throw("Miscalled write_line_EMBL without length. Programming error!");
1350 my $subl = $length - length $pre2;
1351 my $linel = length $line;
1352 my $i;
1354 my $sub = substr($line,0,$length - length $pre1);
1356 $self->_print( "$pre1$sub\n") || return;
1358 for ($i= ($length - length $pre1);$i < $linel;) {
1359 $sub = substr($line,$i,($subl));
1360 $self->_print( "$pre2$sub\n") || return;
1361 $i += $subl;
1364 return 1;
1367 =head2 _write_line_EMBL_regex
1369 Title : _write_line_EMBL_regex
1370 Usage :
1371 Function: internal function for writing lines of specified
1372 length, with different first and the next line
1373 left hand headers and split at specific points in the
1374 text
1375 Example :
1376 Returns : nothing
1377 Args : file handle, first header, second header, text-line, regex for line breaks, total line length
1380 =cut
1382 sub _write_line_EMBL_regex {
1383 my ($self,$pre1,$pre2,$line,$regex,$length) = @_;
1385 #print STDOUT "Going to print with $line!\n";
1387 $length || $self->throw("Programming error - called write_line_EMBL_regex without length.");
1389 my $subl = $length - (length $pre1) -1 ;
1390 my( @lines );
1392 CHUNK: while($line) {
1393 foreach my $pat ($regex, '[,;\.\/-]\s|'.$regex, '[,;\.\/-]|'.$regex) {
1394 if ($line =~ m/^(.{1,$subl})($pat)(.*)/ ) {
1395 my $l = $1.$2;
1396 $l =~ s/#/ /g # remove word wrap protection char '#'
1397 if $pre1 eq "RA ";
1398 my $newl = $3;
1399 $line = substr($line,length($l));
1400 # be strict about not padding spaces according to
1401 # genbank format
1402 $l =~ s/\s+$//;
1403 push(@lines, $l);
1404 next CHUNK;
1407 # if we get here none of the patterns matched $subl or less chars
1408 $self->warn("trouble dissecting \"$line\"\n into chunks ".
1409 "of $subl chars or less - this tag won't print right");
1410 # insert a space char to prevent infinite loops
1411 $line = substr($line,0,$subl) . " " . substr($line,$subl);
1413 my $s = shift @lines;
1414 ($self->_print("$pre1$s\n") || return) if $s;
1415 foreach my $s ( @lines ) {
1416 $self->_print("$pre2$s\n") || return;
1419 return 1;
1422 =head2 _post_sort
1424 Title : _post_sort
1425 Usage : $obj->_post_sort($newval)
1426 Function:
1427 Returns : value of _post_sort
1428 Args : newvalue (optional)
1431 =cut
1433 sub _post_sort{
1434 my $obj = shift;
1435 if ( @_ ) {
1436 my $value = shift;
1437 $obj->{'_post_sort'} = $value;
1439 return $obj->{'_post_sort'};
1443 =head2 _show_dna
1445 Title : _show_dna
1446 Usage : $obj->_show_dna($newval)
1447 Function:
1448 Returns : value of _show_dna
1449 Args : newvalue (optional)
1452 =cut
1454 sub _show_dna{
1455 my $obj = shift;
1456 if ( @_ ) {
1457 my $value = shift;
1458 $obj->{'_show_dna'} = $value;
1460 return $obj->{'_show_dna'};
1464 =head2 _id_generation_func
1466 Title : _id_generation_func
1467 Usage : $obj->_id_generation_func($newval)
1468 Function:
1469 Returns : value of _id_generation_func
1470 Args : newvalue (optional)
1473 =cut
1475 sub _id_generation_func{
1476 my $obj = shift;
1477 if ( @_ ) {
1478 my $value = shift;
1479 $obj->{'_id_generation_func'} = $value;
1481 return $obj->{'_id_generation_func'};
1485 =head2 _ac_generation_func
1487 Title : _ac_generation_func
1488 Usage : $obj->_ac_generation_func($newval)
1489 Function:
1490 Returns : value of _ac_generation_func
1491 Args : newvalue (optional)
1494 =cut
1496 sub _ac_generation_func{
1497 my $obj = shift;
1498 if ( @_ ) {
1499 my $value = shift;
1500 $obj->{'_ac_generation_func'} = $value;
1502 return $obj->{'_ac_generation_func'};
1506 =head2 _sv_generation_func
1508 Title : _sv_generation_func
1509 Usage : $obj->_sv_generation_func($newval)
1510 Function:
1511 Returns : value of _sv_generation_func
1512 Args : newvalue (optional)
1515 =cut
1517 sub _sv_generation_func{
1518 my $obj = shift;
1519 if ( @_ ) {
1520 my $value = shift;
1521 $obj->{'_sv_generation_func'} = $value;
1523 return $obj->{'_sv_generation_func'};
1527 =head2 _kw_generation_func
1529 Title : _kw_generation_func
1530 Usage : $obj->_kw_generation_func($newval)
1531 Function:
1532 Returns : value of _kw_generation_func
1533 Args : newvalue (optional)
1536 =cut
1538 sub _kw_generation_func{
1539 my $obj = shift;
1540 if ( @_ ) {
1541 my $value = shift;
1542 $obj->{'_kw_generation_func'} = $value;
1544 return $obj->{'_kw_generation_func'};