Bio/SeqIO/embl.pm

   1 #
   2 # BioPerl module for Bio::SeqIO::EMBL
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
   7 #
   8 # Copyright Ewan Birney
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::SeqIO::embl - EMBL sequence input/output stream
  17
  18 =head1 SYNOPSIS
  19
  20 It is probably best not to use this object directly, but
  21 rather go through the SeqIO handler system. Go:
  22
  23     $stream = Bio::SeqIO->new(-file => $filename, -format => 'EMBL');
  24
  25     while ( (my $seq = $stream->next_seq()) ) {
  26         # do something with $seq
  27     }
  28
  29 =head1 DESCRIPTION
  30
  31 This object can transform Bio::Seq objects to and from EMBL flat
  32 file databases.
  33
  34 There is a lot of flexibility here about how to dump things which
  35 should be documented more fully.
  36
  37 There should be a common object that this and Genbank share (probably
  38 with Swissprot). Too much of the magic is identical.
  39
  40 =head2 Optional functions
  41
  42 =over 3
  43
  44 =item _show_dna()
  45
  46 (output only) shows the dna or not
  47
  48 =item _post_sort()
  49
  50 (output only) provides a sorting func which is applied to the FTHelpers
  51 before printing
  52
  53 =item _id_generation_func()
  54
  55 This is function which is called as
  56
  57    print "ID   ", $func($annseq), "\n";
  58
  59 To generate the ID line. If it is not there, it generates a sensible ID
  60 line using a number of tools.
  61
  62 If you want to output annotations in EMBL format they need to be
  63 stored in a Bio::Annotation::Collection object which is accessible
  64 through the Bio::SeqI interface method L<annotation()|annotation>.
  65
  66 The following are the names of the keys which are polled from a
  67 L<Bio::Annotation::Collection> object.
  68
  69  reference  - Should contain Bio::Annotation::Reference objects
  70  comment    - Should contain Bio::Annotation::Comment objects
  71  dblink     - Should contain Bio::Annotation::DBLink objects
  72
  73 =back
  74
  75 =head1 FEEDBACK
  76
  77 =head2 Mailing Lists
  78
  79 User feedback is an integral part of the evolution of this and other
  80 Bioperl modules. Send your comments and suggestions preferably to one
  81 of the Bioperl mailing lists.  Your participation is much appreciated.
  82
  83   bioperl-l@bioperl.org                  - General discussion
  84   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  85
  86 =head2 Support
  87
  88 Please direct usage questions or support issues to the mailing list:
  89
  90 I<bioperl-l@bioperl.org>
  91
  92 rather than to the module maintainer directly. Many experienced and
  93 reponsive experts will be able look at the problem and quickly
  94 address it. Please include a thorough description of the problem
  95 with code and data examples if at all possible.
  96
  97 =head2 Reporting Bugs
  98
  99 Report bugs to the Bioperl bug tracking system to help us keep track
 100 the bugs and their resolution. Bug reports can be submitted via
 101 the web:
 102
 103   https://github.com/bioperl/bioperl-live/issues
 104
 105 =head1 AUTHOR - Ewan Birney
 106
 107 Email birney@ebi.ac.uk
 108
 109 =head1 APPENDIX
 110
 111 The rest of the documentation details each of the object
 112 methods. Internal methods are usually preceded with a _
 113
 114 =cut
 115
 116
 117 # Let the code begin...
 118
 119
 120 package Bio::SeqIO::embl;
 121 use vars qw(%FTQUAL_NO_QUOTE);
 122 use strict;
 123 use Bio::SeqIO::FTHelper;
 124 use Bio::SeqFeature::Generic;
 125 use Bio::Species;
 126 use Bio::Seq::SeqFactory;
 127 use Bio::Annotation::Collection;
 128 use Bio::Annotation::Comment;
 129 use Bio::Annotation::Reference;
 130 use Bio::Annotation::DBLink;
 131
 132 use base qw(Bio::SeqIO);
 133
 134 # Note that a qualifier that exceeds one line (i.e. a long label) will
 135 # automatically be quoted regardless:
 136 %FTQUAL_NO_QUOTE=(
 137                   'anticodon'=>1,
 138                   'citation'=>1,
 139                   'codon'=>1,
 140                   'codon_start'=>1,
 141                   'cons_splice'=>1,
 142                   'direction'=>1,
 143                   'evidence'=>1,
 144                   'label'=>1,
 145                   'mod_base'=> 1,
 146                   'number'=> 1,
 147                   'rpt_type'=> 1,
 148                   'rpt_unit'=> 1,
 149                   'transl_except'=> 1,
 150                   'transl_table'=> 1,
 151                   'usedin'=> 1,
 152                  );
 153
 154 sub _initialize {
 155     my($self,@args) = @_;
 156
 157     $self->SUPER::_initialize(@args);
 158     # hash for functions for decoding keys.
 159     $self->{'_func_ftunit_hash'} = {};
 160     # sets this to one by default. People can change it
 161     $self->_show_dna(1);
 162     if ( ! defined $self->sequence_factory ) {
 163         $self->sequence_factory(Bio::Seq::SeqFactory->new
 164                                 (-verbose => $self->verbose(),
 165                                  -type => 'Bio::Seq::RichSeq'));
 166     }
 167 }
 168
 169 =head2 next_seq
 170
 171  Title   : next_seq
 172  Usage   : $seq = $stream->next_seq()
 173  Function: returns the next sequence in the stream
 174  Returns : Bio::Seq object
 175  Args    :
 176
 177 =cut
 178
 179 sub next_seq {
 180     my ($self,@args) = @_;
 181     my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div,
 182         $date, $comment, @date_arr);
 183
 184     my ($annotation, %params, @features) =
 185         Bio::Annotation::Collection->new();
 186
 187     $line = $self->_readline;
 188     # This needs to be before the first eof() test
 189
 190     if ( !defined $line ) {
 191         return;                 # no throws - end of file
 192     }
 193
 194     if ( $line =~ /^\s+$/ ) {
 195         while ( defined ($line = $self->_readline) ) {
 196             $line =~/^\S/ && last;
 197         }
 198         # return without error if the whole next sequence was just a single
 199         # blank line and then eof
 200         return unless $line;
 201     }
 202
 203     # no ID as 1st non-blank line, need short circuit and exit routine
 204     $self->throw("EMBL stream with no ID. Not embl in my book")
 205         unless $line =~ /^ID\s+\S+/;
 206
 207     # At this point we are sure that $line contains an ID header line
 208     my $alphabet;
 209     if ( $line =~ tr/;/;/ == 6) { # New style headers contain exactly six semicolons.
 210
 211         # New style header (EMBL Release >= 87, after June 2006)
 212         my $topology;
 213         my $sv;
 214
 215         # ID   DQ299383; SV 1; linear; mRNA; STD; MAM; 431 BP.
 216         # This regexp comes from the new2old.pl conversion script, from EBI
 217         if ($line =~ m/^ID   (\S+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) BP./) {
 218             ($name, $sv, $topology, $mol, $div) = ($1, $2, $3, $4, $6);
 219         }
 220         if (defined $sv) {
 221             $params{'-seq_version'} = $sv;
 222             $params{'-version'} = $sv;
 223         }
 224
 225         if (defined $topology && $topology eq 'circular') {
 226             $params{'-is_circular'} = 1;
 227         }
 228
 229         if (defined $mol ) {
 230             if ($mol =~ /DNA/) {
 231                 $alphabet = 'dna';
 232             } elsif ($mol =~ /RNA/) {
 233                 $alphabet = 'rna';
 234             } elsif ($mol =~ /AA/) {
 235                 $alphabet = 'protein';
 236             }
 237         }
 238     } else {
 239
 240         # Old style header (EMBL Release < 87, before June 2006)
 241         if ($line =~ /^ID\s+(\S+)[^;]*;\s+(\S+)[^;]*;\s+(\S+)[^;]*;/) {
 242         ($name, $mol, $div) = ($1, $2, $3);
 243         }
 244
 245         if ($mol) {
 246             if ( $mol =~ /circular/ ) {
 247             $params{'-is_circular'} = 1;
 248             $mol =~  s|circular ||;
 249             }
 250             if (defined $mol ) {
 251             if ($mol =~ /DNA/) {
 252                 $alphabet='dna';
 253             } elsif ($mol =~ /RNA/) {
 254                 $alphabet='rna';
 255             } elsif ($mol =~ /AA/) {
 256                 $alphabet='protein';
 257             }
 258             }
 259         }
 260     }
 261
 262     unless( defined $name && length($name) ) {
 263     $name = "unknown_id";
 264     }
 265
 266     # $self->warn("not parsing upper annotation in EMBL file yet!");
 267     my $buffer = $line;
 268     local $_;
 269     BEFORE_FEATURE_TABLE :
 270           my $ncbi_taxid;
 271           until ( !defined $buffer ) {
 272               $_ = $buffer;
 273               # Exit at start of Feature table
 274               if ( /^(F[HT]|SQ)/ ) {
 275                   $self->_pushback($_) if( $1 eq 'SQ' || $1 eq 'FT');
 276                   last;
 277               }
 278               # Description line(s)
 279               if (/^DE\s+(\S.*\S)/) {
 280                   $desc .= $desc ? " $1" : $1;
 281               }
 282
 283               #accession number
 284               if ( /^AC\s+(.*)?/ || /^PA\s+(.*)?/) {
 285                   my @accs = split(/[; ]+/, $1); # allow space in addition
 286                   $params{'-accession_number'} = shift @accs
 287                       unless defined $params{'-accession_number'};
 288                   push @{$params{'-secondary_accessions'}}, @accs;
 289               }
 290
 291               #version number
 292               if ( /^SV\s+\S+\.(\d+);?/ ) {
 293                   my $sv = $1;
 294                   #$sv =~ s/\;//;
 295                   $params{'-seq_version'} = $sv;
 296                   $params{'-version'} = $sv;
 297               }
 298
 299               #date (NOTE: takes last date line)
 300               if ( /^DT\s+(.+)$/ ) {
 301                   my $line = $1;
 302                   my ($date, $version) = split(' ', $line, 2);
 303                   $date =~ tr/,//d; # remove comma if new version
 304                   if ($version) {
 305                   if ($version =~ /\(Rel\. (\d+), Created\)/xms ) {
 306                       my $release = Bio::Annotation::SimpleValue->new(
 307                                                                       -tagname    => 'creation_release',
 308                                                                       -value      => $1
 309                                                                      );
 310                       $annotation->add_Annotation($release);
 311                   } elsif ($version =~ /\(Rel\. (\d+), Last updated, Version (\d+)\)/xms ) {
 312                       my $release = Bio::Annotation::SimpleValue->new(
 313                                                                       -tagname    => 'update_release',
 314                                                                       -value      => $1
 315                                                                      );
 316                       $annotation->add_Annotation($release);
 317
 318                       my $update = Bio::Annotation::SimpleValue->new(
 319                                                                      -tagname    => 'update_version',
 320                                                                      -value      => $2
 321                                                                     );
 322                       $annotation->add_Annotation($update);
 323                   }
 324                   }
 325                   push @{$params{'-dates'}}, $date;
 326               }
 327
 328               #keywords
 329               if ( /^KW   (.*)\S*$/ ) {
 330                   my @kw = split(/\s*\;\s*/,$1);
 331                   push @{$params{'-keywords'}}, @kw;
 332               }
 333
 334               # Organism name and phylogenetic information
 335               elsif (/^O[SC]/) {
 336                   # pass the accession number so we can give an informative throw message if necessary
 337                   my $species = $self->_read_EMBL_Species(\$buffer, $params{'-accession_number'});
 338                   $params{'-species'}= $species;
 339               }
 340
 341               # NCBI TaxID Xref
 342               elsif (/^OX/) {
 343                   if (/NCBI_TaxID=(\d+)/) {
 344                       $ncbi_taxid=$1;
 345                   }
 346
 347                   my @links = $self->_read_EMBL_TaxID_DBLink(\$buffer);
 348                   foreach my $dblink ( @links ) {
 349                       $annotation->add_Annotation('dblink',$dblink);
 350                   }
 351               }
 352
 353               # References
 354               elsif (/^R/) {
 355                   my @refs = $self->_read_EMBL_References(\$buffer);
 356                   foreach my $ref ( @refs ) {
 357                       $annotation->add_Annotation('reference',$ref);
 358                   }
 359               }
 360
 361               # DB Xrefs
 362               elsif (/^DR/) {
 363                   my @links = $self->_read_EMBL_DBLink(\$buffer);
 364                   foreach my $dblink ( @links ) {
 365                       $annotation->add_Annotation('dblink',$dblink);
 366                   }
 367               }
 368
 369               # Comments
 370               elsif (/^CC\s+(.*)/) {
 371                   $comment .= $1;
 372                   $comment .= " ";
 373                   while (defined ($_ = $self->_readline) ) {
 374                       if (/^CC\s+(.*)/) {
 375                           $comment .= $1;
 376                           $comment .= " ";
 377                       } else {
 378                           last;
 379                       }
 380                   }
 381                   my $commobj = Bio::Annotation::Comment->new();
 382                   $commobj->text($comment);
 383                   $annotation->add_Annotation('comment',$commobj);
 384                   $comment = "";
 385               }
 386
 387               # Get next line.
 388               $buffer = $self->_readline;
 389           }
 390
 391     while ( defined ($_ = $self->_readline) ) {
 392         /^FT\s{3}\w/ && last;
 393         /^SQ / && last;
 394         /^CO / && last;
 395     }
 396     $buffer = $_;
 397
 398     if (defined($buffer) && $buffer =~ /^FT /) {
 399         until ( !defined ($buffer) ) {
 400             my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
 401
 402             # process ftunit
 403             my $feat =
 404                 $ftunit->_generic_seqfeature($self->location_factory(), $name);
 405
 406             # add taxon_id from source if available
 407             # Notice, this will override what is found in the OX line.
 408             # this is by design as this seems to be the official way
 409             # of specifying a TaxID
 410             if ($params{'-species'} && ($feat->primary_tag eq 'source')
 411                 && $feat->has_tag('db_xref')
 412                 && (! $params{'-species'}->ncbi_taxid())) {
 413                 foreach my $tagval ($feat->get_tag_values('db_xref')) {
 414                     if (index($tagval,"taxon:") == 0) {
 415                         $params{'-species'}->ncbi_taxid(substr($tagval,6));
 416                         last;
 417                     }
 418                 }
 419             }
 420
 421             # add feature to list of features
 422             push(@features, $feat);
 423
 424             if ( $buffer !~ /^FT/ ) {
 425                 last;
 426             }
 427         }
 428     }
 429     # Set taxid found in OX line
 430     if ($params{'-species'} && defined $ncbi_taxid
 431         && (! $params{'-species'}->ncbi_taxid())) {
 432         $params{'-species'}->ncbi_taxid($ncbi_taxid);
 433     }
 434
 435     # skip comments
 436     while ( defined ($buffer) && $buffer =~ /^XX/ ) {
 437         $buffer = $self->_readline();
 438     }
 439
 440     if ( $buffer =~ /^CO/  ) {
 441         # bug#2982
 442         # special : create contig as annotation
 443         while ( defined ($buffer) ) {
 444             $annotation->add_Annotation($_) for $self->_read_EMBL_Contig(\$buffer);
 445             if ( !$buffer || $buffer !~ /^CO/ ) {
 446                 last;
 447             }
 448         }
 449         $buffer ||= '';
 450     }
 451     if ($buffer !~ /^\/\//) { # if no SQ lines following CO (bug#2958)
 452     if ( $buffer !~ /^SQ/ ) {
 453         while ( defined ($_ = $self->_readline) ) {
 454             /^SQ/ && last;
 455         }
 456     }
 457     $seqc = "";
 458     while ( defined ($_ = $self->_readline) ) {
 459         m{^//} && last;
 460         $_ = uc($_);
 461         s/[^A-Za-z]//g;
 462         $seqc .= $_;
 463     }
 464 }
 465     my $seq = $self->sequence_factory->create
 466         (-verbose => $self->verbose(),
 467          -division => $div,
 468          -seq => $seqc,
 469          -desc => $desc,
 470          -display_id => $name,
 471          -annotation => $annotation,
 472          -molecule => $mol,
 473          -alphabet => $alphabet,
 474          -features => \@features,
 475          %params);
 476     return $seq;
 477 }
 478
 479
 480
 481 =head2 _write_ID_line
 482
 483  Title   : _write_ID_line
 484  Usage   : $self->_write_ID_line($seq);
 485  Function: Writes the EMBL Release 87 format ID line to the stream, unless
 486          : there is a user-supplied ID line generation function in which
 487          : case that is used instead.
 488          : ( See Bio::SeqIO::embl::_id_generation_function(). )
 489  Returns : nothing
 490  Args    : Bio::Seq object
 491
 492 =cut
 493
 494 sub _write_ID_line {
 495
 496     my ($self, $seq) = @_;
 497
 498     my $id_line;
 499     # If there is a user-supplied ID generation function, use it.
 500     if ( $self->_id_generation_func ) {
 501         $id_line = "ID   " . &{$self->_id_generation_func}($seq) . "\nXX\n";
 502     }
 503     # Otherwise, generate a standard EMBL release 87 (June 2006) ID line.
 504     else {
 505
 506         # The sequence name is supposed to be the primary accession number,
 507         my $name = $seq->accession_number();
 508         if ( not(defined $name) || $name eq 'unknown') {
 509             # but if it is not present, use the sequence ID or the empty string
 510             $name = $seq->id() || '';
 511         }
 512
 513         $self->warn("No whitespace allowed in EMBL id [". $name. "]") if $name =~ /\s/;
 514
 515         # Use the sequence version, or default to 1.
 516         my $version = $seq->version() || 1;
 517
 518         my $len = $seq->length();
 519
 520         # Taxonomic division.
 521         my $div;
 522         if ( $seq->can('division') && defined($seq->division) &&
 523              $self->_is_valid_division($seq->division) ) {
 524             $div = $seq->division();
 525         } else {
 526             $div ||= 'UNC';     # 'UNC' is the EMBL division code for 'unclassified'.
 527         }
 528
 529         my $mol;
 530         # If the molecule type is a valid EMBL type, use it.
 531         if (  $seq->can('molecule')
 532               && defined($seq->molecule)
 533               && $self->_is_valid_molecule_type($seq->molecule)
 534            ) {
 535             $mol = $seq->molecule();
 536         }
 537         # Otherwise, choose unassigned DNA or RNA based on the alphabet.
 538         elsif ($seq->can('primary_seq') && defined $seq->primary_seq->alphabet) {
 539             my $alphabet =$seq->primary_seq->alphabet;
 540             if ($alphabet eq 'dna') {
 541                 $mol ='unassigned DNA';
 542             } elsif ($alphabet eq 'rna') {
 543                 $mol='unassigned RNA';
 544             } elsif ($alphabet eq 'protein') {
 545                 $self->warn("Protein sequence found; EMBL is a nucleotide format.");
 546                 $mol='AA';  # AA is not a valid EMBL molecule type.
 547             }
 548         }
 549
 550         my $topology = 'linear';
 551         if ($seq->is_circular) {
 552             $topology = 'circular';
 553         }
 554
 555         $mol ||= '';            # 'unassigned'; ?
 556         $id_line = "ID   $name; SV $version; $topology; $mol; STD; $div; $len BP.\nXX\n";
 557         $self->_print($id_line);
 558     }
 559 }
 560
 561 =head2 _is_valid_division
 562
 563  Title   : _is_valid_division
 564  Usage   : $self->_is_valid_division($div)
 565  Function: tests division code for validity
 566  Returns : true if $div is a valid EMBL release 87 taxonomic division.
 567  Args    : taxonomic division code string
 568
 569 =cut
 570
 571 sub _is_valid_division {
 572     my ($self, $division) = @_;
 573
 574     my %EMBL_divisions = (
 575                           "PHG"    => 1, # Bacteriophage
 576                           "ENV"    => 1, # Environmental Sample
 577                           "FUN"    => 1, # Fungal
 578                           "HUM"    => 1, # Human
 579                           "INV"    => 1, # Invertebrate
 580                           "MAM"    => 1, # Other Mammal
 581                           "VRT"    => 1, # Other Vertebrate
 582                           "MUS"    => 1, # Mus musculus
 583                           "PLN"    => 1, # Plant
 584                           "PRO"    => 1, # Prokaryote
 585                           "ROD"    => 1, # Other Rodent
 586                           "SYN"    => 1, # Synthetic
 587                           "UNC"    => 1, # Unclassified
 588                           "VRL"    => 1 # Viral
 589                          );
 590
 591     return exists($EMBL_divisions{$division});
 592 }
 593
 594 =head2 _is_valid_molecule_type
 595
 596  Title   : _is_valid_molecule_type
 597  Usage   : $self->_is_valid_molecule_type($mol)
 598  Function: tests molecule type for validity
 599  Returns : true if $mol is a valid EMBL release 87 molecule type.
 600  Args    : molecule type string
 601
 602 =cut
 603
 604 sub _is_valid_molecule_type {
 605     my ($self, $moltype) = @_;
 606
 607     my %EMBL_molecule_types = (
 608                                "genomic DNA"    => 1,
 609                                "genomic RNA"    => 1,
 610                                "mRNA"           => 1,
 611                                "tRNA"           => 1,
 612                                "rRNA"           => 1,
 613                                "snoRNA"         => 1,
 614                                "snRNA"          => 1,
 615                                "scRNA"          => 1,
 616                                "pre-RNA"        => 1,
 617                                "other RNA"      => 1,
 618                                "other DNA"      => 1,
 619                                "unassigned DNA" => 1,
 620                                "unassigned RNA" => 1
 621                               );
 622
 623     return exists($EMBL_molecule_types{$moltype});
 624 }
 625
 626 =head2 write_seq
 627
 628  Title   : write_seq
 629  Usage   : $stream->write_seq($seq)
 630  Function: writes the $seq object (must be seq) to the stream
 631  Returns : 1 for success and undef for error
 632  Args    : array of 1 to n Bio::SeqI objects
 633
 634
 635 =cut
 636
 637 sub write_seq {
 638     my ($self,@seqs) = @_;
 639
 640     foreach my $seq ( @seqs ) {
 641         $self->throw("Attempting to write with no seq!") unless defined $seq;
 642         unless ( ref $seq && $seq->isa('Bio::SeqI' ) ) {
 643             $self->warn("$seq is not a SeqI compliant sequence object!")
 644                 if $self->verbose >= 0;
 645             unless ( ref $seq && $seq->isa('Bio::PrimarySeqI' ) ) {
 646                 $self->throw("$seq is not a PrimarySeqI compliant sequence object!");
 647             }
 648         }
 649         my $str = $seq->seq || '';
 650
 651         # Write the ID line.
 652         $self->_write_ID_line($seq);
 653
 654
 655         # Write the accession line if present
 656         my( $acc );
 657         {
 658             if ( my $func = $self->_ac_generation_func ) {
 659                 $acc = &{$func}($seq);
 660             } elsif ( $seq->isa('Bio::Seq::RichSeqI') &&
 661                       defined($seq->accession_number) ) {
 662                 $acc = $seq->accession_number;
 663                 $acc = join("; ", $acc, $seq->get_secondary_accessions);
 664             } elsif ( $seq->can('accession_number') ) {
 665                 $acc = $seq->accession_number;
 666             }
 667
 668             if (defined $acc) {
 669                 $self->_print("AC   $acc;\n",
 670                               "XX\n") || return;
 671             }
 672         }
 673
 674         # Date lines
 675         my $switch=0;
 676         if ( $seq->can('get_dates') ) {
 677             my @dates =  $seq->get_dates();
 678             my $ct = 1;
 679             my $date_flag = 0;
 680             my ($cr) = $seq->annotation->get_Annotations("creation_release");
 681             my ($ur) = $seq->annotation->get_Annotations("update_release");
 682             my ($uv) = $seq->annotation->get_Annotations("update_version");
 683
 684             unless ($cr && $ur && $ur) {
 685                 $date_flag = 1;
 686             }
 687
 688             foreach my $dt (@dates) {
 689                 if (!$date_flag) {
 690                     $self->_write_line_EMBL_regex("DT   ","DT   ",
 691                                                   $dt." (Rel. $cr, Created)",
 692                                                   '\s+|$',80) if $ct == 1;
 693                     $self->_write_line_EMBL_regex("DT   ","DT   ",
 694                                                   $dt." (Rel. $ur, Last updated, Version $uv)",
 695                                                   '\s+|$',80) if $ct == 2;
 696                 } else {        # other formats?
 697                     $self->_write_line_EMBL_regex("DT   ","DT   ",
 698                                                   $dt,'\s+|$',80);
 699                 }
 700                 $switch =1;
 701                 $ct++;
 702             }
 703             if ($switch == 1) {
 704                 $self->_print("XX\n") || return;
 705             }
 706         }
 707
 708         # Description lines
 709         $self->_write_line_EMBL_regex("DE   ","DE   ",$seq->desc(),'\s+|$',80) || return; #'
 710         $self->_print( "XX\n") || return;
 711
 712         # if there, write the kw line
 713         {
 714             my( $kw );
 715             if ( my $func = $self->_kw_generation_func ) {
 716                 $kw = &{$func}($seq);
 717             } elsif ( $seq->can('keywords') ) {
 718                 $kw = $seq->keywords;
 719             }
 720             if (defined $kw) {
 721                 $self->_write_line_EMBL_regex("KW   ", "KW   ", $kw, '\s+|$', 80) || return; #'
 722                 $self->_print( "XX\n") || return;
 723             }
 724         }
 725
 726         # Organism lines
 727
 728         if ($seq->can('species') && (my $spec = $seq->species)) {
 729             my @class = $spec->classification();
 730             shift @class;       # get rid of species name. Some embl files include
 731                                 # the species name in the OC lines, but this seems
 732                                 # more like an error than something we need to
 733                                 # emulate
 734             my $OS = $spec->scientific_name;
 735             if ($spec->common_name) {
 736                 $OS .= ' ('.$spec->common_name.')';
 737             }
 738             $self->_print("OS   $OS\n") || return;
 739             my $OC = join('; ', reverse(@class)) .'.';
 740             $self->_write_line_EMBL_regex("OC   ","OC   ",$OC,'; |$',80) || return;
 741             if ($spec->organelle) {
 742                 $self->_write_line_EMBL_regex("OG   ","OG   ",$spec->organelle,'; |$',80) || return;
 743             }
 744             my $ncbi_taxid = $spec->ncbi_taxid;
 745             if ($ncbi_taxid) {
 746                 $self->_print("OX   NCBI_TaxID=$ncbi_taxid\n") || return;
 747             }
 748             $self->_print("XX\n") || return;
 749         }
 750         # Reference lines
 751         my $t = 1;
 752         if ( $seq->can('annotation') && defined $seq->annotation ) {
 753             foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
 754                 $self->_print( "RN   [$t]\n") || return;
 755
 756                 # Having no RP line is legal, but we need both
 757                 # start and end for a valid location.
 758                 if ($ref->comment) {
 759                     $self->_write_line_EMBL_regex("RC   ", "RC   ", $ref->comment, '\s+|$', 80) || return; #'
 760                 }
 761                 my $start = $ref->start;
 762                 my $end   = $ref->end;
 763                 if ($start and $end) {
 764                     $self->_print( "RP   $start-$end\n") || return;
 765                 } elsif ($start or $end) {
 766                     $self->throw("Both start and end are needed for a valid RP line.".
 767                                  "  Got: start='$start' end='$end'");
 768                 }
 769
 770                 if (my $med = $ref->medline) {
 771                     $self->_print( "RX   MEDLINE; $med.\n") || return;
 772                 }
 773                 if (my $pm = $ref->pubmed) {
 774                     $self->_print( "RX   PUBMED; $pm.\n") || return;
 775                 }
 776                 my $authors = $ref->authors;
 777                 $authors =~ s/([\w\.]) (\w)/$1#$2/g;  # add word wrap protection char '#'
 778
 779                 $self->_write_line_EMBL_regex("RA   ", "RA   ",
 780                                               $authors . ";",
 781                                               '\s+|$', 80) || return; #'
 782
 783                 # If there is no title to the reference, it appears
 784                 # as a single semi-colon.  All titles must end in
 785                 # a semi-colon.
 786                 my $ref_title = $ref->title || '';
 787                 $ref_title =~ s/[\s;]*$/;/;
 788                 $self->_write_line_EMBL_regex("RT   ", "RT   ", $ref_title,    '\s+|$', 80) || return; #'
 789                 $self->_write_line_EMBL_regex("RL   ", "RL   ", $ref->location, '\s+|$', 80) || return; #'
 790                 $self->_print("XX\n") || return;
 791                 $t++;
 792             }
 793
 794             # DB Xref lines
 795             if (my @db_xref = $seq->annotation->get_Annotations('dblink') ) {
 796                 for my $dr (@db_xref) {
 797                     my $db_name = $dr->database;
 798                     my $prim    = $dr->primary_id;
 799
 800                     my $opt     = $dr->optional_id || '';
 801                     my $line = $opt ? "$db_name; $prim; $opt." : "$db_name; $prim.";
 802                     $self->_write_line_EMBL_regex("DR   ", "DR   ", $line, '\s+|$', 80) || return; #'
 803                 }
 804                 $self->_print("XX\n") || return;
 805             }
 806
 807             # Comment lines
 808             foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
 809                 $self->_write_line_EMBL_regex("CC   ", "CC   ", $comment->text, '\s+|$', 80) || return; #'
 810                 $self->_print("XX\n") || return;
 811             }
 812         }
 813         # "\\s\+\|\$"
 814
 815         ## FEATURE TABLE
 816
 817         $self->_print("FH   Key             Location/Qualifiers\n") || return;
 818         $self->_print("FH\n") || return;
 819
 820         my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : ();
 821         if ($feats[0]) {
 822             if ( defined $self->_post_sort ) {
 823                 # we need to read things into an array.
 824                 # Process. Sort them. Print 'em
 825
 826                 my $post_sort_func = $self->_post_sort();
 827                 my @fth;
 828
 829                 foreach my $sf ( @feats ) {
 830                     push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq));
 831                 }
 832
 833                 @fth = sort { &$post_sort_func($a,$b) } @fth;
 834
 835                 foreach my $fth ( @fth ) {
 836                     $self->_print_EMBL_FTHelper($fth) || return;
 837                 }
 838             } else {
 839                 # not post sorted. And so we can print as we get them.
 840                 # lower memory load...
 841
 842                 foreach my $sf ( @feats ) {
 843                     my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq);
 844                     foreach my $fth ( @fth ) {
 845                         if ( $fth->key eq 'CONTIG') {
 846                             $self->_show_dna(0);
 847                         }
 848                         $self->_print_EMBL_FTHelper($fth) || return;
 849                     }
 850                 }
 851             }
 852         }
 853
 854         if ( $self->_show_dna() == 0 ) {
 855             $self->_print( "//\n") || return;
 856             return;
 857         }
 858         $self->_print( "XX\n") || return;
 859
 860         # finished printing features.
 861
 862         # print contig if present : bug#2982
 863         if ( $seq->can('annotation') && defined $seq->annotation) {
 864             foreach my $ctg ( $seq->annotation->get_Annotations('contig') ) {
 865                 if ($ctg->value) {
 866                     $self->_write_line_EMBL_regex("CO   ","CO   ", $ctg->value,
 867                                                   '[,]|$', 80) || return;
 868                 }
 869             }
 870         }
 871         # print sequence lines only if sequence is present! bug#2982
 872         if (length($str)) {
 873             $str =~ tr/A-Z/a-z/;
 874
 875             # Count each nucleotide
 876             my $alen = $str =~ tr/a/a/;
 877             my $clen = $str =~ tr/c/c/;
 878             my $glen = $str =~ tr/g/g/;
 879             my $tlen = $str =~ tr/t/t/;
 880
 881             my $len = $seq->length();
 882             my $olen = $seq->length() - ($alen + $tlen + $clen + $glen);
 883             if ( $olen < 0 ) {
 884                 $self->warn("Weird. More atgc than bases. Problem!");
 885             }
 886
 887             $self->_print("SQ   Sequence $len BP; $alen A; $clen C; $glen G; $tlen T; $olen other;\n") || return;
 888
 889             my $nuc = 60;       # Number of nucleotides per line
 890             my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line
 891             my $out_pat   = 'A11' x 6; # Pattern for packing a line
 892             my $length = length($str);
 893
 894             # Calculate the number of nucleotides which fit on whole lines
 895             my $whole = int($length / $nuc) * $nuc;
 896
 897             # Print the whole lines
 898             my( $i );
 899             for ($i = 0; $i < $whole; $i += $nuc) {
 900                 my $blocks = pack $out_pat,
 901                 unpack $whole_pat,
 902                 substr($str, $i, $nuc);
 903                 $self->_print(sprintf("     $blocks%9d\n", $i + $nuc)) || return;
 904             }
 905
 906             # Print the last line
 907             if (my $last = substr($str, $i)) {
 908                 my $last_len = length($last);
 909                 my $last_pat = 'a10' x int($last_len / 10) .'a'. $last_len % 10;
 910                 my $blocks = pack $out_pat,
 911                 unpack($last_pat, $last);
 912                 $self->_print(sprintf("     $blocks%9d\n", $length)) ||
 913                     return;         # Add the length to the end
 914             }
 915         }
 916
 917         $self->_print( "//\n") || return;
 918
 919         $self->flush if $self->_flush_on_write && defined $self->_fh;
 920     }
 921     return 1;
 922 }
 923
 924 =head2 _print_EMBL_FTHelper
 925
 926  Title   : _print_EMBL_FTHelper
 927  Usage   :
 928  Function: Internal function
 929  Returns : 1 if writing suceeded, otherwise undef
 930  Args    :
 931
 932
 933 =cut
 934
 935 sub _print_EMBL_FTHelper {
 936     my ($self,$fth) = @_;
 937
 938     if ( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) {
 939         $fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!");
 940     }
 941
 942
 943     #$self->_print( "FH   Key             Location/Qualifiers\n");
 944     #$self->_print( sprintf("FT   %-15s  %s\n",$fth->key,$fth->loc));
 945     # let
 946     if ( $fth->key eq 'CONTIG' ) {
 947         $self->_print("XX\n") || return;
 948         $self->_write_line_EMBL_regex("CO   ",
 949                                       "CO   ",$fth->loc,
 950                                       '\,|$',80) || return; #'
 951         return 1;
 952     }
 953     $self->_write_line_EMBL_regex(sprintf("FT   %-15s ",$fth->key),
 954                                   "FT                   ",$fth->loc,
 955                                   '\,|$',80) || return; #'
 956     foreach my $tag ( keys %{$fth->field} ) {
 957         if ( ! defined $fth->field->{$tag} ) {
 958             next;
 959         }
 960         foreach my $value ( @{$fth->field->{$tag}} ) {
 961             $value =~ s/\"/\"\"/g;
 962             if ($value eq "_no_value") {
 963                 $self->_write_line_EMBL_regex("FT                   ",
 964                                               "FT                   ",
 965                                               "/$tag",'.|$',80) || return; #'
 966             }
 967             # there are almost 3x more quoted qualifier values and they
 968             # are more common too so we take quoted ones first
 969             #
 970             # Long qualifiers, that will be line wrapped, are always quoted
 971             elsif (!$FTQUAL_NO_QUOTE{$tag} or length("/$tag=$value")>=60) {
 972                 my $pat = $value =~ /\s/ ? '\s|\-|$' : '.|\-|$';
 973                 $self->_write_line_EMBL_regex("FT                   ",
 974                                               "FT                   ",
 975                                               "/$tag=\"$value\"",$pat,80) || return;
 976             } else {
 977                 $self->_write_line_EMBL_regex("FT                   ",
 978                                               "FT                   ",
 979                                               "/$tag=$value",'.|$',80) || return; #'
 980                                           }
 981             }
 982         }
 983
 984         return 1;
 985     }
 986
 987
 988
 989 =head2 _read_EMBL_Contig()
 990
 991  Title   : _read_EMBL_Contig
 992  Usage   :
 993  Function: convert CO lines into annotations
 994  Returns :
 995  Args    :
 996
 997 =cut
 998
 999 sub _read_EMBL_Contig {
1000     my ($self, $buffer) = @_;
1001     my @ret;
1002     if ( $$buffer !~ /^CO/ ) {
1003         warn("Not parsing line '$$buffer' which maybe important");
1004     }
1005     $self->_pushback($$buffer);
1006     while ( defined ($_ = $self->_readline) ) {
1007         /^C/ || last;
1008         /^CO\s+(.*)/ && do {
1009         push @ret, Bio::Annotation::SimpleValue->new( -tagname => 'contig',
1010                                                       -value => $1);
1011         };
1012     }
1013     $$buffer = $_;
1014     return @ret;
1015
1016 }
1017 #'
1018 =head2 _read_EMBL_References
1019
1020  Title   : _read_EMBL_References
1021  Usage   :
1022  Function: Reads references from EMBL format. Internal function really
1023  Example :
1024  Returns :
1025  Args    :
1026
1027
1028 =cut
1029
1030 sub _read_EMBL_References {
1031     my ($self,$buffer) = @_;
1032     my (@refs);
1033
1034     # assume things are starting with RN
1035
1036     if ( $$buffer !~ /^RN/ ) {
1037         warn("Not parsing line '$$buffer' which maybe important");
1038     }
1039     my $b1;
1040     my $b2;
1041     my $title;
1042     my $loc;
1043     my $au;
1044     my $med;
1045     my $pm;
1046     my $com;
1047
1048     while ( defined ($_ = $self->_readline) ) {
1049         /^R/ || last;
1050         /^RP   (\d+)-(\d+)/ && do {$b1=$1;$b2=$2;};
1051         /^RX   MEDLINE;\s+(\d+)/ && do {$med=$1};
1052         /^RX   PUBMED;\s+(\d+)/ && do {$pm=$1};
1053         /^RA   (.*)/ && do {
1054             $au = $self->_concatenate_lines($au,$1); next;
1055         };
1056         /^RT   (.*)/ && do {
1057             $title = $self->_concatenate_lines($title,$1); next;
1058         };
1059         /^RL   (.*)/ && do {
1060             $loc = $self->_concatenate_lines($loc,$1); next;
1061         };
1062         /^RC   (.*)/ && do {
1063             $com = $self->_concatenate_lines($com,$1); next;
1064         };
1065     }
1066
1067     my $ref = Bio::Annotation::Reference->new();
1068     $au =~ s/;\s*$//g;
1069     $title =~ s/;\s*$//g;
1070
1071     $ref->start($b1);
1072     $ref->end($b2);
1073     $ref->authors($au);
1074     $ref->title($title);
1075     $ref->location($loc);
1076     $ref->medline($med);
1077     $ref->comment($com);
1078     $ref->pubmed($pm);
1079
1080     push(@refs,$ref);
1081     $$buffer = $_;
1082
1083     return @refs;
1084 }
1085
1086 =head2 _read_EMBL_Species
1087
1088  Title   : _read_EMBL_Species
1089  Usage   :
1090  Function: Reads the EMBL Organism species and classification
1091            lines.
1092  Example :
1093  Returns : A Bio::Species object
1094  Args    : a reference to the current line buffer, accession number
1095
1096 =cut
1097
1098 sub _read_EMBL_Species {
1099     my( $self, $buffer, $acc ) = @_;
1100     my $org;
1101
1102     $_ = $$buffer;
1103     my( $sub_species, $species, $genus, $common, $sci_name, $class_lines );
1104     while (defined( $_ ||= $self->_readline )) {
1105         if (/^OS\s+(.+)/) {
1106             $sci_name .= ($sci_name) ? ' '.$1 : $1;
1107         } elsif (s/^OC\s+(.+)$//) {
1108             $class_lines .= $1;
1109         } elsif (/^OG\s+(.*)/) {
1110             $org = $1;
1111         } else {
1112             last;
1113         }
1114
1115         $_ = undef;             # Empty $_ to trigger read of next line
1116     }
1117
1118 #    $$buffer = $_;
1119         $self->_pushback($_);
1120
1121     $sci_name =~ s{\.$}{};
1122     $sci_name || return;
1123
1124     # Convert data in classification lines into classification array.
1125     # only split on ';' or '.' so that classification that is 2 or more words
1126     # will still get matched, use map() to remove trailing/leading/intervening
1127     # spaces
1128     my @class = map { s/^\s+//; s/\s+$//; s/\s{2,}/ /g; $_; } split /(?<!subgen)[;\.]+/, $class_lines;
1129
1130     # do we have a genus?
1131     my $possible_genus = $class[-1];
1132     $possible_genus .= "|$class[-2]" if $class[-2];
1133     if ($sci_name =~ /^($possible_genus)/) {
1134         $genus = $1;
1135         ($species) = $sci_name =~ /^$genus\s+(.+)/;
1136     } else {
1137         $species = $sci_name;
1138     }
1139
1140     # Don't make a species object if it is "Unknown" or "None"
1141     if ($genus) {
1142         return if $genus =~ /^(Unknown|None)$/i;
1143     }
1144
1145     # is this organism of rank species or is it lower?
1146     # (doesn't catch everything, but at least the guess isn't dangerous)
1147     if ($species =~ /subsp\.|var\./) {
1148         ($species, $sub_species) = $species =~ /(.+)\s+((?:subsp\.|var\.).+)/;
1149     }
1150
1151     # sometimes things have common name in brackets, like
1152     # Schizosaccharomyces pombe (fission yeast), so get rid of the common
1153     # name bit. Probably dangerous if real scientific species name ends in
1154     # bracketed bit.
1155     unless ($class[-1] eq 'Viruses') {
1156         ($species, $common) = $species =~ /^(.+)\s+\((.+)\)$/;
1157         $sci_name =~ s/\s+\(.+\)$// if $common;
1158     }
1159
1160     # Bio::Species array needs array in Species -> Kingdom direction
1161     unless ($class[-1] eq $sci_name) {
1162         push(@class, $sci_name);
1163     }
1164     @class = reverse @class;
1165
1166     # do minimal sanity checks before we hand off to Bio::Species which won't
1167     # be able to give informative throw messages if it has to throw because
1168     # of problems here
1169     $self->throw("$acc seems to be missing its OS line: invalid.") unless $sci_name;
1170     my %names;
1171     foreach my $i (0..$#class) {
1172         my $name = $class[$i];
1173         $names{$name}++;
1174         # this code breaks examples like: Xenopus (Silurana) tropicalis
1175         # commenting out, see bug 3158
1176
1177         #if ($names{$name} > 1 && ($name ne $class[$i - 1])) {
1178         #    $self->warn("$acc seems to have an invalid species classification:$name ne $class[$i - 1]");
1179         #}
1180     }
1181     my $make = Bio::Species->new();
1182     $make->scientific_name($sci_name);
1183     $make->classification(@class);
1184     unless ($class[-1] eq 'Viruses') {
1185         $make->genus($genus) if $genus;
1186         $make->species($species) if $species;
1187         $make->sub_species($sub_species) if $sub_species;
1188         $make->common_name($common) if $common;
1189     }
1190     $make->organelle($org) if $org;
1191     return $make;
1192 }
1193
1194 =head2 _read_EMBL_DBLink
1195
1196  Title   : _read_EMBL_DBLink
1197  Usage   :
1198  Function: Reads the EMBL database cross reference ("DR") lines
1199  Example :
1200  Returns : A list of Bio::Annotation::DBLink objects
1201  Args    :
1202
1203 =cut
1204
1205 sub _read_EMBL_DBLink {
1206     my( $self,$buffer ) = @_;
1207     my( @db_link );
1208
1209     $_ = $$buffer;
1210     while (defined( $_ ||= $self->_readline )) {
1211         if ( /^DR   ([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?\.$/) {
1212         my ($databse, $prim_id, $sec_id) = ($1,$2,$3);
1213         my $link = Bio::Annotation::DBLink->new(-database    => $databse,
1214                             -primary_id  => $prim_id,
1215                             -optional_id => $sec_id);
1216
1217             push(@db_link, $link);
1218     } else {
1219             last;
1220         }
1221         $_ = undef;             # Empty $_ to trigger read of next line
1222     }
1223
1224     $$buffer = $_;
1225     return @db_link;
1226 }
1227
1228 =head2 _read_EMBL_TaxID_DBLink
1229
1230  Title   : _read_EMBL_TaxID_DBLink
1231  Usage   :
1232  Function: Reads the EMBL database cross reference to NCBI TaxID ("OX") lines
1233  Example :
1234  Returns : A list of Bio::Annotation::DBLink objects
1235  Args    :
1236
1237 =cut
1238
1239 sub _read_EMBL_TaxID_DBLink {
1240     my( $self,$buffer ) = @_;
1241     my( @db_link );
1242
1243     $_ = $$buffer;
1244     while (defined( $_ ||= $self->_readline )) {
1245         if ( /^OX   (\S+)=(\d+);$/ ) {
1246             my ($databse, $prim_id) = ($1,$2);
1247             my $link = Bio::Annotation::DBLink->new(-database    => $databse,
1248                                                     -primary_id  => $prim_id,);
1249             push(@db_link, $link);
1250         } else {
1251             last;
1252         }
1253         $_ = undef;             # Empty $_ to trigger read of next line
1254     }
1255
1256     $$buffer = $_;
1257     return @db_link;
1258 }
1259
1260 =head2 _filehandle
1261
1262  Title   : _filehandle
1263  Usage   : $obj->_filehandle($newval)
1264  Function:
1265  Example :
1266  Returns : value of _filehandle
1267  Args    : newvalue (optional)
1268
1269
1270 =cut
1271
1272 sub _filehandle{
1273     my ($obj,$value) = @_;
1274     if ( defined $value) {
1275         $obj->{'_filehandle'} = $value;
1276     }
1277     return $obj->{'_filehandle'};
1278
1279 }
1280
1281 =head2 _read_FTHelper_EMBL
1282
1283  Title   : _read_FTHelper_EMBL
1284  Usage   : _read_FTHelper_EMBL($buffer)
1285  Function: reads the next FT key line
1286  Example :
1287  Returns : Bio::SeqIO::FTHelper object
1288  Args    : filehandle and reference to a scalar
1289
1290
1291 =cut
1292
1293 sub _read_FTHelper_EMBL {
1294     my ($self,$buffer) = @_;
1295
1296     my ($key,                   # The key of the feature
1297         $loc,                   # The location line from the feature
1298         @qual,                  # An arrray of lines making up the qualifiers
1299        );
1300
1301     if ($$buffer =~ /^FT\s{3}(\S+)\s+(\S+)/ ) {
1302         $key = $1;
1303         $loc = $2;
1304         # Read all the lines up to the next feature
1305         while ( defined($_ = $self->_readline) ) {
1306             if (/^FT(\s+)(.+?)\s*$/) {
1307                 # Lines inside features are preceeded by 19 spaces
1308                 # A new feature is preceeded by 3 spaces
1309                 if (length($1) > 4) {
1310                     # Add to qualifiers if we're in the qualifiers
1311                     if (@qual) {
1312                         push(@qual, $2);
1313                     }
1314                     # Start the qualifier list if it's the first qualifier
1315                     elsif (substr($2, 0, 1) eq '/') {
1316                         @qual = ($2);
1317                     }
1318                     # We're still in the location line, so append to location
1319                     else {
1320                         $loc .= $2;
1321                     }
1322                 } else {
1323                     # We've reached the start of the next feature
1324                     last;
1325                 }
1326             } else {
1327                 # We're at the end of the feature table
1328                 last;
1329             }
1330         }
1331     } elsif ( $$buffer =~ /^CO\s+(\S+)/) {
1332     $key = 'CONTIG';
1333     $loc = $1;
1334     # Read all the lines up to the next feature
1335     while ( defined($_ = $self->_readline) ) {
1336         if (/^CO\s+(\S+)\s*$/) {
1337         $loc .= $1;
1338         } else {
1339         # We've reached the start of the next feature
1340         last;
1341         }
1342     }
1343     } else {
1344         # No feature key
1345         return;
1346     }
1347
1348     # Put the first line of the next feature into the buffer
1349     $$buffer = $_;
1350
1351     # Make the new FTHelper object
1352     my $out = Bio::SeqIO::FTHelper->new();
1353     $out->verbose($self->verbose());
1354     $out->key($key);
1355     $out->loc($loc);
1356
1357     # Now parse and add any qualifiers.  (@qual is kept
1358     # intact to provide informative error messages.)
1359   QUAL: for (my $i = 0; $i < @qual; $i++) {
1360         $_ = $qual[$i];
1361         my( $qualifier, $value ) = m{^/([^=]+)(?:=(.+))?}
1362             or $self->throw("Can't see new qualifier in: $_\nfrom:\n"
1363                             . join('', map "$_\n", @qual));
1364         if (defined $value) {
1365             # Do we have a quoted value?
1366             if (substr($value, 0, 1) eq '"') {
1367                 # Keep adding to value until we find the trailing quote
1368                 # and the quotes are balanced
1369               QUOTES:
1370                 while ($value !~ /"$/ or $value =~ tr/"/"/ % 2) { #"
1371                     $i++;
1372                     my $next = $qual[$i];
1373                     if (!defined($next)) {
1374                         $self->warn("Unbalanced quote in:\n".join("\n", @qual).
1375                                     "\nAdding quote to close...".
1376                                     "Check sequence quality!");
1377                         $value .= '"';
1378                         last QUOTES;
1379                     }
1380
1381                     # Protein sequence translations need to be joined without spaces,
1382                     # other qualifiers need those.
1383                     if ($qualifier eq "translation") {
1384                         $value .= $next;
1385                     } else {
1386                         $value .= " $next";
1387                     }
1388                 }
1389                 # Trim leading and trailing quotes
1390                 $value =~ s/^"|"$//g;
1391                 # Undouble internal quotes
1392                 $value =~ s/""/"/g; #"
1393             }
1394         } else {
1395             $value = '_no_value';
1396         }
1397
1398         # Store the qualifier
1399         $out->field->{$qualifier} ||= [];
1400         push(@{$out->field->{$qualifier}},$value);
1401     }
1402
1403     return $out;
1404 }
1405
1406 =head2 _write_line_EMBL
1407
1408  Title   : _write_line_EMBL
1409  Usage   :
1410  Function: internal function
1411  Example :
1412  Returns : 1 if writing suceeded, else undef
1413  Args    :
1414
1415
1416 =cut
1417
1418 sub _write_line_EMBL {
1419     my ($self,$pre1,$pre2,$line,$length) = @_;
1420
1421     $length || $self->throw("Miscalled write_line_EMBL without length. Programming error!");
1422     my $subl = $length - length $pre2;
1423     my $linel = length $line;
1424     my $i;
1425
1426     my $sub = substr($line,0,$length - length $pre1);
1427
1428     $self->_print( "$pre1$sub\n") || return;
1429
1430     for ($i= ($length - length $pre1);$i < $linel;) {
1431         $sub = substr($line,$i,($subl));
1432         $self->_print( "$pre2$sub\n") || return;
1433         $i += $subl;
1434     }
1435
1436     return 1;
1437 }
1438
1439 =head2 _write_line_EMBL_regex
1440
1441  Title   : _write_line_EMBL_regex
1442  Usage   :
1443  Function: internal function for writing lines of specified
1444            length, with different first and the next line
1445            left hand headers and split at specific points in the
1446            text
1447  Example :
1448  Returns : nothing
1449  Args    : file handle, first header, second header, text-line, regex for line breaks, total line length
1450
1451
1452 =cut
1453
1454 sub _write_line_EMBL_regex {
1455     my ($self,$pre1,$pre2,$line,$regex,$length) = @_;
1456
1457     #print STDOUT "Going to print with $line!\n";
1458
1459     $length || $self->throw("Programming error - called write_line_EMBL_regex without length.");
1460
1461     my $subl = $length - (length $pre1) -1 ;
1462     my( @lines );
1463
1464   CHUNK: while($line) {
1465         foreach my $pat ($regex, '[,;\.\/-]\s|'.$regex, '[,;\.\/-]|'.$regex) {
1466             if ($line =~ m/^(.{0,$subl})($pat)(.*)/ ) {
1467                 my $l = $1.$2;
1468                 $l =~ s/#/ /g  # remove word wrap protection char '#'
1469                     if $pre1 eq "RA   ";
1470                 my $newl = $3;
1471                 $line = substr($line,length($l));
1472                 # be strict about not padding spaces according to
1473                 # genbank format
1474                 $l =~ s/\s+$//;
1475                 next CHUNK if ($l eq '');
1476                 push(@lines, $l);
1477                 next CHUNK;
1478             }
1479         }
1480         # if we get here none of the patterns matched $subl or less chars
1481         $self->warn("trouble dissecting \"$line\"\n     into chunks ".
1482                     "of $subl chars or less - this tag won't print right");
1483         # insert a space char to prevent infinite loops
1484         $line = substr($line,0,$subl) . " " . substr($line,$subl);
1485     }
1486     my $s = shift @lines;
1487     ($self->_print("$pre1$s\n") || return) if $s;
1488     foreach my $s ( @lines ) {
1489         $self->_print("$pre2$s\n") || return;
1490     }
1491
1492     return 1;
1493 }
1494
1495 =head2 _post_sort
1496
1497  Title   : _post_sort
1498  Usage   : $obj->_post_sort($newval)
1499  Function:
1500  Returns : value of _post_sort
1501  Args    : newvalue (optional)
1502
1503
1504 =cut
1505
1506 sub _post_sort{
1507     my $obj = shift;
1508     if ( @_ ) {
1509         my $value = shift;
1510         $obj->{'_post_sort'} = $value;
1511     }
1512     return $obj->{'_post_sort'};
1513
1514 }
1515
1516 =head2 _show_dna
1517
1518  Title   : _show_dna
1519  Usage   : $obj->_show_dna($newval)
1520  Function:
1521  Returns : value of _show_dna
1522  Args    : newvalue (optional)
1523
1524
1525 =cut
1526
1527 sub _show_dna{
1528     my $obj = shift;
1529     if ( @_ ) {
1530         my $value = shift;
1531         $obj->{'_show_dna'} = $value;
1532     }
1533     return $obj->{'_show_dna'};
1534
1535 }
1536
1537 =head2 _id_generation_func
1538
1539  Title   : _id_generation_func
1540  Usage   : $obj->_id_generation_func($newval)
1541  Function:
1542  Returns : value of _id_generation_func
1543  Args    : newvalue (optional)
1544
1545
1546 =cut
1547
1548 sub _id_generation_func{
1549     my $obj = shift;
1550     if ( @_ ) {
1551         my $value = shift;
1552         $obj->{'_id_generation_func'} = $value;
1553     }
1554     return $obj->{'_id_generation_func'};
1555
1556 }
1557
1558 =head2 _ac_generation_func
1559
1560  Title   : _ac_generation_func
1561  Usage   : $obj->_ac_generation_func($newval)
1562  Function:
1563  Returns : value of _ac_generation_func
1564  Args    : newvalue (optional)
1565
1566
1567 =cut
1568
1569 sub _ac_generation_func{
1570     my $obj = shift;
1571     if ( @_ ) {
1572         my $value = shift;
1573         $obj->{'_ac_generation_func'} = $value;
1574     }
1575     return $obj->{'_ac_generation_func'};
1576
1577 }
1578
1579 =head2 _sv_generation_func
1580
1581  Title   : _sv_generation_func
1582  Usage   : $obj->_sv_generation_func($newval)
1583  Function:
1584  Returns : value of _sv_generation_func
1585  Args    : newvalue (optional)
1586
1587
1588 =cut
1589
1590 sub _sv_generation_func{
1591     my $obj = shift;
1592     if ( @_ ) {
1593         my $value = shift;
1594         $obj->{'_sv_generation_func'} = $value;
1595     }
1596     return $obj->{'_sv_generation_func'};
1597
1598 }
1599
1600 =head2 _kw_generation_func
1601
1602  Title   : _kw_generation_func
1603  Usage   : $obj->_kw_generation_func($newval)
1604  Function:
1605  Returns : value of _kw_generation_func
1606  Args    : newvalue (optional)
1607
1608
1609 =cut
1610
1611 sub _kw_generation_func{
1612     my $obj = shift;
1613     if ( @_ ) {
1614         my $value = shift;
1615         $obj->{'_kw_generation_func'} = $value;
1616     }
1617     return $obj->{'_kw_generation_func'};
1618
1619 }
1620
1621 1;