Bio/SeqIO/embl.pm

   1 # $Id$
   2 #
   3 # BioPerl module for Bio::SeqIO::EMBL
   4 #
   5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   6 #
   7 # Cared for by Ewan Birney <birney@ebi.ac.uk>
   8 #
   9 # Copyright Ewan Birney
  10 #
  11 # You may distribute this module under the same terms as perl itself
  12
  13 # POD documentation - main docs before the code
  14
  15 =head1 NAME
  16
  17 Bio::SeqIO::embl - EMBL sequence input/output stream
  18
  19 =head1 SYNOPSIS
  20
  21 It is probably best not to use this object directly, but
  22 rather go through the SeqIO handler system. Go:
  23
  24     $stream = Bio::SeqIO->new(-file => $filename, -format => 'EMBL');
  25
  26     while ( (my $seq = $stream->next_seq()) ) {
  27         # do something with $seq
  28     }
  29
  30 =head1 DESCRIPTION
  31
  32 This object can transform Bio::Seq objects to and from EMBL flat
  33 file databases.
  34
  35 There is a lot of flexibility here about how to dump things which
  36 should be documented more fully.
  37
  38 There should be a common object that this and Genbank share (probably
  39 with Swissprot). Too much of the magic is identical.
  40
  41 =head2 Optional functions
  42
  43 =over 3
  44
  45 =item _show_dna()
  46
  47 (output only) shows the dna or not
  48
  49 =item _post_sort()
  50
  51 (output only) provides a sorting func which is applied to the FTHelpers
  52 before printing
  53
  54 =item _id_generation_func()
  55
  56 This is function which is called as
  57
  58    print "ID   ", $func($annseq), "\n";
  59
  60 To generate the ID line. If it is not there, it generates a sensible ID
  61 line using a number of tools.
  62
  63 If you want to output annotations in EMBL format they need to be
  64 stored in a Bio::Annotation::Collection object which is accessible
  65 through the Bio::SeqI interface method L<annotation()|annotation>.
  66
  67 The following are the names of the keys which are polled from a
  68 L<Bio::Annotation::Collection> object.
  69
  70  reference  - Should contain Bio::Annotation::Reference objects
  71  comment    - Should contain Bio::Annotation::Comment objects
  72  dblink     - Should contain Bio::Annotation::DBLink objects
  73
  74 =back
  75
  76 =head1 FEEDBACK
  77
  78 =head2 Mailing Lists
  79
  80 User feedback is an integral part of the evolution of this and other
  81 Bioperl modules. Send your comments and suggestions preferably to one
  82 of the Bioperl mailing lists.  Your participation is much appreciated.
  83
  84   bioperl-l@bioperl.org                  - General discussion
  85   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  86
  87 =head2 Support
  88
  89 Please direct usage questions or support issues to the mailing list:
  90
  91 L<bioperl-l@bioperl.org>
  92
  93 rather than to the module maintainer directly. Many experienced and
  94 reponsive experts will be able look at the problem and quickly
  95 address it. Please include a thorough description of the problem
  96 with code and data examples if at all possible.
  97
  98 =head2 Reporting Bugs
  99
 100 Report bugs to the Bioperl bug tracking system to help us keep track
 101 the bugs and their resolution. Bug reports can be submitted via
 102 the web:
 103
 104   http://bugzilla.open-bio.org/
 105
 106 =head1 AUTHOR - Ewan Birney
 107
 108 Email birney@ebi.ac.uk
 109
 110 =head1 APPENDIX
 111
 112 The rest of the documentation details each of the object
 113 methods. Internal methods are usually preceded with a _
 114
 115 =cut
 116
 117
 118 # Let the code begin...
 119
 120
 121 package Bio::SeqIO::embl;
 122 use vars qw(%FTQUAL_NO_QUOTE);
 123 use strict;
 124 use Bio::SeqIO::FTHelper;
 125 use Bio::SeqFeature::Generic;
 126 use Bio::Species;
 127 use Bio::Seq::SeqFactory;
 128 use Bio::Annotation::Collection;
 129 use Bio::Annotation::Comment;
 130 use Bio::Annotation::Reference;
 131 use Bio::Annotation::DBLink;
 132
 133 use base qw(Bio::SeqIO);
 134
 135 %FTQUAL_NO_QUOTE=(
 136                   'anticodon'=>1,
 137                   'citation'=>1,
 138                   'codon'=>1,
 139                   'codon_start'=>1,
 140                   'cons_splice'=>1,
 141                   'direction'=>1,
 142                   'evidence'=>1,
 143                   'label'=>1,
 144                   'mod_base'=> 1,
 145                   'number'=> 1,
 146                   'rpt_type'=> 1,
 147                   'rpt_unit'=> 1,
 148                   'transl_except'=> 1,
 149                   'transl_table'=> 1,
 150                   'usedin'=> 1,
 151                  );
 152
 153 sub _initialize {
 154     my($self,@args) = @_;
 155
 156     $self->SUPER::_initialize(@args);
 157     # hash for functions for decoding keys.
 158     $self->{'_func_ftunit_hash'} = {};
 159     # sets this to one by default. People can change it
 160     $self->_show_dna(1);
 161     if ( ! defined $self->sequence_factory ) {
 162         $self->sequence_factory(Bio::Seq::SeqFactory->new
 163                                 (-verbose => $self->verbose(),
 164                                  -type => 'Bio::Seq::RichSeq'));
 165     }
 166 }
 167
 168 =head2 next_seq
 169
 170  Title   : next_seq
 171  Usage   : $seq = $stream->next_seq()
 172  Function: returns the next sequence in the stream
 173  Returns : Bio::Seq object
 174  Args    :
 175
 176 =cut
 177
 178 sub next_seq {
 179     my ($self,@args) = @_;
 180     my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div,
 181         $date, $comment, @date_arr);
 182
 183     my ($annotation, %params, @features) =
 184         Bio::Annotation::Collection->new();
 185
 186     $line = $self->_readline;
 187     # This needs to be before the first eof() test
 188
 189     if ( !defined $line ) {
 190         return;                 # no throws - end of file
 191     }
 192
 193     if ( $line =~ /^\s+$/ ) {
 194         while ( defined ($line = $self->_readline) ) {
 195             $line =~/^\S/ && last;
 196         }
 197         # return without error if the whole next sequence was just a single
 198         # blank line and then eof
 199         return unless $line;
 200     }
 201
 202     # no ID as 1st non-blank line, need short circuit and exit routine
 203     $self->throw("EMBL stream with no ID. Not embl in my book")
 204         unless $line =~ /^ID\s+\S+/;
 205
 206     # At this point we are sure that $line contains an ID header line
 207     my $alphabet;
 208     if ( $line =~ tr/;/;/ == 6) { # New style headers contain exactly six semicolons.
 209
 210         # New style header (EMBL Release >= 87, after June 2006)
 211         my $topology;
 212         my $sv;
 213
 214         # ID   DQ299383; SV 1; linear; mRNA; STD; MAM; 431 BP.
 215         # This regexp comes from the new2old.pl conversion script, from EBI
 216         if ($line =~ m/^ID   (\w+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) BP./) {
 217         ($name, $sv, $topology, $mol, $div) = ($1, $2, $3, $4, $6);
 218         }
 219         if (defined($sv)) {
 220         $params{'-seq_version'} = $sv;
 221         $params{'-version'} = $sv;
 222         }
 223
 224         if ($topology eq "circular") {
 225         $params{'-is_circular'} = 1;
 226         }
 227
 228     if (defined $mol ) {
 229         if ($mol =~ /DNA/) {
 230         $alphabet='dna';
 231         } elsif ($mol =~ /RNA/) {
 232         $alphabet='rna';
 233         } elsif ($mol =~ /AA/) {
 234         $alphabet='protein';
 235         }
 236     }
 237     } else {
 238
 239         # Old style header (EMBL Release < 87, before June 2006)
 240         if ($line =~ /^ID\s+(\S+)[^;]*;\s+(\S+)[^;]*;\s+(\S+)[^;]*;/) {
 241         ($name, $mol, $div) = ($1, $2, $3);
 242         }
 243
 244         if ($mol) {
 245             if ( $mol =~ /circular/ ) {
 246             $params{'-is_circular'} = 1;
 247             $mol =~  s|circular ||;
 248             }
 249             if (defined $mol ) {
 250             if ($mol =~ /DNA/) {
 251                 $alphabet='dna';
 252             } elsif ($mol =~ /RNA/) {
 253                 $alphabet='rna';
 254             } elsif ($mol =~ /AA/) {
 255                 $alphabet='protein';
 256             }
 257             }
 258         }
 259     }
 260
 261     unless( defined $name && length($name) ) {
 262     $name = "unknown_id";
 263     }
 264
 265     # $self->warn("not parsing upper annotation in EMBL file yet!");
 266     my $buffer = $line;
 267     local $_;
 268     BEFORE_FEATURE_TABLE :
 269           until ( !defined $buffer ) {
 270               $_ = $buffer;
 271               # Exit at start of Feature table
 272               if ( /^(F[HT]|SQ)/ ) {
 273                   $self->_pushback($_) if( $1 eq 'SQ' || $1 eq 'FT');
 274                   last;
 275               }
 276               # Description line(s)
 277               if (/^DE\s+(\S.*\S)/) {
 278                   $desc .= $desc ? " $1" : $1;
 279               }
 280
 281               #accession number
 282               if ( /^AC\s+(.*)?/ || /^PA\s+(.*)?/) {
 283                   my @accs = split(/[; ]+/, $1); # allow space in addition
 284                   $params{'-accession_number'} = shift @accs
 285                       unless defined $params{'-accession_number'};
 286                   push @{$params{'-secondary_accessions'}}, @accs;
 287               }
 288
 289               #version number
 290               if ( /^SV\s+\S+\.(\d+);?/ ) {
 291                   my $sv = $1;
 292                   #$sv =~ s/\;//;
 293                   $params{'-seq_version'} = $sv;
 294                   $params{'-version'} = $sv;
 295               }
 296
 297               #date (NOTE: takes last date line)
 298               if ( /^DT\s+(.+)$/ ) {
 299                   my $line = $1;
 300                   my ($date, $version) = split(' ', $line, 2);
 301                   $date =~ tr/,//d; # remove comma if new version
 302                   if ($version =~ /\(Rel\. (\d+), Created\)/xms ) {
 303                       my $release = Bio::Annotation::SimpleValue->new(
 304                                                                       -tagname    => 'creation_release',
 305                                                                       -value      => $1
 306                                                                      );
 307                       $annotation->add_Annotation($release);
 308                   } elsif ($version =~ /\(Rel\. (\d+), Last updated, Version (\d+)\)/xms ) {
 309                       my $release = Bio::Annotation::SimpleValue->new(
 310                                                                       -tagname    => 'update_release',
 311                                                                       -value      => $1
 312                                                                      );
 313                       $annotation->add_Annotation($release);
 314
 315                       my $update = Bio::Annotation::SimpleValue->new(
 316                                                                      -tagname    => 'update_version',
 317                                                                      -value      => $2
 318                                                                     );
 319                       $annotation->add_Annotation($update);
 320                   }
 321                   push @{$params{'-dates'}}, $date;
 322               }
 323
 324               #keywords
 325               if ( /^KW   (.*)\S*$/ ) {
 326                   my @kw = split(/\s*\;\s*/,$1);
 327                   push @{$params{'-keywords'}}, @kw;
 328               }
 329
 330               # Organism name and phylogenetic information
 331               elsif (/^O[SC]/) {
 332                   # pass the accession number so we can give an informative throw message if necessary
 333                   my $species = $self->_read_EMBL_Species(\$buffer, $params{'-accession_number'});
 334                   $params{'-species'}= $species;
 335               }
 336
 337               # NCBI TaxID Xref
 338               elsif (/^OX/) {
 339                   my @links = $self->_read_EMBL_TaxID_DBLink(\$buffer);
 340                   foreach my $dblink ( @links ) {
 341                       $annotation->add_Annotation('dblink',$dblink);
 342                   }
 343               }
 344
 345               # References
 346               elsif (/^R/) {
 347                   my @refs = $self->_read_EMBL_References(\$buffer);
 348                   foreach my $ref ( @refs ) {
 349                       $annotation->add_Annotation('reference',$ref);
 350                   }
 351               }
 352
 353               # DB Xrefs
 354               elsif (/^DR/) {
 355                   my @links = $self->_read_EMBL_DBLink(\$buffer);
 356                   foreach my $dblink ( @links ) {
 357                       $annotation->add_Annotation('dblink',$dblink);
 358                   }
 359               }
 360
 361               # Comments
 362               elsif (/^CC\s+(.*)/) {
 363                   $comment .= $1;
 364                   $comment .= " ";
 365                   while (defined ($_ = $self->_readline) ) {
 366                       if (/^CC\s+(.*)/) {
 367                           $comment .= $1;
 368                           $comment .= " ";
 369                       } else {
 370                           last;
 371                       }
 372                   }
 373                   my $commobj = Bio::Annotation::Comment->new();
 374                   $commobj->text($comment);
 375                   $annotation->add_Annotation('comment',$commobj);
 376                   $comment = "";
 377               }
 378
 379               # Get next line.
 380               $buffer = $self->_readline;
 381           }
 382
 383     while ( defined ($_ = $self->_readline) ) {
 384         /^FT\s{3}\w/ && last;
 385         /^SQ / && last;
 386         /^CO / && last;
 387     }
 388     $buffer = $_;
 389
 390     if (defined($buffer) && $buffer =~ /^FT /) {
 391         until ( !defined ($buffer) ) {
 392             my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
 393
 394             # process ftunit
 395             my $feat =
 396                 $ftunit->_generic_seqfeature($self->location_factory(), $name);
 397
 398             # add taxon_id from source if available
 399             if ($params{'-species'} && ($feat->primary_tag eq 'source')
 400                 && $feat->has_tag('db_xref')
 401                 && (! $params{'-species'}->ncbi_taxid())) {
 402                 foreach my $tagval ($feat->get_tag_values('db_xref')) {
 403                     if (index($tagval,"taxon:") == 0) {
 404                         $params{'-species'}->ncbi_taxid(substr($tagval,6));
 405                         last;
 406                     }
 407                 }
 408             }
 409
 410             # add feature to list of features
 411             push(@features, $feat);
 412
 413             if ( $buffer !~ /^FT/ ) {
 414                 last;
 415             }
 416         }
 417     }
 418     # skip comments
 419     while ( defined ($buffer) && $buffer =~ /^XX/ ) {
 420         $buffer = $self->_readline();
 421     }
 422
 423     if ( $buffer =~ /^CO/  ) {
 424         until ( !defined ($buffer) ) {
 425             my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
 426             # process ftunit
 427             push(@features,
 428                  $ftunit->_generic_seqfeature($self->location_factory(),
 429                                               $name));
 430
 431             if ( $buffer !~ /^CO/ ) {
 432                 last;
 433             }
 434         }
 435     }
 436     if ( $buffer !~ /^SQ/  ) {
 437         while ( defined ($_ = $self->_readline) ) {
 438             /^SQ/ && last;
 439         }
 440     }
 441     $seqc = "";
 442     while ( defined ($_ = $self->_readline) ) {
 443         m{^//} && last;
 444         $_ = uc($_);
 445         s/[^A-Za-z]//g;
 446         $seqc .= $_;
 447     }
 448     my $seq = $self->sequence_factory->create
 449         (-verbose => $self->verbose(),
 450          -division => $div,
 451          -seq => $seqc,
 452          -desc => $desc,
 453          -display_id => $name,
 454          -annotation => $annotation,
 455          -molecule => $mol,
 456          -alphabet => $alphabet,
 457          -features => \@features,
 458          %params);
 459     return $seq;
 460 }
 461
 462
 463
 464 =head2 _write_ID_line
 465
 466  Title   : _write_ID_line
 467  Usage   : $self->_write_ID_line($seq);
 468  Function: Writes the EMBL Release 87 format ID line to the stream, unless
 469          : there is a user-supplied ID line generation function in which
 470          : case that is used instead.
 471          : ( See Bio::SeqIO::embl::_id_generation_function(). )
 472  Returns : nothing
 473  Args    : Bio::Seq object
 474
 475 =cut
 476
 477 sub _write_ID_line {
 478
 479     my ($self, $seq) = @_;
 480
 481     my $id_line;
 482     # If there is a user-supplied ID generation function, use it.
 483     if ( $self->_id_generation_func ) {
 484         $id_line = "ID   " . &{$self->_id_generation_func}($seq) . "\nXX\n";
 485     }
 486     # Otherwise, generate a standard EMBL release 87 (June 2006) ID line.
 487     else {
 488
 489         # The sequence name is supposed to be the primary accession number,
 490         my $name = $seq->accession_number();
 491         if (!$name) {
 492             # but if it is not present, use the sequence ID.
 493             $name = $seq->id();
 494         }
 495
 496         $self->warn("No whitespace allowed in EMBL id [". $name. "]") if $name =~ /\s/;
 497
 498         # Use the sequence version, or default to 1.
 499         my $version = $seq->version() || 1;
 500
 501         my $len = $seq->length();
 502
 503         # Taxonomic division.
 504         my $div;
 505         if ( $seq->can('division') && defined($seq->division) &&
 506              $self->_is_valid_division($seq->division) ) {
 507             $div = $seq->division();
 508         } else {
 509             $div ||= 'UNC';     # 'UNC' is the EMBL division code for 'unclassified'.
 510         }
 511
 512         my $mol;
 513         # If the molecule type is a valid EMBL type, use it.
 514         if (  $seq->can('molecule')
 515               && defined($seq->molecule)
 516               && $self->_is_valid_molecule_type($seq->molecule)
 517            ) {
 518             $mol = $seq->molecule();
 519         }
 520         # Otherwise, choose unassigned DNA or RNA based on the alphabet.
 521         elsif ($seq->can('primary_seq') && defined $seq->primary_seq->alphabet) {
 522             my $alphabet =$seq->primary_seq->alphabet;
 523             if ($alphabet eq 'dna') {
 524                 $mol ='unassigned DNA';
 525             } elsif ($alphabet eq 'rna') {
 526                 $mol='unassigned RNA';
 527             } elsif ($alphabet eq 'protein') {
 528                 $self->warn("Protein sequence found; EMBL is a nucleotide format.");
 529                 $mol='AA';  # AA is not a valid EMBL molecule type.
 530             }
 531         }
 532
 533         my $topology = 'linear';
 534         if ($seq->is_circular) {
 535             $topology = 'circular';
 536         }
 537
 538         $mol ||= '';            # 'unassigned'; ?
 539         $id_line = "ID   $name; SV $version; $topology; $mol; STD; $div; $len BP.\nXX\n";
 540         $self->_print($id_line);
 541     }
 542 }
 543
 544 =head2 _is_valid_division
 545
 546  Title   : _is_valid_division
 547  Usage   : $self->_is_valid_division($div)
 548  Function: tests division code for validity
 549  Returns : true if $div is a valid EMBL release 87 taxonomic division.
 550  Args    : taxonomic division code string
 551
 552 =cut
 553
 554 sub _is_valid_division {
 555     my ($self, $division) = @_;
 556
 557     my %EMBL_divisions = (
 558                           "PHG"    => 1, # Bacteriophage
 559                           "ENV"    => 1, # Environmental Sample
 560                           "FUN"    => 1, # Fungal
 561                           "HUM"    => 1, # Human
 562                           "INV"    => 1, # Invertebrate
 563                           "MAM"    => 1, # Other Mammal
 564                           "VRT"    => 1, # Other Vertebrate
 565                           "MUS"    => 1, # Mus musculus
 566                           "PLN"    => 1, # Plant
 567                           "PRO"    => 1, # Prokaryote
 568                           "ROD"    => 1, # Other Rodent
 569                           "SYN"    => 1, # Synthetic
 570                           "UNC"    => 1, # Unclassified
 571                           "VRL"    => 1 # Viral
 572                          );
 573
 574     return exists($EMBL_divisions{$division});
 575 }
 576
 577 =head2 _is_valid_molecule_type
 578
 579  Title   : _is_valid_molecule_type
 580  Usage   : $self->_is_valid_molecule_type($mol)
 581  Function: tests molecule type for validity
 582  Returns : true if $mol is a valid EMBL release 87 molecule type.
 583  Args    : molecule type string
 584
 585 =cut
 586
 587 sub _is_valid_molecule_type {
 588     my ($self, $moltype) = @_;
 589
 590     my %EMBL_molecule_types = (
 591                                "genomic DNA"    => 1,
 592                                "genomic RNA"    => 1,
 593                                "mRNA"           => 1,
 594                                "tRNA"           => 1,
 595                                "rRNA"           => 1,
 596                                "snoRNA"         => 1,
 597                                "snRNA"          => 1,
 598                                "scRNA"          => 1,
 599                                "pre-RNA"        => 1,
 600                                "other RNA"      => 1,
 601                                "other DNA"      => 1,
 602                                "unassigned DNA" => 1,
 603                                "unassigned RNA" => 1
 604                               );
 605
 606     return exists($EMBL_molecule_types{$moltype});
 607 }
 608
 609 =head2 write_seq
 610
 611  Title   : write_seq
 612  Usage   : $stream->write_seq($seq)
 613  Function: writes the $seq object (must be seq) to the stream
 614  Returns : 1 for success and undef for error
 615  Args    : array of 1 to n Bio::SeqI objects
 616
 617
 618 =cut
 619
 620 sub write_seq {
 621     my ($self,@seqs) = @_;
 622
 623     foreach my $seq ( @seqs ) {
 624         $self->throw("Attempting to write with no seq!") unless defined $seq;
 625         unless ( ref $seq && $seq->isa('Bio::SeqI' ) ) {
 626             $self->warn("$seq is not a SeqI compliant sequence object!")
 627                 if $self->verbose >= 0;
 628             unless ( ref $seq && $seq->isa('Bio::PrimarySeqI' ) ) {
 629                 $self->throw("$seq is not a PrimarySeqI compliant sequence object!");
 630             }
 631         }
 632         my $str = $seq->seq || '';
 633
 634         # Write the ID line.
 635         $self->_write_ID_line($seq);
 636
 637
 638         # Write the accession line if present
 639         my( $acc );
 640         {
 641             if ( my $func = $self->_ac_generation_func ) {
 642                 $acc = &{$func}($seq);
 643             } elsif ( $seq->isa('Bio::Seq::RichSeqI') &&
 644                       defined($seq->accession_number) ) {
 645                 $acc = $seq->accession_number;
 646                 $acc = join("; ", $acc, $seq->get_secondary_accessions);
 647             } elsif ( $seq->can('accession_number') ) {
 648                 $acc = $seq->accession_number;
 649             }
 650
 651             if (defined $acc) {
 652                 $self->_print("AC   $acc;\n",
 653                               "XX\n") || return;
 654             }
 655         }
 656
 657         # Date lines
 658         my $switch=0;
 659         if ( $seq->can('get_dates') ) {
 660             my @dates =  $seq->get_dates();
 661             my $ct = 1;
 662             my $date_flag = 0;
 663             my ($cr) = $seq->annotation->get_Annotations("creation_release");
 664             my ($ur) = $seq->annotation->get_Annotations("update_release");
 665             my ($uv) = $seq->annotation->get_Annotations("update_version");
 666
 667             unless ($cr && $ur && $ur) {
 668                 $date_flag = 1;
 669             }
 670
 671             foreach my $dt (@dates) {
 672                 if (!$date_flag) {
 673                     $self->_write_line_EMBL_regex("DT   ","DT   ",
 674                                                   $dt." (Rel. $cr, Created)",
 675                                                   '\s+|$',80) if $ct == 1;
 676                     $self->_write_line_EMBL_regex("DT   ","DT   ",
 677                                                   $dt." (Rel. $ur, Last updated, Version $uv)",
 678                                                   '\s+|$',80) if $ct == 2;
 679                 } else {        # other formats?
 680                     $self->_write_line_EMBL_regex("DT   ","DT   ",
 681                                                   $dt,'\s+|$',80);
 682                 }
 683                 $switch =1;
 684                 $ct++;
 685             }
 686             if ($switch == 1) {
 687                 $self->_print("XX\n") || return;
 688             }
 689         }
 690
 691         # Description lines
 692         $self->_write_line_EMBL_regex("DE   ","DE   ",$seq->desc(),'\s+|$',80) || return; #'
 693         $self->_print( "XX\n") || return;
 694
 695         # if there, write the kw line
 696         {
 697             my( $kw );
 698             if ( my $func = $self->_kw_generation_func ) {
 699                 $kw = &{$func}($seq);
 700             } elsif ( $seq->can('keywords') ) {
 701                 $kw = $seq->keywords;
 702             }
 703             if (defined $kw) {
 704                 $self->_write_line_EMBL_regex("KW   ", "KW   ", $kw, '\s+|$', 80) || return; #'
 705                 $self->_print( "XX\n") || return;
 706             }
 707         }
 708
 709         # Organism lines
 710
 711         if ($seq->can('species') && (my $spec = $seq->species)) {
 712             my @class = $spec->classification();
 713             shift @class;       # get rid of species name. Some embl files include
 714                                 # the species name in the OC lines, but this seems
 715                                 # more like an error than something we need to
 716                                 # emulate
 717             my $OS = $spec->scientific_name;
 718             if ($spec->common_name) {
 719                 $OS .= ' ('.$spec->common_name.')';
 720             }
 721             $self->_print("OS   $OS\n") || return;
 722             my $OC = join('; ', reverse(@class)) .'.';
 723             $self->_write_line_EMBL_regex("OC   ","OC   ",$OC,'; |$',80) || return;
 724             if ($spec->organelle) {
 725                 $self->_write_line_EMBL_regex("OG   ","OG   ",$spec->organelle,'; |$',80) || return;
 726             }
 727             $self->_print("XX\n") || return;
 728         }
 729
 730         # Reference lines
 731         my $t = 1;
 732         if ( $seq->can('annotation') && defined $seq->annotation ) {
 733             foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
 734                 $self->_print( "RN   [$t]\n") || return;
 735
 736                 # Having no RP line is legal, but we need both
 737                 # start and end for a valid location.
 738                 if ($ref->comment) {
 739                     $self->_write_line_EMBL_regex("RC   ", "RC   ", $ref->comment, '\s+|$', 80) || return; #'
 740                 }
 741                 my $start = $ref->start;
 742                 my $end   = $ref->end;
 743                 if ($start and $end) {
 744                     $self->_print( "RP   $start-$end\n") || return;
 745                 } elsif ($start or $end) {
 746                     $self->throw("Both start and end are needed for a valid RP line.".
 747                                  "  Got: start='$start' end='$end'");
 748                 }
 749
 750                 if (my $med = $ref->medline) {
 751                     $self->_print( "RX   MEDLINE; $med.\n") || return;
 752                 }
 753                 if (my $pm = $ref->pubmed) {
 754                     $self->_print( "RX   PUBMED; $pm.\n") || return;
 755                 }
 756                 my $authors = $ref->authors;
 757                 $authors =~ s/([\w\.]) (\w)/$1#$2/g;  # add word wrap protection char '#'
 758
 759                 $self->_write_line_EMBL_regex("RA   ", "RA   ",
 760                                               $authors . ";",
 761                                               '\s+|$', 80) || return; #'
 762
 763                 # If there is no title to the reference, it appears
 764                 # as a single semi-colon.  All titles must end in
 765                 # a semi-colon.
 766                 my $ref_title = $ref->title || '';
 767                 $ref_title =~ s/[\s;]*$/;/;
 768                 $self->_write_line_EMBL_regex("RT   ", "RT   ", $ref_title,    '\s+|$', 80) || return; #'
 769                 $self->_write_line_EMBL_regex("RL   ", "RL   ", $ref->location, '\s+|$', 80) || return; #'
 770                 $self->_print("XX\n") || return;
 771                 $t++;
 772             }
 773
 774             # DB Xref lines
 775             if (my @db_xref = $seq->annotation->get_Annotations('dblink') ) {
 776                 for my $dr (@db_xref) {
 777                     my $db_name = $dr->database;
 778                     my $prim    = $dr->primary_id;
 779
 780                     my $opt     = $dr->optional_id || '';
 781                     my $line = $opt ? "$db_name; $prim; $opt." : "$db_name; $prim.";
 782                     $self->_write_line_EMBL_regex("DR   ", "DR   ", $line, '\s+|$', 80) || return; #'
 783                 }
 784                 $self->_print("XX\n") || return;
 785             }
 786
 787             # Comment lines
 788             foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
 789                 $self->_write_line_EMBL_regex("CC   ", "CC   ", $comment->text, '\s+|$', 80) || return; #'
 790                 $self->_print("XX\n") || return;
 791             }
 792         }
 793         # "\\s\+\|\$"
 794
 795         ## FEATURE TABLE
 796
 797         $self->_print("FH   Key             Location/Qualifiers\n") || return;
 798         $self->_print("FH\n") || return;
 799
 800         my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : ();
 801         if ($feats[0]) {
 802             if ( defined $self->_post_sort ) {
 803                 # we need to read things into an array.
 804                 # Process. Sort them. Print 'em
 805
 806                 my $post_sort_func = $self->_post_sort();
 807                 my @fth;
 808
 809                 foreach my $sf ( @feats ) {
 810                     push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq));
 811                 }
 812
 813                 @fth = sort { &$post_sort_func($a,$b) } @fth;
 814
 815                 foreach my $fth ( @fth ) {
 816                     $self->_print_EMBL_FTHelper($fth) || return;
 817                 }
 818             } else {
 819                 # not post sorted. And so we can print as we get them.
 820                 # lower memory load...
 821
 822                 foreach my $sf ( @feats ) {
 823                     my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq);
 824                     foreach my $fth ( @fth ) {
 825                         if ( $fth->key eq 'CONTIG') {
 826                             $self->_show_dna(0);
 827                         }
 828                         $self->_print_EMBL_FTHelper($fth) || return;
 829                     }
 830                 }
 831             }
 832         }
 833
 834         if ( $self->_show_dna() == 0 ) {
 835             $self->_print( "//\n") || return;
 836             return;
 837         }
 838         $self->_print( "XX\n") || return;
 839
 840         # finished printing features.
 841
 842         $str =~ tr/A-Z/a-z/;
 843
 844         # Count each nucleotide
 845         my $alen = $str =~ tr/a/a/;
 846         my $clen = $str =~ tr/c/c/;
 847         my $glen = $str =~ tr/g/g/;
 848         my $tlen = $str =~ tr/t/t/;
 849
 850         my $len = $seq->length();
 851         my $olen = $seq->length() - ($alen + $tlen + $clen + $glen);
 852         if ( $olen < 0 ) {
 853             $self->warn("Weird. More atgc than bases. Problem!");
 854         }
 855
 856         $self->_print("SQ   Sequence $len BP; $alen A; $clen C; $glen G; $tlen T; $olen other;\n") || return;
 857
 858         my $nuc = 60;       # Number of nucleotides per line
 859         my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line
 860         my $out_pat   = 'A11' x 6; # Pattern for packing a line
 861         my $length = length($str);
 862
 863         # Calculate the number of nucleotides which fit on whole lines
 864         my $whole = int($length / $nuc) * $nuc;
 865
 866         # Print the whole lines
 867         my( $i );
 868         for ($i = 0; $i < $whole; $i += $nuc) {
 869             my $blocks = pack $out_pat,
 870                 unpack $whole_pat,
 871                     substr($str, $i, $nuc);
 872             $self->_print(sprintf("     $blocks%9d\n", $i + $nuc)) || return;
 873         }
 874
 875         # Print the last line
 876         if (my $last = substr($str, $i)) {
 877             my $last_len = length($last);
 878             my $last_pat = 'a10' x int($last_len / 10) .'a'. $last_len % 10;
 879             my $blocks = pack $out_pat,
 880                 unpack($last_pat, $last);
 881             $self->_print(sprintf("     $blocks%9d\n", $length)) ||
 882                 return;         # Add the length to the end
 883         }
 884
 885         $self->_print( "//\n") || return;
 886
 887         $self->flush if $self->_flush_on_write && defined $self->_fh;
 888     }
 889     return 1;
 890 }
 891
 892 =head2 _print_EMBL_FTHelper
 893
 894  Title   : _print_EMBL_FTHelper
 895  Usage   :
 896  Function: Internal function
 897  Returns : 1 if writing suceeded, otherwise undef
 898  Args    :
 899
 900
 901 =cut
 902
 903 sub _print_EMBL_FTHelper {
 904     my ($self,$fth) = @_;
 905
 906     if ( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) {
 907         $fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!");
 908     }
 909
 910
 911     #$self->_print( "FH   Key             Location/Qualifiers\n");
 912     #$self->_print( sprintf("FT   %-15s  %s\n",$fth->key,$fth->loc));
 913     # let
 914     if ( $fth->key eq 'CONTIG' ) {
 915         $self->_print("XX\n") || return;
 916         $self->_write_line_EMBL_regex("CO   ",
 917                                       "CO   ",$fth->loc,
 918                                       '\,|$',80) || return; #'
 919         return 1;
 920     }
 921     $self->_write_line_EMBL_regex(sprintf("FT   %-15s ",$fth->key),
 922                                   "FT                   ",$fth->loc,
 923                                   '\,|$',80) || return; #'
 924     foreach my $tag ( keys %{$fth->field} ) {
 925         if ( ! defined $fth->field->{$tag} ) {
 926             next;
 927         }
 928         foreach my $value ( @{$fth->field->{$tag}} ) {
 929             $value =~ s/\"/\"\"/g;
 930             if ($value eq "_no_value") {
 931                 $self->_write_line_EMBL_regex("FT                   ",
 932                                               "FT                   ",
 933                                               "/$tag",'.|$',80) || return; #'
 934             }
 935             # there are almost 3x more quoted qualifier values and they
 936             # are more common too so we take quoted ones first
 937             elsif (!$FTQUAL_NO_QUOTE{$tag}) {
 938                 my $pat = $value =~ /\s/ ? '\s|\-|$' : '.|\-|$';
 939                 $self->_write_line_EMBL_regex("FT                   ",
 940                                               "FT                   ",
 941                                               "/$tag=\"$value\"",$pat,80) || return;
 942             } else {
 943                 $self->_write_line_EMBL_regex("FT                   ",
 944                                               "FT                   ",
 945                                               "/$tag=$value",'.|$',80) || return; #'
 946                                           }
 947             }
 948         }
 949
 950         return 1;
 951     }
 952
 953 #'
 954 =head2 _read_EMBL_References
 955
 956  Title   : _read_EMBL_References
 957  Usage   :
 958  Function: Reads references from EMBL format. Internal function really
 959  Example :
 960  Returns :
 961  Args    :
 962
 963
 964 =cut
 965
 966 sub _read_EMBL_References {
 967     my ($self,$buffer) = @_;
 968     my (@refs);
 969
 970     # assume things are starting with RN
 971
 972     if ( $$buffer !~ /^RN/ ) {
 973         warn("Not parsing line '$$buffer' which maybe important");
 974     }
 975     my $b1;
 976     my $b2;
 977     my $title;
 978     my $loc;
 979     my $au;
 980     my $med;
 981     my $pm;
 982     my $com;
 983
 984     while ( defined ($_ = $self->_readline) ) {
 985         /^R/ || last;
 986         /^RP   (\d+)-(\d+)/ && do {$b1=$1;$b2=$2;};
 987         /^RX   MEDLINE;\s+(\d+)/ && do {$med=$1};
 988         /^RX   PUBMED;\s+(\d+)/ && do {$pm=$1};
 989         /^RA   (.*)/ && do {
 990             $au = $self->_concatenate_lines($au,$1); next;
 991         };
 992         /^RT   (.*)/ && do {
 993             $title = $self->_concatenate_lines($title,$1); next;
 994         };
 995         /^RL   (.*)/ && do {
 996             $loc = $self->_concatenate_lines($loc,$1); next;
 997         };
 998         /^RC   (.*)/ && do {
 999             $com = $self->_concatenate_lines($com,$1); next;
1000         };
1001     }
1002
1003     my $ref = Bio::Annotation::Reference->new();
1004     $au =~ s/;\s*$//g;
1005     $title =~ s/;\s*$//g;
1006
1007     $ref->start($b1);
1008     $ref->end($b2);
1009     $ref->authors($au);
1010     $ref->title($title);
1011     $ref->location($loc);
1012     $ref->medline($med);
1013     $ref->comment($com);
1014     $ref->pubmed($pm);
1015
1016     push(@refs,$ref);
1017     $$buffer = $_;
1018
1019     return @refs;
1020 }
1021
1022 =head2 _read_EMBL_Species
1023
1024  Title   : _read_EMBL_Species
1025  Usage   :
1026  Function: Reads the EMBL Organism species and classification
1027            lines.
1028  Example :
1029  Returns : A Bio::Species object
1030  Args    : a reference to the current line buffer, accession number
1031
1032 =cut
1033
1034 sub _read_EMBL_Species {
1035     my( $self, $buffer, $acc ) = @_;
1036     my $org;
1037
1038     $_ = $$buffer;
1039     my( $sub_species, $species, $genus, $common, $sci_name, $class_lines );
1040     while (defined( $_ ||= $self->_readline )) {
1041         if (/^OS\s+(.+)/) {
1042             $sci_name .= ($sci_name) ? ' '.$1 : $1;
1043         } elsif (s/^OC\s+(.+)$//) {
1044             $class_lines .= $1;
1045         } elsif (/^OG\s+(.*)/) {
1046             $org = $1;
1047         } else {
1048             last;
1049         }
1050
1051         $_ = undef;             # Empty $_ to trigger read of next line
1052     }
1053
1054 #    $$buffer = $_;
1055         $self->_pushback($_);
1056
1057     $sci_name =~ s{\.$}{};
1058     $sci_name || return;
1059
1060     # Convert data in classification lines into classification array.
1061     # only split on ';' or '.' so that classification that is 2 or more words
1062     # will still get matched, use map() to remove trailing/leading/intervening
1063     # spaces
1064     my @class = map { s/^\s+//; s/\s+$//; s/\s{2,}/ /g; $_; } split /(?<!subgen)[;\.]+/, $class_lines;
1065
1066     # do we have a genus?
1067     my $possible_genus = $class[-1];
1068     $possible_genus .= "|$class[-2]" if $class[-2];
1069     if ($sci_name =~ /^($possible_genus)/) {
1070         $genus = $1;
1071         ($species) = $sci_name =~ /^$genus\s+(.+)/;
1072     } else {
1073         $species = $sci_name;
1074     }
1075
1076     # Don't make a species object if it is "Unknown" or "None"
1077     if ($genus) {
1078         return if $genus =~ /^(Unknown|None)$/i;
1079     }
1080
1081     # is this organism of rank species or is it lower?
1082     # (doesn't catch everything, but at least the guess isn't dangerous)
1083     if ($species =~ /subsp\.|var\./) {
1084         ($species, $sub_species) = $species =~ /(.+)\s+((?:subsp\.|var\.).+)/;
1085     }
1086
1087     # sometimes things have common name in brackets, like
1088     # Schizosaccharomyces pombe (fission yeast), so get rid of the common
1089     # name bit. Probably dangerous if real scientific species name ends in
1090     # bracketed bit.
1091     unless ($class[-1] eq 'Viruses') {
1092         ($species, $common) = $species =~ /^(.+)\s+\((.+)\)$/;
1093         $sci_name =~ s/\s+\(.+\)$// if $common;
1094     }
1095
1096     # Bio::Species array needs array in Species -> Kingdom direction
1097     unless ($class[-1] eq $sci_name) {
1098         push(@class, $sci_name);
1099     }
1100     @class = reverse @class;
1101
1102     # do minimal sanity checks before we hand off to Bio::Species which won't
1103     # be able to give informative throw messages if it has to throw because
1104     # of problems here
1105     $self->throw("$acc seems to be missing its OS line: invalid.") unless $sci_name;
1106     my %names;
1107     foreach my $i (0..$#class) {
1108         my $name = $class[$i];
1109         $names{$name}++;
1110         if ($names{$name} > 1 && $name ne $class[$i - 1]) {
1111             $self->throw("$acc seems to have an invalid species classification.");
1112         }
1113     }
1114     my $make = Bio::Species->new();
1115     $make->scientific_name($sci_name);
1116     $make->classification(@class);
1117     unless ($class[-1] eq 'Viruses') {
1118         $make->genus($genus) if $genus;
1119         $make->species($species) if $species;
1120         $make->sub_species($sub_species) if $sub_species;
1121         $make->common_name($common) if $common;
1122     }
1123     $make->organelle($org) if $org;
1124     return $make;
1125 }
1126
1127 =head2 _read_EMBL_DBLink
1128
1129  Title   : _read_EMBL_DBLink
1130  Usage   :
1131  Function: Reads the EMBL database cross reference ("DR") lines
1132  Example :
1133  Returns : A list of Bio::Annotation::DBLink objects
1134  Args    :
1135
1136 =cut
1137
1138 sub _read_EMBL_DBLink {
1139     my( $self,$buffer ) = @_;
1140     my( @db_link );
1141
1142     $_ = $$buffer;
1143     while (defined( $_ ||= $self->_readline )) {
1144         if ( /^DR   ([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?\.$/) {
1145         my ($databse, $prim_id, $sec_id) = ($1,$2,$3);
1146         my $link = Bio::Annotation::DBLink->new(-database    => $databse,
1147                             -primary_id  => $prim_id,
1148                             -optional_id => $sec_id);
1149
1150             push(@db_link, $link);
1151     } else {
1152             last;
1153         }
1154         $_ = undef;             # Empty $_ to trigger read of next line
1155     }
1156
1157     $$buffer = $_;
1158     return @db_link;
1159 }
1160
1161 =head2 _read_EMBL_TaxID_DBLink
1162
1163  Title   : _read_EMBL_TaxID_DBLink
1164  Usage   :
1165  Function: Reads the EMBL database cross reference to NCBI TaxID ("OX") lines
1166  Example :
1167  Returns : A list of Bio::Annotation::DBLink objects
1168  Args    :
1169
1170 =cut
1171
1172 sub _read_EMBL_TaxID_DBLink {
1173     my( $self,$buffer ) = @_;
1174     my( @db_link );
1175
1176     $_ = $$buffer;
1177     while (defined( $_ ||= $self->_readline )) {
1178         if ( /^OX   (\S+)=(\d+);$/ ) {
1179             my ($databse, $prim_id) = ($1,$2);
1180             my $link = Bio::Annotation::DBLink->new(-database    => $databse,
1181                                                     -primary_id  => $prim_id,);
1182             push(@db_link, $link);
1183         } else {
1184             last;
1185         }
1186         $_ = undef;             # Empty $_ to trigger read of next line
1187     }
1188
1189     $$buffer = $_;
1190     return @db_link;
1191 }
1192
1193 =head2 _filehandle
1194
1195  Title   : _filehandle
1196  Usage   : $obj->_filehandle($newval)
1197  Function:
1198  Example :
1199  Returns : value of _filehandle
1200  Args    : newvalue (optional)
1201
1202
1203 =cut
1204
1205 sub _filehandle{
1206     my ($obj,$value) = @_;
1207     if ( defined $value) {
1208         $obj->{'_filehandle'} = $value;
1209     }
1210     return $obj->{'_filehandle'};
1211
1212 }
1213
1214 =head2 _read_FTHelper_EMBL
1215
1216  Title   : _read_FTHelper_EMBL
1217  Usage   : _read_FTHelper_EMBL($buffer)
1218  Function: reads the next FT key line
1219  Example :
1220  Returns : Bio::SeqIO::FTHelper object
1221  Args    : filehandle and reference to a scalar
1222
1223
1224 =cut
1225
1226 sub _read_FTHelper_EMBL {
1227     my ($self,$buffer) = @_;
1228
1229     my ($key,                   # The key of the feature
1230         $loc,                   # The location line from the feature
1231         @qual,                  # An arrray of lines making up the qualifiers
1232        );
1233
1234     if ($$buffer =~ /^FT\s{3}(\S+)\s+(\S+)/ ) {
1235         $key = $1;
1236         $loc = $2;
1237         # Read all the lines up to the next feature
1238         while ( defined($_ = $self->_readline) ) {
1239             if (/^FT(\s+)(.+?)\s*$/) {
1240                 # Lines inside features are preceeded by 19 spaces
1241                 # A new feature is preceeded by 3 spaces
1242                 if (length($1) > 4) {
1243                     # Add to qualifiers if we're in the qualifiers
1244                     if (@qual) {
1245                         push(@qual, $2);
1246                     }
1247                     # Start the qualifier list if it's the first qualifier
1248                     elsif (substr($2, 0, 1) eq '/') {
1249                         @qual = ($2);
1250                     }
1251                     # We're still in the location line, so append to location
1252                     else {
1253                         $loc .= $2;
1254                     }
1255                 } else {
1256                     # We've reached the start of the next feature
1257                     last;
1258                 }
1259             } else {
1260                 # We're at the end of the feature table
1261                 last;
1262             }
1263         }
1264     } elsif ( $$buffer =~ /^CO\s+(\S+)/) {
1265     $key = 'CONTIG';
1266     $loc = $1;
1267     # Read all the lines up to the next feature
1268     while ( defined($_ = $self->_readline) ) {
1269         if (/^CO\s+(\S+)\s*$/) {
1270         $loc .= $1;
1271         } else {
1272         # We've reached the start of the next feature
1273         last;
1274         }
1275     }
1276     } else {
1277         # No feature key
1278         return;
1279     }
1280
1281     # Put the first line of the next feature into the buffer
1282     $$buffer = $_;
1283
1284     # Make the new FTHelper object
1285     my $out = Bio::SeqIO::FTHelper->new();
1286     $out->verbose($self->verbose());
1287     $out->key($key);
1288     $out->loc($loc);
1289
1290     # Now parse and add any qualifiers.  (@qual is kept
1291     # intact to provide informative error messages.)
1292   QUAL: for (my $i = 0; $i < @qual; $i++) {
1293         $_ = $qual[$i];
1294         my( $qualifier, $value ) = m{^/([^=]+)(?:=(.+))?}
1295             or $self->throw("Can't see new qualifier in: $_\nfrom:\n"
1296                             . join('', map "$_\n", @qual));
1297         if (defined $value) {
1298             # Do we have a quoted value?
1299             if (substr($value, 0, 1) eq '"') {
1300                 # Keep adding to value until we find the trailing quote
1301                 # and the quotes are balanced
1302               QUOTES:
1303                 while ($value !~ /"$/ or $value =~ tr/"/"/ % 2) { #"
1304                     $i++;
1305                     my $next = $qual[$i];
1306                     if (!defined($next)) {
1307                         $self->warn("Unbalanced quote in:\n".join("\n", @qual).
1308                                     "\nAdding quote to close...".
1309                                     "Check sequence quality!");
1310                         $value .= '"';
1311                         last QUOTES;
1312                     }
1313
1314                     # Join to value with space if value or next line contains a space
1315                     $value .= (grep /\s/, ($value, $next)) ? " $next" : $next;
1316                 }
1317                 # Trim leading and trailing quotes
1318                 $value =~ s/^"|"$//g;
1319                 # Undouble internal quotes
1320                 $value =~ s/""/"/g; #"
1321             }
1322         } else {
1323             $value = '_no_value';
1324         }
1325
1326         # Store the qualifier
1327         $out->field->{$qualifier} ||= [];
1328         push(@{$out->field->{$qualifier}},$value);
1329     }
1330
1331     return $out;
1332 }
1333
1334 =head2 _write_line_EMBL
1335
1336  Title   : _write_line_EMBL
1337  Usage   :
1338  Function: internal function
1339  Example :
1340  Returns : 1 if writing suceeded, else undef
1341  Args    :
1342
1343
1344 =cut
1345
1346 sub _write_line_EMBL {
1347     my ($self,$pre1,$pre2,$line,$length) = @_;
1348
1349     $length || $self->throw("Miscalled write_line_EMBL without length. Programming error!");
1350     my $subl = $length - length $pre2;
1351     my $linel = length $line;
1352     my $i;
1353
1354     my $sub = substr($line,0,$length - length $pre1);
1355
1356     $self->_print( "$pre1$sub\n") || return;
1357
1358     for ($i= ($length - length $pre1);$i < $linel;) {
1359         $sub = substr($line,$i,($subl));
1360         $self->_print( "$pre2$sub\n") || return;
1361         $i += $subl;
1362     }
1363
1364     return 1;
1365 }
1366
1367 =head2 _write_line_EMBL_regex
1368
1369  Title   : _write_line_EMBL_regex
1370  Usage   :
1371  Function: internal function for writing lines of specified
1372            length, with different first and the next line
1373            left hand headers and split at specific points in the
1374            text
1375  Example :
1376  Returns : nothing
1377  Args    : file handle, first header, second header, text-line, regex for line breaks, total line length
1378
1379
1380 =cut
1381
1382 sub _write_line_EMBL_regex {
1383     my ($self,$pre1,$pre2,$line,$regex,$length) = @_;
1384
1385     #print STDOUT "Going to print with $line!\n";
1386
1387     $length || $self->throw("Programming error - called write_line_EMBL_regex without length.");
1388
1389     my $subl = $length - (length $pre1) -1 ;
1390     my( @lines );
1391
1392   CHUNK: while($line) {
1393         foreach my $pat ($regex, '[,;\.\/-]\s|'.$regex, '[,;\.\/-]|'.$regex) {
1394             if ($line =~ m/^(.{1,$subl})($pat)(.*)/ ) {
1395                 my $l = $1.$2;
1396                 $l =~ s/#/ /g  # remove word wrap protection char '#'
1397                     if $pre1 eq "RA   ";
1398                 my $newl = $3;
1399                 $line = substr($line,length($l));
1400                 # be strict about not padding spaces according to
1401                 # genbank format
1402                 $l =~ s/\s+$//;
1403                 push(@lines, $l);
1404                 next CHUNK;
1405             }
1406         }
1407         # if we get here none of the patterns matched $subl or less chars
1408         $self->warn("trouble dissecting \"$line\"\n     into chunks ".
1409                     "of $subl chars or less - this tag won't print right");
1410         # insert a space char to prevent infinite loops
1411         $line = substr($line,0,$subl) . " " . substr($line,$subl);
1412     }
1413     my $s = shift @lines;
1414     ($self->_print("$pre1$s\n") || return) if $s;
1415     foreach my $s ( @lines ) {
1416         $self->_print("$pre2$s\n") || return;
1417     }
1418
1419     return 1;
1420 }
1421
1422 =head2 _post_sort
1423
1424  Title   : _post_sort
1425  Usage   : $obj->_post_sort($newval)
1426  Function:
1427  Returns : value of _post_sort
1428  Args    : newvalue (optional)
1429
1430
1431 =cut
1432
1433 sub _post_sort{
1434     my $obj = shift;
1435     if ( @_ ) {
1436         my $value = shift;
1437         $obj->{'_post_sort'} = $value;
1438     }
1439     return $obj->{'_post_sort'};
1440
1441 }
1442
1443 =head2 _show_dna
1444
1445  Title   : _show_dna
1446  Usage   : $obj->_show_dna($newval)
1447  Function:
1448  Returns : value of _show_dna
1449  Args    : newvalue (optional)
1450
1451
1452 =cut
1453
1454 sub _show_dna{
1455     my $obj = shift;
1456     if ( @_ ) {
1457         my $value = shift;
1458         $obj->{'_show_dna'} = $value;
1459     }
1460     return $obj->{'_show_dna'};
1461
1462 }
1463
1464 =head2 _id_generation_func
1465
1466  Title   : _id_generation_func
1467  Usage   : $obj->_id_generation_func($newval)
1468  Function:
1469  Returns : value of _id_generation_func
1470  Args    : newvalue (optional)
1471
1472
1473 =cut
1474
1475 sub _id_generation_func{
1476     my $obj = shift;
1477     if ( @_ ) {
1478         my $value = shift;
1479         $obj->{'_id_generation_func'} = $value;
1480     }
1481     return $obj->{'_id_generation_func'};
1482
1483 }
1484
1485 =head2 _ac_generation_func
1486
1487  Title   : _ac_generation_func
1488  Usage   : $obj->_ac_generation_func($newval)
1489  Function:
1490  Returns : value of _ac_generation_func
1491  Args    : newvalue (optional)
1492
1493
1494 =cut
1495
1496 sub _ac_generation_func{
1497     my $obj = shift;
1498     if ( @_ ) {
1499         my $value = shift;
1500         $obj->{'_ac_generation_func'} = $value;
1501     }
1502     return $obj->{'_ac_generation_func'};
1503
1504 }
1505
1506 =head2 _sv_generation_func
1507
1508  Title   : _sv_generation_func
1509  Usage   : $obj->_sv_generation_func($newval)
1510  Function:
1511  Returns : value of _sv_generation_func
1512  Args    : newvalue (optional)
1513
1514
1515 =cut
1516
1517 sub _sv_generation_func{
1518     my $obj = shift;
1519     if ( @_ ) {
1520         my $value = shift;
1521         $obj->{'_sv_generation_func'} = $value;
1522     }
1523     return $obj->{'_sv_generation_func'};
1524
1525 }
1526
1527 =head2 _kw_generation_func
1528
1529  Title   : _kw_generation_func
1530  Usage   : $obj->_kw_generation_func($newval)
1531  Function:
1532  Returns : value of _kw_generation_func
1533  Args    : newvalue (optional)
1534
1535
1536 =cut
1537
1538 sub _kw_generation_func{
1539     my $obj = shift;
1540     if ( @_ ) {
1541         my $value = shift;
1542         $obj->{'_kw_generation_func'} = $value;
1543     }
1544     return $obj->{'_kw_generation_func'};
1545
1546 }
1547
1548 1;