Bio/SeqIO/embldriver.pm

   1 #
   2 # BioPerl module for Bio::SeqIO::embldriver
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
   7 #
   8 # Copyright Ewan Birney
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::SeqIO::embldriver - EMBL sequence input/output stream
  17
  18 =head1 SYNOPSIS
  19
  20 It is probably best not to use this object directly, but
  21 rather go through the SeqIO handler system. Go:
  22
  23     $stream = Bio::SeqIO->new(-file => $filename, -format => 'embldriver');
  24
  25     while ( (my $seq = $stream->next_seq()) ) {
  26         # do something with $seq
  27     }
  28
  29 =head1 DESCRIPTION
  30
  31 This object can transform Bio::Seq objects to and from EMBL flat
  32 file databases.
  33
  34 There is a lot of flexibility here about how to dump things which
  35 should be documented more fully.
  36
  37 There should be a common object that this and Genbank share (probably
  38 with Swissprot). Too much of the magic is identical.
  39
  40 =head2 Optional functions
  41
  42 =over 3
  43
  44 =item _show_dna()
  45
  46 (output only) shows the dna or not
  47
  48 =item _post_sort()
  49
  50 (output only) provides a sorting func which is applied to the FTHelpers
  51 before printing
  52
  53 =item _id_generation_func()
  54
  55 This is function which is called as
  56
  57    print "ID   ", $func($annseq), "\n";
  58
  59 To generate the ID line. If it is not there, it generates a sensible ID
  60 line using a number of tools.
  61
  62 If you want to output annotations in EMBL format they need to be
  63 stored in a Bio::Annotation::Collection object which is accessible
  64 through the Bio::SeqI interface method L<annotation()|annotation>.
  65
  66 The following are the names of the keys which are polled from a
  67 L<Bio::Annotation::Collection> object.
  68
  69  reference  - Should contain Bio::Annotation::Reference objects
  70  comment    - Should contain Bio::Annotation::Comment objects
  71  dblink     - Should contain Bio::Annotation::DBLink objects
  72
  73 =back
  74
  75 =head1 FEEDBACK
  76
  77 =head2 Mailing Lists
  78
  79 User feedback is an integral part of the evolution of this and other
  80 Bioperl modules. Send your comments and suggestions preferably to one
  81 of the Bioperl mailing lists.  Your participation is much appreciated.
  82
  83   bioperl-l@bioperl.org                  - General discussion
  84   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  85
  86 =head2 Support
  87
  88 Please direct usage questions or support issues to the mailing list:
  89
  90 I<bioperl-l@bioperl.org>
  91
  92 rather than to the module maintainer directly. Many experienced and
  93 reponsive experts will be able look at the problem and quickly
  94 address it. Please include a thorough description of the problem
  95 with code and data examples if at all possible.
  96
  97 =head2 Reporting Bugs
  98
  99 Report bugs to the Bioperl bug tracking system to help us keep track
 100 the bugs and their resolution. Bug reports can be submitted via
 101 the web:
 102
 103   https://redmine.open-bio.org/projects/bioperl/
 104
 105 =head1 AUTHOR - Ewan Birney
 106
 107 Email birney@ebi.ac.uk
 108
 109 =head1 APPENDIX
 110
 111 The rest of the documentation details each of the object
 112 methods. Internal methods are usually preceded with a _
 113
 114 =cut
 115
 116 # Let the code begin...
 117
 118 package Bio::SeqIO::embldriver;
 119 use vars qw(%FTQUAL_NO_QUOTE);
 120 use strict;
 121 use Bio::SeqIO::Handler::GenericRichSeqHandler;
 122 use Data::Dumper;
 123
 124 use base qw(Bio::SeqIO);
 125
 126 my %FTQUAL_NO_QUOTE = map {$_ => 1} qw(
 127     anticodon           citation
 128     codon               codon_start
 129     cons_splice         direction
 130     evidence            label
 131     mod_base            number
 132     rpt_type            rpt_unit
 133     transl_except       transl_table
 134     usedin
 135     LOCATION
 136     );
 137
 138 my %DATA_KEY = (
 139     ID      => 'ID',
 140     AC      => 'ACCESSION',
 141     DT      => 'DATE',
 142     DE      => 'DESCRIPTION',
 143     KW      => 'KEYWORDS',
 144     OS      => 'SOURCE',
 145     OC      => 'CLASSIFICATION',
 146     OG      => 'ORGANELLE',
 147     RN      => 'REFERENCE',
 148     RA      => 'AUTHORS',
 149     RC      => 'COMMENT',
 150     RG      => 'CONSRTM',
 151     RP      => 'POSITION',
 152     RX      => 'CROSSREF',
 153     RT      => 'TITLE',
 154     RL      => 'LOCATION',
 155     XX      => 'SPACER',
 156     FH      => 'FEATHEADER',
 157     FT      => 'FEATURES',
 158     AH      => 'TPA_HEADER',  # Third party annotation
 159     AS      => 'TPA_DATA',  # Third party annotation
 160     DR      => 'DBLINK',
 161     CC      => 'COMMENT',
 162     CO      => 'CO',
 163     CON     => 'CON',
 164     WGS     => 'WGS',
 165     ANN     => 'ANN',
 166     TPA     => 'TPA',
 167     SQ      => 'SEQUENCE',
 168     );
 169
 170 my %SEC = (
 171     OC      => 'CLASSIFICATION',
 172     OH      => 'HOST', # not currently handled, bundled with organism data for now
 173     OG      => 'ORGANELLE',
 174     OX      => 'CROSSREF',
 175     RA      => 'AUTHORS',
 176     RC      => 'COMMENT',
 177     RG      => 'CONSRTM',
 178     RP      => 'POSITION',
 179     RX      => 'CROSSREF',
 180     RT      => 'TITLE',
 181     RL      => 'JOURNAL',
 182     AS      => 'ASSEMBLYINFO',  # Third party annotation
 183     );
 184
 185 my %DELIM = (
 186     #CC      => "\n",
 187     #DR      => "\n",
 188     #DT      => "\n",
 189             );
 190
 191 # signals to process what's in the hash prior to next round
 192 # these should be changed to map secondary data
 193 my %PRIMARY = map {$_ => 1} qw(ID AC DT DE SV KW OS RN AH DR FH CC SQ FT WGS CON ANN TPA //);
 194
 195 sub _initialize {
 196     my($self,@args) = @_;
 197
 198     $self->SUPER::_initialize(@args);
 199     my $handler = $self->_rearrange([qw(HANDLER)],@args);
 200     # hash for functions for decoding keys.
 201     $handler ? $self->seqhandler($handler) :
 202     $self->seqhandler(Bio::SeqIO::Handler::GenericRichSeqHandler->new(
 203                     -format => 'embl',
 204                     -verbose => $self->verbose,
 205                     -builder => $self->sequence_builder
 206                     ));
 207     #
 208     if( ! defined $self->sequence_factory ) {
 209         $self->sequence_factory(Bio::Seq::SeqFactory->new
 210                 (-verbose => $self->verbose(),
 211                  -type => 'Bio::Seq::RichSeq'));
 212     }
 213 }
 214
 215 =head2 next_seq
 216
 217  Title   : next_seq
 218  Usage   : $seq = $stream->next_seq()
 219  Function: returns the next sequence in the stream
 220  Returns : Bio::Seq object
 221  Args    :
 222
 223 =cut
 224
 225 sub next_seq {
 226     my $self = shift;
 227     my $hobj = $self->seqhandler;
 228     local($/) = "\n";
 229     my ($featkey, $qual, $annkey, $delim, $seqdata);
 230     my $lastann = '';
 231     my $ct = 0;
 232     PARSER:
 233     while(defined(my $line = $self->_readline)) {
 234         next PARSER if $line =~ m{^\s*$};
 235         chomp $line;
 236         my ($ann,$data) = split m{\s{2,3}}, $line , 2;
 237         next PARSER if ($ann eq 'XX' || $ann eq 'FH');
 238         if ($ann) {
 239             $data ||='';
 240             if ($ann eq 'FT') {
 241                 # seqfeatures
 242                 if ($data =~ m{^(\S+)\s+([^\n]+)}) {
 243                     $hobj->data_handler($seqdata) if $seqdata;
 244                     $seqdata = ();
 245                     ($seqdata->{FEATURE_KEY}, $data) = ($1, $2);
 246                     $seqdata->{NAME} = $ann;
 247                     $qual = 'LOCATION';
 248                 } elsif ($data =~ m{^\s+/([^=]+)=?(.+)?}) {
 249                     ($qual, $data) = ($1, $2 ||'');
 250                     $ct = (exists $seqdata->{$qual}) ?
 251                         ((ref($seqdata->{$qual}))  ? scalar(@{ $seqdata->{$qual} }) : 1)
 252                         : 0 ;
 253                 }
 254                 $data =~ s{^\s+}{};
 255                 $data =~ tr{"}{}d; # we don't care about quotes yet...
 256                 my $delim = ($FTQUAL_NO_QUOTE{$qual}) ? '' : ' ';
 257                 if ($ct == 0) {
 258                     $seqdata->{$qual} .= ($seqdata->{$qual}) ?
 259                         $delim.$data :
 260                         $data;
 261                 } else {
 262                     if (!ref($seqdata->{$qual})) {
 263                         $seqdata->{$qual} = [$seqdata->{$qual}];
 264                     }
 265                     (exists $seqdata->{$qual}->[$ct]) ?
 266                         (($seqdata->{$qual}->[$ct]) .= $delim.$data) :
 267                         (($seqdata->{$qual}->[$ct]) .= $data);
 268                 }
 269             } else {
 270                 # simple annotations
 271                 $data =~ s{;$}{};
 272                 last PARSER if $ann eq '//';
 273                 if ($ann ne $lastann) {
 274                     if (!$SEC{$ann} && $seqdata) {
 275                         $hobj->data_handler($seqdata);
 276                         # can't use undef here; it can lead to subtle mem leaks
 277                         $seqdata = ();
 278                     }
 279                     $annkey = (!$SEC{$ann})    ? 'DATA'     : # primary data
 280                               $SEC{$ann};
 281                     $seqdata->{'NAME'} = $ann if !$SEC{$ann};
 282                 }
 283
 284                 # toss the data for SQ lines; this needs to be done after the
 285                 # call to the data handler
 286
 287                 next PARSER if $ann eq 'SQ';
 288                 my $delim = $DELIM{$ann} || ' ';
 289                 $seqdata->{$annkey} .= ($seqdata->{$annkey}) ?
 290                     $delim.$data : $data;
 291                 $lastann = $ann;
 292             }
 293         } else {
 294             # this should only be sequence (fingers crossed!)
 295             SEQUENCE:
 296             while (defined ($line = $self->_readline)) {
 297                 if (index($line, '//') == 0) {
 298                     $data =~ tr{0-9 \n}{}d;
 299                     $seqdata->{DATA} = $data;
 300                     #$self->debug(Dumper($seqdata));
 301                     $hobj->data_handler($seqdata);
 302                     $seqdata = ();
 303                     last PARSER;
 304                 } else {
 305                     $data .= $line;
 306                     $line = undef;
 307                 }
 308             }
 309         }
 310     }
 311     $hobj->data_handler($seqdata) if $seqdata;
 312     $seqdata = ();
 313     return $hobj->build_sequence;
 314 }
 315
 316 sub next_chunk {
 317     my $self = shift;
 318     my $ct = 0;
 319     PARSER:
 320     while(defined(my $line = $self->_readline)) {
 321         next if $line =~ m{^\s*$};
 322         chomp $line;
 323         my ($ann,$data) = split m{\s{2,3}}, $line , 2;
 324         $data ||= '';
 325         $self->debug("Ann: [$ann]\n\tData: [$data]\n");
 326         last PARSER if $ann =~ m{//};
 327     }
 328 }
 329
 330 =head2 write_seq
 331
 332  Title   : write_seq
 333  Usage   : $stream->write_seq($seq)
 334  Function: writes the $seq object (must be seq) to the stream
 335  Returns : 1 for success and 0 for error
 336  Args    : array of 1 to n Bio::SeqI objects
 337
 338 =cut
 339
 340 sub write_seq {
 341     shift->throw("Use Bio::SeqIO::embl for output");
 342     # maybe make a Writer class as well????
 343 }
 344
 345 =head2 seqhandler
 346
 347  Title   : seqhandler
 348  Usage   : $stream->seqhandler($handler)
 349  Function: Get/Set teh Bio::Seq::HandlerBaseI object
 350  Returns : Bio::Seq::HandlerBaseI
 351  Args    : Bio::Seq::HandlerBaseI
 352
 353 =cut
 354
 355 sub seqhandler {
 356     my ($self, $handler) = @_;
 357     if ($handler) {
 358         $self->throw("Not a Bio::HandlerBaseI") unless
 359         ref($handler) && $handler->isa("Bio::HandlerBaseI");
 360         $self->{'_seqhandler'} = $handler;
 361     }
 362     return $self->{'_seqhandler'};
 363 }
 364
 365 1;
 366
 367 __END__
 368