lib/Bio/SeqIO/gbdriver.pm

   1 #
   2 # BioPerl module for Bio::SeqIO::gbdriver
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Bioperl project bioperl-l(at)bioperl.org
   7 #
   8 # Copyright Chris Fields and contributors see AUTHORS section
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::SeqIO::gbdriver - GenBank handler-based push parser
  17
  18 =head1 SYNOPSIS
  19
  20   #It is probably best not to use this object directly, but
  21   #rather go through the SeqIO handler:
  22
  23   $stream = Bio::SeqIO->new(-file => $filename,
  24                             -format => 'gbdriver');
  25
  26   while ( my $seq = $stream->next_seq() ) {
  27       # do something with $seq
  28   }
  29
  30 =head1 DESCRIPTION
  31
  32 This object can transform Bio::Seq objects to and from GenBank flat file
  33 databases. The key difference between this parser and the tried-and-true
  34 Bio::SeqIO::genbank parser is this version separates the parsing and data
  35 manipulation into a 'driver' method (next_seq) and separate object handlers
  36 which deal with the data passed to it.
  37
  38 =head2 The Driver
  39
  40 The main purpose of the driver routine, in this case next_seq(), is to carve out
  41 the data into meaningful chunks which are passed along to relevant handlers (see
  42 below).
  43
  44 Each chunk of data in the has a NAME tag attached to it, similar to that for XML
  45 parsing. This designates the type of data passed (annotation type or seqfeature)
  46 and the handler to be called for processing the data.
  47
  48 For GenBank annotations, the data is divided up and passed along to handlers
  49 according to whether the data is tagged with a field name (i.e. LOCUS) and
  50 whether the field name represents 'primary' annotation (in this case, is present
  51 at the beginning of the line, such as REFERENCE). If the field is primary, it is
  52 assigned to the NAME tag. Field names which aren't primary (have at least 2
  53 spaces before the name, like ORGANISM) are appended to the preceding primary
  54 field name as additional tags.
  55
  56 For feature table data each new feature name signals the beginning of a new
  57 chunk of data. 'FEATURES' is attached to NAME, the feature key ('CDS', 'gene',
  58 etc) is attached as the PRIMARY_ID, and the location is assigned to it's own tag
  59 name (LOCATION). Feature qualifiers are added as additional keys, with multiple
  60 keys included in an array.
  61
  62 Once a particular event occurs (new primary tag, sequence, end of record), the
  63 data is passed along to be processed by a handler or (if no handler is defined)
  64 tossed away.
  65
  66 Internally, the hash ref for a representative annotation (here a REFERENCE)
  67 looks like this:
  68
  69   $VAR1 = {
  70             'JOURNAL' => 'Unpublished (2003)',
  71             'TITLE' => 'The DNA sequence of Homo sapiens',
  72             'NAME' => 'REFERENCE',
  73             'REFERENCE' => '1  (bases 1 to 10001)',
  74             'AUTHORS' => 'International Human Genome Sequencing Consortium.'
  75           };
  76
  77 and a SeqFeature as this:
  78
  79   $VAR1 = {
  80             'db_xref' => [
  81                            'GeneID:127086',
  82                            'InterimID:127086'
  83                          ],
  84             'LOCATION' => 'complement(3024..6641)',
  85             'NAME' => 'FEATURES',
  86             'FEATURE_KEY' => 'gene',
  87             'gene' => 'LOC127086',
  88             'note' => 'Derived by automated computational analysis using
  89                        gene prediction method: GNOMON.'
  90           };
  91
  92 Note that any driver implementation would suffice as long as it fulfilled the
  93 requirements above.
  94
  95 =head1 FEEDBACK
  96
  97 =head2 Mailing Lists
  98
  99 User feedback is an integral part of the evolution of this and other
 100 Bioperl modules. Send your comments and suggestions preferably to one
 101 of the Bioperl mailing lists.  Your participation is much appreciated.
 102
 103   bioperl-l@bioperl.org                  - General discussion
 104   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 105
 106 =head2 Support
 107
 108 Please direct usage questions or support issues to the mailing list:
 109
 110 I<bioperl-l@bioperl.org>
 111
 112 rather than to the module maintainer directly. Many experienced and
 113 reponsive experts will be able look at the problem and quickly
 114 address it. Please include a thorough description of the problem
 115 with code and data examples if at all possible.
 116
 117 =head2 Reporting Bugs
 118
 119 Report bugs to the Bioperl bug tracking system to help us keep track
 120 the bugs and their resolution. Bug reports can be submitted via the web:
 121
 122   https://github.com/bioperl/bioperl-live/issues
 123
 124 =head1 AUTHOR - Bioperl Project
 125
 126 bioperl-l at bioperl.org
 127
 128 Original author Elia Stupka, elia -at- tigem.it
 129
 130 =head1 CONTRIBUTORS
 131
 132 Ewan Birney birney at ebi.ac.uk
 133 Jason Stajich jason at bioperl.org
 134 Chris Mungall cjm at fruitfly.bdgp.berkeley.edu
 135 Lincoln Stein lstein at cshl.org
 136 Heikki Lehvaslaiho, heikki at ebi.ac.uk
 137 Hilmar Lapp, hlapp at gmx.net
 138 Donald G. Jackson, donald.jackson at bms.com
 139 James Wasmuth, james.wasmuth at ed.ac.uk
 140 Brian Osborne, bosborne at alum.mit.edu
 141
 142 =head1 APPENDIX
 143
 144 The rest of the documentation details each of the object
 145 methods. Internal methods are usually preceded with a _
 146
 147 =cut
 148
 149 # POD is at the end of the module
 150
 151 # Let the code begin...
 152
 153 package Bio::SeqIO::gbdriver;
 154 use strict;
 155 use warnings;
 156 use Data::Dumper;
 157 use Bio::SeqIO::Handler::GenericRichSeqHandler;
 158 use Bio::Seq::SeqFactory;
 159
 160 use base qw(Bio::SeqIO);
 161
 162 # map all annotation keys to consistent INSDC-based tags for all handlers
 163
 164 my %FTQUAL_NO_QUOTE = map {$_ => 1} qw(
 165     anticodon           citation
 166     codon               codon_start
 167     cons_splice         direction
 168     evidence            label
 169     mod_base            number
 170     rpt_type            rpt_unit
 171     transl_except       transl_table
 172     usedin
 173     );
 174
 175
 176 # 1) change this to indicate what should be secondary, not primary, which allows
 177 # unknown or new stuff to be passed to handler automatically; current behavior
 178 # appends unknowns to previous data, which isn't good since it's subtly passing
 179 # by important data
 180 # 2) add mapping details about how to separate data using specific delimiters
 181
 182
 183 # Features are the only ones postprocessed for now
 184 # Uncomment relevant code in next_seq and add keys as needed...
 185 my %POSTPROCESS_DATA = map {$_ => 1} qw (FEATURES);
 186
 187 sub _initialize {
 188     my($self,@args) = @_;
 189
 190     $self->SUPER::_initialize(@args);
 191     my $handler = $self->_rearrange([qw(HANDLER)],@args);
 192     # hash for functions for decoding keys.
 193     $handler ? $self->seqhandler($handler) :
 194     $self->seqhandler(Bio::SeqIO::Handler::GenericRichSeqHandler->new(
 195                     -format => 'genbank',
 196                     -verbose => $self->verbose,
 197                     -builder => $self->sequence_builder
 198                     ));
 199     if( ! defined $self->sequence_factory ) {
 200         $self->sequence_factory(Bio::Seq::SeqFactory->new
 201                 (-verbose => $self->verbose(),
 202                  -type => 'Bio::Seq::RichSeq'));
 203     }
 204 }
 205
 206 =head2 next_seq
 207
 208  Title   : next_seq
 209  Usage   : $seq = $stream->next_seq()
 210  Function: returns the next sequence in the stream
 211  Returns : Bio::Seq object
 212  Args    :
 213
 214 =cut
 215
 216 # at this point there is minimal sequence validation,
 217 # but the parser seems to hold up nicely so far...
 218
 219 sub next_seq {
 220     my $self = shift;
 221     local($/) = "\n";
 222     my ($ann, $data, $annkey);
 223     my $endrec = my $seenfeat = 0;
 224     my $seqdata;
 225     my $seenlocus;
 226     my $hobj = $self->seqhandler;
 227     my $handlers = $self->seqhandler->handler_methods;
 228     #$self->debug(Dumper($handlers));
 229     PARSER:
 230     while (defined(my $line = $self->_readline)) {
 231         next if $line =~ m{^\s*$};
 232
 233         # have to catch this at the top of the loop, then exit SEQ loop on //
 234         # The reason? The regex match for ann/feat keys also matches some lines
 235         # in the sequence; no easy way around it since some feature keys may
 236         # start with a number as well
 237         if ($ann && $ann eq 'ORIGIN') {
 238             SEQ:
 239             while (defined($line)) {
 240                 last SEQ if index($line,'//') == 0;
 241                 $seqdata->{DATA} .= uc $line;
 242                 $line = $self->_readline;
 243             }
 244             $seqdata->{DATA} =~ tr{0-9 \n}{}d;
 245         }
 246         $endrec = 1 if (index($line,'//')==0);
 247
 248         if ($line =~ m{^(\s{0,5})(\w+)\s+(.*)$}ox || $endrec) {
 249             ($ann, $data) = ($2, $3);
 250             unless ($seenlocus) {
 251                 $self->throw("No LOCUS found.  Not GenBank in my book!")
 252                     if ($ann ne 'LOCUS');
 253                 $seenlocus = 1;
 254             }
 255             # use the spacer to determine the annotation type
 256             my $len = length($1 || '');
 257
 258             $annkey  = ($len == 0 || $len > 4)   ? 'DATA'  : $ann;
 259
 260             # Push off the previously cached data to the handler
 261             # whenever a new primary annotation or seqfeature is found
 262             # Note use of $endrec for catching end of record
 263             if (($annkey eq 'DATA') && $seqdata) {
 264                 chomp $seqdata->{DATA};
 265                 # postprocessing for some data
 266                 if ($seqdata->{NAME} eq 'FEATURES') {
 267                     $self->_process_features($seqdata)
 268                 }
 269
 270                 # using handlers directly, slightly faster
 271                 #my $method = (exists $handlers->{ $seqdata->{NAME} }) ?
 272                 #        ($handlers->{$seqdata->{NAME}}) :
 273                 #    (exists $handlers->{'_DEFAULT_'}) ?
 274                 #        ($handlers->{'_DEFAULT_'}) :
 275                 #    undef;
 276                 #($method) ? ($hobj->$method($seqdata) ) :
 277                 #        $self->debug("No handler defined for ",$seqdata->{NAME},"\n");
 278
 279                 # using handler methods in the Handler object, more centralized
 280                 #$self->debug(Dumper($seqdata));
 281                 $hobj->data_handler($seqdata);
 282
 283                 # bail here on //
 284                 last PARSER if $endrec;
 285                 # reset for next round
 286                 $seqdata = undef;
 287             }
 288
 289             $seqdata->{NAME} =  ($len == 0) ? $ann :   # primary ann
 290                                 ($len > 4 ) ? 'FEATURES': # sf feature key
 291                                 $seqdata->{NAME};      # all rest are sec. ann
 292             if ($seqdata->{NAME} eq 'FEATURES') {
 293                 $seqdata->{FEATURE_KEY} = $ann;
 294             }
 295             # throw back to top if seq is found to avoid regex
 296             next PARSER if $ann eq 'ORIGIN';
 297
 298         } else {
 299             ($data = $line) =~ s{^\s+}{};
 300             chomp $data;
 301         }
 302         my $delim = ($seqdata && $seqdata->{NAME} eq 'FEATURES') ? "\n" : ' ';
 303         $seqdata->{$annkey} .= ($seqdata->{$annkey}) ? $delim.$data : $data;
 304     }
 305     return $hobj->build_sequence;
 306 }
 307
 308 sub next_chunk {
 309     my $self = shift;
 310     local($/) = "\n";
 311     my ($ann, $data, $annkey);
 312     my $endrec = my $seenfeat = 0;
 313     my $seqdata;
 314     my $seenlocus;
 315     my $hobj = $self->seqhandler;
 316     PARSER:
 317     while (defined(my $line = $self->_readline)) {
 318         next if $line =~ m{^\s*$};
 319         # have to catch this at the top of the loop, then exit SEQ loop on //
 320         # The reason? The regex match for ann/feat keys also matches some lines
 321         # in the sequence; no easy way around it since some feature keys may
 322         # start with a number as well
 323         if ($ann && $ann eq 'ORIGIN') {
 324             SEQ:
 325             while (defined($line)) {
 326                 last SEQ if index($line,'//') == 0;
 327                 $seqdata->{DATA} .= uc $line;
 328                 $line = $self->_readline;
 329             }
 330             $seqdata->{DATA} =~ tr{0-9 \n}{}d;
 331         }
 332         $endrec = 1 if (index($line,'//')==0);
 333
 334         if ($line =~ m{^(\s{0,5})(\w+)\s+(.*)$}ox || $endrec) {
 335             ($ann, $data) = ($2, $3);
 336             unless ($seenlocus) {
 337                 $self->throw("No LOCUS found.  Not GenBank in my book!")
 338                     if ($ann ne 'LOCUS');
 339                 $seenlocus = 1;
 340             }
 341             # use the spacer to determine the annotation type
 342             my $len = length($1 || '');
 343
 344             $annkey  = ($len == 0 || $len > 4)   ? 'DATA'  : $ann;
 345
 346             # Push off the previously cached data to the handler
 347             # whenever a new primary annotation or seqfeature is found
 348             # Note use of $endrec for catching end of record
 349             if (($annkey eq 'DATA') && $seqdata) {
 350                 chomp $seqdata->{DATA};
 351                 # postprocessing for some data
 352                 if ($seqdata->{NAME} eq 'FEATURES') {
 353                     $self->_process_features($seqdata)
 354                 }
 355                 # using handler methods in the Handler object, more centralized
 356                 $hobj->data_handler($seqdata);
 357                 # bail here on //
 358                 last PARSER if $endrec;
 359                 # reset for next round
 360                 $seqdata = undef;
 361             }
 362
 363             $seqdata->{NAME} =  ($len == 0) ? $ann :   # primary ann
 364                                 ($len > 4 ) ? 'FEATURES': # sf feature key
 365                                 $seqdata->{NAME};      # all rest are sec. ann
 366             if ($seqdata->{NAME} eq 'FEATURES') {
 367                 $seqdata->{FEATURE_KEY} = $ann;
 368             }
 369             # throw back to top if seq is found to avoid regex
 370             next PARSER if $ann eq 'ORIGIN';
 371         } else {
 372             ($data = $line) =~ s{^\s+}{};
 373             chomp $data;
 374         }
 375         my $delim = ($seqdata && $seqdata->{NAME} eq 'FEATURES') ? "\n" : ' ';
 376         $seqdata->{$annkey} .= ($seqdata->{$annkey}) ? $delim.$data : $data;
 377     }
 378 }
 379
 380 =head2 write_seq
 381
 382  Title   : write_seq
 383  Usage   : $stream->write_seq($seq)
 384  Function: writes the $seq object (must be seq) to the stream
 385  Returns : 1 for success and 0 for error
 386  Args    : array of 1 to n Bio::SeqI objects
 387
 388 =cut
 389
 390 sub write_seq {
 391     shift->throw("Use Bio::SeqIO::genbank for output");
 392     # maybe make a Writer class as well????
 393 }
 394
 395 =head2 seqhandler
 396
 397  Title   : seqhandler
 398  Usage   : $stream->seqhandler($handler)
 399  Function: Get/Set the Bio::Seq::HandlerBaseI object
 400  Returns : Bio::Seq::HandlerBaseI
 401  Args    : Bio::Seq::HandlerBaseI
 402
 403 =cut
 404
 405 sub seqhandler {
 406     my ($self, $handler) = @_;
 407     if ($handler) {
 408         $self->throw("Not a Bio::HandlerBaseI") unless
 409         ref($handler) && $handler->isa("Bio::HandlerBaseI");
 410         $self->{'_seqhandler'} = $handler;
 411     }
 412     return $self->{'_seqhandler'};
 413 }
 414
 415 #=head2 _process_features
 416 #
 417 # Title   : _process_features
 418 # Usage   : $self->_process_features($seqdata)
 419 # Function: Process feature data chunk into usable bits
 420 # Returns :
 421 # Args    : data chunk
 422 #
 423 #=cut
 424
 425 sub _process_features {
 426     my ($self, $seqdata) = @_;
 427     my @ftlines = split m{\n}, $seqdata->{DATA};
 428     delete $seqdata->{DATA};
 429     # don't deal with balancing quotes for now; just get rid of them...
 430     # Should we worry about checking whether these are balanced
 431     # for round-tripping tests?
 432     map { s{"}{}g } @ftlines;
 433     # all sfs start with the location...
 434     my $qual = 'LOCATION';
 435     my $ct = 0;
 436     for my $qualdata (@ftlines) {
 437         if ($qualdata =~ m{^/([^=]+)=?(.+)?}) {
 438             ($qual, $qualdata) = ($1, $2);
 439             $qualdata ||= ''; # for those qualifiers that have no data, like 'pseudo'
 440             $ct = (exists $seqdata->{$qual}) ?
 441                   ((ref($seqdata->{$qual}))  ? scalar(@{ $seqdata->{$qual} }) : 1)
 442                   : 0 ;
 443         }
 444         my $delim = ($qual eq 'translation' || exists $FTQUAL_NO_QUOTE{$qual}) ?
 445             '' : ' ';
 446         # if more than one, turn into an array ref and append
 447         if ($ct == 0) {
 448             (exists $seqdata->{$qual}) ? ($seqdata->{$qual}.= $delim.$qualdata || '') :
 449                                          ($seqdata->{$qual} .= $qualdata || '');
 450         } else {
 451             if (!ref($seqdata->{$qual})) {
 452                 $seqdata->{$qual} = [$seqdata->{$qual}];
 453             }
 454             (exists $seqdata->{$qual}->[$ct]) ? (($seqdata->{$qual}->[$ct]) .= $delim.$qualdata) :
 455                                              (($seqdata->{$qual}->[$ct]) .= $qualdata);
 456         }
 457     }
 458 }
 459
 460 1;
 461
 462 __END__
 463