branch-1-6/Bio/SeqIO/gbdriver.pm

   1 # $Id$
   2 #
   3 # BioPerl module for Bio::SeqIO::gbdriver
   4 #
   5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   6 #
   7 # Cared for by Bioperl project bioperl-l(at)bioperl.org
   8 #
   9 # Copyright Chris Fields and contributors see AUTHORS section
  10 #
  11 # You may distribute this module under the same terms as perl itself
  12
  13 # POD documentation - main docs before the code
  14
  15 =head1 NAME
  16
  17 Bio::SeqIO::gbdriver - GenBank handler-based push parser
  18
  19 =head1 SYNOPSIS
  20
  21   #It is probably best not to use this object directly, but
  22   #rather go through the SeqIO handler:
  23
  24   $stream = Bio::SeqIO->new(-file => $filename,
  25                             -format => 'gbdriver');
  26
  27   while ( my $seq = $stream->next_seq() ) {
  28       # do something with $seq
  29   }
  30
  31 =head1 DESCRIPTION
  32
  33 This object can transform Bio::Seq objects to and from GenBank flat file
  34 databases. The key difference between this parser and the tried-and-true
  35 Bio::SeqIO::genbank parser is this version separates the parsing and data
  36 manipulation into a 'driver' method (next_seq) and separate object handlers
  37 which deal with the data passed to it.
  38
  39 =head2 The Driver
  40
  41 The main purpose of the driver routine, in this case next_seq(), is to carve out
  42 the data into meaningful chunks which are passed along to relevant handlers (see
  43 below).
  44
  45 Each chunk of data in the has a NAME tag attached to it, similar to that for XML
  46 parsing. This designates the type of data passed (annotation type or seqfeature)
  47 and the handler to be called for processing the data.
  48
  49 For GenBank annotations, the data is divided up and passed along to handlers
  50 according to whether the data is tagged with a field name (i.e. LOCUS) and
  51 whether the field name represents 'primary' annotation (in this case, is present
  52 at the beginning of the line, such as REFERENCE). If the field is primary, it is
  53 assigned to the NAME tag. Field names which aren't primary (have at least 2
  54 spaces before the name, like ORGANISM) are appended to the preceding primary
  55 field name as additional tags.
  56
  57 For feature table data each new feature name signals the beginning of a new
  58 chunk of data. 'FEATURES' is attached to NAME, the feature key ('CDS', 'gene',
  59 etc) is attached as the PRIMARY_ID, and the location is assigned to it's own tag
  60 name (LOCATION). Feature qualifiers are added as additional keys, with multiple
  61 keys included in an array.
  62
  63 Once a particular event occurs (new primary tag, sequence, end of record), the
  64 data is passed along to be processed by a handler or (if no handler is defined)
  65 tossed away.
  66
  67 Internally, the hash ref for a representative annotation (here a REFERENCE)
  68 looks like this:
  69
  70   $VAR1 = {
  71             'JOURNAL' => 'Unpublished (2003)',
  72             'TITLE' => 'The DNA sequence of Homo sapiens',
  73             'NAME' => 'REFERENCE',
  74             'REFERENCE' => '1  (bases 1 to 10001)',
  75             'AUTHORS' => 'International Human Genome Sequencing Consortium.'
  76           };
  77
  78 and a SeqFeature as this:
  79
  80   $VAR1 = {
  81             'db_xref' => [
  82                            'GeneID:127086',
  83                            'InterimID:127086'
  84                          ],
  85             'LOCATION' => 'complement(3024..6641)',
  86             'NAME' => 'FEATURES',
  87             'FEATURE_KEY' => 'gene',
  88             'gene' => 'LOC127086',
  89             'note' => 'Derived by automated computational analysis using
  90                        gene prediction method: GNOMON.'
  91           };
  92
  93 Note that any driver implementation would suffice as long as it fulfilled the
  94 requirements above.
  95
  96 =head1 FEEDBACK
  97
  98 =head2 Mailing Lists
  99
 100 User feedback is an integral part of the evolution of this and other
 101 Bioperl modules. Send your comments and suggestions preferably to one
 102 of the Bioperl mailing lists.  Your participation is much appreciated.
 103
 104   bioperl-l@bioperl.org                  - General discussion
 105   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 106
 107 =head2 Support
 108
 109 Please direct usage questions or support issues to the mailing list:
 110
 111 I<bioperl-l@bioperl.org>
 112
 113 rather than to the module maintainer directly. Many experienced and
 114 reponsive experts will be able look at the problem and quickly
 115 address it. Please include a thorough description of the problem
 116 with code and data examples if at all possible.
 117
 118 =head2 Reporting Bugs
 119
 120 Report bugs to the Bioperl bug tracking system to help us keep track
 121 the bugs and their resolution. Bug reports can be submitted via the web:
 122
 123   http://bugzilla.open-bio.org/
 124
 125 =head1 AUTHOR - Bioperl Project
 126
 127 bioperl-l at bioperl.org
 128
 129 Original author Elia Stupka, elia -at- tigem.it
 130
 131 =head1 CONTRIBUTORS
 132
 133 Ewan Birney birney at ebi.ac.uk
 134 Jason Stajich jason at bioperl.org
 135 Chris Mungall cjm at fruitfly.bdgp.berkeley.edu
 136 Lincoln Stein lstein at cshl.org
 137 Heikki Lehvaslaiho, heikki at ebi.ac.uk
 138 Hilmar Lapp, hlapp at gmx.net
 139 Donald G. Jackson, donald.jackson at bms.com
 140 James Wasmuth, james.wasmuth at ed.ac.uk
 141 Brian Osborne, bosborne at alum.mit.edu
 142
 143 =head1 APPENDIX
 144
 145 The rest of the documentation details each of the object
 146 methods. Internal methods are usually preceded with a _
 147
 148 =cut
 149
 150 # POD is at the end of the module
 151
 152 # Let the code begin...
 153
 154 package Bio::SeqIO::gbdriver;
 155 use strict;
 156 use warnings;
 157 use Data::Dumper;
 158 use Bio::SeqIO::Handler::GenericRichSeqHandler;
 159 use Bio::Seq::SeqFactory;
 160
 161 use base qw(Bio::SeqIO);
 162
 163 # map all annotation keys to consistent INSDC-based tags for all handlers
 164
 165 my %FTQUAL_NO_QUOTE = map {$_ => 1} qw(
 166     anticodon           citation
 167     codon               codon_start
 168     cons_splice         direction
 169     evidence            label
 170     mod_base            number
 171     rpt_type            rpt_unit
 172     transl_except       transl_table
 173     usedin
 174     );
 175
 176
 177 # 1) change this to indicate what should be secondary, not primary, which allows
 178 # unknown or new stuff to be passed to handler automatically; current behavior
 179 # appends unknowns to previous data, which isn't good since it's subtly passing
 180 # by important data
 181 # 2) add mapping details about how to separate data using specific delimiters
 182
 183
 184 # Features are the only ones postprocessed for now
 185 # Uncomment relevant code in next_seq and add keys as needed...
 186 my %POSTPROCESS_DATA = map {$_ => 1} qw (FEATURES);
 187
 188 sub _initialize {
 189     my($self,@args) = @_;
 190
 191     $self->SUPER::_initialize(@args);
 192     my $handler = $self->_rearrange([qw(HANDLER)],@args);
 193     # hash for functions for decoding keys.
 194     $handler ? $self->seqhandler($handler) :
 195     $self->seqhandler(Bio::SeqIO::Handler::GenericRichSeqHandler->new(
 196                     -format => 'genbank',
 197                     -verbose => $self->verbose,
 198                     -builder => $self->sequence_builder
 199                     ));
 200     if( ! defined $self->sequence_factory ) {
 201         $self->sequence_factory(Bio::Seq::SeqFactory->new
 202                 (-verbose => $self->verbose(),
 203                  -type => 'Bio::Seq::RichSeq'));
 204     }
 205 }
 206
 207 =head2 next_seq
 208
 209  Title   : next_seq
 210  Usage   : $seq = $stream->next_seq()
 211  Function: returns the next sequence in the stream
 212  Returns : Bio::Seq object
 213  Args    :
 214
 215 =cut
 216
 217 # at this point there is minimal sequence validation,
 218 # but the parser seems to hold up nicely so far...
 219
 220 sub next_seq {
 221     my $self = shift;
 222     local($/) = "\n";
 223     my ($ann, $data, $annkey);
 224     my $endrec = my $seenfeat = 0;
 225     my $seqdata;
 226     my $seenlocus;
 227     my $hobj = $self->seqhandler;
 228     my $handlers = $self->seqhandler->handler_methods;
 229     #$self->debug(Dumper($handlers));
 230     PARSER:
 231     while (defined(my $line = $self->_readline)) {
 232         next if $line =~ m{^\s*$};
 233
 234         # have to catch this at the top of the loop, then exit SEQ loop on //
 235         # The reason? The regex match for ann/feat keys also matches some lines
 236         # in the sequence; no easy way around it since some feature keys may
 237         # start with a number as well
 238         if ($ann && $ann eq 'ORIGIN') {
 239             SEQ:
 240             while (defined($line)) {
 241                 last SEQ if index($line,'//') == 0;
 242                 $seqdata->{DATA} .= uc $line;
 243                 $line = $self->_readline;
 244             }
 245             $seqdata->{DATA} =~ tr{0-9 \n}{}d;
 246         }
 247         $endrec = 1 if (index($line,'//')==0);
 248
 249         if ($line =~ m{^(\s{0,5})(\w+)\s+(.*)$}ox || $endrec) {
 250             ($ann, $data) = ($2, $3);
 251             unless ($seenlocus) {
 252                 $self->throw("No LOCUS found.  Not GenBank in my book!")
 253                     if ($ann ne 'LOCUS');
 254                 $seenlocus = 1;
 255             }
 256             # use the spacer to determine the annotation type
 257             my $len = length($1 || '');
 258
 259             $annkey  = ($len == 0 || $len > 4)   ? 'DATA'  : $ann;
 260
 261             # Push off the previously cached data to the handler
 262             # whenever a new primary annotation or seqfeature is found
 263             # Note use of $endrec for catching end of record
 264             if (($annkey eq 'DATA') && $seqdata) {
 265                 chomp $seqdata->{DATA};
 266                 # postprocessing for some data
 267                 if ($seqdata->{NAME} eq 'FEATURES') {
 268                     $self->_process_features($seqdata)
 269                 }
 270
 271                 # using handlers directly, slightly faster
 272                 #my $method = (exists $handlers->{ $seqdata->{NAME} }) ?
 273                 #        ($handlers->{$seqdata->{NAME}}) :
 274                 #    (exists $handlers->{'_DEFAULT_'}) ?
 275                 #        ($handlers->{'_DEFAULT_'}) :
 276                 #    undef;
 277                 #($method) ? ($hobj->$method($seqdata) ) :
 278                 #        $self->debug("No handler defined for ",$seqdata->{NAME},"\n");
 279
 280                 # using handler methods in the Handler object, more centralized
 281                 #$self->debug(Dumper($seqdata));
 282                 $hobj->data_handler($seqdata);
 283
 284                 # bail here on //
 285                 last PARSER if $endrec;
 286                 # reset for next round
 287                 $seqdata = undef;
 288             }
 289
 290             $seqdata->{NAME} =  ($len == 0) ? $ann :   # primary ann
 291                                 ($len > 4 ) ? 'FEATURES': # sf feature key
 292                                 $seqdata->{NAME};      # all rest are sec. ann
 293             if ($seqdata->{NAME} eq 'FEATURES') {
 294                 $seqdata->{FEATURE_KEY} = $ann;
 295             }
 296             # throw back to top if seq is found to avoid regex
 297             next PARSER if $ann eq 'ORIGIN';
 298
 299         } else {
 300             ($data = $line) =~ s{^\s+}{};
 301             chomp $data;
 302         }
 303         my $delim = ($seqdata && $seqdata->{NAME} eq 'FEATURES') ? "\n" : ' ';
 304         $seqdata->{$annkey} .= ($seqdata->{$annkey}) ? $delim.$data : $data;
 305     }
 306     return $hobj->build_sequence;
 307 }
 308
 309 sub next_chunk {
 310     my $self = shift;
 311     local($/) = "\n";
 312     my ($ann, $data, $annkey);
 313     my $endrec = my $seenfeat = 0;
 314     my $seqdata;
 315     my $seenlocus;
 316     my $hobj = $self->seqhandler;
 317     PARSER:
 318     while (defined(my $line = $self->_readline)) {
 319         next if $line =~ m{^\s*$};
 320         # have to catch this at the top of the loop, then exit SEQ loop on //
 321         # The reason? The regex match for ann/feat keys also matches some lines
 322         # in the sequence; no easy way around it since some feature keys may
 323         # start with a number as well
 324         if ($ann && $ann eq 'ORIGIN') {
 325             SEQ:
 326             while (defined($line)) {
 327                 last SEQ if index($line,'//') == 0;
 328                 $seqdata->{DATA} .= uc $line;
 329                 $line = $self->_readline;
 330             }
 331             $seqdata->{DATA} =~ tr{0-9 \n}{}d;
 332         }
 333         $endrec = 1 if (index($line,'//')==0);
 334
 335         if ($line =~ m{^(\s{0,5})(\w+)\s+(.*)$}ox || $endrec) {
 336             ($ann, $data) = ($2, $3);
 337             unless ($seenlocus) {
 338                 $self->throw("No LOCUS found.  Not GenBank in my book!")
 339                     if ($ann ne 'LOCUS');
 340                 $seenlocus = 1;
 341             }
 342             # use the spacer to determine the annotation type
 343             my $len = length($1 || '');
 344
 345             $annkey  = ($len == 0 || $len > 4)   ? 'DATA'  : $ann;
 346
 347             # Push off the previously cached data to the handler
 348             # whenever a new primary annotation or seqfeature is found
 349             # Note use of $endrec for catching end of record
 350             if (($annkey eq 'DATA') && $seqdata) {
 351                 chomp $seqdata->{DATA};
 352                 # postprocessing for some data
 353                 if ($seqdata->{NAME} eq 'FEATURES') {
 354                     $self->_process_features($seqdata)
 355                 }
 356                 # using handler methods in the Handler object, more centralized
 357                 $hobj->data_handler($seqdata);
 358                 # bail here on //
 359                 last PARSER if $endrec;
 360                 # reset for next round
 361                 $seqdata = undef;
 362             }
 363
 364             $seqdata->{NAME} =  ($len == 0) ? $ann :   # primary ann
 365                                 ($len > 4 ) ? 'FEATURES': # sf feature key
 366                                 $seqdata->{NAME};      # all rest are sec. ann
 367             if ($seqdata->{NAME} eq 'FEATURES') {
 368                 $seqdata->{FEATURE_KEY} = $ann;
 369             }
 370             # throw back to top if seq is found to avoid regex
 371             next PARSER if $ann eq 'ORIGIN';
 372         } else {
 373             ($data = $line) =~ s{^\s+}{};
 374             chomp $data;
 375         }
 376         my $delim = ($seqdata && $seqdata->{NAME} eq 'FEATURES') ? "\n" : ' ';
 377         $seqdata->{$annkey} .= ($seqdata->{$annkey}) ? $delim.$data : $data;
 378     }
 379 }
 380
 381 =head2 write_seq
 382
 383  Title   : write_seq
 384  Usage   : $stream->write_seq($seq)
 385  Function: writes the $seq object (must be seq) to the stream
 386  Returns : 1 for success and 0 for error
 387  Args    : array of 1 to n Bio::SeqI objects
 388
 389 =cut
 390
 391 sub write_seq {
 392     shift->throw("Use Bio::SeqIO::genbank for output");
 393     # maybe make a Writer class as well????
 394 }
 395
 396 =head2 seqhandler
 397
 398  Title   : seqhandler
 399  Usage   : $stream->seqhandler($handler)
 400  Function: Get/Set teh Bio::Seq::HandlerBaseI object
 401  Returns : Bio::Seq::HandlerBaseI
 402  Args    : Bio::Seq::HandlerBaseI
 403
 404 =cut
 405
 406 sub seqhandler {
 407     my ($self, $handler) = @_;
 408     if ($handler) {
 409         $self->throw("Not a Bio::HandlerBaseI") unless
 410         ref($handler) && $handler->isa("Bio::HandlerBaseI");
 411         $self->{'_seqhandler'} = $handler;
 412     }
 413     return $self->{'_seqhandler'};
 414 }
 415
 416 #=head2 _process_features
 417 #
 418 # Title   : _process_features
 419 # Usage   : $self->_process_features($seqdata)
 420 # Function: Process feature data chunk into usable bits
 421 # Returns :
 422 # Args    : data chunk
 423 #
 424 #=cut
 425
 426 sub _process_features {
 427     my ($self, $seqdata) = @_;
 428     my @ftlines = split m{\n}, $seqdata->{DATA};
 429     delete $seqdata->{DATA};
 430     # don't deal with balancing quotes for now; just get rid of them...
 431     # Should we worry about checking whether these are balanced
 432     # for round-tripping tests?
 433     map { s{"}{}g } @ftlines;
 434     # all sfs start with the location...
 435     my $qual = 'LOCATION';
 436     my $ct = 0;
 437     for my $qualdata (@ftlines) {
 438         if ($qualdata =~ m{^/([^=]+)=?(.+)?}) {
 439             ($qual, $qualdata) = ($1, $2);
 440             $qualdata ||= ''; # for those qualifiers that have no data, like 'pseudo'
 441             $ct = (exists $seqdata->{$qual}) ?
 442                   ((ref($seqdata->{$qual}))  ? scalar(@{ $seqdata->{$qual} }) : 1)
 443                   : 0 ;
 444         }
 445         my $delim = ($qual eq 'translation' || exists $FTQUAL_NO_QUOTE{$qual}) ?
 446             '' : ' ';
 447         # if more than one, turn into an array ref and append
 448         if ($ct == 0) {
 449             (exists $seqdata->{$qual}) ? ($seqdata->{$qual}.= $delim.$qualdata || '') :
 450                                          ($seqdata->{$qual} .= $qualdata || '');
 451         } else {
 452             if (!ref($seqdata->{$qual})) {
 453                 $seqdata->{$qual} = [$seqdata->{$qual}];
 454             }
 455             (exists $seqdata->{$qual}->[$ct]) ? (($seqdata->{$qual}->[$ct]) .= $delim.$qualdata) :
 456                                              (($seqdata->{$qual}->[$ct]) .= $qualdata);
 457         }
 458     }
 459 }
 460
 461 1;
 462
 463 __END__
 464