Bio/Index/EMBL.pm

   1 #
   2 # BioPerl module for Bio::Index::EMBL
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Ewan Birney <birney@sanger.ac.uk>
   7 #
   8 # You may distribute this module under the same terms as perl itself
   9
  10 # POD documentation - main docs before the code
  11
  12 =head1 NAME
  13
  14 Bio::Index::EMBL - Interface for indexing (multiple) EMBL/Swissprot
  15 .dat files (i.e. flat file EMBL/Swissprot format).
  16
  17 =head1 SYNOPSIS
  18
  19     # Complete code for making an index for several
  20     # EMBL files
  21     use Bio::Index::EMBL;
  22     use strict;
  23
  24     my $Index_File_Name = shift;
  25     my $inx = Bio::Index::EMBL->new(-filename => $Index_File_Name,
  26                                     -write_flag => 'WRITE');
  27     $inx->make_index(@ARGV);
  28
  29     # Print out several sequences present in the index
  30     # in Fasta format
  31     use Bio::Index::EMBL;
  32     use strict;
  33
  34     my $Index_File_Name = shift;
  35     my $inx = Bio::Index::EMBL->new(-filename => $Index_File_Name);
  36     my $out = Bio::SeqIO->new(-format => 'Fasta',-fh => \*STDOUT);
  37
  38     foreach my $id (@ARGV) {
  39         my $seq = $inx->fetch($id); # Returns Bio::Seq object
  40         $out->write_seq($seq);
  41     }
  42
  43     # alternatively
  44     my ($id, $acc);
  45     my $seq1 = $inx->get_Seq_by_id($id);
  46     my $seq2 = $inx->get_Seq_by_acc($acc);
  47
  48 =head1 DESCRIPTION
  49
  50 Inherits functions for managing dbm files from Bio::Index::Abstract.pm,
  51 and provides the basic funtionallity for indexing EMBL files, and
  52 retrieving the sequence from them. Heavily snaffled from James Gilbert
  53 and his Fasta system. Note: for best results 'use strict'.
  54
  55 The keys are the identifiers in the ID and AC lines.
  56
  57 =head1 FEED_BACK
  58
  59 =head2 Mailing Lists
  60
  61 User feedback is an integral part of the evolution of this and other
  62 Bioperl modules. Send your comments and suggestions preferably to one
  63 of the Bioperl mailing lists.  Your participation is much appreciated.
  64
  65   bioperl-l@bioperl.org                  - General discussion
  66   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  67
  68
  69
  70 =head2 Support
  71
  72 Please direct usage questions or support issues to the mailing list:
  73
  74 I<bioperl-l@bioperl.org>
  75
  76 rather than to the module maintainer directly. Many experienced and
  77 reponsive experts will be able look at the problem and quickly
  78 address it. Please include a thorough description of the problem
  79 with code and data examples if at all possible.
  80
  81 =head2 Reporting Bugs
  82
  83 Report bugs to the Bioperl bug tracking system to help us keep track
  84 the bugs and their resolution.  Bug reports can be submitted via the
  85 web:
  86
  87   https://redmine.open-bio.org/projects/bioperl/
  88
  89 =head1 AUTHOR - Ewan Birney
  90
  91 Email - birney@sanger.ac.uk
  92
  93 =head1 APPENDIX
  94
  95 The rest of the documentation details each of the object
  96 methods. Internal methods are usually preceded with a _
  97
  98 =cut
  99
 100
 101 # Let's begin the code...
 102
 103
 104 package Bio::Index::EMBL;
 105
 106 use strict;
 107 use Bio::Seq;
 108
 109 use base qw(Bio::Index::AbstractSeq);
 110
 111 sub _type_stamp {
 112     return '__EMBL_FLAT__'; # What kind of index are we?
 113 }
 114
 115
 116 sub _version {
 117     return 0.1;
 118 }
 119
 120 =head2 _index_file
 121
 122   Title   : _index_file
 123   Usage   : $index->_index_file( $file_name, $i )
 124   Function: Specialist function to index EMBL format files.
 125             Is provided with a filename and an integer
 126             by make_index in its SUPER class.
 127   Example :
 128   Returns :
 129   Args    :
 130
 131 =cut
 132
 133 sub _index_file {
 134     my( $self,
 135         $file, # File name
 136         $i     # Index-number of file being indexed
 137         ) = @_;
 138
 139     my( $begin, # Offset from start of file of the start
 140                 # of the last found record.
 141         $id,    # ID of last found record.
 142         @accs,   # accession of last record. Also put into the index
 143         );
 144
 145     $begin = 0;
 146
 147     open my $EMBL, '<', $file or $self->throw("Could not read file '$file': $!");
 148
 149     # In Windows, text files have '\r\n' as line separator, but when reading in
 150     # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n",
 151     # "length $_" will report 4 although the line is 5 bytes in length.
 152     # We assume that all lines have the same line separator and only read current line.
 153     my $init_pos   = tell($EMBL);
 154     my $curr_line  = <$EMBL>;
 155     my $pos_diff   = tell($EMBL) - $init_pos;
 156     my $correction = $pos_diff - length $curr_line;
 157     seek $EMBL, $init_pos, 0; # Rewind position to proceed to read the file
 158
 159     # Main indexing loop
 160     $id = undef;
 161     @accs = ();
 162     while (<$EMBL>) {
 163         if( m{^//} ) {
 164             if( ! defined $id ) {
 165                 $self->throw("Got to a end of entry line for an EMBL flat file with no parsed ID. Considering this a problem!");
 166                 next;
 167             }
 168             if( ! @accs ) {
 169                 $self->warn("For id [$id] in embl flat file, got no accession number. Storing id index anyway");
 170             }
 171
 172             $self->add_record($id, $i, $begin);
 173
 174             foreach my $acc (@accs) {
 175                 if( $acc ne $id ) {
 176                     $self->add_record($acc, $i, $begin);
 177                 }
 178             }
 179         } elsif (/^ID\s+(\S+)/) {
 180             $id = $1;
 181             # not sure if I like this. Assummes tell is in bytes.
 182             # we could tell before each line and save it.
 183             $begin = tell($EMBL) - length( $_ ) - $correction;
 184
 185         } elsif (/^AC\s+(.*)?/) {
 186             push @accs , split (/[; ]+/, $1);
 187         } else {
 188             # do nothing
 189         }
 190     }
 191
 192     close $EMBL;
 193     return 1;
 194 }
 195
 196 =head2 _file_format
 197
 198  Title   : _file_format
 199  Usage   : Internal function for indexing system
 200  Function: Provides file format for this database
 201  Example :
 202  Returns :
 203  Args    :
 204
 205
 206 =cut
 207
 208 sub _file_format{
 209    my ($self,@args) = @_;
 210
 211    return 'EMBL';
 212 }
 213
 214
 215
 216 1;