branch-1-6/Bio/Index/Fastq.pm

   1 #
   2 #
   3 # BioPerl module for Bio::Index::Fastq
   4 #
   5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   6 #
   7 # Cared for by Tony Cox <avc@sanger.ac.uk>
   8 #
   9 # You may distribute this module under the same terms as perl itself
  10
  11 # POD documentation - main docs before the code
  12
  13 =head1 NAME
  14
  15 Bio::Index::Fastq - Interface for indexing (multiple) fastq files
  16
  17 =head1 SYNOPSIS
  18
  19     # Complete code for making an index for several
  20     # fastq files
  21     use Bio::Index::Fastq;
  22     use strict;
  23
  24     my $Index_File_Name = shift;
  25     my $inx = Bio::Index::Fastq->new(
  26         '-filename' => $Index_File_Name,
  27         '-write_flag' => 1);
  28     $inx->make_index(@ARGV);
  29
  30     # Print out several sequences present in the index
  31     # in Fastq format
  32     use Bio::Index::Fastq;
  33     use strict;
  34
  35     my $Index_File_Name = shift;
  36     my $inx = Bio::Index::Fastq->new('-filename' => $Index_File_Name);
  37     my $out = Bio::SeqIO->new('-format' => 'Fastq','-fh' => \*STDOUT);
  38
  39     foreach my $id (@ARGV) {
  40         my $seq = $inx->fetch($id); # Returns Bio::Seq::Quality object
  41         $out->write_seq($seq);
  42     }
  43
  44     # or, alternatively
  45     my $id;
  46     my $seq = $inx->get_Seq_by_id($id); #identical to fetch
  47
  48 =head1 DESCRIPTION
  49
  50 Inherits functions for managing dbm files from Bio::Index::Abstract.pm,
  51 and provides the basic funtionallity for indexing fastq files, and
  52 retrieving the sequence from them. Note: for best results 'use strict'.
  53
  54 Bio::Index::Fastq supports the Bio::DB::BioSeqI interface, meaning
  55 it can be used as a Sequence database for other parts of bioperl
  56
  57 =head1 FEED_BACK
  58
  59 =head2 Mailing Lists
  60
  61 User feedback is an integral part of the evolution of this and other
  62 Bioperl modules. Send your comments and suggestions preferably to one
  63 of the Bioperl mailing lists.  Your participation is much appreciated.
  64
  65   bioperl-l@bioperl.org                  - General discussion
  66   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  67
  68 =head2 Support
  69
  70 Please direct usage questions or support issues to the mailing list:
  71
  72 I<bioperl-l@bioperl.org>
  73
  74 rather than to the module maintainer directly. Many experienced and
  75 reponsive experts will be able look at the problem and quickly
  76 address it. Please include a thorough description of the problem
  77 with code and data examples if at all possible.
  78
  79 =head2 Reporting Bugs
  80
  81 Report bugs to the Bioperl bug tracking system to help us keep track
  82 the bugs and their resolution.  Bug reports can be submitted via the
  83 web:
  84
  85   http://bugzilla.open-bio.org/
  86
  87 =head1 AUTHOR - Tony Cox
  88
  89 Email - avc@sanger.ac.uk
  90
  91 =head1 APPENDIX
  92
  93 The rest of the documentation details each of the object
  94 methods. Internal methods are usually preceded with a _
  95
  96 =cut
  97
  98
  99 # Let the code begin...
 100
 101
 102 package Bio::Index::Fastq;
 103
 104 use strict;
 105
 106 use Bio::Seq;
 107
 108 use base qw(Bio::Index::AbstractSeq);
 109
 110 #
 111 # Suggested fix by Michael G Schwern <schwern@pobox.com> to
 112 # get around a clash with CPAN shell...
 113 #
 114
 115 sub _version {
 116     return 0.2;
 117 }
 118
 119 =head2 _file_format
 120
 121  Title   : _file_format
 122  Function: The file format for this package, which is needed
 123            by the SeqIO system when reading the sequence.
 124  Returns : 'Fastq'
 125
 126 =cut
 127
 128 sub _file_format {
 129     return 'Fastq';
 130 }
 131
 132
 133
 134 =head2 _index_file
 135
 136   Title   : _index_file
 137   Usage   : $index->_index_file( $file_name, $i )
 138   Function: Specialist function to index FASTQ format files.
 139             Is provided with a filename and an integer
 140             by make_index in its SUPER class.
 141   Example :
 142   Returns :
 143   Args    :
 144
 145 =cut
 146
 147 sub _index_file {
 148     my( $self,
 149         $file, # File name
 150         $i,    # Index-number of file being indexed
 151         ) = @_;
 152
 153     my( $begin,     # Offset from start of file of the start
 154                     # of the last found record.
 155         );
 156
 157     $begin = 0;
 158
 159     my $id_parser = $self->id_parser;
 160     my $c = 0;
 161     open my $FASTQ, '<', $file or $self->throw("Can't open file for read : $file");
 162     # Main indexing loop
 163     while (<$FASTQ>) {
 164         if (/^@/) {
 165             # $begin is the position of the first character after the '@'
 166             my $begin = tell($FASTQ) - length( $_ ) + 1;
 167             foreach my $id (&$id_parser($_)) {
 168                 $self->add_record($id, $i, $begin);
 169                 $c++;
 170             }
 171         }
 172     }
 173
 174     close $FASTQ;
 175     return ($c);
 176 }
 177
 178 =head2 id_parser
 179
 180   Title   : id_parser
 181   Usage   : $index->id_parser( CODE )
 182   Function: Stores or returns the code used by record_id to
 183             parse the ID for record from a string.  Useful
 184             for (for instance) specifying a different
 185             parser for different flavours of FASTQ file.
 186             Returns \&default_id_parser (see below) if not
 187             set. If you supply your own id_parser
 188             subroutine, then it should expect a fastq
 189             description line.  An entry will be added to
 190             the index for each string in the list returned.
 191   Example : $index->id_parser( \&my_id_parser )
 192   Returns : ref to CODE if called without arguments
 193   Args    : CODE
 194
 195 =cut
 196
 197 sub id_parser {
 198     my( $self, $code ) = @_;
 199
 200     if ($code) {
 201         $self->{'_id_parser'} = $code;
 202     }
 203     return $self->{'_id_parser'} || \&default_id_parser;
 204 }
 205
 206
 207
 208 =head2 default_id_parser
 209
 210   Title   : default_id_parser
 211   Usage   : $id = default_id_parser( $header )
 212   Function: The default Fastq ID parser for Fastq.pm
 213             Returns $1 from applying the regexp /^>\s*(\S+)/
 214             to $header.
 215   Returns : ID string
 216   Args    : a fastq header line string
 217
 218 =cut
 219
 220 sub default_id_parser {
 221     if ($_[0] =~ /^@\s*(\S+)/) {
 222         return $1;
 223     } else {
 224         return;
 225     }
 226 }
 227
 228 1;