Bio/SearchIO/hmmer_pull.pm

   1 # $Id$
   2 #
   3 # BioPerl module for Bio::SearchIO::hmmer_pull
   4 #
   5 # Cared for by Sendu Bala <bix@sendu.me.uk>
   6 #
   7 # Copyright Sendu Bala
   8 #
   9 # You may distribute this module under the same terms as perl itself
  10
  11 # POD documentation - main docs before the code
  12
  13 =head1 NAME
  14
  15 Bio::SearchIO::hmmer_pull - A parser for HMMER output
  16
  17 =head1 SYNOPSIS
  18
  19     # do not use this class directly it is available through Bio::SearchIO
  20     use Bio::SearchIO;
  21     my $in = Bio::SearchIO->new(-format => 'hmmer_pull',
  22                                -file   => 't/data/hmmpfam.bigout');
  23     while (my $result = $in->next_result) {
  24         # this is a Bio::Search::Result::HmmpfamResult object
  25         print $result->query_name(), " for HMM ", $result->hmm_name(), "\n";
  26         while (my $hit = $result->next_hit) {
  27             print $hit->name(), "\n";
  28             while (my $hsp = $hit->next_hsp) {
  29                 print "length is ", $hsp->length(), "\n";
  30             }
  31         }
  32     }
  33
  34 =head1 DESCRIPTION
  35
  36 This object implements a pull-parser for HMMER output. It is fast since it
  37 only does work on request (hence 'pull').
  38
  39 =head1 FEEDBACK
  40
  41 =head2 Mailing Lists
  42
  43 User feedback is an integral part of the evolution of this and other
  44 Bioperl modules. Send your comments and suggestions preferably to
  45 the Bioperl mailing list.  Your participation is much appreciated.
  46
  47   bioperl-l@bioperl.org                  - General discussion
  48   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  49
  50 =head2 Reporting Bugs
  51
  52 Report bugs to the Bioperl bug tracking system to help us keep track
  53 of the bugs and their resolution. Bug reports can be submitted via the
  54 web:
  55
  56   http://bugzilla.open-bio.org/
  57
  58 =head1 AUTHOR - Sendu Bala
  59
  60 Email bix@sendu.me.uk
  61
  62 =head1 APPENDIX
  63
  64 The rest of the documentation details each of the object methods.
  65 Internal methods are usually preceded with a _
  66
  67 =cut
  68
  69 # Let the code begin...
  70
  71 package Bio::SearchIO::hmmer_pull;
  72
  73 use strict;
  74
  75
  76 use base qw(Bio::SearchIO Bio::PullParserI);
  77
  78 =head2 new
  79
  80  Title   : new
  81  Usage   : my $obj = Bio::SearchIO::hmmer_pull->new();
  82  Function: Builds a new Bio::SearchIO::hmmer_pull object
  83  Returns : Bio::SearchIO::hmmer_pull
  84  Args    : -fh/-file => HMMER output filename
  85            -format   => 'hmmer_pull'
  86            -evalue   => float or scientific notation number to be used
  87                         as an evalue cutoff for hits
  88            -score    => integer or scientific notation number to be used
  89                         as a score value cutoff for hits
  90            -hsps     => integer minimum number of hsps (domains) a hit must have
  91            -piped_behaviour => 'temp_file'|'memory'|'sequential_read'
  92
  93            -piped_behaviour defines what the parser should do if the input is
  94             an unseekable filehandle (eg. piped input), see
  95             Bio::PullParserI::chunk for details. Default is 'sequential_read'.
  96
  97 =cut
  98
  99 sub _initialize {
 100     my ($self, @args) = @_;
 101
 102     # don't do normal SearchIO initialization
 103
 104     my ($writer, $file, $fh, $piped_behaviour, $evalue, $score, $hsps) =
 105                             $self->_rearrange([qw(WRITER
 106                                                   FILE FH
 107                                                   PIPED_BEHAVIOUR
 108                                                   EVALUE
 109                                                   SCORE
 110                                                   HSPS)], @args);
 111     $self->writer($writer) if $writer;
 112
 113     $self->_fields( { ( header => undef,
 114                         algorithm => undef,
 115                         algorithm_version => undef,
 116                         algorithm_reference => '',
 117                         hmm_file => undef,
 118                         hmm_name => undef,
 119                         sequence_file => undef,
 120                         sequence_database => undef,
 121                         database_name => undef,
 122                         database_letters => undef,
 123                         database_entries => undef,
 124                         next_result => undef,
 125                         evalue_cutoff => '[unset]',
 126                         score_cutoff => '[unset]',
 127                         hsps_cutoff => '[unset]' ) } );
 128
 129     $self->_fields->{evalue_cutoff} = $evalue if $evalue;
 130     $self->_fields->{score_cutoff} = $score if $score;
 131     $self->_fields->{hsps_cutoff} = $hsps if $hsps;
 132
 133     $self->_dependencies( { ( algorithm => 'header',
 134                               algorithm_version => 'header',
 135                               hmm_file => 'header',
 136                               hmm_name => 'header',
 137                               sequence_file => 'header',
 138                               sequence_database => 'header' ) } );
 139
 140     $self->chunk($file || $fh || $self->throw("-file or -fh must be supplied"),
 141                  -piped_behaviour => $piped_behaviour || 'sequential_read');
 142 }
 143
 144 sub _discover_header {
 145     my $self = shift;
 146     $self->_chunk_seek(0);
 147     my $header = $self->_get_chunk_by_nol(8);
 148     $self->{_after_header} = $self->_chunk_tell;
 149
 150     my ($algo) = $header =~ /^(hmm\S+) - search/m;
 151     $self->_fields->{algorithm} = uc $algo;
 152
 153     ($self->_fields->{algorithm_version}) = $header =~ /^HMMER\s+?(\S+)/m;
 154
 155     ($self->_fields->{hmm_file}) = $header =~ /^HMM file:\s.+?(\S+)$/m;
 156     $self->_fields->{hmm_name} = $self->_fields->{hmm_file};
 157
 158     ($self->_fields->{sequence_file}) = $header =~ /^Sequence (?:file|database):\s.+?(\S+)$/m;
 159     $self->_fields->{sequence_database} = $self->_fields->{sequence_file};
 160
 161     $self->_fields->{header} = 1;
 162 }
 163
 164 sub _discover_database_name {
 165     my $self = shift;
 166     my $type = $self->get_field('algorithm');
 167
 168     if ($type eq 'HMMPFAM') {
 169         $self->_fields->{database_name} = $self->get_field('hmm_file');
 170     }
 171     elsif ($type eq 'HMMSEARCH') {
 172         $self->_fields->{database_name} = $self->get_field('sequence_file');
 173     }
 174 }
 175
 176 sub _discover_next_result {
 177     my $self = shift;
 178     my $type = $self->get_field('algorithm'); # also sets _after_header if not set
 179
 180     if ($type eq 'HMMPFAM') {
 181         use Bio::Search::Result::HmmpfamResult;
 182
 183         unless ($self->_sequential) {
 184             $self->_chunk_seek($self->{_end_of_previous_result} || $self->{_after_header});
 185
 186             my ($start, $end) = $self->_find_chunk_by_end("//\n");
 187             return if $start == $end;
 188             $self->_fields->{next_result} = Bio::Search::Result::HmmpfamResult->new(-chunk => [($self->chunk, $start, $end)],
 189                                                                                    -parent => $self);
 190
 191             $self->{_end_of_previous_result} = $end;
 192         }
 193         else {
 194             # deliberatly don't cache these, which means rewind won't work;
 195             # if we cached we may as well have used 'memory' option to
 196             # -piped_behaviour
 197             my $chunk = $self->_get_chunk_by_end("//\n");
 198             $chunk || return;
 199             $self->_fields->{next_result} = Bio::Search::Result::HmmpfamResult->new(-chunk => [$chunk],
 200                                                                                    -parent => $self);
 201         }
 202     }
 203     elsif ($type eq 'HMMSEARCH') {
 204         $self->throw("Can't handle hmmsearch yet\n");
 205     }
 206     else {
 207         $self->throw("Unknown report type");
 208     }
 209 }
 210
 211 =head2 next_result
 212
 213  Title   : next_result
 214  Usage   : my $hit = $searchio->next_result;
 215  Function: Returns the next Result from a search
 216  Returns : Bio::Search::Result::ResultI object
 217  Args    : none
 218
 219 =cut
 220
 221 sub next_result {
 222     my $self = shift;
 223     my $result = $self->get_field('next_result') || return;
 224
 225     undef $self->_fields->{next_result};
 226
 227     $self->{'_result_count'}++;
 228     return $result;
 229 }
 230
 231 =head2 result_count
 232
 233  Title   : result_count
 234  Usage   : my $count = $searchio->result_count
 235  Function: Returns the number of results we have processed.
 236  Returns : integer
 237  Args    : none
 238
 239 =cut
 240
 241 sub result_count {
 242     my $self = shift;
 243     return $self->{'_result_count'};
 244 }
 245
 246 =head2 rewind
 247
 248  Title   : rewind
 249  Usage   : $searchio->rewind;
 250  Function: Allow one to reset the Result iterator to the beginning, so that
 251            next_result() will subsequently return the first result and so on.
 252
 253            NB: result objects are not cached, so you will get new result objects
 254            each time you rewind. Also, note that result_count() counts the
 255            number of times you have called next_result(), so will not be able
 256            tell you how many results there were in the file if you use rewind().
 257
 258  Returns : n/a
 259  Args    : none
 260
 261 =cut
 262
 263 sub rewind {
 264         my $self = shift;
 265     if ($self->_sequential) {
 266         $self->warn("rewind has no effect on piped input when you have chosen 'sequential_read' mode");
 267     }
 268         delete $self->{_end_of_previous_result};
 269 }
 270
 271 1;