Bio/SearchIO/hmmer_pull.pm

   1 #
   2 # BioPerl module for Bio::SearchIO::hmmer_pull
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Sendu Bala <bix@sendu.me.uk>
   7 #
   8 # Copyright Sendu Bala
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::SearchIO::hmmer_pull - A parser for HMMER output
  17
  18 =head1 SYNOPSIS
  19
  20     # do not use this class directly it is available through Bio::SearchIO
  21     use Bio::SearchIO;
  22     my $in = Bio::SearchIO->new(-format => 'hmmer_pull',
  23                                -file   => 't/data/hmmpfam.bigout');
  24     while (my $result = $in->next_result) {
  25         # this is a Bio::Search::Result::HmmpfamResult object
  26         print $result->query_name(), " for HMM ", $result->hmm_name(), "\n";
  27         while (my $hit = $result->next_hit) {
  28             print $hit->name(), "\n";
  29             while (my $hsp = $hit->next_hsp) {
  30                 print "length is ", $hsp->length(), "\n";
  31             }
  32         }
  33     }
  34
  35 =head1 DESCRIPTION
  36
  37 This object implements a pull-parser for HMMER output. It is fast since it
  38 only does work on request (hence 'pull').
  39
  40 =head1 FEEDBACK
  41
  42 =head2 Mailing Lists
  43
  44 User feedback is an integral part of the evolution of this and other
  45 Bioperl modules. Send your comments and suggestions preferably to
  46 the Bioperl mailing list.  Your participation is much appreciated.
  47
  48   bioperl-l@bioperl.org                  - General discussion
  49   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  50
  51 =head2 Support
  52
  53 Please direct usage questions or support issues to the mailing list:
  54
  55 I<bioperl-l@bioperl.org>
  56
  57 rather than to the module maintainer directly. Many experienced and
  58 reponsive experts will be able look at the problem and quickly
  59 address it. Please include a thorough description of the problem
  60 with code and data examples if at all possible.
  61
  62 =head2 Reporting Bugs
  63
  64 Report bugs to the Bioperl bug tracking system to help us keep track
  65 of the bugs and their resolution. Bug reports can be submitted via the
  66 web:
  67
  68   https://redmine.open-bio.org/projects/bioperl/
  69
  70 =head1 AUTHOR - Sendu Bala
  71
  72 Email bix@sendu.me.uk
  73
  74 =head1 APPENDIX
  75
  76 The rest of the documentation details each of the object methods.
  77 Internal methods are usually preceded with a _
  78
  79 =cut
  80
  81 # Let the code begin...
  82
  83 package Bio::SearchIO::hmmer_pull;
  84
  85 use strict;
  86
  87
  88 use base qw(Bio::SearchIO Bio::PullParserI);
  89
  90 =head2 new
  91
  92  Title   : new
  93  Usage   : my $obj = Bio::SearchIO::hmmer_pull->new();
  94  Function: Builds a new Bio::SearchIO::hmmer_pull object
  95  Returns : Bio::SearchIO::hmmer_pull
  96  Args    : -fh/-file => HMMER output filename
  97            -format   => 'hmmer_pull'
  98            -evalue   => float or scientific notation number to be used
  99                         as an evalue cutoff for hits
 100            -score    => integer or scientific notation number to be used
 101                         as a score value cutoff for hits
 102            -hsps     => integer minimum number of hsps (domains) a hit must have
 103            -piped_behaviour => 'temp_file'|'memory'|'sequential_read'
 104
 105            -piped_behaviour defines what the parser should do if the input is
 106             an unseekable filehandle (eg. piped input), see
 107             Bio::PullParserI::chunk for details. Default is 'sequential_read'.
 108
 109 =cut
 110
 111 sub _initialize {
 112     my ($self, @args) = @_;
 113
 114     # don't do normal SearchIO initialization
 115
 116     my ($writer, $file, $fh, $piped_behaviour, $evalue, $score, $hsps) =
 117                             $self->_rearrange([qw(WRITER
 118                                                   FILE FH
 119                                                   PIPED_BEHAVIOUR
 120                                                   EVALUE
 121                                                   SCORE
 122                                                   HSPS)], @args);
 123     $self->writer($writer) if $writer;
 124
 125     $self->_fields( { ( header => undef,
 126                         algorithm => undef,
 127                         algorithm_version => undef,
 128                         algorithm_reference => '',
 129                         hmm_file => undef,
 130                         hmm_name => undef,
 131                         sequence_file => undef,
 132                         sequence_database => undef,
 133                         database_name => undef,
 134                         database_letters => undef,
 135                         database_entries => undef,
 136                         next_result => undef,
 137                         evalue_cutoff => '[unset]',
 138                         score_cutoff => '[unset]',
 139                         hsps_cutoff => '[unset]' ) } );
 140
 141     $self->_fields->{evalue_cutoff} = $evalue if $evalue;
 142     $self->_fields->{score_cutoff} = $score if $score;
 143     $self->_fields->{hsps_cutoff} = $hsps if $hsps;
 144
 145     $self->_dependencies( { ( algorithm => 'header',
 146                               algorithm_version => 'header',
 147                               hmm_file => 'header',
 148                               hmm_name => 'header',
 149                               sequence_file => 'header',
 150                               sequence_database => 'header' ) } );
 151
 152     $self->chunk($file || $fh || $self->throw("-file or -fh must be supplied"),
 153                  -piped_behaviour => $piped_behaviour || 'sequential_read');
 154 }
 155
 156 sub _discover_header {
 157     my $self = shift;
 158     $self->_chunk_seek(0);
 159     my $header = $self->_get_chunk_by_nol(8);
 160     $self->{_after_header} = $self->_chunk_tell;
 161
 162     my ($algo) = $header =~ /^(hmm\S+) - search/m;
 163     $self->_fields->{algorithm} = uc $algo;
 164
 165     ($self->_fields->{algorithm_version}) = $header =~ /^HMMER\s+?(\S+)/m;
 166
 167     ($self->_fields->{hmm_file}) = $header =~ /^HMM file:\s.+?(\S+)$/m;
 168     $self->_fields->{hmm_name} = $self->_fields->{hmm_file};
 169
 170     ($self->_fields->{sequence_file}) = $header =~ /^Sequence (?:file|database):\s.+?(\S+)$/m;
 171     $self->_fields->{sequence_database} = $self->_fields->{sequence_file};
 172
 173     $self->_fields->{header} = 1;
 174 }
 175
 176 sub _discover_database_name {
 177     my $self = shift;
 178     my $type = $self->get_field('algorithm');
 179
 180     if ($type eq 'HMMPFAM') {
 181         $self->_fields->{database_name} = $self->get_field('hmm_file');
 182     }
 183     elsif ($type eq 'HMMSEARCH') {
 184         $self->_fields->{database_name} = $self->get_field('sequence_file');
 185     }
 186 }
 187
 188 sub _discover_next_result {
 189     my $self = shift;
 190     my $type = $self->get_field('algorithm'); # also sets _after_header if not set
 191
 192     if ($type eq 'HMMPFAM') {
 193         use Bio::Search::Result::HmmpfamResult;
 194
 195         unless ($self->_sequential) {
 196             $self->_chunk_seek($self->{_end_of_previous_result} || $self->{_after_header});
 197
 198             my ($start, $end) = $self->_find_chunk_by_end("//\n");
 199             return if $start == $end;
 200             $self->_fields->{next_result} = Bio::Search::Result::HmmpfamResult->new(-chunk => [($self->chunk, $start, $end)],
 201                                                                                    -parent => $self);
 202
 203             $self->{_end_of_previous_result} = $end;
 204         }
 205         else {
 206             # deliberatly don't cache these, which means rewind won't work;
 207             # if we cached we may as well have used 'memory' option to
 208             # -piped_behaviour
 209             my $chunk = $self->_get_chunk_by_end("//\n");
 210             $chunk || return;
 211             $self->_fields->{next_result} = Bio::Search::Result::HmmpfamResult->new(-chunk => [$chunk],
 212                                                                                    -parent => $self);
 213         }
 214     }
 215     elsif ($type eq 'HMMSEARCH') {
 216         $self->throw("Can't handle hmmsearch yet\n");
 217     }
 218     else {
 219         $self->throw("Unknown report type");
 220     }
 221 }
 222
 223 =head2 next_result
 224
 225  Title   : next_result
 226  Usage   : my $hit = $searchio->next_result;
 227  Function: Returns the next Result from a search
 228  Returns : Bio::Search::Result::ResultI object
 229  Args    : none
 230
 231 =cut
 232
 233 sub next_result {
 234     my $self = shift;
 235     my $result = $self->get_field('next_result') || return;
 236
 237     undef $self->_fields->{next_result};
 238
 239     $self->{'_result_count'}++;
 240     return $result;
 241 }
 242
 243 =head2 result_count
 244
 245  Title   : result_count
 246  Usage   : my $count = $searchio->result_count
 247  Function: Returns the number of results we have processed.
 248  Returns : integer
 249  Args    : none
 250
 251 =cut
 252
 253 sub result_count {
 254     my $self = shift;
 255     return $self->{'_result_count'};
 256 }
 257
 258 =head2 rewind
 259
 260  Title   : rewind
 261  Usage   : $searchio->rewind;
 262  Function: Allow one to reset the Result iterator to the beginning, so that
 263            next_result() will subsequently return the first result and so on.
 264
 265            NB: result objects are not cached, so you will get new result objects
 266            each time you rewind. Also, note that result_count() counts the
 267            number of times you have called next_result(), so will not be able
 268            tell you how many results there were in the file if you use rewind().
 269
 270  Returns : n/a
 271  Args    : none
 272
 273 =cut
 274
 275 sub rewind {
 276         my $self = shift;
 277     if ($self->_sequential) {
 278         $self->warn("rewind has no effect on piped input when you have chosen 'sequential_read' mode");
 279     }
 280         delete $self->{_end_of_previous_result};
 281 }
 282
 283 1;