sync w/ main trunk
[bioperl-live.git] / Bio / Index / Stockholm.pm
blob47622ce6cedcb17935628ebfa2d5ae62df4866ac
1 # $Id$
3 # BioPerl module for Bio::Index::Stockholm
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Chris Fields <cjfields@uiuc.edu>
9 # Copyright Chris Fields
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::Index::Stockholm - Indexes Stockholm format alignments (such as those from
18 Pfam and Rfam. Retrieves raw stream data using the ID or a Bio::SimpleAlign
19 (via Bio::AlignIO)
21 =head1 SYNOPSIS
23 use strict;
24 use Bio::Index::Stockholm;
25 my ($indexfile,$file1,$file2,$query);
26 my $index = Bio::Index::Stockholm->new(-filename => $indexfile,
27 -write_flag => 1);
28 $index->make_index($file1,$file2);
30 # get raw data stream starting at alignment position
31 my $fh = $index->get_stream($query);
33 # fetch individual alignment
34 my $align = $index->fetch_aln($query); # alias for fetch_report
35 my $align = $index->fetch_report($query); # same as above
36 print "query is ", $align->display_id, "\n";
38 =head1 DESCRIPTION
40 This object allows one to build an index for any file (or files)
41 containing Stockholm alignment format (such as Rfam and Pfam) and provides
42 quick access to the alignment based on the alignment ID.
44 This also allows for ID parsing using a callback:
46 $inx->id_parser(\&get_id);
47 # make the index
48 $inx->make_index($file_name);
50 # here is where the retrieval key is specified
51 sub get_id {
52 my $line = shift;
53 $line =~ /^>.+gi\|(\d+)/;
54 $1;
57 The indexer is capable of indexing based on multiple IDs passed back from the
58 callback; this is assuming of course all IDs are unique. The default is to use
59 the alignment ID provided for Rfam/Pfam output.
61 Note: for best results 'use strict'.
63 =head1 TODO
65 - allow using an alternative regex for indexing (for instance, the ID instead of AC)
67 =head1 FEEDBACK
69 =head2 Mailing Lists
71 User feedback is an integral part of the evolution of this and other
72 Bioperl modules. Send your comments and suggestions preferably to
73 the Bioperl mailing list. Your participation is much appreciated.
75 bioperl-l@bioperl.org - General discussion
76 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
78 =head2 Support
80 Please direct usage questions or support issues to the mailing list:
82 L<bioperl-l@bioperl.org>
84 rather than to the module maintainer directly. Many experienced and
85 reponsive experts will be able look at the problem and quickly
86 address it. Please include a thorough description of the problem
87 with code and data examples if at all possible.
89 =head2 Reporting Bugs
91 Report bugs to the Bioperl bug tracking system to help us keep track
92 of the bugs and their resolution. Bug reports can be submitted via the
93 web:
95 http://bugzilla.open-bio.org/
97 =head1 AUTHOR - Chris Fields
99 Email cjfields-at-bioperl-dot-org
101 =head1 APPENDIX
103 The rest of the documentation details each of the object methods.
104 Internal methods are usually preceded with a _
106 =cut
108 # Let the code begin...
110 package Bio::Index::Stockholm;
111 use strict;
112 use Bio::AlignIO;
114 use base qw(Bio::Index::Abstract Bio::Root::Root);
116 sub _version {
117 return ${Bio::Root::Version::VERSION};
120 =head2 new
122 Usage : $index = Bio::Index::Abstract->new(
123 -filename => $dbm_file,
124 -write_flag => 0,
125 -dbm_package => 'DB_File',
126 -verbose => 0);
128 Function: Returns a new index object. If filename is
129 specified, then open_dbm() is immediately called.
130 Bio::Index::Abstract->new() will usually be called
131 directly only when opening an existing index.
132 Returns : A new index object
133 Args : -filename The name of the dbm index file.
134 -write_flag TRUE if write access to the dbm file is
135 needed.
136 -dbm_package The Perl dbm module to use for the
137 index.
138 -verbose Print debugging output to STDERR if
139 TRUE.
141 =cut
143 sub new {
145 my($class,@args) = @_;
147 my $self = $class->SUPER::new(@args);
151 =head2 Bio::Index::Stockholm implemented methods
153 =cut
155 =head2 fetch_report
157 Title : fetch_report
158 Usage : my $align = $idx->fetch_report($id);
159 Function: Returns a Bio::SimpleAlign object
160 for a specific alignment
161 Returns : Bio::SimpleAlign
162 Args : valid id
164 =cut
166 sub fetch_report{
167 my ($self,$id) = @_;
168 my $fh = $self->get_stream($id);
169 my $report = Bio::AlignIO->new(-noclose => 1,
170 -format => 'stockholm',
171 -fh => $fh);
172 return $report->next_aln;
175 =head2 fetch_report
177 Title : fetch_report
178 Usage : my $align = $idx->fetch_report($id);
179 Function: Returns a Bio::SimpleAlign object
180 for a specific alignment
181 Returns : Bio::SimpleAlign
182 Args : valid id
183 Note : alias for fetch_report
185 =cut
187 *fetch_aln = \&fetch_report;
189 =head2 Require methods from Bio::Index::Abstract
191 =cut
193 =head2 _index_file
195 Title : _index_file
196 Usage : $index->_index_file( $file_name, $i )
197 Function: Specialist function to index report file(s).
198 Is provided with a filename and an integer
199 by make_index in its SUPER class.
200 Example :
201 Returns :
202 Args :
204 =cut
206 sub _index_file {
207 my( $self,
208 $file, # File name
209 $i, # Index-number of file being indexed
210 ) = @_;
212 my( $begin, # Offset from start of file of the start
213 # of the last found record.
215 local $/ ="\n";
216 open(my $BLAST, '<', $file) or $self->throw("cannot open file $file\n");
217 my $indexpoint = 0;
218 my $lastline = 0;
219 while( <$BLAST> ) {
220 if(m{^#\sSTOCKHOLM} ) {
221 $indexpoint = tell($BLAST)-length $_;
222 $self->debug("Index:$indexpoint\n")
224 if(m{^#=GF\s+AC\s+(\S[^\n]+)}) {
225 foreach my $id ($self->id_parser()->($1)) {
226 $self->debug("id is $id, begin is $indexpoint\n");
227 #$self->add_record($id, $i, $indexpoint);
233 # shamelessly stolen from Bio::Index::Fasta
235 =head2 id_parser
237 Title : id_parser
238 Usage : $index->id_parser( CODE )
239 Function: Stores or returns the code used by record_id to
240 parse the ID for record from a string. Useful
241 for (for instance) specifying a different
242 parser for different flavours of IDs (for instance,
243 custom stockholm-formated files).
244 Returns \&default_id_parser (see below) if not
245 set. If you supply your own id_parser
246 subroutine, then it should expect a fasta
247 description line. An entry will be added to
248 the index for each string in the list returned.
249 Example : $index->id_parser( \&my_id_parser )
250 Returns : ref to CODE if called without arguments
251 Args : CODE
253 =cut
255 sub id_parser {
256 my( $self, $code ) =@_;
258 if ($code) {
259 $self->{'_id_parser'} = $code;
261 return $self->{'_id_parser'} || \&default_id_parser;
264 =head2 default_id_parser
266 Title : default_id_parser
267 Usage : $id = default_id_parser( $header )
268 Function: The default Blast Query ID parser for Bio::Index::Blast.pm
269 Returns $1 from applying the regexp /^>\s*(\S+)/
270 to $header.
271 Returns : ID string
272 Args : a header line string
274 =cut
276 sub default_id_parser
278 if ($_[0] =~ /^\s*(\S+)/) {
279 return $1;
280 } else {
281 return;
285 =head2 Bio::Index::Abstract methods
287 =cut
289 =head2 filename
291 Title : filename
292 Usage : $value = $self->filename();
293 $self->filename($value);
294 Function: Gets or sets the name of the dbm index file.
295 Returns : The current value of filename
296 Args : Value of filename if setting, or none if
297 getting the value.
299 =head2 write_flag
301 Title : write_flag
302 Usage : $value = $self->write_flag();
303 $self->write_flag($value);
304 Function: Gets or sets the value of write_flag, which
305 is wether the dbm file should be opened with
306 write access.
307 Returns : The current value of write_flag (default 0)
308 Args : Value of write_flag if setting, or none if
309 getting the value.
311 =head2 dbm_package
313 Usage : $value = $self->dbm_package();
314 $self->dbm_package($value);
316 Function: Gets or sets the name of the Perl dbm module used.
317 If the value is unset, then it returns the value of
318 the package variable $USE_DBM_TYPE or if that is
319 unset, then it chooses the best available dbm type,
320 choosing 'DB_File' in preference to 'SDBM_File'.
321 Bio::Abstract::Index may work with other dbm file
322 types.
324 Returns : The current value of dbm_package
325 Args : Value of dbm_package if setting, or none if
326 getting the value.
329 =head2 get_stream
331 Title : get_stream
332 Usage : $stream = $index->get_stream( $id );
333 Function: Returns a file handle with the file pointer
334 at the approprite place
336 This provides for a way to get the actual
337 file contents and not an object
339 WARNING: you must parse the record deliminter
340 *yourself*. Abstract wont do this for you
341 So this code
343 $fh = $index->get_stream($myid);
344 while( <$fh> ) {
345 # do something
347 will parse the entire file if you do not put in
348 a last statement in, like
350 while( <$fh> ) {
351 /^\/\// && last; # end of record
352 # do something
355 Returns : A filehandle object
356 Args : string represents the accession number
357 Notes : This method should not be used without forethought
360 =head2 open_dbm
362 Usage : $index->open_dbm()
363 Function: Opens the dbm file associated with the index
364 object. Write access is only given if explicitly
365 asked for by calling new(-write => 1) or having set
366 the write_flag(1) on the index object. The type of
367 dbm file opened is that returned by dbm_package().
368 The name of the file to be is opened is obtained by
369 calling the filename() method.
371 Example : $index->_open_dbm()
372 Returns : 1 on success
375 =head2 _version
377 Title : _version
378 Usage : $type = $index->_version()
379 Function: Returns a string which identifes the version of an
380 index module. Used to permanently identify an index
381 file as having been created by a particular version
382 of the index module. Must be provided by the sub class
383 Example :
384 Returns :
385 Args : none
387 =head2 _filename
389 Title : _filename
390 Usage : $index->_filename( FILE INT )
391 Function: Indexes the file
392 Example :
393 Returns :
394 Args :
396 =head2 _file_handle
398 Title : _file_handle
399 Usage : $fh = $index->_file_handle( INT )
400 Function: Returns an open filehandle for the file
401 index INT. On opening a new filehandle it
402 caches it in the @{$index->_filehandle} array.
403 If the requested filehandle is already open,
404 it simply returns it from the array.
405 Example : $fist_file_indexed = $index->_file_handle( 0 );
406 Returns : ref to a filehandle
407 Args : INT
409 =head2 _file_count
411 Title : _file_count
412 Usage : $index->_file_count( INT )
413 Function: Used by the index building sub in a sub class to
414 track the number of files indexed. Sets or gets
415 the number of files indexed when called with or
416 without an argument.
417 Example :
418 Returns : INT
419 Args : INT
422 =head2 add_record
424 Title : add_record
425 Usage : $index->add_record( $id, @stuff );
426 Function: Calls pack_record on @stuff, and adds the result
427 of pack_record to the index database under key $id.
428 If $id is a reference to an array, then a new entry
429 is added under a key corresponding to each element
430 of the array.
431 Example : $index->add_record( $id, $fileNumber, $begin, $end )
432 Returns : TRUE on success or FALSE on failure
433 Args : ID LIST
435 =head2 pack_record
437 Title : pack_record
438 Usage : $packed_string = $index->pack_record( LIST )
439 Function: Packs an array of scalars into a single string
440 joined by ASCII 034 (which is unlikely to be used
441 in any of the strings), and returns it.
442 Example : $packed_string = $index->pack_record( $fileNumber, $begin, $end )
443 Returns : STRING or undef
444 Args : LIST
446 =head2 unpack_record
448 Title : unpack_record
449 Usage : $index->unpack_record( STRING )
450 Function: Splits the sting provided into an array,
451 splitting on ASCII 034.
452 Example : ( $fileNumber, $begin, $end ) = $index->unpack_record( $self->db->{$id} )
453 Returns : A 3 element ARRAY
454 Args : STRING containing ASCII 034
456 =head2 DESTROY
458 Title : DESTROY
459 Usage : Called automatically when index goes out of scope
460 Function: Closes connection to database and handles to
461 sequence files
462 Returns : NEVER
463 Args : NONE
466 =cut