sync w/ main trunk
[bioperl-live.git] / Bio / DB / Flat.pm
blobe09c9981c93d55d3226d28cb363b347be11f31ee
2 # $Id$
4 # BioPerl module for Bio::DB::Flat
6 # Please direct questions and support issues to <bioperl-l@bioperl.org>
8 # Cared for by Lincoln Stein <lstein@cshl.org>
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::DB::Flat - Interface for indexed flat files
18 =head1 SYNOPSIS
20 $db = Bio::DB::Flat->new(-directory => '/usr/share/embl',
21 -dbname => 'mydb',
22 -format => 'embl',
23 -index => 'bdb',
24 -write_flag => 1);
25 $db->build_index('/usr/share/embl/primate.embl',
26 '/usr/share/embl/protists.embl');
27 $seq = $db->get_Seq_by_id('BUM');
28 @sequences = $db->get_Seq_by_acc('DIV' => 'primate');
29 $raw = $db->fetch_raw('BUM');
31 =head1 DESCRIPTION
33 This object provides the basic mechanism to associate positions in
34 files with primary and secondary name spaces. Unlike
35 Bio::Index::Abstract (see L<Bio::Index::Abstract>), this is specialized
36 to work with the "flat index" and BerkeleyDB indexed flat file formats
37 worked out at the 2002 BioHackathon.
39 This object is a general front end to the underlying databases.
41 =head1 FEEDBACK
43 =head2 Mailing Lists
45 User feedback is an integral part of the evolution of this and other
46 Bioperl modules. Send your comments and suggestions preferably to one
47 of the Bioperl mailing lists. Your participation is much appreciated.
49 bioperl-l@bioperl.org - General discussion
50 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
52 =head2 Support
54 Please direct usage questions or support issues to the mailing list:
56 L<bioperl-l@bioperl.org>
58 rather than to the module maintainer directly. Many experienced and
59 reponsive experts will be able look at the problem and quickly
60 address it. Please include a thorough description of the problem
61 with code and data examples if at all possible.
63 =head2 Reporting Bugs
65 Report bugs to the Bioperl bug tracking system to help us keep track
66 the bugs and their resolution. Bug reports can be submitted via the
67 web:
69 http://bugzilla.open-bio.org/
71 =head1 AUTHOR - Lincoln Stein
73 Email - lstein@cshl.org
75 =head1 APPENDIX
77 The rest of the documentation details each of the object methods. Internal
78 methods are usually preceded with an "_" (underscore).
80 =cut
83 # Let the code begin...
84 package Bio::DB::Flat;
86 use File::Spec;
88 use base qw(Bio::Root::Root Bio::DB::RandomAccessI);
90 use constant CONFIG_FILE_NAME => 'config.dat';
92 =head2 new
94 Title : new
95 Usage : my $db = Bio::DB::Flat->new(
96 -directory => $root_directory,
97 -dbname => 'mydb',
98 -write_flag => 1,
99 -index => 'bdb',
100 -verbose => 0,
101 -out => 'outputfile',
102 -format => 'genbank');
103 Function: create a new Bio::DB::Flat object
104 Returns : new Bio::DB::Flat object
105 Args : -directory Root directory containing "config.dat"
106 -write_flag If true, allows creation/updating.
107 -verbose Verbose messages
108 -out File to write to when write_seq invoked
109 -index 'bdb' or 'binarysearch'
110 Status : Public
112 The required -directory argument indicates where the flat file indexes
113 will be stored. The build_index() and write_seq() methods will
114 automatically create subdirectories of this root directory. Each
115 subdirectory will contain a human-readable configuration file named
116 "config.dat" that specifies where the individual indexes are stored.
118 The required -dbname argument gives a name to the database index. The
119 index files will actually be stored in a like-named subdirectory
120 underneath the root directory.
122 The -write_flag enables writing new entries into the database as well
123 as the creation of the indexes. By default the indexes will be opened
124 read only.
126 -index is one of "bdb" or "binarysearch" and indicates the type of
127 index to generate. "bdb" corresponds to Berkeley DB. You *must* be
128 using BerkeleyDB version 2 or higher, and have the Perl BerkeleyDB
129 extension installed (DB_File will *not* work). "binarysearch"
130 corresponds to the OBDA "flat" indexed file.
132 The -out argument specifies the output file for writing objects created
133 with write_seq().
135 The -format argument specifies the format of the input file or files. If
136 the file suffix is one that Bioperl can already associate with a format
137 then this is optional.
139 =cut
141 sub new {
142 my $class = shift;
143 $class = ref($class) if ref($class);
144 my $self = $class->SUPER::new(@_);
146 # first we initialize ourselves
147 my ($flat_directory,$dbname,$format) =
148 $self->_rearrange([qw(DIRECTORY DBNAME FORMAT)],@_);
150 defined $flat_directory
151 or $self->throw('Please supply a -directory argument');
152 defined $dbname
153 or $self->throw('Please supply a -dbname argument');
155 # set values from configuration file
156 $self->directory($flat_directory);
157 $self->dbname($dbname);
159 $self->throw("Base directory $flat_directory doesn't exist")
160 unless -e $flat_directory;
161 $self->throw("$flat_directory isn't a directory")
162 unless -d _;
163 my $dbpath = File::Spec->catfile($flat_directory,$dbname);
164 unless (-d $dbpath) {
165 $self->debug("creating db directory $dbpath\n");
166 mkdir $dbpath,0777 or $self->throw("Can't create $dbpath: $!");
168 $self->_read_config();
170 # but override with initialization values
171 $self->_initialize(@_);
173 $self->throw('you must specify an indexing scheme')
174 unless $self->indexing_scheme;
176 # now we figure out what subclass to instantiate
177 my $index_type = $self->indexing_scheme eq 'BerkeleyDB/1' ? 'BDB'
178 :$self->indexing_scheme eq 'flat/1' ? 'Binary'
179 :$self->throw("unknown indexing scheme: " .
180 $self->indexing_scheme);
181 $format = $self->file_format;
183 # because Michele and Lincoln did it differently
184 # Michele's way is via a standalone concrete class
185 if ($index_type eq 'Binary') {
186 my $child_class = 'Bio::DB::Flat::BinarySearch';
187 eval "use $child_class";
188 $self->throw($@) if $@;
189 push @_, ('-format', $format);
190 return $child_class->new(@_);
193 # Lincoln uses Bio::SeqIO style delegation.
194 my $child_class= "Bio\:\:DB\:\:Flat\:\:$index_type\:\:\L$format";
195 eval "use $child_class";
196 $self->throw($@) if $@;
198 # rebless & reinitialize with the new class
199 # (this prevents subclasses from forgetting to call our own initialization)
200 bless $self,$child_class;
201 $self->_initialize(@_);
202 $self->_set_namespaces(@_);
204 $self;
207 sub _initialize {
208 my $self = shift;
210 my ($flat_write_flag,$dbname,$flat_indexing,$flat_verbose,$flat_outfile,$flat_format)
211 = $self->_rearrange([qw(WRITE_FLAG DBNAME INDEX VERBOSE OUT FORMAT)],@_);
213 $self->write_flag($flat_write_flag) if defined $flat_write_flag;
215 if (defined $flat_indexing) {
216 # very permissive
217 $flat_indexing = 'BerkeleyDB/1' if $flat_indexing =~ /bdb/;
218 $flat_indexing = 'flat/1' if $flat_indexing =~ /^(flat|binary)/;
219 $self->indexing_scheme($flat_indexing);
222 $self->verbose($flat_verbose) if defined $flat_verbose;
223 $self->dbname($dbname) if defined $dbname;
224 $self->out_file($flat_outfile) if defined $flat_outfile;
225 $self->file_format($flat_format) if defined $flat_format;
228 sub _set_namespaces {
229 my $self = shift;
231 $self->primary_namespace($self->default_primary_namespace)
232 unless defined $self->{flat_primary_namespace};
234 $self->secondary_namespaces($self->default_secondary_namespaces)
235 unless defined $self->{flat_secondary_namespaces};
237 $self->file_format($self->default_file_format)
238 unless defined $self->{flat_format};
241 =head2 new_from_registry
243 Title : new_from_registry
244 Usage : $db = Bio::DB::Flat->new_from_registry(%config)
245 Function: creates a new Bio::DB::Flat object in a Bio::DB::Registry-
246 compatible fashion
247 Returns : new Bio::DB::Flat
248 Args : provided by the registry, see below
249 Status : Public
251 The following registry-configuration tags are recognized:
253 location Root of the indexed flat file; corresponds to the new() method's
254 -directory argument.
256 =cut
258 sub new_from_registry {
259 my ($self,%config) = @_;
260 my $location = $config{'location'} or
261 $self->throw('location tag must be specified.');
262 my $dbname = $config{'dbname'} or
263 $self->throw('dbname tag must be specified.');
265 my $db = $self->new(-directory => $location,
266 -dbname => $dbname,
268 $db;
271 # accessors
272 sub directory {
273 my $self = shift;
274 my $d = $self->{flat_directory};
275 $self->{flat_directory} = shift if @_;
278 sub write_flag {
279 my $self = shift;
280 my $d = $self->{flat_write_flag};
281 $self->{flat_write_flag} = shift if @_;
284 sub verbose {
285 my $self = shift;
286 my $d = $self->{flat_verbose};
287 $self->{flat_verbose} = shift if @_;
290 sub out_file {
291 my $self = shift;
292 my $d = $self->{flat_outfile};
293 $self->{flat_outfile} = shift if @_;
296 sub dbname {
297 my $self = shift;
298 my $d = $self->{flat_dbname};
299 $self->{flat_dbname} = shift if @_;
302 sub primary_namespace {
303 my $self = shift;
304 my $d = $self->{flat_primary_namespace};
305 $self->{flat_primary_namespace} = shift if @_;
309 # get/set secondary namespace(s)
310 # pass an array ref.
311 # get an array ref in scalar context, list in list context.
312 sub secondary_namespaces {
313 my $self = shift;
314 my $d = $self->{flat_secondary_namespaces};
315 $self->{flat_secondary_namespaces} = (ref($_[0]) eq 'ARRAY' ? shift : [@_]) if @_;
316 return unless $d;
317 $d = [$d] if $d && ref($d) ne 'ARRAY'; # just paranoia
318 return wantarray ? @$d : $d;
321 # return the file format
322 sub file_format {
323 my $self = shift;
324 my $d = $self->{flat_format};
325 $self->{flat_format} = shift if @_;
329 # return the alphabet
330 sub alphabet {
331 my $self = shift;
332 my $d = $self->{flat_alphabet};
333 $self->{flat_alphabet} = shift if @_;
337 sub parse_one_record {
338 my $self = shift;
339 my $fh = shift;
340 my $parser =
341 $self->{cached_parsers}{fileno($fh)}
342 ||= Bio::SeqIO->new(-fh=>$fh,-format=>$self->default_file_format);
343 my $seq = $parser->next_seq or return;
344 $self->{flat_alphabet} ||= $seq->alphabet;
345 my $ids = $self->seq_to_ids($seq);
346 return $ids;
350 # return the indexing scheme
351 sub indexing_scheme {
352 my $self = shift;
353 my $d = $self->{flat_indexing};
354 $self->{flat_indexing} = shift if @_;
358 sub add_flat_file {
359 my $self = shift;
360 my ($file_path,$file_length,$nf) = @_;
362 # check that file_path is absolute
363 unless (File::Spec->file_name_is_absolute($file_path)) {
364 $file_path = File::Spec->rel2abs($file_path);
367 -r $file_path or $self->throw("flat file $file_path cannot be read: $!");
369 my $current_size = -s _;
370 if (defined $file_length) {
371 $current_size == $file_length
372 or $self->throw("flat file $file_path has changed size. Was $file_length bytes; now $current_size");
373 } else {
374 $file_length = $current_size;
377 unless (defined $nf) {
378 $self->{flat_file_index} = 0 unless exists $self->{flat_file_index};
379 $nf = $self->{flat_file_index}++;
381 $self->{flat_flat_file_path}{$nf} = $file_path;
382 $self->{flat_flat_file_no}{$file_path} = $nf;
383 $nf;
386 sub write_config {
387 my $self = shift;
388 $self->write_flag or $self->throw("cannot write configuration file because write_flag is not set");
389 my $path = $self->_config_path;
391 open (my $F,">$path") or $self->throw("open error on $path: $!");
393 my $index_type = $self->indexing_scheme;
394 print $F "index\t$index_type\n";
396 my $format = $self->file_format;
397 my $alphabet = $self->alphabet;
398 my $alpha = $alphabet ? "/$alphabet" : '';
399 print $F "format\tURN:LSID:open-bio.org:${format}${alpha}\n";
401 my @filenos = $self->_filenos or $self->throw("cannot write config file because no flat files defined");
402 for my $nf (@filenos) {
403 my $path = $self->{flat_flat_file_path}{$nf};
404 my $size = -s $path;
405 print $F join("\t","fileid_$nf",$path,$size),"\n";
408 # write primary namespace
409 my $primary_ns = $self->primary_namespace
410 or $self->throw('cannot write config file because no primary namespace defined');
412 print $F join("\t",'primary_namespace',$primary_ns),"\n";
414 # write secondary namespaces
415 my @secondary = $self->secondary_namespaces;
416 print $F join("\t",'secondary_namespaces',@secondary),"\n";
418 close $F or $self->throw("close error on $path: $!");
421 sub files {
422 my $self = shift;
423 return unless $self->{flat_flat_file_no};
424 return keys %{$self->{flat_flat_file_no}};
427 sub write_seq {
428 my $self = shift;
429 my $seq = shift;
431 $self->write_flag or $self->throw("cannot write sequences because write_flag is not set");
433 my $file = $self->out_file or $self->throw('no outfile defined; use the -out argument to new()');
434 my $seqio = $self->{flat_cached_parsers}{$file}
435 ||= Bio::SeqIO->new(-Format => $self->file_format,
436 -file => ">$file")
437 or $self->throw("couldn't create Bio::SeqIO object");
439 my $fh = $seqio->_fh or $self->throw("couldn't get filehandle from Bio::SeqIO object");
440 my $offset = tell($fh);
441 $seqio->write_seq($seq);
442 my $length = tell($fh)-$offset;
443 my $ids = $self->seq_to_ids($seq);
444 $self->_store_index($ids,$file,$offset,$length);
446 $self->{flat_outfile_dirty}++;
449 sub close {
450 my $self = shift;
451 return unless $self->{flat_outfile_dirty};
452 $self->write_config;
453 delete $self->{flat_outfile_dirty};
454 delete $self->{flat_cached_parsers}{$self->out_file};
458 sub _filenos {
459 my $self = shift;
460 return unless $self->{flat_flat_file_path};
461 return keys %{$self->{flat_flat_file_path}};
464 # read the configuration file
465 sub _read_config {
466 my $self = shift;
467 my $path = $self->_config_path;
468 return unless -e $path;
470 open (my $F,$path) or $self->throw("open error on $path: $!");
471 my %config;
472 while (<$F>) {
473 chomp;
474 my ($tag,@values) = split "\t";
475 $config{$tag} = \@values;
477 CORE::close $F or $self->throw("close error on $path: $!");
479 $config{index}[0] =~ m~(flat/1|BerkeleyDB/1)~
480 or $self->throw("invalid configuration file $path: no index line");
482 $self->indexing_scheme($1);
484 if ($config{format}) {
485 # handle LSID format
486 if ($config{format}[0] =~ /^URN:LSID:open-bio\.org:(\w+)(?:\/(\w+))/) {
487 $self->file_format($1);
488 $self->alphabet($2);
489 } else { # compatibility with older versions
490 $self->file_format($config{format}[0]);
494 # set up primary namespace
495 my $primary_namespace = $config{primary_namespace}[0]
496 or $self->throw("invalid configuration file $path: no primary namespace defined");
497 $self->primary_namespace($primary_namespace);
499 # set up secondary namespaces (may be empty)
500 $self->secondary_namespaces($config{secondary_namespaces});
502 # get file paths and their normalization information
503 my @normalized_files = grep {$_ ne ''} map {/^fileid_(\S+)/ && $1} keys %config;
504 for my $nf (@normalized_files) {
505 my ($file_path,$file_length) = @{$config{"fileid_${nf}"}};
506 $self->add_flat_file($file_path,$file_length,$nf);
512 sub _config_path {
513 my $self = shift;
514 $self->_catfile($self->_config_name);
517 sub _catfile {
518 my $self = shift;
519 my $component = shift;
520 File::Spec->catfile($self->directory,$self->dbname,$component);
523 sub _config_name { CONFIG_FILE_NAME }
525 sub _path2fileno {
526 my $self = shift;
527 my $path = shift;
528 return $self->add_flat_file($path)
529 unless exists $self->{flat_flat_file_no}{$path};
530 $self->{flat_flat_file_no}{$path};
533 sub _fileno2path {
534 my $self = shift;
535 my $fileno = shift;
536 $self->{flat_flat_file_path}{$fileno};
539 sub _files {
540 my $self = shift;
541 my $paths = $self->{flat_flat_file_no};
542 return keys %$paths;
545 =head2 fetch
547 Title : fetch
548 Usage : $index->fetch( $id )
549 Function: Returns a Bio::Seq object from the index
550 Example : $seq = $index->fetch( 'dJ67B12' )
551 Returns : Bio::Seq object
552 Args : ID
554 Deprecated. Use get_Seq_by_id instead.
556 =cut
558 sub fetch { shift->get_Seq_by_id(@_) }
561 =head2 To Be Implemented in Subclasses
563 The following methods MUST be implemented by subclasses.
565 =cut
567 # create real live Bio::Seq object
568 sub get_Seq_by_id {
569 my $self = shift;
570 my $id = shift;
571 $self->throw_not_implemented;
575 # fetch array of Bio::Seq objects
576 sub get_Seq_by_acc {
577 my $self = shift;
578 return $self->get_Seq_by_id(shift) if @_ == 1;
579 my ($ns,$key) = @_;
581 $self->throw_not_implemented;
584 sub fetch_raw {
585 my ($self,$id,$namespace) = @_;
586 $self->throw_not_implemented;
589 sub default_file_format {
590 my $self = shift;
591 $self->throw_not_implemented;
594 sub _store_index {
595 my $self = shift;
596 my ($ids,$file,$offset,$length) = @_;
597 $self->throw_not_implemented;
600 =head2 May Be Overridden in Subclasses
602 The following methods MAY be overridden by subclasses.
604 =cut
606 sub default_primary_namespace {
607 return "ACC";
610 sub default_secondary_namespaces {
611 return;
614 sub seq_to_ids {
615 my $self = shift;
616 my $seq = shift;
617 my %ids;
618 $ids{$self->primary_namespace} = $seq->accession_number;
619 \%ids;
622 sub DESTROY {
623 my $self = shift;
624 $self->close;