fix subversion tags
[bioperl-live.git] / Bio / DB / Flat.pm
blob7c29e88bc7cfe9e7ddb852fe752c152eda224aa5
2 # $Id$
4 # BioPerl module for Bio::DB::Flat
6 # Cared for by Lincoln Stein <lstein@cshl.org>
8 # You may distribute this module under the same terms as perl itself
10 # POD documentation - main docs before the code
12 =head1 NAME
14 Bio::DB::Flat - Interface for indexed flat files
16 =head1 SYNOPSIS
18 $db = Bio::DB::Flat->new(-directory => '/usr/share/embl',
19 -dbname => 'mydb',
20 -format => 'embl',
21 -index => 'bdb',
22 -write_flag => 1);
23 $db->build_index('/usr/share/embl/primate.embl',
24 '/usr/share/embl/protists.embl');
25 $seq = $db->get_Seq_by_id('BUM');
26 @sequences = $db->get_Seq_by_acc('DIV' => 'primate');
27 $raw = $db->fetch_raw('BUM');
29 =head1 DESCRIPTION
31 This object provides the basic mechanism to associate positions in
32 files with primary and secondary name spaces. Unlike
33 Bio::Index::Abstract (see L<Bio::Index::Abstract>), this is specialized
34 to work with the "flat index" and BerkeleyDB indexed flat file formats
35 worked out at the 2002 BioHackathon.
37 This object is a general front end to the underlying databases.
39 =head1 FEEDBACK
41 =head2 Mailing Lists
43 User feedback is an integral part of the evolution of this and other
44 Bioperl modules. Send your comments and suggestions preferably to one
45 of the Bioperl mailing lists. Your participation is much appreciated.
47 bioperl-l@bioperl.org - General discussion
48 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
50 =head2 Reporting Bugs
52 Report bugs to the Bioperl bug tracking system to help us keep track
53 the bugs and their resolution. Bug reports can be submitted via the
54 web:
56 http://bugzilla.open-bio.org/
58 =head1 AUTHOR - Lincoln Stein
60 Email - lstein@cshl.org
62 =head1 APPENDIX
64 The rest of the documentation details each of the object methods. Internal
65 methods are usually preceded with an "_" (underscore).
67 =cut
70 # Let the code begin...
71 package Bio::DB::Flat;
73 use File::Spec;
75 use base qw(Bio::Root::Root Bio::DB::RandomAccessI);
77 use constant CONFIG_FILE_NAME => 'config.dat';
79 =head2 new
81 Title : new
82 Usage : my $db = Bio::DB::Flat->new(
83 -directory => $root_directory,
84 -dbname => 'mydb',
85 -write_flag => 1,
86 -index => 'bdb',
87 -verbose => 0,
88 -out => 'outputfile',
89 -format => 'genbank');
90 Function: create a new Bio::DB::Flat object
91 Returns : new Bio::DB::Flat object
92 Args : -directory Root directory containing "config.dat"
93 -write_flag If true, allows creation/updating.
94 -verbose Verbose messages
95 -out File to write to when write_seq invoked
96 -index 'bdb' or 'binarysearch'
97 Status : Public
99 The required -directory argument indicates where the flat file indexes
100 will be stored. The build_index() and write_seq() methods will
101 automatically create subdirectories of this root directory. Each
102 subdirectory will contain a human-readable configuration file named
103 "config.dat" that specifies where the individual indexes are stored.
105 The required -dbname argument gives a name to the database index. The
106 index files will actually be stored in a like-named subdirectory
107 underneath the root directory.
109 The -write_flag enables writing new entries into the database as well
110 as the creation of the indexes. By default the indexes will be opened
111 read only.
113 -index is one of "bdb" or "binarysearch" and indicates the type of
114 index to generate. "bdb" corresponds to Berkeley DB. You *must* be
115 using BerkeleyDB version 2 or higher, and have the Perl BerkeleyDB
116 extension installed (DB_File will *not* work). "binarysearch"
117 corresponds to the OBDA "flat" indexed file.
119 The -out argument specifies the output file for writing objects created
120 with write_seq().
122 The -format argument specifies the format of the input file or files. If
123 the file suffix is one that Bioperl can already associate with a format
124 then this is optional.
126 =cut
128 sub new {
129 my $class = shift;
130 $class = ref($class) if ref($class);
131 my $self = $class->SUPER::new(@_);
133 # first we initialize ourselves
134 my ($flat_directory,$dbname,$format) =
135 $self->_rearrange([qw(DIRECTORY DBNAME FORMAT)],@_);
137 defined $flat_directory
138 or $self->throw('Please supply a -directory argument');
139 defined $dbname
140 or $self->throw('Please supply a -dbname argument');
142 # set values from configuration file
143 $self->directory($flat_directory);
144 $self->dbname($dbname);
146 $self->throw("Base directory $flat_directory doesn't exist")
147 unless -e $flat_directory;
148 $self->throw("$flat_directory isn't a directory")
149 unless -d _;
150 my $dbpath = File::Spec->catfile($flat_directory,$dbname);
151 unless (-d $dbpath) {
152 $self->debug("creating db directory $dbpath\n");
153 mkdir $dbpath,0777 or $self->throw("Can't create $dbpath: $!");
155 $self->_read_config();
157 # but override with initialization values
158 $self->_initialize(@_);
160 $self->throw('you must specify an indexing scheme')
161 unless $self->indexing_scheme;
163 # now we figure out what subclass to instantiate
164 my $index_type = $self->indexing_scheme eq 'BerkeleyDB/1' ? 'BDB'
165 :$self->indexing_scheme eq 'flat/1' ? 'Binary'
166 :$self->throw("unknown indexing scheme: " .
167 $self->indexing_scheme);
168 $format = $self->file_format;
170 # because Michele and Lincoln did it differently
171 # Michele's way is via a standalone concrete class
172 if ($index_type eq 'Binary') {
173 my $child_class = 'Bio::DB::Flat::BinarySearch';
174 eval "use $child_class";
175 $self->throw($@) if $@;
176 push @_, ('-format', $format);
177 return $child_class->new(@_);
180 # Lincoln uses Bio::SeqIO style delegation.
181 my $child_class= "Bio\:\:DB\:\:Flat\:\:$index_type\:\:\L$format";
182 eval "use $child_class";
183 $self->throw($@) if $@;
185 # rebless & reinitialize with the new class
186 # (this prevents subclasses from forgetting to call our own initialization)
187 bless $self,$child_class;
188 $self->_initialize(@_);
189 $self->_set_namespaces(@_);
191 $self;
194 sub _initialize {
195 my $self = shift;
197 my ($flat_write_flag,$dbname,$flat_indexing,$flat_verbose,$flat_outfile,$flat_format)
198 = $self->_rearrange([qw(WRITE_FLAG DBNAME INDEX VERBOSE OUT FORMAT)],@_);
200 $self->write_flag($flat_write_flag) if defined $flat_write_flag;
202 if (defined $flat_indexing) {
203 # very permissive
204 $flat_indexing = 'BerkeleyDB/1' if $flat_indexing =~ /bdb/;
205 $flat_indexing = 'flat/1' if $flat_indexing =~ /^(flat|binary)/;
206 $self->indexing_scheme($flat_indexing);
209 $self->verbose($flat_verbose) if defined $flat_verbose;
210 $self->dbname($dbname) if defined $dbname;
211 $self->out_file($flat_outfile) if defined $flat_outfile;
212 $self->file_format($flat_format) if defined $flat_format;
215 sub _set_namespaces {
216 my $self = shift;
218 $self->primary_namespace($self->default_primary_namespace)
219 unless defined $self->{flat_primary_namespace};
221 $self->secondary_namespaces($self->default_secondary_namespaces)
222 unless defined $self->{flat_secondary_namespaces};
224 $self->file_format($self->default_file_format)
225 unless defined $self->{flat_format};
228 =head2 new_from_registry
230 Title : new_from_registry
231 Usage : $db = Bio::DB::Flat->new_from_registry(%config)
232 Function: creates a new Bio::DB::Flat object in a Bio::DB::Registry-
233 compatible fashion
234 Returns : new Bio::DB::Flat
235 Args : provided by the registry, see below
236 Status : Public
238 The following registry-configuration tags are recognized:
240 location Root of the indexed flat file; corresponds to the new() method's
241 -directory argument.
243 =cut
245 sub new_from_registry {
246 my ($self,%config) = @_;
247 my $location = $config{'location'} or
248 $self->throw('location tag must be specified.');
249 my $dbname = $config{'dbname'} or
250 $self->throw('dbname tag must be specified.');
252 my $db = $self->new(-directory => $location,
253 -dbname => $dbname,
255 $db;
258 # accessors
259 sub directory {
260 my $self = shift;
261 my $d = $self->{flat_directory};
262 $self->{flat_directory} = shift if @_;
265 sub write_flag {
266 my $self = shift;
267 my $d = $self->{flat_write_flag};
268 $self->{flat_write_flag} = shift if @_;
271 sub verbose {
272 my $self = shift;
273 my $d = $self->{flat_verbose};
274 $self->{flat_verbose} = shift if @_;
277 sub out_file {
278 my $self = shift;
279 my $d = $self->{flat_outfile};
280 $self->{flat_outfile} = shift if @_;
283 sub dbname {
284 my $self = shift;
285 my $d = $self->{flat_dbname};
286 $self->{flat_dbname} = shift if @_;
289 sub primary_namespace {
290 my $self = shift;
291 my $d = $self->{flat_primary_namespace};
292 $self->{flat_primary_namespace} = shift if @_;
296 # get/set secondary namespace(s)
297 # pass an array ref.
298 # get an array ref in scalar context, list in list context.
299 sub secondary_namespaces {
300 my $self = shift;
301 my $d = $self->{flat_secondary_namespaces};
302 $self->{flat_secondary_namespaces} = (ref($_[0]) eq 'ARRAY' ? shift : [@_]) if @_;
303 return unless $d;
304 $d = [$d] if $d && ref($d) ne 'ARRAY'; # just paranoia
305 return wantarray ? @$d : $d;
308 # return the file format
309 sub file_format {
310 my $self = shift;
311 my $d = $self->{flat_format};
312 $self->{flat_format} = shift if @_;
316 # return the alphabet
317 sub alphabet {
318 my $self = shift;
319 my $d = $self->{flat_alphabet};
320 $self->{flat_alphabet} = shift if @_;
324 sub parse_one_record {
325 my $self = shift;
326 my $fh = shift;
327 my $parser =
328 $self->{cached_parsers}{fileno($fh)}
329 ||= Bio::SeqIO->new(-fh=>$fh,-format=>$self->default_file_format);
330 my $seq = $parser->next_seq or return;
331 $self->{flat_alphabet} ||= $seq->alphabet;
332 my $ids = $self->seq_to_ids($seq);
333 return $ids;
337 # return the indexing scheme
338 sub indexing_scheme {
339 my $self = shift;
340 my $d = $self->{flat_indexing};
341 $self->{flat_indexing} = shift if @_;
345 sub add_flat_file {
346 my $self = shift;
347 my ($file_path,$file_length,$nf) = @_;
349 # check that file_path is absolute
350 unless (File::Spec->file_name_is_absolute($file_path)) {
351 $file_path = File::Spec->rel2abs($file_path);
354 -r $file_path or $self->throw("flat file $file_path cannot be read: $!");
356 my $current_size = -s _;
357 if (defined $file_length) {
358 $current_size == $file_length
359 or $self->throw("flat file $file_path has changed size. Was $file_length bytes; now $current_size");
360 } else {
361 $file_length = $current_size;
364 unless (defined $nf) {
365 $self->{flat_file_index} = 0 unless exists $self->{flat_file_index};
366 $nf = $self->{flat_file_index}++;
368 $self->{flat_flat_file_path}{$nf} = $file_path;
369 $self->{flat_flat_file_no}{$file_path} = $nf;
370 $nf;
373 sub write_config {
374 my $self = shift;
375 $self->write_flag or $self->throw("cannot write configuration file because write_flag is not set");
376 my $path = $self->_config_path;
378 open (my $F,">$path") or $self->throw("open error on $path: $!");
380 my $index_type = $self->indexing_scheme;
381 print $F "index\t$index_type\n";
383 my $format = $self->file_format;
384 my $alphabet = $self->alphabet;
385 my $alpha = $alphabet ? "/$alphabet" : '';
386 print $F "format\tURN:LSID:open-bio.org:${format}${alpha}\n";
388 my @filenos = $self->_filenos or $self->throw("cannot write config file because no flat files defined");
389 for my $nf (@filenos) {
390 my $path = $self->{flat_flat_file_path}{$nf};
391 my $size = -s $path;
392 print $F join("\t","fileid_$nf",$path,$size),"\n";
395 # write primary namespace
396 my $primary_ns = $self->primary_namespace
397 or $self->throw('cannot write config file because no primary namespace defined');
399 print $F join("\t",'primary_namespace',$primary_ns),"\n";
401 # write secondary namespaces
402 my @secondary = $self->secondary_namespaces;
403 print $F join("\t",'secondary_namespaces',@secondary),"\n";
405 close $F or $self->throw("close error on $path: $!");
408 sub files {
409 my $self = shift;
410 return unless $self->{flat_flat_file_no};
411 return keys %{$self->{flat_flat_file_no}};
414 sub write_seq {
415 my $self = shift;
416 my $seq = shift;
418 $self->write_flag or $self->throw("cannot write sequences because write_flag is not set");
420 my $file = $self->out_file or $self->throw('no outfile defined; use the -out argument to new()');
421 my $seqio = $self->{flat_cached_parsers}{$file}
422 ||= Bio::SeqIO->new(-Format => $self->file_format,
423 -file => ">$file")
424 or $self->throw("couldn't create Bio::SeqIO object");
426 my $fh = $seqio->_fh or $self->throw("couldn't get filehandle from Bio::SeqIO object");
427 my $offset = tell($fh);
428 $seqio->write_seq($seq);
429 my $length = tell($fh)-$offset;
430 my $ids = $self->seq_to_ids($seq);
431 $self->_store_index($ids,$file,$offset,$length);
433 $self->{flat_outfile_dirty}++;
436 sub close {
437 my $self = shift;
438 return unless $self->{flat_outfile_dirty};
439 $self->write_config;
440 delete $self->{flat_outfile_dirty};
441 delete $self->{flat_cached_parsers}{$self->out_file};
445 sub _filenos {
446 my $self = shift;
447 return unless $self->{flat_flat_file_path};
448 return keys %{$self->{flat_flat_file_path}};
451 # read the configuration file
452 sub _read_config {
453 my $self = shift;
454 my $path = $self->_config_path;
455 return unless -e $path;
457 open (my $F,$path) or $self->throw("open error on $path: $!");
458 my %config;
459 while (<$F>) {
460 chomp;
461 my ($tag,@values) = split "\t";
462 $config{$tag} = \@values;
464 CORE::close $F or $self->throw("close error on $path: $!");
466 $config{index}[0] =~ m~(flat/1|BerkeleyDB/1)~
467 or $self->throw("invalid configuration file $path: no index line");
469 $self->indexing_scheme($1);
471 if ($config{format}) {
472 # handle LSID format
473 if ($config{format}[0] =~ /^URN:LSID:open-bio\.org:(\w+)(?:\/(\w+))/) {
474 $self->file_format($1);
475 $self->alphabet($2);
476 } else { # compatibility with older versions
477 $self->file_format($config{format}[0]);
481 # set up primary namespace
482 my $primary_namespace = $config{primary_namespace}[0]
483 or $self->throw("invalid configuration file $path: no primary namespace defined");
484 $self->primary_namespace($primary_namespace);
486 # set up secondary namespaces (may be empty)
487 $self->secondary_namespaces($config{secondary_namespaces});
489 # get file paths and their normalization information
490 my @normalized_files = grep {$_ ne ''} map {/^fileid_(\S+)/ && $1} keys %config;
491 for my $nf (@normalized_files) {
492 my ($file_path,$file_length) = @{$config{"fileid_${nf}"}};
493 $self->add_flat_file($file_path,$file_length,$nf);
499 sub _config_path {
500 my $self = shift;
501 $self->_catfile($self->_config_name);
504 sub _catfile {
505 my $self = shift;
506 my $component = shift;
507 File::Spec->catfile($self->directory,$self->dbname,$component);
510 sub _config_name { CONFIG_FILE_NAME }
512 sub _path2fileno {
513 my $self = shift;
514 my $path = shift;
515 return $self->add_flat_file($path)
516 unless exists $self->{flat_flat_file_no}{$path};
517 $self->{flat_flat_file_no}{$path};
520 sub _fileno2path {
521 my $self = shift;
522 my $fileno = shift;
523 $self->{flat_flat_file_path}{$fileno};
526 sub _files {
527 my $self = shift;
528 my $paths = $self->{flat_flat_file_no};
529 return keys %$paths;
532 =head2 fetch
534 Title : fetch
535 Usage : $index->fetch( $id )
536 Function: Returns a Bio::Seq object from the index
537 Example : $seq = $index->fetch( 'dJ67B12' )
538 Returns : Bio::Seq object
539 Args : ID
541 Deprecated. Use get_Seq_by_id instead.
543 =cut
545 sub fetch { shift->get_Seq_by_id(@_) }
548 =head2 To Be Implemented in Subclasses
550 The following methods MUST be implemented by subclasses.
552 =cut
554 # create real live Bio::Seq object
555 sub get_Seq_by_id {
556 my $self = shift;
557 my $id = shift;
558 $self->throw_not_implemented;
562 # fetch array of Bio::Seq objects
563 sub get_Seq_by_acc {
564 my $self = shift;
565 return $self->get_Seq_by_id(shift) if @_ == 1;
566 my ($ns,$key) = @_;
568 $self->throw_not_implemented;
571 sub fetch_raw {
572 my ($self,$id,$namespace) = @_;
573 $self->throw_not_implemented;
576 sub default_file_format {
577 my $self = shift;
578 $self->throw_not_implemented;
581 sub _store_index {
582 my $self = shift;
583 my ($ids,$file,$offset,$length) = @_;
584 $self->throw_not_implemented;
587 =head2 May Be Overridden in Subclasses
589 The following methods MAY be overridden by subclasses.
591 =cut
593 sub default_primary_namespace {
594 return "ACC";
597 sub default_secondary_namespaces {
598 return;
601 sub seq_to_ids {
602 my $self = shift;
603 my $seq = shift;
604 my %ids;
605 $ids{$self->primary_namespace} = $seq->accession_number;
606 \%ids;
609 sub DESTROY {
610 my $self = shift;
611 $self->close;