4 # BioPerl module for Bio::DB::Flat
6 # Please direct questions and support issues to <bioperl-l@bioperl.org>
8 # Cared for by Lincoln Stein <lstein@cshl.org>
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::DB::Flat - Interface for indexed flat files
20 $db = Bio::DB::Flat->new(-directory => '/usr/share/embl',
25 $db->build_index('/usr/share/embl/primate.embl',
26 '/usr/share/embl/protists.embl');
27 $seq = $db->get_Seq_by_id('BUM');
28 @sequences = $db->get_Seq_by_acc('DIV' => 'primate');
29 $raw = $db->fetch_raw('BUM');
33 This object provides the basic mechanism to associate positions in
34 files with primary and secondary name spaces. Unlike
35 Bio::Index::Abstract (see L<Bio::Index::Abstract>), this is specialized
36 to work with the "flat index" and BerkeleyDB indexed flat file formats
37 worked out at the 2002 BioHackathon.
39 This object is a general front end to the underlying databases.
45 User feedback is an integral part of the evolution of this and other
46 Bioperl modules. Send your comments and suggestions preferably to one
47 of the Bioperl mailing lists. Your participation is much appreciated.
49 bioperl-l@bioperl.org - General discussion
50 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
54 Please direct usage questions or support issues to the mailing list:
56 L<bioperl-l@bioperl.org>
58 rather than to the module maintainer directly. Many experienced and
59 reponsive experts will be able look at the problem and quickly
60 address it. Please include a thorough description of the problem
61 with code and data examples if at all possible.
65 Report bugs to the Bioperl bug tracking system to help us keep track
66 the bugs and their resolution. Bug reports can be submitted via the
69 http://bugzilla.open-bio.org/
71 =head1 AUTHOR - Lincoln Stein
73 Email - lstein@cshl.org
77 The rest of the documentation details each of the object methods. Internal
78 methods are usually preceded with an "_" (underscore).
83 # Let the code begin...
84 package Bio
::DB
::Flat
;
88 use base
qw(Bio::Root::Root Bio::DB::RandomAccessI);
90 use constant CONFIG_FILE_NAME
=> 'config.dat';
95 Usage : my $db = Bio::DB::Flat->new(
96 -directory => $root_directory,
101 -out => 'outputfile',
102 -format => 'genbank');
103 Function: create a new Bio::DB::Flat object
104 Returns : new Bio::DB::Flat object
105 Args : -directory Root directory containing "config.dat"
106 -write_flag If true, allows creation/updating.
107 -verbose Verbose messages
108 -out File to write to when write_seq invoked
109 -index 'bdb' or 'binarysearch'
112 The required -directory argument indicates where the flat file indexes
113 will be stored. The build_index() and write_seq() methods will
114 automatically create subdirectories of this root directory. Each
115 subdirectory will contain a human-readable configuration file named
116 "config.dat" that specifies where the individual indexes are stored.
118 The required -dbname argument gives a name to the database index. The
119 index files will actually be stored in a like-named subdirectory
120 underneath the root directory.
122 The -write_flag enables writing new entries into the database as well
123 as the creation of the indexes. By default the indexes will be opened
126 -index is one of "bdb" or "binarysearch" and indicates the type of
127 index to generate. "bdb" corresponds to Berkeley DB. You *must* be
128 using BerkeleyDB version 2 or higher, and have the Perl BerkeleyDB
129 extension installed (DB_File will *not* work). "binarysearch"
130 corresponds to the OBDA "flat" indexed file.
132 The -out argument specifies the output file for writing objects created
135 The -format argument specifies the format of the input file or files. If
136 the file suffix is one that Bioperl can already associate with a format
137 then this is optional.
143 $class = ref($class) if ref($class);
144 my $self = $class->SUPER::new
(@_);
146 # first we initialize ourselves
147 my ($flat_directory,$dbname,$format) =
148 $self->_rearrange([qw(DIRECTORY DBNAME FORMAT)],@_);
150 defined $flat_directory
151 or $self->throw('Please supply a -directory argument');
153 or $self->throw('Please supply a -dbname argument');
155 # set values from configuration file
156 $self->directory($flat_directory);
157 $self->dbname($dbname);
159 $self->throw("Base directory $flat_directory doesn't exist")
160 unless -e
$flat_directory;
161 $self->throw("$flat_directory isn't a directory")
163 my $dbpath = File
::Spec
->catfile($flat_directory,$dbname);
164 unless (-d
$dbpath) {
165 $self->debug("creating db directory $dbpath\n");
166 mkdir $dbpath,0777 or $self->throw("Can't create $dbpath: $!");
168 $self->_read_config();
170 # but override with initialization values
171 $self->_initialize(@_);
173 $self->throw('you must specify an indexing scheme')
174 unless $self->indexing_scheme;
176 # now we figure out what subclass to instantiate
177 my $index_type = $self->indexing_scheme eq 'BerkeleyDB/1' ?
'BDB'
178 :$self->indexing_scheme eq 'flat/1' ?
'Binary'
179 :$self->throw("unknown indexing scheme: " .
180 $self->indexing_scheme);
181 $format = $self->file_format;
183 # because Michele and Lincoln did it differently
184 # Michele's way is via a standalone concrete class
185 if ($index_type eq 'Binary') {
186 my $child_class = 'Bio::DB::Flat::BinarySearch';
187 eval "use $child_class";
188 $self->throw($@
) if $@
;
189 push @_, ('-format', $format);
190 return $child_class->new(@_);
193 # Lincoln uses Bio::SeqIO style delegation.
194 my $child_class= "Bio\:\:DB\:\:Flat\:\:$index_type\:\:\L$format";
195 eval "use $child_class";
196 $self->throw($@
) if $@
;
198 # rebless & reinitialize with the new class
199 # (this prevents subclasses from forgetting to call our own initialization)
200 bless $self,$child_class;
201 $self->_initialize(@_);
202 $self->_set_namespaces(@_);
210 my ($flat_write_flag,$dbname,$flat_indexing,$flat_verbose,$flat_outfile,$flat_format)
211 = $self->_rearrange([qw(WRITE_FLAG DBNAME INDEX VERBOSE OUT FORMAT)],@_);
213 $self->write_flag($flat_write_flag) if defined $flat_write_flag;
215 if (defined $flat_indexing) {
217 $flat_indexing = 'BerkeleyDB/1' if $flat_indexing =~ /bdb
/;
218 $flat_indexing = 'flat/1' if $flat_indexing =~ /^(flat
|binary
)/;
219 $self->indexing_scheme($flat_indexing);
222 $self->verbose($flat_verbose) if defined $flat_verbose;
223 $self->dbname($dbname) if defined $dbname;
224 $self->out_file($flat_outfile) if defined $flat_outfile;
225 $self->file_format($flat_format) if defined $flat_format;
228 sub _set_namespaces
{
231 $self->primary_namespace($self->default_primary_namespace)
232 unless defined $self->{flat_primary_namespace
};
234 $self->secondary_namespaces($self->default_secondary_namespaces)
235 unless defined $self->{flat_secondary_namespaces
};
237 $self->file_format($self->default_file_format)
238 unless defined $self->{flat_format
};
241 =head2 new_from_registry
243 Title : new_from_registry
244 Usage : $db = Bio::DB::Flat->new_from_registry(%config)
245 Function: creates a new Bio::DB::Flat object in a Bio::DB::Registry-
247 Returns : new Bio::DB::Flat
248 Args : provided by the registry, see below
251 The following registry-configuration tags are recognized:
253 location Root of the indexed flat file; corresponds to the new() method's
258 sub new_from_registry
{
259 my ($self,%config) = @_;
260 my $location = $config{'location'} or
261 $self->throw('location tag must be specified.');
262 my $dbname = $config{'dbname'} or
263 $self->throw('dbname tag must be specified.');
265 my $db = $self->new(-directory
=> $location,
274 my $d = $self->{flat_directory
};
275 $self->{flat_directory
} = shift if @_;
280 my $d = $self->{flat_write_flag
};
281 $self->{flat_write_flag
} = shift if @_;
286 my $d = $self->{flat_verbose
};
287 $self->{flat_verbose
} = shift if @_;
292 my $d = $self->{flat_outfile
};
293 $self->{flat_outfile
} = shift if @_;
298 my $d = $self->{flat_dbname
};
299 $self->{flat_dbname
} = shift if @_;
302 sub primary_namespace
{
304 my $d = $self->{flat_primary_namespace
};
305 $self->{flat_primary_namespace
} = shift if @_;
309 # get/set secondary namespace(s)
311 # get an array ref in scalar context, list in list context.
312 sub secondary_namespaces
{
314 my $d = $self->{flat_secondary_namespaces
};
315 $self->{flat_secondary_namespaces
} = (ref($_[0]) eq 'ARRAY' ?
shift : [@_]) if @_;
317 $d = [$d] if $d && ref($d) ne 'ARRAY'; # just paranoia
318 return wantarray ? @
$d : $d;
321 # return the file format
324 my $d = $self->{flat_format
};
325 $self->{flat_format
} = shift if @_;
329 # return the alphabet
332 my $d = $self->{flat_alphabet
};
333 $self->{flat_alphabet
} = shift if @_;
337 sub parse_one_record
{
341 $self->{cached_parsers
}{fileno($fh)}
342 ||= Bio
::SeqIO
->new(-fh
=>$fh,-format
=>$self->default_file_format);
343 my $seq = $parser->next_seq or return;
344 $self->{flat_alphabet
} ||= $seq->alphabet;
345 my $ids = $self->seq_to_ids($seq);
350 # return the indexing scheme
351 sub indexing_scheme
{
353 my $d = $self->{flat_indexing
};
354 $self->{flat_indexing
} = shift if @_;
360 my ($file_path,$file_length,$nf) = @_;
362 # check that file_path is absolute
363 unless (File
::Spec
->file_name_is_absolute($file_path)) {
364 $file_path = File
::Spec
->rel2abs($file_path);
367 -r
$file_path or $self->throw("flat file $file_path cannot be read: $!");
369 my $current_size = -s _
;
370 if (defined $file_length) {
371 $current_size == $file_length
372 or $self->throw("flat file $file_path has changed size. Was $file_length bytes; now $current_size");
374 $file_length = $current_size;
377 unless (defined $nf) {
378 $self->{flat_file_index
} = 0 unless exists $self->{flat_file_index
};
379 $nf = $self->{flat_file_index
}++;
381 $self->{flat_flat_file_path
}{$nf} = $file_path;
382 $self->{flat_flat_file_no
}{$file_path} = $nf;
388 $self->write_flag or $self->throw("cannot write configuration file because write_flag is not set");
389 my $path = $self->_config_path;
391 open (my $F,">$path") or $self->throw("open error on $path: $!");
393 my $index_type = $self->indexing_scheme;
394 print $F "index\t$index_type\n";
396 my $format = $self->file_format;
397 my $alphabet = $self->alphabet;
398 my $alpha = $alphabet ?
"/$alphabet" : '';
399 print $F "format\tURN:LSID:open-bio.org:${format}${alpha}\n";
401 my @filenos = $self->_filenos or $self->throw("cannot write config file because no flat files defined");
402 for my $nf (@filenos) {
403 my $path = $self->{flat_flat_file_path
}{$nf};
405 print $F join("\t","fileid_$nf",$path,$size),"\n";
408 # write primary namespace
409 my $primary_ns = $self->primary_namespace
410 or $self->throw('cannot write config file because no primary namespace defined');
412 print $F join("\t",'primary_namespace',$primary_ns),"\n";
414 # write secondary namespaces
415 my @secondary = $self->secondary_namespaces;
416 print $F join("\t",'secondary_namespaces',@secondary),"\n";
418 close $F or $self->throw("close error on $path: $!");
423 return unless $self->{flat_flat_file_no
};
424 return keys %{$self->{flat_flat_file_no
}};
431 $self->write_flag or $self->throw("cannot write sequences because write_flag is not set");
433 my $file = $self->out_file or $self->throw('no outfile defined; use the -out argument to new()');
434 my $seqio = $self->{flat_cached_parsers
}{$file}
435 ||= Bio
::SeqIO
->new(-Format
=> $self->file_format,
437 or $self->throw("couldn't create Bio::SeqIO object");
439 my $fh = $seqio->_fh or $self->throw("couldn't get filehandle from Bio::SeqIO object");
440 my $offset = tell($fh);
441 $seqio->write_seq($seq);
442 my $length = tell($fh)-$offset;
443 my $ids = $self->seq_to_ids($seq);
444 $self->_store_index($ids,$file,$offset,$length);
446 $self->{flat_outfile_dirty
}++;
451 return unless $self->{flat_outfile_dirty
};
453 delete $self->{flat_outfile_dirty
};
454 delete $self->{flat_cached_parsers
}{$self->out_file};
460 return unless $self->{flat_flat_file_path
};
461 return keys %{$self->{flat_flat_file_path
}};
464 # read the configuration file
467 my $path = $self->_config_path;
468 return unless -e
$path;
470 open (my $F,$path) or $self->throw("open error on $path: $!");
474 my ($tag,@values) = split "\t";
475 $config{$tag} = \
@values;
477 CORE
::close $F or $self->throw("close error on $path: $!");
479 $config{index}[0] =~ m
~(flat
/1|BerkeleyDB/1)~
480 or $self->throw("invalid configuration file $path: no index line");
482 $self->indexing_scheme($1);
484 if ($config{format
}) {
486 if ($config{format
}[0] =~ /^URN:LSID:open-bio\.org:(\w+)(?:\/(\w
+))/) {
487 $self->file_format($1);
489 } else { # compatibility with older versions
490 $self->file_format($config{format
}[0]);
494 # set up primary namespace
495 my $primary_namespace = $config{primary_namespace
}[0]
496 or $self->throw("invalid configuration file $path: no primary namespace defined");
497 $self->primary_namespace($primary_namespace);
499 # set up secondary namespaces (may be empty)
500 $self->secondary_namespaces($config{secondary_namespaces
});
502 # get file paths and their normalization information
503 my @normalized_files = grep {$_ ne ''} map {/^fileid_(\S+)/ && $1} keys %config;
504 for my $nf (@normalized_files) {
505 my ($file_path,$file_length) = @
{$config{"fileid_${nf}"}};
506 $self->add_flat_file($file_path,$file_length,$nf);
514 $self->_catfile($self->_config_name);
519 my $component = shift;
520 File
::Spec
->catfile($self->directory,$self->dbname,$component);
523 sub _config_name
{ CONFIG_FILE_NAME
}
528 return $self->add_flat_file($path)
529 unless exists $self->{flat_flat_file_no
}{$path};
530 $self->{flat_flat_file_no
}{$path};
536 $self->{flat_flat_file_path
}{$fileno};
541 my $paths = $self->{flat_flat_file_no
};
548 Usage : $index->fetch( $id )
549 Function: Returns a Bio::Seq object from the index
550 Example : $seq = $index->fetch( 'dJ67B12' )
551 Returns : Bio::Seq object
554 Deprecated. Use get_Seq_by_id instead.
558 sub fetch
{ shift->get_Seq_by_id(@_) }
561 =head2 To Be Implemented in Subclasses
563 The following methods MUST be implemented by subclasses.
567 # create real live Bio::Seq object
571 $self->throw_not_implemented;
575 # fetch array of Bio::Seq objects
578 return $self->get_Seq_by_id(shift) if @_ == 1;
581 $self->throw_not_implemented;
585 my ($self,$id,$namespace) = @_;
586 $self->throw_not_implemented;
589 sub default_file_format
{
591 $self->throw_not_implemented;
596 my ($ids,$file,$offset,$length) = @_;
597 $self->throw_not_implemented;
600 =head2 May Be Overridden in Subclasses
602 The following methods MAY be overridden by subclasses.
606 sub default_primary_namespace
{
610 sub default_secondary_namespaces
{
618 $ids{$self->primary_namespace} = $seq->accession_number;