4 # BioPerl module for Bio::DB::Flat
6 # Cared for by Lincoln Stein <lstein@cshl.org>
8 # You may distribute this module under the same terms as perl itself
10 # POD documentation - main docs before the code
14 Bio::DB::Flat - Interface for indexed flat files
18 $db = Bio::DB::Flat->new(-directory => '/usr/share/embl',
23 $db->build_index('/usr/share/embl/primate.embl',
24 '/usr/share/embl/protists.embl');
25 $seq = $db->get_Seq_by_id('BUM');
26 @sequences = $db->get_Seq_by_acc('DIV' => 'primate');
27 $raw = $db->fetch_raw('BUM');
31 This object provides the basic mechanism to associate positions in
32 files with primary and secondary name spaces. Unlike
33 Bio::Index::Abstract (see L<Bio::Index::Abstract>), this is specialized
34 to work with the "flat index" and BerkeleyDB indexed flat file formats
35 worked out at the 2002 BioHackathon.
37 This object is a general front end to the underlying databases.
43 User feedback is an integral part of the evolution of this and other
44 Bioperl modules. Send your comments and suggestions preferably to one
45 of the Bioperl mailing lists. Your participation is much appreciated.
47 bioperl-l@bioperl.org - General discussion
48 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
52 Report bugs to the Bioperl bug tracking system to help us keep track
53 the bugs and their resolution. Bug reports can be submitted via the
56 http://bugzilla.open-bio.org/
58 =head1 AUTHOR - Lincoln Stein
60 Email - lstein@cshl.org
64 The rest of the documentation details each of the object methods. Internal
65 methods are usually preceded with an "_" (underscore).
70 # Let the code begin...
71 package Bio
::DB
::Flat
;
75 use base
qw(Bio::Root::Root Bio::DB::RandomAccessI);
77 use constant CONFIG_FILE_NAME
=> 'config.dat';
82 Usage : my $db = Bio::DB::Flat->new(
83 -directory => $root_directory,
89 -format => 'genbank');
90 Function: create a new Bio::DB::Flat object
91 Returns : new Bio::DB::Flat object
92 Args : -directory Root directory containing "config.dat"
93 -write_flag If true, allows creation/updating.
94 -verbose Verbose messages
95 -out File to write to when write_seq invoked
96 -index 'bdb' or 'binarysearch'
99 The required -directory argument indicates where the flat file indexes
100 will be stored. The build_index() and write_seq() methods will
101 automatically create subdirectories of this root directory. Each
102 subdirectory will contain a human-readable configuration file named
103 "config.dat" that specifies where the individual indexes are stored.
105 The required -dbname argument gives a name to the database index. The
106 index files will actually be stored in a like-named subdirectory
107 underneath the root directory.
109 The -write_flag enables writing new entries into the database as well
110 as the creation of the indexes. By default the indexes will be opened
113 -index is one of "bdb" or "binarysearch" and indicates the type of
114 index to generate. "bdb" corresponds to Berkeley DB. You *must* be
115 using BerkeleyDB version 2 or higher, and have the Perl BerkeleyDB
116 extension installed (DB_File will *not* work). "binarysearch"
117 corresponds to the OBDA "flat" indexed file.
119 The -out argument specifies the output file for writing objects created
122 The -format argument specifies the format of the input file or files. If
123 the file suffix is one that Bioperl can already associate with a format
124 then this is optional.
130 $class = ref($class) if ref($class);
131 my $self = $class->SUPER::new
(@_);
133 # first we initialize ourselves
134 my ($flat_directory,$dbname,$format) =
135 $self->_rearrange([qw(DIRECTORY DBNAME FORMAT)],@_);
137 defined $flat_directory
138 or $self->throw('Please supply a -directory argument');
140 or $self->throw('Please supply a -dbname argument');
142 # set values from configuration file
143 $self->directory($flat_directory);
144 $self->dbname($dbname);
146 $self->throw("Base directory $flat_directory doesn't exist")
147 unless -e
$flat_directory;
148 $self->throw("$flat_directory isn't a directory")
150 my $dbpath = File
::Spec
->catfile($flat_directory,$dbname);
151 unless (-d
$dbpath) {
152 $self->debug("creating db directory $dbpath\n");
153 mkdir $dbpath,0777 or $self->throw("Can't create $dbpath: $!");
155 $self->_read_config();
157 # but override with initialization values
158 $self->_initialize(@_);
160 $self->throw('you must specify an indexing scheme')
161 unless $self->indexing_scheme;
163 # now we figure out what subclass to instantiate
164 my $index_type = $self->indexing_scheme eq 'BerkeleyDB/1' ?
'BDB'
165 :$self->indexing_scheme eq 'flat/1' ?
'Binary'
166 :$self->throw("unknown indexing scheme: " .
167 $self->indexing_scheme);
168 $format = $self->file_format;
170 # because Michele and Lincoln did it differently
171 # Michele's way is via a standalone concrete class
172 if ($index_type eq 'Binary') {
173 my $child_class = 'Bio::DB::Flat::BinarySearch';
174 eval "use $child_class";
175 $self->throw($@
) if $@
;
176 push @_, ('-format', $format);
177 return $child_class->new(@_);
180 # Lincoln uses Bio::SeqIO style delegation.
181 my $child_class= "Bio\:\:DB\:\:Flat\:\:$index_type\:\:\L$format";
182 eval "use $child_class";
183 $self->throw($@
) if $@
;
185 # rebless & reinitialize with the new class
186 # (this prevents subclasses from forgetting to call our own initialization)
187 bless $self,$child_class;
188 $self->_initialize(@_);
189 $self->_set_namespaces(@_);
197 my ($flat_write_flag,$dbname,$flat_indexing,$flat_verbose,$flat_outfile,$flat_format)
198 = $self->_rearrange([qw(WRITE_FLAG DBNAME INDEX VERBOSE OUT FORMAT)],@_);
200 $self->write_flag($flat_write_flag) if defined $flat_write_flag;
202 if (defined $flat_indexing) {
204 $flat_indexing = 'BerkeleyDB/1' if $flat_indexing =~ /bdb
/;
205 $flat_indexing = 'flat/1' if $flat_indexing =~ /^(flat
|binary
)/;
206 $self->indexing_scheme($flat_indexing);
209 $self->verbose($flat_verbose) if defined $flat_verbose;
210 $self->dbname($dbname) if defined $dbname;
211 $self->out_file($flat_outfile) if defined $flat_outfile;
212 $self->file_format($flat_format) if defined $flat_format;
215 sub _set_namespaces
{
218 $self->primary_namespace($self->default_primary_namespace)
219 unless defined $self->{flat_primary_namespace
};
221 $self->secondary_namespaces($self->default_secondary_namespaces)
222 unless defined $self->{flat_secondary_namespaces
};
224 $self->file_format($self->default_file_format)
225 unless defined $self->{flat_format
};
228 =head2 new_from_registry
230 Title : new_from_registry
231 Usage : $db = Bio::DB::Flat->new_from_registry(%config)
232 Function: creates a new Bio::DB::Flat object in a Bio::DB::Registry-
234 Returns : new Bio::DB::Flat
235 Args : provided by the registry, see below
238 The following registry-configuration tags are recognized:
240 location Root of the indexed flat file; corresponds to the new() method's
245 sub new_from_registry
{
246 my ($self,%config) = @_;
247 my $location = $config{'location'} or
248 $self->throw('location tag must be specified.');
249 my $dbname = $config{'dbname'} or
250 $self->throw('dbname tag must be specified.');
252 my $db = $self->new(-directory
=> $location,
261 my $d = $self->{flat_directory
};
262 $self->{flat_directory
} = shift if @_;
267 my $d = $self->{flat_write_flag
};
268 $self->{flat_write_flag
} = shift if @_;
273 my $d = $self->{flat_verbose
};
274 $self->{flat_verbose
} = shift if @_;
279 my $d = $self->{flat_outfile
};
280 $self->{flat_outfile
} = shift if @_;
285 my $d = $self->{flat_dbname
};
286 $self->{flat_dbname
} = shift if @_;
289 sub primary_namespace
{
291 my $d = $self->{flat_primary_namespace
};
292 $self->{flat_primary_namespace
} = shift if @_;
296 # get/set secondary namespace(s)
298 # get an array ref in scalar context, list in list context.
299 sub secondary_namespaces
{
301 my $d = $self->{flat_secondary_namespaces
};
302 $self->{flat_secondary_namespaces
} = (ref($_[0]) eq 'ARRAY' ?
shift : [@_]) if @_;
304 $d = [$d] if $d && ref($d) ne 'ARRAY'; # just paranoia
305 return wantarray ? @
$d : $d;
308 # return the file format
311 my $d = $self->{flat_format
};
312 $self->{flat_format
} = shift if @_;
316 # return the alphabet
319 my $d = $self->{flat_alphabet
};
320 $self->{flat_alphabet
} = shift if @_;
324 sub parse_one_record
{
328 $self->{cached_parsers
}{fileno($fh)}
329 ||= Bio
::SeqIO
->new(-fh
=>$fh,-format
=>$self->default_file_format);
330 my $seq = $parser->next_seq or return;
331 $self->{flat_alphabet
} ||= $seq->alphabet;
332 my $ids = $self->seq_to_ids($seq);
337 # return the indexing scheme
338 sub indexing_scheme
{
340 my $d = $self->{flat_indexing
};
341 $self->{flat_indexing
} = shift if @_;
347 my ($file_path,$file_length,$nf) = @_;
349 # check that file_path is absolute
350 unless (File
::Spec
->file_name_is_absolute($file_path)) {
351 $file_path = File
::Spec
->rel2abs($file_path);
354 -r
$file_path or $self->throw("flat file $file_path cannot be read: $!");
356 my $current_size = -s _
;
357 if (defined $file_length) {
358 $current_size == $file_length
359 or $self->throw("flat file $file_path has changed size. Was $file_length bytes; now $current_size");
361 $file_length = $current_size;
364 unless (defined $nf) {
365 $self->{flat_file_index
} = 0 unless exists $self->{flat_file_index
};
366 $nf = $self->{flat_file_index
}++;
368 $self->{flat_flat_file_path
}{$nf} = $file_path;
369 $self->{flat_flat_file_no
}{$file_path} = $nf;
375 $self->write_flag or $self->throw("cannot write configuration file because write_flag is not set");
376 my $path = $self->_config_path;
378 open (my $F,">$path") or $self->throw("open error on $path: $!");
380 my $index_type = $self->indexing_scheme;
381 print $F "index\t$index_type\n";
383 my $format = $self->file_format;
384 my $alphabet = $self->alphabet;
385 my $alpha = $alphabet ?
"/$alphabet" : '';
386 print $F "format\tURN:LSID:open-bio.org:${format}${alpha}\n";
388 my @filenos = $self->_filenos or $self->throw("cannot write config file because no flat files defined");
389 for my $nf (@filenos) {
390 my $path = $self->{flat_flat_file_path
}{$nf};
392 print $F join("\t","fileid_$nf",$path,$size),"\n";
395 # write primary namespace
396 my $primary_ns = $self->primary_namespace
397 or $self->throw('cannot write config file because no primary namespace defined');
399 print $F join("\t",'primary_namespace',$primary_ns),"\n";
401 # write secondary namespaces
402 my @secondary = $self->secondary_namespaces;
403 print $F join("\t",'secondary_namespaces',@secondary),"\n";
405 close $F or $self->throw("close error on $path: $!");
410 return unless $self->{flat_flat_file_no
};
411 return keys %{$self->{flat_flat_file_no
}};
418 $self->write_flag or $self->throw("cannot write sequences because write_flag is not set");
420 my $file = $self->out_file or $self->throw('no outfile defined; use the -out argument to new()');
421 my $seqio = $self->{flat_cached_parsers
}{$file}
422 ||= Bio
::SeqIO
->new(-Format
=> $self->file_format,
424 or $self->throw("couldn't create Bio::SeqIO object");
426 my $fh = $seqio->_fh or $self->throw("couldn't get filehandle from Bio::SeqIO object");
427 my $offset = tell($fh);
428 $seqio->write_seq($seq);
429 my $length = tell($fh)-$offset;
430 my $ids = $self->seq_to_ids($seq);
431 $self->_store_index($ids,$file,$offset,$length);
433 $self->{flat_outfile_dirty
}++;
438 return unless $self->{flat_outfile_dirty
};
440 delete $self->{flat_outfile_dirty
};
441 delete $self->{flat_cached_parsers
}{$self->out_file};
447 return unless $self->{flat_flat_file_path
};
448 return keys %{$self->{flat_flat_file_path
}};
451 # read the configuration file
454 my $path = $self->_config_path;
455 return unless -e
$path;
457 open (my $F,$path) or $self->throw("open error on $path: $!");
461 my ($tag,@values) = split "\t";
462 $config{$tag} = \
@values;
464 CORE
::close $F or $self->throw("close error on $path: $!");
466 $config{index}[0] =~ m
~(flat
/1|BerkeleyDB/1)~
467 or $self->throw("invalid configuration file $path: no index line");
469 $self->indexing_scheme($1);
471 if ($config{format
}) {
473 if ($config{format
}[0] =~ /^URN:LSID:open-bio\.org:(\w+)(?:\/(\w
+))/) {
474 $self->file_format($1);
476 } else { # compatibility with older versions
477 $self->file_format($config{format
}[0]);
481 # set up primary namespace
482 my $primary_namespace = $config{primary_namespace
}[0]
483 or $self->throw("invalid configuration file $path: no primary namespace defined");
484 $self->primary_namespace($primary_namespace);
486 # set up secondary namespaces (may be empty)
487 $self->secondary_namespaces($config{secondary_namespaces
});
489 # get file paths and their normalization information
490 my @normalized_files = grep {$_ ne ''} map {/^fileid_(\S+)/ && $1} keys %config;
491 for my $nf (@normalized_files) {
492 my ($file_path,$file_length) = @
{$config{"fileid_${nf}"}};
493 $self->add_flat_file($file_path,$file_length,$nf);
501 $self->_catfile($self->_config_name);
506 my $component = shift;
507 File
::Spec
->catfile($self->directory,$self->dbname,$component);
510 sub _config_name
{ CONFIG_FILE_NAME
}
515 return $self->add_flat_file($path)
516 unless exists $self->{flat_flat_file_no
}{$path};
517 $self->{flat_flat_file_no
}{$path};
523 $self->{flat_flat_file_path
}{$fileno};
528 my $paths = $self->{flat_flat_file_no
};
535 Usage : $index->fetch( $id )
536 Function: Returns a Bio::Seq object from the index
537 Example : $seq = $index->fetch( 'dJ67B12' )
538 Returns : Bio::Seq object
541 Deprecated. Use get_Seq_by_id instead.
545 sub fetch
{ shift->get_Seq_by_id(@_) }
548 =head2 To Be Implemented in Subclasses
550 The following methods MUST be implemented by subclasses.
554 # create real live Bio::Seq object
558 $self->throw_not_implemented;
562 # fetch array of Bio::Seq objects
565 return $self->get_Seq_by_id(shift) if @_ == 1;
568 $self->throw_not_implemented;
572 my ($self,$id,$namespace) = @_;
573 $self->throw_not_implemented;
576 sub default_file_format
{
578 $self->throw_not_implemented;
583 my ($ids,$file,$offset,$length) = @_;
584 $self->throw_not_implemented;
587 =head2 May Be Overridden in Subclasses
589 The following methods MAY be overridden by subclasses.
593 sub default_primary_namespace
{
597 sub default_secondary_namespaces
{
605 $ids{$self->primary_namespace} = $seq->accession_number;