sync w/ main trunk
[bioperl-live.git] / Bio / DB / Flat / BinarySearch.pm
blob55613a946d24870a29613d5b42dc320278534c04
1 # $Id$
3 # BioPerl module for Bio::DB::Flat::BinarySearch
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Michele Clamp <michele@sanger.ac.uk>>
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::DB::Flat::BinarySearch - BinarySearch search indexing system for sequence files
17 =head1 SYNOPSIS
21 =head1 DESCRIPTION
23 This module can be used both to index sequence files and also to
24 retrieve sequences from existing sequence files.
26 This object allows indexing of sequence files both by a primary key
27 (say accession) and multiple secondary keys (say ids). This is
28 different from the Bio::Index::Abstract (see L<Bio::Index::Abstract>)
29 which uses DBM files as storage. This module uses a binary search to
30 retrieve sequences which is more efficient for large datasets.
32 =head2 Index creation
34 my $sequencefile; # Some fasta sequence file
36 Patterns have to be entered to define where the keys are to be indexed
37 and also where the start of each record. E.g. for fasta
39 my $start_pattern = '^>';
40 my $primary_pattern = '^>(\S+)';
42 So the start of a record is a line starting with a E<gt> and the
43 primary key is all characters up to the first space after the E<gt>
45 A string also has to be entered to defined what the primary key
46 (primary_namespace) is called.
48 The index can now be created using
50 my $index = Bio::DB::Flat::BinarySearch->new(
51 -directory => "/home/max/",
52 -dbname => "mydb",
53 -start_pattern => $start_pattern,
54 -primary_pattern => $primary_pattern,
55 -primary_namespace => "ID",
56 -format => "fasta" );
58 my @files = ("file1","file2","file3");
60 $index->build_index(@files);
62 The index is now ready to use. For large sequence files the perl way
63 of indexing takes a *long* time and a *huge* amount of memory. For
64 indexing things like dbEST I recommend using the DB_File indexer, BDB.
66 The formats currently supported by this module are fasta, Swissprot,
67 and EMBL.
69 =head2 Creating indices with secondary keys
71 Sometimes just indexing files with one id per entry is not enough. For
72 instance you may want to retrieve sequences from swissprot using
73 their accessions as well as their ids.
75 To be able to do this when creating your index you need to pass in
76 a hash of secondary_patterns which have their namespaces as the keys
77 to the hash.
79 e.g. For Indexing something like
81 ID 1433_CAEEL STANDARD; PRT; 248 AA.
82 AC P41932;
83 DT 01-NOV-1995 (Rel. 32, Created)
84 DT 01-NOV-1995 (Rel. 32, Last sequence update)
85 DT 15-DEC-1998 (Rel. 37, Last annotation update)
86 DE 14-3-3-LIKE PROTEIN 1.
87 GN FTT-1 OR M117.2.
88 OS Caenorhabditis elegans.
89 OC Eukaryota; Metazoa; Nematoda; Chromadorea; Rhabditida; Rhabditoidea;
90 OC Rhabditidae; Peloderinae; Caenorhabditis.
91 OX NCBI_TaxID=6239;
92 RN [1]
94 where we want to index the accession (P41932) as the primary key and the
95 id (1433_CAEEL) as the secondary id. The index is created as follows
97 my %secondary_patterns;
99 my $start_pattern = '^ID (\S+)';
100 my $primary_pattern = '^AC (\S+)\;';
102 $secondary_patterns{"ID"} = '^ID (\S+)';
104 my $index = Bio::DB::Flat::BinarySearch->new(
105 -directory => $index_directory,
106 -dbname => "ppp",
107 -write_flag => 1,
108 -verbose => 1,
109 -start_pattern => $start_pattern,
110 -primary_pattern => $primary_pattern,
111 -primary_namespace => 'AC',
112 -secondary_patterns => \%secondary_patterns);
114 $index->build_index($seqfile);
116 Of course having secondary indices makes indexing slower and use more
117 memory.
119 =head2 Index reading
121 To fetch sequences using an existing index first of all create your sequence
122 object
124 my $index = Bio::DB::Flat::BinarySearch->new(
125 -directory => $index_directory);
127 Now you can happily fetch sequences either by the primary key or
128 by the secondary keys.
130 my $entry = $index->get_entry_by_id('HBA_HUMAN');
132 This returns just a string containing the whole entry. This is
133 useful is you just want to print the sequence to screen or write it to a file.
135 Other ways of getting sequences are
137 my $fh = $index->get_stream_by_id('HBA_HUMAN');
139 This can then be passed to a seqio object for output or converting
140 into objects.
142 my $seq = Bio::SeqIO->new(-fh => $fh,
143 -format => 'fasta');
145 The last way is to retrieve a sequence directly. This is the
146 slowest way of extracting as the sequence objects need to be made.
148 my $seq = $index->get_Seq_by_id('HBA_HUMAN');
150 To access the secondary indices the secondary namespace needs to be known
152 $index->secondary_namespaces("ID");
154 Then the following call can be used
156 my $seq = $index->get_Seq_by_secondary('ID','1433_CAEEL');
158 These calls are not yet implemented
160 my $fh = $index->get_stream_by_secondary('ID','1433_CAEEL');
161 my $entry = $index->get_entry_by_secondary('ID','1433_CAEEL');
163 =head1 FEEDBACK
165 =head2 Mailing Lists
167 User feedback is an integral part of the evolution of this and other
168 Bioperl modules. Send your comments and suggestions preferably to one
169 of the Bioperl mailing lists. Your participation is much appreciated.
171 bioperl-l@bioperl.org - General discussion
172 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
174 =head2 Support
176 Please direct usage questions or support issues to the mailing list:
178 L<bioperl-l@bioperl.org>
180 rather than to the module maintainer directly. Many experienced and
181 reponsive experts will be able look at the problem and quickly
182 address it. Please include a thorough description of the problem
183 with code and data examples if at all possible.
185 =head2 Reporting Bugs
187 Report bugs to the Bioperl bug tracking system to help us keep track
188 the bugs and their resolution. Bug reports can be submitted via the
189 web:
191 http://bugzilla.open-bio.org/
193 =head1 AUTHOR - Michele Clamp
195 Email - michele@sanger.ac.uk
197 =head1 CONTRIBUTORS
199 Jason Stajich, jason@bioperl.org
201 =head1 APPENDIX
203 The rest of the documentation details each of the object methods. Internal
204 methods are usually preceded with an "_" (underscore).
206 =cut
208 package Bio::DB::Flat::BinarySearch;
210 use strict;
212 use Fcntl qw(SEEK_END SEEK_CUR);
213 # rather than using tell which might be buffered
214 sub systell{ sysseek($_[0], 0, SEEK_CUR) }
215 sub syseof{ sysseek($_[0], 0, SEEK_END) }
217 use File::Spec;
218 use Bio::Root::RootI;
219 use Bio::SeqIO;
220 use Bio::Seq;
222 use base qw(Bio::DB::RandomAccessI);
224 use constant CONFIG_FILE_NAME => 'config.dat';
225 use constant HEADER_SIZE => 4;
226 use constant DEFAULT_FORMAT => 'fasta';
228 my @formats = ['FASTA','SWISSPROT','EMBL'];
230 =head2 new
232 Title : new
233 Usage : For reading
234 my $index = Bio::DB::Flat::BinarySearch->new(
235 -directory => '/Users/michele/indices/dbest',
236 -dbname => 'mydb',
237 -format => 'fasta');
239 For writing
241 my %secondary_patterns = {"ACC" => "^>\\S+ +(\\S+)"}
242 my $index = Bio::DB::Flat::BinarySearch->new(
243 -directory => '/Users/michele/indices',
244 -dbname => 'mydb',
245 -primary_pattern => "^>(\\S+)",
246 -secondary_patterns => \%secondary_patterns,
247 -primary_namespace => "ID");
249 my @files = ('file1','file2','file3');
251 $index->build_index(@files);
254 Function: create a new Bio::DB::Flat::BinarySearch object
255 Returns : new Bio::DB::Flat::BinarySearch
256 Args : -directory Root directory for index files
257 -dbname Name of subdirectory containing indices
258 for named database
259 -write_flag Allow building index
260 -primary_pattern Regexp defining the primary id
261 -secondary_patterns A hash ref containing the secondary
262 patterns with the namespaces as keys
263 -primary_namespace A string defining what the primary key
266 Status : Public
268 =cut
270 sub new {
271 my($class, @args) = @_;
273 my $self = $class->SUPER::new(@args);
275 bless $self, $class;
277 my ($index_dir,$dbname,$format,$write_flag,$primary_pattern,
278 $primary_namespace,$start_pattern,$secondary_patterns) =
279 $self->_rearrange([qw(DIRECTORY
280 DBNAME
281 FORMAT
282 WRITE_FLAG
283 PRIMARY_PATTERN
284 PRIMARY_NAMESPACE
285 START_PATTERN
286 SECONDARY_PATTERNS)], @args);
288 $self->index_directory($index_dir);
289 $self->dbname($dbname);
291 if ($self->index_directory && $self->read_config_file) {
293 my $fh = $self->primary_index_filehandle;
294 my $record_width = $self->read_header($fh);
295 $self->record_size($record_width);
297 $format ||= DEFAULT_FORMAT;
298 $self->format ($format);
299 $self->write_flag ($write_flag);
301 if ($self->write_flag && ! $primary_namespace) {
302 ($primary_namespace,$primary_pattern,
303 $start_pattern,$secondary_patterns) =
304 $self->_guess_patterns($self->format);
307 $self->primary_pattern ($primary_pattern);
308 $self->primary_namespace ($primary_namespace);
309 $self->start_pattern ($start_pattern);
310 $self->secondary_patterns($secondary_patterns);
312 return $self;
315 sub new_from_registry {
316 my ($self,%config) = @_;
318 my $dbname = $config{'dbname'};
319 my $location = $config{'location'};
321 my $index = Bio::DB::Flat::BinarySearch->new(-dbname => $dbname,
322 -index_dir => $location,
326 =head2 get_Seq_by_id
328 Title : get_Seq_by_id
329 Usage : $obj->get_Seq_by_id($newval)
330 Function:
331 Example :
332 Returns : value of get_Seq_by_id
333 Args : newvalue (optional)
335 =cut
337 sub get_Seq_by_id {
338 my ($self,$id) = @_;
340 # too many uninit variables...
341 local $^W = 0;
343 my ($fh,$length) = $self->get_stream_by_id($id);
345 unless (defined($self->format)) {
346 $self->throw("Can't create sequence - format is not defined");
349 return unless $fh;
351 unless ( defined($self->{_seqio}) ) {
353 $self->{_seqio} = Bio::SeqIO->new(-fh => $fh,
354 -format => $self->format);
355 } else {
356 $self->{_seqio}->fh($fh);
359 return $self->{_seqio}->next_seq;
362 =head2 get_entry_by_id
364 Title : get_entry_by_id
365 Usage : $obj->get_entry_by_id($newval)
366 Function: Get a Bio::SeqI object for a unique ID
367 Returns : Bio::SeqI
368 Args : string
371 =cut
373 sub get_entry_by_id {
374 my ($self,$id) = @_;
376 my ($fh,$length) = $self->get_stream_by_id($id);
378 my $entry;
380 sysread($fh,$entry,$length);
382 return $entry;
386 =head2 get_stream_by_id
388 Title : get_stream_by_id
389 Usage : $obj->get_stream_by_id($id)
390 Function: Gets a Sequence stream for an id
391 Returns : Bio::SeqIO stream
392 Args : Id to lookup by
395 =cut
397 sub get_stream_by_id {
398 my ($self,$id) = @_;
400 unless( $self->record_size ) {
401 if ($self->index_directory && $self->read_config_file) {
403 my $fh = $self->primary_index_filehandle;
404 my $record_width = $self->read_header($fh);
405 $self->record_size($record_width);
408 my $indexfh = $self->primary_index_filehandle;
409 syseof ($indexfh);
411 my $filesize = systell($indexfh);
413 $self->throw("file was not parsed properly, record size is empty")
414 unless $self->record_size;
416 my $end = ($filesize - $self->{'_start_pos'}) / $self->record_size;
417 my ($newid,$rest,$fhpos) = $self->find_entry($indexfh,0,$end,$id,$self->record_size);
420 my ($fileid,$pos,$length) = split(/\t/,$rest);
422 #print STDERR "BinarySearch Found id entry $newid $fileid $pos $length:$rest\n";
424 if (!$newid) {
425 return;
428 my $file = $self->{_file}{$fileid};
430 open (my $IN,"<$file");
432 my $entry;
434 sysseek($IN,$pos,0);
436 return ($IN,$length);
439 =head2 get_Seq_by_acc
441 Title : get_Seq_by_acc
442 Usage : $obj->get_Seq_by_acc($acc)
443 Function: Gets a Bio::SeqI object by accession number
444 Returns : Bio::SeqI object
445 Args : string representing accession number
448 =cut
450 sub get_Seq_by_acc {
451 my ($self,$acc) = @_;
453 # too many uninit variables...
454 local $^W = 0;
456 if ($self->primary_namespace eq "ACC") {
457 return $self->get_Seq_by_id($acc);
458 } else {
459 return $self->get_Seq_by_secondary("ACC",$acc);
463 =head2 get_Seq_by_version
465 Title : get_Seq_by_version
466 Usage : $obj->get_Seq_by_version($version)
467 Function: Gets a Bio::SeqI object by accession.version number
468 Returns : Bio::SeqI object
469 Args : string representing accession.version number
472 =cut
474 sub get_Seq_by_version {
475 my ($self,$acc) = @_;
477 # too many uninit variables...
478 local $^W = 0;
480 if ($self->primary_namespace eq "VERSION") {
481 return $self->get_Seq_by_id($acc);
482 } else {
483 return $self->get_Seq_by_secondary("VERSION",$acc);
487 =head2 get_Seq_by_secondary
489 Title : get_Seq_by_secondary
490 Usage : $obj->get_Seq_by_secondary($namespace,$acc)
491 Function: Gets a Bio::SeqI object looking up secondary accessions
492 Returns : Bio::SeqI object
493 Args : namespace name to check secondary namespace and an id
496 =cut
498 sub get_Seq_by_secondary {
499 my ($self,$name,$id) = @_;
501 my @names = $self->secondary_namespaces;
503 my $found = 0;
504 foreach my $tmpname (@names) {
505 if ($name eq $tmpname) {
506 $found = 1;
510 if ($found == 0) {
511 $self->throw("Secondary index for $name doesn't exist\n");
514 my $fh = $self->open_secondary_index($name);
516 syseof ($fh);
518 my $filesize = systell($fh);
520 my $recsize = $self->{'_secondary_record_size'}{$name};
521 # print "Name " . $recsize . "\n";
523 my $end = ($filesize - $self->{'_start_pos'})/$recsize;
525 # print "End $end $filesize\n";
526 my ($newid,$primary_id,$pos) = $self->find_entry($fh,0,$end,$id,$recsize);
528 sysseek($fh,$pos,0);
530 # print "Found new id $newid $primary_id\n";
531 # We now need to shuffle up the index file to find the top secondary entry
533 my $record = $newid;
535 while ($record =~ /^$newid/ && $pos >= 0) {
537 $record = $self->read_record($fh,$pos,$recsize);
538 $pos = $pos - $recsize;
539 # print "Up record = $record:$newid\n";
542 $pos += $recsize;
544 # print "Top position is $pos\n";
546 # Now we have to shuffle back down again to read all the secondary entries
548 my $current_id = $newid;
549 my %primary_id;
551 $primary_id{$primary_id} = 1;
553 while ($current_id eq $newid) {
554 $record = $self->read_record($fh,$pos,$recsize);
555 # print "Record is :$record:\n";
556 my ($secid,$primary_id) = split(/\t/,$record,2);
557 $current_id = $secid;
559 if ($current_id eq $newid) {
560 $primary_id =~ s/ //g;
561 # print "Primary $primary_id\n";
562 $primary_id{$primary_id} = 1;
564 $pos = $pos + $recsize;
565 # print "Down record = $record\n";
569 if (!defined($newid)) {
570 return;
573 my @entry;
575 foreach my $id (keys %primary_id) {
576 push @entry,$self->get_Seq_by_id($id);
578 return wantarray ? @entry : $entry[0];
582 =head2 read_header
584 Title : read_header
585 Usage : $obj->read_header($fhl)
586 Function: Reads the header from the db file
587 Returns : width of a record
588 Args : filehandle
591 =cut
593 sub read_header {
594 my ($self,$fh) = @_;
596 my $record_width;
598 sysread($fh,$record_width,HEADER_SIZE);
600 $self->{'_start_pos'} = HEADER_SIZE;
601 $record_width =~ s/ //g;
602 $record_width = $record_width * 1;
604 return $record_width;
607 =head2 read_record
609 Title : read_record
610 Usage : $obj->read_record($fh,$pos,$len)
611 Function: Reads a record from a filehandle
612 Returns : String
613 Args : filehandle, offset, and length
616 =cut
618 sub read_record {
619 my ($self,$fh,$pos,$len) = @_;
621 sysseek($fh,$pos,0);
623 my $record;
625 sysread($fh,$record,$len);
627 return $record;
631 =head2 get_all_primary_ids
633 Title : get_all_primary_ids
634 Usage : @ids = $seqdb->get_all_primary_ids()
635 Function: gives an array of all the primary_ids of the
636 sequence objects in the database.
637 Returns : an array of strings
638 Args : none
640 =cut
642 sub get_all_primary_ids {
643 my $self = shift;
645 my $fh = $self->primary_index_filehandle;
646 syseof($fh);
647 my $filesize = systell($fh);
648 my $recsize = $self->record_size;
649 my $end = $filesize;
651 my @ids;
652 for (my $pos=$self->{'_start_pos'}; $pos < $end; $pos += $recsize) {
653 my $record = $self->read_record($fh,$pos,$recsize);
654 my ($entryid) = split(/\t/,$record);
655 push @ids,$entryid;
657 @ids;
661 =head2 find_entry
663 Title : find_entry
664 Usage : $obj->find_entry($fh,$start,$end,$id,$recsize)
665 Function: Extract an entry based on the start,end,id and record size
666 Returns : string
667 Args : filehandle, start, end, id, recordsize
670 =cut
672 sub find_entry {
673 my ($self,$fh,$start,$end,$id,$recsize) = @_;
675 my $mid = int( ($end+1+$start) / 2);
676 my $pos = ($mid-1)*$recsize + $self->{'_start_pos'};
678 my ($record) = $self->read_record($fh,$pos,$recsize);
679 my ($entryid,$rest) = split(/\t/,$record,2);
680 $rest =~ s/\s+$//;
682 # print "Mid $recsize $mid $pos:$entryid:$rest:$record\n";
683 # print "Entry :$id:$entryid:$rest\n";
685 my ($first,$second) = $id le $entryid ? ($id,$entryid) : ($entryid,$id);
687 if ($id eq $entryid) {
689 return ($id,$rest,$pos-$recsize);
691 } elsif ($first eq $id) {
693 if ($end-$start <= 1) {
694 return;
696 my $end = $mid;
697 # print "Moving up $entryid $id\n";
698 $self->find_entry($fh,$start,$end,$id,$recsize);
700 } elsif ($second eq $id ) {
701 # print "Moving down $entryid $id\n";
702 if ($end-$start <= 1) {
703 return;
706 $start = $mid;
708 $self->find_entry($fh,$start,$end,$id,$recsize);
714 =head2 build_index
716 Title : build_index
717 Usage : $obj->build_index(@files)
718 Function: Build the index based on a set of files
719 Returns : count of the number of entries
720 Args : List of filenames
723 =cut
725 sub build_index {
726 my ($self,@files) = @_;
727 $self->write_flag or
728 $self->throw('Cannot build index unless -write_flag is true');
730 my $rootdir = $self->index_directory;
732 if (!defined($rootdir)) {
733 $self->throw("No index directory set - can't build indices");
736 if (! -d $rootdir) {
737 $self->throw("Index directory [$rootdir] is not a directory. Cant' build indices");
740 my $dbpath = File::Spec->catfile($rootdir,$self->dbname);
741 if (! -d $dbpath) {
742 warn "Creating directory $dbpath\n";
743 mkdir $dbpath,0777 or $self->throw("Couldn't create $dbpath: $!");
746 unless (@files ) {
747 $self->throw("Must enter an array of filenames to index");
750 foreach my $file (@files) {
751 $file = File::Spec->rel2abs($file)
752 unless File::Spec->file_name_is_absolute($file);
753 unless ( -e $file) {
754 $self->throw("Can't index file [$file] as it doesn't exist");
758 if (my $filehash = $self->{_dbfile}) {
759 push @files,keys %$filehash;
762 my %seen;
763 @files = grep {!$seen{$_}++} @files;
765 # Lets index
766 $self->make_config_file(\@files);
767 my $entries = 0;
768 foreach my $file (@files) {
769 $entries += $self->_index_file($file);
772 # update alphabet if necessary
773 $self->make_config_file(\@files);
775 # And finally write out the indices
776 $self->write_primary_index;
777 $self->write_secondary_indices;
779 $entries;
782 =head2 _index_file
784 Title : _index_file
785 Usage : $obj->_index_file($newval)
786 Function:
787 Example :
788 Returns : value of _index_file
789 Args : newvalue (optional)
792 =cut
794 sub _index_file {
795 my ($self,$file) = @_;
796 my $v = $self->verbose;
797 open(my $FILE,"<", $file) || $self->throw("Can't open file [$file]");
799 my $recstart = 0;
800 my $fileid = $self->get_fileid_by_filename($file);
801 my $found = 0;
802 my $id;
803 my $count = 0;
805 my $primary = $self->primary_pattern;
806 my $start_pattern = $self->start_pattern;
808 my $pos = 0;
810 my $new_primary_entry;
812 my $length;
814 my $fh = $FILE;
816 my $done = -1;
818 my @secondary_names = $self->secondary_namespaces;
819 my %secondary_id;
820 my $last_one;
822 while (<$fh>) {
823 $last_one = $_;
824 $self->{alphabet} ||= $self->guess_alphabet($_);
825 if ($_ =~ /$start_pattern/) {
826 if ($done == 0) {
827 $id = $new_primary_entry;
828 $self->{alphabet} ||= $self->guess_alphabet($_);
830 my $tmplen = (tell $fh) - length($_);
832 $length = $tmplen - $pos;
834 unless( defined($id)) {
835 $self->throw("No id defined for sequence");
837 unless( defined($fileid)) {
838 $self->throw("No fileid defined for file $file");
840 unless( defined($pos)) {
841 $self->throw("No position defined for " . $id . "\n");
843 unless( defined($length)) {
844 $self->throw("No length defined for " . $id . "\n");
846 $self->_add_id_position($id,$pos,$fileid,$length,\%secondary_id);
848 $pos = $tmplen;
850 if ($count > 0 && $count%1000 == 0) {
851 $self->debug( "Indexed $count ids\n") if $v > 0;
854 $count++;
855 } else {
856 $done = 0;
860 if ($_ =~ /$primary/) {
861 $new_primary_entry = $1;
864 my $secondary_patterns = $self->secondary_patterns;
866 foreach my $sec (@secondary_names) {
867 my $pattern = $secondary_patterns->{$sec};
869 if ($_ =~ /$pattern/) {
870 $secondary_id{$sec} = $1;
876 # Remember to add in the last one
878 $id = $new_primary_entry;
879 # my $tmplen = (tell $fh) - length($last_one);
880 my $tmplen = (tell $fh);
882 $length = $tmplen - $pos;
884 if (!defined($id)) {
885 $self->throw("No id defined for sequence");
887 if (!defined($fileid)) {
888 $self->throw("No fileid defined for file $file");
890 if (!defined($pos)) {
891 $self->throw("No position defined for " . $id . "\n");
893 if (!defined($length)) {
894 $self->throw("No length defined for " . $id . "\n");
897 $self->_add_id_position($id,$pos,$fileid,$length,\%secondary_id);
898 $count++;
900 close(FILE);
901 $count;
904 =head2 write_primary_index
906 Title : write_primary_index
907 Usage : $obj->write_primary_index($newval)
908 Function:
909 Example :
910 Returns : value of write_primary_index
911 Args : newvalue (optional)
914 =cut
916 sub write_primary_index {
917 my ($self) = @_;
919 my @ids = keys %{$self->{_id}};
921 @ids = sort {$a cmp $b} @ids;
923 open (my $INDEX,">" . $self->primary_index_file) ||
924 $self->throw("Can't open primary index file [" .
925 $self->primary_index_file . "]");
927 my $recordlength = $self->{_maxidlength} +
928 $self->{_maxfileidlength} +
929 $self->{_maxposlength} +
930 $self->{_maxlengthlength} + 3;
932 print $INDEX sprintf("%04d",$recordlength);
934 foreach my $id (@ids) {
936 if (!defined($self->{_id}{$id}{_fileid})) {
937 $self->throw("No fileid for $id\n");
939 if (!defined($self->{_id}{$id}{_pos})) {
940 $self->throw("No position for $id\n");
942 if (!defined($self->{_id}{$id}{_length})) {
943 $self->throw("No length for $id");
946 my $record = $id . "\t" .
947 $self->{_id}{$id}{_fileid} . "\t" .
948 $self->{_id}{$id}{_pos} . "\t" .
949 $self->{_id}{$id}{_length};
951 print $INDEX sprintf("%-${recordlength}s",$record);
956 =head2 write_secondary_indices
958 Title : write_secondary_indices
959 Usage : $obj->write_secondary_indices($newval)
960 Function:
961 Example :
962 Returns : value of write_secondary_indices
963 Args : newvalue (optional)
966 =cut
968 sub write_secondary_indices {
969 my ($self) = @_;
971 # These are the different
972 my @names = keys (%{$self->{_secondary_id}});
975 foreach my $name (@names) {
977 my @seconds = keys %{$self->{_secondary_id}{$name}};
979 # First we need to loop over to get the longest record.
980 my $length = 0;
982 foreach my $second (@seconds) {
983 my $tmplen = length($second) + 1;
984 my @prims = keys %{$self->{_secondary_id}{$name}{$second}};
986 foreach my $prim (@prims) {
987 my $recordlen = $tmplen + length($prim);
989 if ($recordlen > $length) {
990 $length = $recordlen;
995 # Now we can print the index
997 my $fh = $self->new_secondary_filehandle($name);
999 print $fh sprintf("%04d",$length);
1000 @seconds = sort @seconds;
1002 foreach my $second (@seconds) {
1004 my @prims = keys %{$self->{_secondary_id}{$name}{$second}};
1005 my $tmp = $second;
1007 foreach my $prim (@prims) {
1008 my $record = $tmp . "\t" . $prim;
1009 if (length($record) > $length) {
1010 $self->throw("Something has gone horribly wrong - length of record is more than we thought [$length]\n");
1011 } else {
1012 print $fh sprintf("%-${length}s",$record);
1013 print $fh sprintf("%-${length}s",$record);
1018 close($fh);
1022 =head2 new_secondary_filehandle
1024 Title : new_secondary_filehandle
1025 Usage : $obj->new_secondary_filehandle($newval)
1026 Function:
1027 Example :
1028 Returns : value of new_secondary_filehandle
1029 Args : newvalue (optional)
1032 =cut
1034 sub new_secondary_filehandle {
1035 my ($self,$name) = @_;
1037 my $indexdir = $self->_config_path;
1039 my $secindex = File::Spec->catfile($indexdir,"id_$name.index");
1041 open(my $fh,">", $secindex) || $self->throw($!);
1042 return $fh;
1045 =head2 open_secondary_index
1047 Title : open_secondary_index
1048 Usage : $obj->open_secondary_index($newval)
1049 Function:
1050 Example :
1051 Returns : value of open_secondary_index
1052 Args : newvalue (optional)
1055 =cut
1057 sub open_secondary_index {
1058 my ($self,$name) = @_;
1060 if (!defined($self->{_secondary_filehandle}{$name})) {
1062 my $indexdir = $self->_config_path;
1063 my $secindex = $indexdir . "/id_$name.index";
1065 if (! -e $secindex) {
1066 $self->throw("Index is not present for namespace [$name]\n");
1069 open(my $newfh,"<", $secindex) || $self->throw($!);
1070 my $reclen = $self->read_header($newfh);
1072 $self->{_secondary_filehandle} {$name} = $newfh;
1073 $self->{_secondary_record_size}{$name} = $reclen;
1076 return $self->{_secondary_filehandle}{$name};
1080 =head2 _add_id_position
1082 Title : _add_id_position
1083 Usage : $obj->_add_id_position($newval)
1084 Function:
1085 Example :
1086 Returns : value of _add_id_position
1087 Args : newvalue (optional)
1090 =cut
1092 sub _add_id_position {
1093 my ($self,$id,$pos,$fileid,$length,$secondary_id) = @_;
1095 if (!defined($id)) {
1096 $self->throw("No id defined. Can't add id position");
1098 if (!defined($pos)) {
1099 $self->throw("No position defined. Can't add id position");
1101 if ( ! defined($fileid)) {
1102 $self->throw("No fileid defined. Can't add id position");
1104 if (! defined($length) || $length <= 0) {
1105 $self->throw("No length defined or <= 0 [$length]. Can't add id position");
1108 $self->{_id}{$id}{_pos} = $pos;
1109 $self->{_id}{$id}{_length} = $length;
1110 $self->{_id}{$id}{_fileid} = $fileid;
1112 # Now the secondary ids
1114 foreach my $sec (keys (%$secondary_id)) {
1115 my $value = $secondary_id->{$sec};
1116 $self->{_secondary_id}{$sec}{$value}{$id} = 1;
1119 $self->{_maxidlength} = length($id)
1120 if !exists $self->{_maxidlength} or
1121 length($id) >= $self->{_maxidlength};
1123 $self->{_maxfileidlength} = length($fileid)
1124 if !exists $self->{_maxfileidlength} or
1125 length($fileid) >= $self->{_maxfileidlength};
1127 $self->{_maxposlength} = length($pos)
1128 if !exists $self->{_maxposlength} or
1129 length($pos) >= $self->{_maxposlength};
1131 $self->{_maxlengthlength} = length($length)
1132 if !exists $self->{_maxlengthlength} or
1133 length($length) >= $self->{_maxlengthlength};
1136 =head2 make_config_file
1138 Title : make_config_file
1139 Usage : $obj->make_config_file($newval)
1140 Function:
1141 Example :
1142 Returns : value of make_config_file
1143 Args : newvalue (optional)
1145 =cut
1147 sub make_config_file {
1148 my ($self,$files) = @_;
1150 my @files = @$files;
1152 my $configfile = $self->_config_file;
1154 open(my $CON,">", $configfile) || $self->throw("Can't create config file [$configfile]");
1156 # First line must be the type of index - in this case flat
1157 print $CON "index\tflat/1\n";
1159 # Now the fileids
1161 my $count = 0;
1163 foreach my $file (@files) {
1165 my $size = -s $file;
1167 print $CON "fileid_$count\t$file\t$size\n";
1169 my $fh;
1170 open($fh,"<", $file) || $self->throw($!);
1171 $self->{_file} {$count} = $file;
1172 $self->{_dbfile}{$file} = $count;
1173 $self->{_size}{$count} = $size;
1174 $count++;
1177 # Now the namespaces
1179 print $CON "primary_namespace\t" .$self->primary_namespace. "\n";
1181 # Needs fixing for the secondary stuff
1183 my $second_patterns = $self->secondary_patterns;
1185 my @second = keys %$second_patterns;
1187 if ((@second)) {
1188 print $CON "secondary_namespaces";
1190 foreach my $second (@second) {
1191 print $CON "\t$second";
1193 print $CON "\n";
1196 # Now the config format
1198 unless (defined ($self->format) ) {
1199 $self->throw("Format does not exist in module - can't write config file");
1200 } else {
1201 my $format = $self->format;
1202 my $alphabet = $self->alphabet;
1203 my $alpha = $alphabet ? "/$alphabet" : '';
1204 print $CON "format\t" . "$format\n";
1206 close($CON);
1209 =head2 read_config_file
1211 Title : read_config_file
1212 Usage : $obj->read_config_file($newval)
1213 Function:
1214 Example :
1215 Returns : value of read_config_file
1216 Args : newvalue (optional)
1218 =cut
1220 sub read_config_file {
1221 my ($self) = @_;
1222 my $configfile = $self->_config_file;
1223 return unless -e $configfile;
1225 open(my $CON,"<", $configfile) || $self->throw("Can't open configfile [$configfile]");
1227 # First line must be type
1228 my $line = <$CON>;
1229 chomp($line);
1230 my $version;
1232 # This is hard coded as we only index flatfiles here
1233 if ($line =~ m{index\tflat/(\d+)}) {
1234 $version = $1;
1235 } else {
1236 $self->throw("First line not compatible with flat file index. Should be something like\n\nindex\tflat/1");
1239 $self->index_type("flat");
1240 $self->index_version($version);
1242 while (<$CON>) {
1243 chomp;
1245 # Look for fileid lines
1246 if ($_ =~ /^fileid_(\d+)\t(.+)\t(\d+)/) {
1247 my $fileid = $1;
1248 my $filename = $2;
1249 my $filesize = $3;
1251 if (! -e $filename) {
1252 $self->throw("File [$filename] does not exist!");
1254 if (-s $filename != $filesize) {
1255 $self->throw("Flatfile size for $filename differs from what the index thinks it is. Real size [" . (-s $filename) . "] Index thinks it is [" . $filesize . "]");
1258 my $fh;
1259 open($fh,"<", $filename) || $self->throw($!);
1260 $self->{_file} {$fileid} = $filename;
1261 $self->{_dbfile}{$filename} = $fileid;
1262 $self->{_size} {$fileid} = $filesize;
1265 # Look for namespace lines
1266 if ( /(.*)_namespaces?\t(.+)/ ) {
1267 if ($1 eq "primary") {
1268 $self->primary_namespace($2);
1269 } elsif ($1 eq "secondary") {
1270 $self->secondary_namespaces(split "\t",$2);
1271 } else {
1272 $self->throw("Unknown namespace name in config file [$1");
1276 # Look for format lines
1277 if ($_ =~ /format\t(\S+)/) {
1278 # Check the format here?
1279 my $format = $1;
1281 # handle LSID format
1282 if ($format =~ /^URN:LSID:open-bio\.org:(\w+)(?:\/(\w+))?/) {
1283 $self->format($1);
1284 $self->alphabet($2);
1285 } else { # compatibility with older versions
1286 $self->format($1);
1291 close($CON);
1293 # Now check we have all that we need
1295 my @fileid_keys = keys (%{$self->{_file}});
1297 if (!(@fileid_keys)) {
1298 $self->throw("No flatfile fileid files in config - check the index has been made correctly");
1301 if (!defined($self->primary_namespace)) {
1302 $self->throw("No primary namespace exists");
1305 if (! -e $self->primary_index_file) {
1306 $self->throw("Primary index file [" . $self->primary_index_file . "] doesn't exist");
1312 =head2 get_fileid_by_filename
1314 Title : get_fileid_by_filename
1315 Usage : $obj->get_fileid_by_filename($newval)
1316 Function:
1317 Example :
1318 Returns : value of get_fileid_by_filename
1319 Args : newvalue (optional)
1321 =cut
1323 sub get_fileid_by_filename {
1324 my ($self,$file) = @_;
1326 if (!defined($self->{_dbfile})) {
1327 $self->throw("No file to fileid mapping present. Has the fileid file been read?");
1331 return $self->{_dbfile}{$file};
1334 =head2 get_filehandle_by_fileid
1336 Title : get_filehandle_by_fileid
1337 Usage : $obj->get_filehandle_by_fileid($newval)
1338 Function:
1339 Example :
1340 Returns : value of get_filehandle_by_fileid
1341 Args : newvalue (optional)
1343 =cut
1345 sub get_filehandle_by_fileid {
1346 my ($self,$fileid) = @_;
1348 if (!defined($self->{_file}{$fileid})) {
1349 $self->throw("ERROR: undefined fileid in index [$fileid]");
1352 my $fh;
1353 open($fh,"<", $self->{_file}{$fileid}) || $self->throw($!);
1354 return $fh;
1357 =head2 primary_index_file
1359 Title : primary_index_file
1360 Usage : $obj->primary_index_file($newval)
1361 Function:
1362 Example :
1363 Returns : value of primary_index_file
1364 Args : newvalue (optional)
1367 =cut
1369 sub primary_index_file {
1370 my ($self) = @_;
1372 return File::Spec->catfile($self->_config_path,"key_" . $self->primary_namespace . ".key");
1375 =head2 primary_index_filehandle
1377 Title : primary_index_filehandle
1378 Usage : $obj->primary_index_filehandle($newval)
1379 Function:
1380 Example :
1381 Returns : value of primary_index_filehandle
1382 Args : newvalue (optional)
1385 =cut
1387 sub primary_index_filehandle {
1388 my ($self) = @_;
1390 unless (defined ($self->{'_primary_index_handle'})) {
1391 open($self->{'_primary_index_handle'}, "<" . $self->primary_index_file) || self->throw($@);
1393 return $self->{'_primary_index_handle'};
1396 =head2 format
1398 Title : format
1399 Usage : $obj->format($newval)
1400 Function:
1401 Example :
1402 Returns : value of format
1403 Args : newvalue (optional)
1406 =cut
1408 sub format{
1409 my ($obj,$value) = @_;
1410 if( defined $value) {
1411 $obj->{'format'} = $value;
1413 return $obj->{'format'};
1417 sub alphabet{
1418 my ($obj,$value) = @_;
1419 if( defined $value) {
1420 $obj->{alphabet} = $value;
1422 return $obj->{alphabet};
1425 =head2 write_flag
1427 Title : write_flag
1428 Usage : $obj->write_flag($newval)
1429 Function:
1430 Example :
1431 Returns : value of write_flag
1432 Args : newvalue (optional)
1435 =cut
1437 sub write_flag{
1438 my ($obj,$value) = @_;
1439 if( defined $value) {
1440 $obj->{'write_flag'} = $value;
1442 return $obj->{'write_flag'};
1446 =head2 dbname
1448 Title : dbname
1449 Usage : $obj->dbname($newval)
1450 Function: get/set database name
1451 Example :
1452 Returns : value of dbname
1453 Args : newvalue (optional)
1455 =cut
1457 sub dbname {
1458 my $self = shift;
1459 my $d = $self->{flat_dbname};
1460 $self->{flat_dbname} = shift if @_;
1464 =head2 index_directory
1466 Title : index_directory
1467 Usage : $obj->index_directory($newval)
1468 Function:
1469 Example :
1470 Returns : value of index_directory
1471 Args : newvalue (optional)
1474 =cut
1476 sub index_directory {
1477 my ($self,$arg) = @_;
1479 if (defined($arg)) {
1480 if ($arg !~ m{/$}) {
1481 $arg .= "/";
1483 $self->{_index_directory} = $arg;
1485 return $self->{_index_directory};
1489 sub _config_path {
1490 my $self = shift;
1491 my $root = $self->index_directory;
1492 my $dbname = $self->dbname;
1493 File::Spec->catfile($root,$dbname);
1496 sub _config_file {
1497 my $self = shift;
1498 my $path = $self->_config_path;
1499 File::Spec->catfile($path,CONFIG_FILE_NAME);
1502 =head2 record_size
1504 Title : record_size
1505 Usage : $obj->record_size($newval)
1506 Function:
1507 Example :
1508 Returns : value of record_size
1509 Args : newvalue (optional)
1512 =cut
1514 sub record_size {
1515 my $self = shift;
1516 $self->{_record_size} = shift if @_;
1517 return $self->{_record_size};
1520 =head2 primary_namespace
1522 Title : primary_namespace
1523 Usage : $obj->primary_namespace($newval)
1524 Function:
1525 Example :
1526 Returns : value of primary_namespace
1527 Args : newvalue (optional)
1529 =cut
1531 sub primary_namespace {
1532 my $self = shift;
1533 $self->{_primary_namespace} = shift if @_;
1534 return $self->{_primary_namespace};
1537 =head2 index_type
1539 Title : index_type
1540 Usage : $obj->index_type($newval)
1541 Function:
1542 Example :
1543 Returns : value of index_type
1544 Args : newvalue (optional)
1547 =cut
1549 sub index_type {
1550 my $self = shift;
1551 $self->{_index_type} = shift if @_;
1552 return $self->{_index_type};
1555 =head2 index_version
1557 Title : index_version
1558 Usage : $obj->index_version($newval)
1559 Function:
1560 Example :
1561 Returns : value of index_version
1562 Args : newvalue (optional)
1565 =cut
1567 sub index_version {
1568 my $self = shift;
1569 $self->{_index_version} = shift if @_;
1570 return $self->{_index_version};
1573 =head2 primary_pattern
1575 Title : primary_pattern
1576 Usage : $obj->primary_pattern($newval)
1577 Function:
1578 Example :
1579 Returns : value of primary_pattern
1580 Args : newvalue (optional)
1583 =cut
1585 sub primary_pattern{
1586 my $obj = shift;
1587 $obj->{'primary_pattern'} = shift if @_;
1588 return $obj->{'primary_pattern'};
1591 =head2 start_pattern
1593 Title : start_pattern
1594 Usage : $obj->start_pattern($newval)
1595 Function:
1596 Example :
1597 Returns : value of start_pattern
1598 Args : newvalue (optional)
1601 =cut
1603 sub start_pattern{
1604 my $obj = shift;
1605 $obj->{'start_pattern'} = shift if @_;
1606 return $obj->{'start_pattern'};
1609 =head2 secondary_patterns
1611 Title : secondary_patterns
1612 Usage : $obj->secondary_patterns($newval)
1613 Function:
1614 Example :
1615 Returns : value of secondary_patterns
1616 Args : newvalue (optional)
1619 =cut
1621 sub secondary_patterns{
1622 my ($obj,$value) = @_;
1623 if( defined $value) {
1624 $obj->{'secondary_patterns'} = $value;
1626 my @names = keys %$value;
1628 foreach my $name (@names) {
1629 $obj->secondary_namespaces($name);
1632 return $obj->{'secondary_patterns'};
1636 =head2 secondary_namespaces
1638 Title : secondary_namespaces
1639 Usage : $obj->secondary_namespaces($newval)
1640 Function:
1641 Example :
1642 Returns : value of secondary_namespaces
1643 Args : newvalue (optional)
1646 =cut
1648 sub secondary_namespaces {
1649 my ($obj,@values) = @_;
1651 if (@values) {
1652 push(@{$obj->{'secondary_namespaces'}},@values);
1654 return @{$obj->{'secondary_namespaces'} || []};
1659 ## These are indexing routines to index commonly used format - fasta
1660 ## swissprot and embl
1662 sub new_SWISSPROT_index {
1663 my ($self,$index_dir,@files) = @_;
1665 my %secondary_patterns;
1667 my $start_pattern = "^ID (\\S+)";
1668 my $primary_pattern = "^AC (\\S+)\\;";
1670 $secondary_patterns{"ID"} = $start_pattern;
1672 my $index = Bio::DB::Flat::BinarySearch->new
1673 (-index_dir => $index_dir,
1674 -format => 'swissprot',
1675 -primary_pattern => $primary_pattern,
1676 -primary_namespace => "ACC",
1677 -start_pattern => $start_pattern,
1678 -secondary_patterns => \%secondary_patterns);
1680 $index->build_index(@files);
1683 sub new_EMBL_index {
1684 my ($self,$index_dir,@files) = @_;
1686 my %secondary_patterns;
1688 my $start_pattern = "^ID (\\S+)";
1689 my $primary_pattern = "^AC (\\S+)\\;";
1690 my $primary_namespace = "ACC";
1692 $secondary_patterns{"ID"} = $start_pattern;
1694 my $index = Bio::DB::Flat::BinarySearch->new
1695 (-index_dir => $index_dir,
1696 -format => 'embl',
1697 -primary_pattern => $primary_pattern,
1698 -primary_namespace => "ACC",
1699 -start_pattern => $start_pattern,
1700 -secondary_patterns => \%secondary_patterns);
1702 $index->build_index(@files);
1704 return $index;
1707 sub new_FASTA_index {
1708 my ($self,$index_dir,@files) = @_;
1710 my %secondary_patterns;
1712 my $start_pattern = "^>";
1713 my $primary_pattern = "^>(\\S+)";
1714 my $primary_namespace = "ACC";
1716 $secondary_patterns{"ID"} = "^>\\S+ +(\\S+)";
1718 my $index = Bio::DB::Flat::BinarySearch->new
1719 (-index_dir => $index_dir,
1720 -format => 'fasta',
1721 -primary_pattern => $primary_pattern,
1722 -primary_namespace => "ACC",
1723 -start_pattern => $start_pattern,
1724 -secondary_patterns => \%secondary_patterns);
1726 $index->build_index(@files);
1728 return $index;
1731 # EVERYTHING THAT FOLLOWS THIS
1732 # is an awful hack - in reality Michele's code needs to be rewritten
1733 # to use Bio::SeqIO, but I have too little time to do this -- LS
1734 sub guess_alphabet {
1735 my $self = shift;
1736 my $line = shift;
1738 my $format = $self->format;
1739 return 'protein' if $format eq 'swissprot';
1741 if ($format eq 'genbank') {
1742 return unless $line =~ /^LOCUS/;
1743 return 'dna' if $line =~ /\s+\d+\s+bp/i;
1744 return 'protein';
1747 if ($format eq 'embl') {
1748 return unless $line =~ /^ID/;
1749 return 'dna' if $line =~ / DNA;/i;
1750 return 'rna' if $line =~ / RNA;/i;
1751 return 'protein';
1754 return;
1757 # return (namespace,primary_pattern,start_pattern,secondary_pattern)
1758 sub _guess_patterns {
1759 my $self = shift;
1760 my $format = shift;
1761 if ($format =~ /swiss(prot)?/i) {
1762 return ('ID',
1763 "^ID (\\S+)",
1764 "^ID (\\S+)",
1766 ACC => "^AC (\\S+);"
1770 if ($format =~ /embl/i) {
1771 return ('ID',
1772 "^ID (\\S+)",
1773 "^ID (\\S+)",
1775 ACC => q/^AC (\S+);/,
1776 VERSION => q/^SV\s+(\S+)/
1780 if ($format =~ /genbank/i) {
1781 return ('ID',
1782 q/^LOCUS\s+(\S+)/,
1783 q/^LOCUS/,
1785 ACC => q/^ACCESSION\s+(\S+)/,
1786 VERSION => q/^VERSION\s+(\S+)/
1790 if ($format =~ /fasta/i) {
1791 return ('ACC',
1792 '^>(\S+)',
1793 '^>(\S+)',
1797 $self->throw("I can't handle format $format");