3 # BioPerl module for Bio::DB::Flat::BinarySearch
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Michele Clamp <michele@sanger.ac.uk>>
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::DB::Flat::BinarySearch - BinarySearch search indexing system for sequence files
23 This module can be used both to index sequence files and also to
24 retrieve sequences from existing sequence files.
26 This object allows indexing of sequence files both by a primary key
27 (say accession) and multiple secondary keys (say ids). This is
28 different from the Bio::Index::Abstract (see L<Bio::Index::Abstract>)
29 which uses DBM files as storage. This module uses a binary search to
30 retrieve sequences which is more efficient for large datasets.
34 my $sequencefile; # Some fasta sequence file
36 Patterns have to be entered to define where the keys are to be indexed
37 and also where the start of each record. E.g. for fasta
39 my $start_pattern = '^>';
40 my $primary_pattern = '^>(\S+)';
42 So the start of a record is a line starting with a E<gt> and the
43 primary key is all characters up to the first space after the E<gt>
45 A string also has to be entered to defined what the primary key
46 (primary_namespace) is called.
48 The index can now be created using
50 my $index = Bio::DB::Flat::BinarySearch->new(
51 -directory => "/home/max/",
53 -start_pattern => $start_pattern,
54 -primary_pattern => $primary_pattern,
55 -primary_namespace => "ID",
58 my @files = ("file1","file2","file3");
60 $index->build_index(@files);
62 The index is now ready to use. For large sequence files the perl way
63 of indexing takes a *long* time and a *huge* amount of memory. For
64 indexing things like dbEST I recommend using the DB_File indexer, BDB.
66 The formats currently supported by this module are fasta, Swissprot,
69 =head2 Creating indices with secondary keys
71 Sometimes just indexing files with one id per entry is not enough. For
72 instance you may want to retrieve sequences from swissprot using
73 their accessions as well as their ids.
75 To be able to do this when creating your index you need to pass in
76 a hash of secondary_patterns which have their namespaces as the keys
79 e.g. For Indexing something like
81 ID 1433_CAEEL STANDARD; PRT; 248 AA.
83 DT 01-NOV-1995 (Rel. 32, Created)
84 DT 01-NOV-1995 (Rel. 32, Last sequence update)
85 DT 15-DEC-1998 (Rel. 37, Last annotation update)
86 DE 14-3-3-LIKE PROTEIN 1.
88 OS Caenorhabditis elegans.
89 OC Eukaryota; Metazoa; Nematoda; Chromadorea; Rhabditida; Rhabditoidea;
90 OC Rhabditidae; Peloderinae; Caenorhabditis.
94 where we want to index the accession (P41932) as the primary key and the
95 id (1433_CAEEL) as the secondary id. The index is created as follows
97 my %secondary_patterns;
99 my $start_pattern = '^ID (\S+)';
100 my $primary_pattern = '^AC (\S+)\;';
102 $secondary_patterns{"ID"} = '^ID (\S+)';
104 my $index = Bio::DB::Flat::BinarySearch->new(
105 -directory => $index_directory,
109 -start_pattern => $start_pattern,
110 -primary_pattern => $primary_pattern,
111 -primary_namespace => 'AC',
112 -secondary_patterns => \%secondary_patterns);
114 $index->build_index($seqfile);
116 Of course having secondary indices makes indexing slower and use more
121 To fetch sequences using an existing index first of all create your sequence
124 my $index = Bio::DB::Flat::BinarySearch->new(
125 -directory => $index_directory);
127 Now you can happily fetch sequences either by the primary key or
128 by the secondary keys.
130 my $entry = $index->get_entry_by_id('HBA_HUMAN');
132 This returns just a string containing the whole entry. This is
133 useful is you just want to print the sequence to screen or write it to a file.
135 Other ways of getting sequences are
137 my $fh = $index->get_stream_by_id('HBA_HUMAN');
139 This can then be passed to a seqio object for output or converting
142 my $seq = Bio::SeqIO->new(-fh => $fh,
145 The last way is to retrieve a sequence directly. This is the
146 slowest way of extracting as the sequence objects need to be made.
148 my $seq = $index->get_Seq_by_id('HBA_HUMAN');
150 To access the secondary indices the secondary namespace needs to be known
152 $index->secondary_namespaces("ID");
154 Then the following call can be used
156 my $seq = $index->get_Seq_by_secondary('ID','1433_CAEEL');
158 These calls are not yet implemented
160 my $fh = $index->get_stream_by_secondary('ID','1433_CAEEL');
161 my $entry = $index->get_entry_by_secondary('ID','1433_CAEEL');
167 User feedback is an integral part of the evolution of this and other
168 Bioperl modules. Send your comments and suggestions preferably to one
169 of the Bioperl mailing lists. Your participation is much appreciated.
171 bioperl-l@bioperl.org - General discussion
172 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
176 Please direct usage questions or support issues to the mailing list:
178 L<bioperl-l@bioperl.org>
180 rather than to the module maintainer directly. Many experienced and
181 reponsive experts will be able look at the problem and quickly
182 address it. Please include a thorough description of the problem
183 with code and data examples if at all possible.
185 =head2 Reporting Bugs
187 Report bugs to the Bioperl bug tracking system to help us keep track
188 the bugs and their resolution. Bug reports can be submitted via the
191 http://bugzilla.open-bio.org/
193 =head1 AUTHOR - Michele Clamp
195 Email - michele@sanger.ac.uk
199 Jason Stajich, jason@bioperl.org
203 The rest of the documentation details each of the object methods. Internal
204 methods are usually preceded with an "_" (underscore).
208 package Bio
::DB
::Flat
::BinarySearch
;
212 use Fcntl
qw(SEEK_END SEEK_CUR);
213 # rather than using tell which might be buffered
214 sub systell
{ sysseek($_[0], 0, SEEK_CUR
) }
215 sub syseof
{ sysseek($_[0], 0, SEEK_END
) }
218 use Bio
::Root
::RootI
;
222 use base
qw(Bio::DB::RandomAccessI);
224 use constant CONFIG_FILE_NAME
=> 'config.dat';
225 use constant HEADER_SIZE
=> 4;
226 use constant DEFAULT_FORMAT
=> 'fasta';
228 my @formats = ['FASTA','SWISSPROT','EMBL'];
234 my $index = Bio::DB::Flat::BinarySearch->new(
235 -directory => '/Users/michele/indices/dbest',
241 my %secondary_patterns = {"ACC" => "^>\\S+ +(\\S+)"}
242 my $index = Bio::DB::Flat::BinarySearch->new(
243 -directory => '/Users/michele/indices',
245 -primary_pattern => "^>(\\S+)",
246 -secondary_patterns => \%secondary_patterns,
247 -primary_namespace => "ID");
249 my @files = ('file1','file2','file3');
251 $index->build_index(@files);
254 Function: create a new Bio::DB::Flat::BinarySearch object
255 Returns : new Bio::DB::Flat::BinarySearch
256 Args : -directory Root directory for index files
257 -dbname Name of subdirectory containing indices
259 -write_flag Allow building index
260 -primary_pattern Regexp defining the primary id
261 -secondary_patterns A hash ref containing the secondary
262 patterns with the namespaces as keys
263 -primary_namespace A string defining what the primary key
271 my($class, @args) = @_;
273 my $self = $class->SUPER::new
(@args);
277 my ($index_dir,$dbname,$format,$write_flag,$primary_pattern,
278 $primary_namespace,$start_pattern,$secondary_patterns) =
279 $self->_rearrange([qw(DIRECTORY
286 SECONDARY_PATTERNS)], @args);
288 $self->index_directory($index_dir);
289 $self->dbname($dbname);
291 if ($self->index_directory && $self->read_config_file) {
293 my $fh = $self->primary_index_filehandle;
294 my $record_width = $self->read_header($fh);
295 $self->record_size($record_width);
297 $format ||= DEFAULT_FORMAT
;
298 $self->format ($format);
299 $self->write_flag ($write_flag);
301 if ($self->write_flag && ! $primary_namespace) {
302 ($primary_namespace,$primary_pattern,
303 $start_pattern,$secondary_patterns) =
304 $self->_guess_patterns($self->format);
307 $self->primary_pattern ($primary_pattern);
308 $self->primary_namespace ($primary_namespace);
309 $self->start_pattern ($start_pattern);
310 $self->secondary_patterns($secondary_patterns);
315 sub new_from_registry
{
316 my ($self,%config) = @_;
318 my $dbname = $config{'dbname'};
319 my $location = $config{'location'};
321 my $index = Bio
::DB
::Flat
::BinarySearch
->new(-dbname
=> $dbname,
322 -index_dir
=> $location,
328 Title : get_Seq_by_id
329 Usage : $obj->get_Seq_by_id($newval)
332 Returns : value of get_Seq_by_id
333 Args : newvalue (optional)
340 # too many uninit variables...
343 my ($fh,$length) = $self->get_stream_by_id($id);
345 unless (defined($self->format)) {
346 $self->throw("Can't create sequence - format is not defined");
351 unless ( defined($self->{_seqio
}) ) {
353 $self->{_seqio
} = Bio
::SeqIO
->new(-fh
=> $fh,
354 -format
=> $self->format);
356 $self->{_seqio
}->fh($fh);
359 return $self->{_seqio
}->next_seq;
362 =head2 get_entry_by_id
364 Title : get_entry_by_id
365 Usage : $obj->get_entry_by_id($newval)
366 Function: Get a Bio::SeqI object for a unique ID
373 sub get_entry_by_id
{
376 my ($fh,$length) = $self->get_stream_by_id($id);
380 sysread($fh,$entry,$length);
386 =head2 get_stream_by_id
388 Title : get_stream_by_id
389 Usage : $obj->get_stream_by_id($id)
390 Function: Gets a Sequence stream for an id
391 Returns : Bio::SeqIO stream
392 Args : Id to lookup by
397 sub get_stream_by_id
{
400 unless( $self->record_size ) {
401 if ($self->index_directory && $self->read_config_file) {
403 my $fh = $self->primary_index_filehandle;
404 my $record_width = $self->read_header($fh);
405 $self->record_size($record_width);
408 my $indexfh = $self->primary_index_filehandle;
411 my $filesize = systell
($indexfh);
413 $self->throw("file was not parsed properly, record size is empty")
414 unless $self->record_size;
416 my $end = ($filesize - $self->{'_start_pos'}) / $self->record_size;
417 my ($newid,$rest,$fhpos) = $self->find_entry($indexfh,0,$end,$id,$self->record_size);
420 my ($fileid,$pos,$length) = split(/\t/,$rest);
422 #print STDERR "BinarySearch Found id entry $newid $fileid $pos $length:$rest\n";
428 my $file = $self->{_file
}{$fileid};
430 open (my $IN,"<$file");
436 return ($IN,$length);
439 =head2 get_Seq_by_acc
441 Title : get_Seq_by_acc
442 Usage : $obj->get_Seq_by_acc($acc)
443 Function: Gets a Bio::SeqI object by accession number
444 Returns : Bio::SeqI object
445 Args : string representing accession number
451 my ($self,$acc) = @_;
453 # too many uninit variables...
456 if ($self->primary_namespace eq "ACC") {
457 return $self->get_Seq_by_id($acc);
459 return $self->get_Seq_by_secondary("ACC",$acc);
463 =head2 get_Seq_by_version
465 Title : get_Seq_by_version
466 Usage : $obj->get_Seq_by_version($version)
467 Function: Gets a Bio::SeqI object by accession.version number
468 Returns : Bio::SeqI object
469 Args : string representing accession.version number
474 sub get_Seq_by_version
{
475 my ($self,$acc) = @_;
477 # too many uninit variables...
480 if ($self->primary_namespace eq "VERSION") {
481 return $self->get_Seq_by_id($acc);
483 return $self->get_Seq_by_secondary("VERSION",$acc);
487 =head2 get_Seq_by_secondary
489 Title : get_Seq_by_secondary
490 Usage : $obj->get_Seq_by_secondary($namespace,$acc)
491 Function: Gets a Bio::SeqI object looking up secondary accessions
492 Returns : Bio::SeqI object
493 Args : namespace name to check secondary namespace and an id
498 sub get_Seq_by_secondary
{
499 my ($self,$name,$id) = @_;
501 my @names = $self->secondary_namespaces;
504 foreach my $tmpname (@names) {
505 if ($name eq $tmpname) {
511 $self->throw("Secondary index for $name doesn't exist\n");
514 my $fh = $self->open_secondary_index($name);
518 my $filesize = systell
($fh);
520 my $recsize = $self->{'_secondary_record_size'}{$name};
521 # print "Name " . $recsize . "\n";
523 my $end = ($filesize - $self->{'_start_pos'})/$recsize;
525 # print "End $end $filesize\n";
526 my ($newid,$primary_id,$pos) = $self->find_entry($fh,0,$end,$id,$recsize);
530 # print "Found new id $newid $primary_id\n";
531 # We now need to shuffle up the index file to find the top secondary entry
535 while ($record =~ /^$newid/ && $pos >= 0) {
537 $record = $self->read_record($fh,$pos,$recsize);
538 $pos = $pos - $recsize;
539 # print "Up record = $record:$newid\n";
544 # print "Top position is $pos\n";
546 # Now we have to shuffle back down again to read all the secondary entries
548 my $current_id = $newid;
551 $primary_id{$primary_id} = 1;
553 while ($current_id eq $newid) {
554 $record = $self->read_record($fh,$pos,$recsize);
555 # print "Record is :$record:\n";
556 my ($secid,$primary_id) = split(/\t/,$record,2);
557 $current_id = $secid;
559 if ($current_id eq $newid) {
560 $primary_id =~ s/ //g;
561 # print "Primary $primary_id\n";
562 $primary_id{$primary_id} = 1;
564 $pos = $pos + $recsize;
565 # print "Down record = $record\n";
569 if (!defined($newid)) {
575 foreach my $id (keys %primary_id) {
576 push @entry,$self->get_Seq_by_id($id);
578 return wantarray ?
@entry : $entry[0];
585 Usage : $obj->read_header($fhl)
586 Function: Reads the header from the db file
587 Returns : width of a record
598 sysread($fh,$record_width,HEADER_SIZE
);
600 $self->{'_start_pos'} = HEADER_SIZE
;
601 $record_width =~ s/ //g;
602 $record_width = $record_width * 1;
604 return $record_width;
610 Usage : $obj->read_record($fh,$pos,$len)
611 Function: Reads a record from a filehandle
613 Args : filehandle, offset, and length
619 my ($self,$fh,$pos,$len) = @_;
625 sysread($fh,$record,$len);
631 =head2 get_all_primary_ids
633 Title : get_all_primary_ids
634 Usage : @ids = $seqdb->get_all_primary_ids()
635 Function: gives an array of all the primary_ids of the
636 sequence objects in the database.
637 Returns : an array of strings
642 sub get_all_primary_ids
{
645 my $fh = $self->primary_index_filehandle;
647 my $filesize = systell
($fh);
648 my $recsize = $self->record_size;
652 for (my $pos=$self->{'_start_pos'}; $pos < $end; $pos += $recsize) {
653 my $record = $self->read_record($fh,$pos,$recsize);
654 my ($entryid) = split(/\t/,$record);
664 Usage : $obj->find_entry($fh,$start,$end,$id,$recsize)
665 Function: Extract an entry based on the start,end,id and record size
667 Args : filehandle, start, end, id, recordsize
673 my ($self,$fh,$start,$end,$id,$recsize) = @_;
675 my $mid = int( ($end+1+$start) / 2);
676 my $pos = ($mid-1)*$recsize + $self->{'_start_pos'};
678 my ($record) = $self->read_record($fh,$pos,$recsize);
679 my ($entryid,$rest) = split(/\t/,$record,2);
682 # print "Mid $recsize $mid $pos:$entryid:$rest:$record\n";
683 # print "Entry :$id:$entryid:$rest\n";
685 my ($first,$second) = $id le $entryid ?
($id,$entryid) : ($entryid,$id);
687 if ($id eq $entryid) {
689 return ($id,$rest,$pos-$recsize);
691 } elsif ($first eq $id) {
693 if ($end-$start <= 1) {
697 # print "Moving up $entryid $id\n";
698 $self->find_entry($fh,$start,$end,$id,$recsize);
700 } elsif ($second eq $id ) {
701 # print "Moving down $entryid $id\n";
702 if ($end-$start <= 1) {
708 $self->find_entry($fh,$start,$end,$id,$recsize);
717 Usage : $obj->build_index(@files)
718 Function: Build the index based on a set of files
719 Returns : count of the number of entries
720 Args : List of filenames
726 my ($self,@files) = @_;
728 $self->throw('Cannot build index unless -write_flag is true');
730 my $rootdir = $self->index_directory;
732 if (!defined($rootdir)) {
733 $self->throw("No index directory set - can't build indices");
737 $self->throw("Index directory [$rootdir] is not a directory. Cant' build indices");
740 my $dbpath = File
::Spec
->catfile($rootdir,$self->dbname);
742 warn "Creating directory $dbpath\n";
743 mkdir $dbpath,0777 or $self->throw("Couldn't create $dbpath: $!");
747 $self->throw("Must enter an array of filenames to index");
750 foreach my $file (@files) {
751 $file = File
::Spec
->rel2abs($file)
752 unless File
::Spec
->file_name_is_absolute($file);
754 $self->throw("Can't index file [$file] as it doesn't exist");
758 if (my $filehash = $self->{_dbfile
}) {
759 push @files,keys %$filehash;
763 @files = grep {!$seen{$_}++} @files;
766 $self->make_config_file(\
@files);
768 foreach my $file (@files) {
769 $entries += $self->_index_file($file);
772 # update alphabet if necessary
773 $self->make_config_file(\
@files);
775 # And finally write out the indices
776 $self->write_primary_index;
777 $self->write_secondary_indices;
785 Usage : $obj->_index_file($newval)
788 Returns : value of _index_file
789 Args : newvalue (optional)
795 my ($self,$file) = @_;
796 my $v = $self->verbose;
797 open(my $FILE,"<", $file) || $self->throw("Can't open file [$file]");
800 my $fileid = $self->get_fileid_by_filename($file);
805 my $primary = $self->primary_pattern;
806 my $start_pattern = $self->start_pattern;
810 my $new_primary_entry;
818 my @secondary_names = $self->secondary_namespaces;
824 $self->{alphabet
} ||= $self->guess_alphabet($_);
825 if ($_ =~ /$start_pattern/) {
827 $id = $new_primary_entry;
828 $self->{alphabet
} ||= $self->guess_alphabet($_);
830 my $tmplen = (tell $fh) - length($_);
832 $length = $tmplen - $pos;
834 unless( defined($id)) {
835 $self->throw("No id defined for sequence");
837 unless( defined($fileid)) {
838 $self->throw("No fileid defined for file $file");
840 unless( defined($pos)) {
841 $self->throw("No position defined for " . $id . "\n");
843 unless( defined($length)) {
844 $self->throw("No length defined for " . $id . "\n");
846 $self->_add_id_position($id,$pos,$fileid,$length,\
%secondary_id);
850 if ($count > 0 && $count%1000 == 0) {
851 $self->debug( "Indexed $count ids\n") if $v > 0;
860 if ($_ =~ /$primary/) {
861 $new_primary_entry = $1;
864 my $secondary_patterns = $self->secondary_patterns;
866 foreach my $sec (@secondary_names) {
867 my $pattern = $secondary_patterns->{$sec};
869 if ($_ =~ /$pattern/) {
870 $secondary_id{$sec} = $1;
876 # Remember to add in the last one
878 $id = $new_primary_entry;
879 # my $tmplen = (tell $fh) - length($last_one);
880 my $tmplen = (tell $fh);
882 $length = $tmplen - $pos;
885 $self->throw("No id defined for sequence");
887 if (!defined($fileid)) {
888 $self->throw("No fileid defined for file $file");
890 if (!defined($pos)) {
891 $self->throw("No position defined for " . $id . "\n");
893 if (!defined($length)) {
894 $self->throw("No length defined for " . $id . "\n");
897 $self->_add_id_position($id,$pos,$fileid,$length,\
%secondary_id);
904 =head2 write_primary_index
906 Title : write_primary_index
907 Usage : $obj->write_primary_index($newval)
910 Returns : value of write_primary_index
911 Args : newvalue (optional)
916 sub write_primary_index
{
919 my @ids = keys %{$self->{_id
}};
921 @ids = sort {$a cmp $b} @ids;
923 open (my $INDEX,">" . $self->primary_index_file) ||
924 $self->throw("Can't open primary index file [" .
925 $self->primary_index_file . "]");
927 my $recordlength = $self->{_maxidlength
} +
928 $self->{_maxfileidlength
} +
929 $self->{_maxposlength
} +
930 $self->{_maxlengthlength
} + 3;
932 print $INDEX sprintf("%04d",$recordlength);
934 foreach my $id (@ids) {
936 if (!defined($self->{_id
}{$id}{_fileid
})) {
937 $self->throw("No fileid for $id\n");
939 if (!defined($self->{_id
}{$id}{_pos
})) {
940 $self->throw("No position for $id\n");
942 if (!defined($self->{_id
}{$id}{_length
})) {
943 $self->throw("No length for $id");
946 my $record = $id . "\t" .
947 $self->{_id
}{$id}{_fileid
} . "\t" .
948 $self->{_id
}{$id}{_pos
} . "\t" .
949 $self->{_id
}{$id}{_length
};
951 print $INDEX sprintf("%-${recordlength}s",$record);
956 =head2 write_secondary_indices
958 Title : write_secondary_indices
959 Usage : $obj->write_secondary_indices($newval)
962 Returns : value of write_secondary_indices
963 Args : newvalue (optional)
968 sub write_secondary_indices
{
971 # These are the different
972 my @names = keys (%{$self->{_secondary_id
}});
975 foreach my $name (@names) {
977 my @seconds = keys %{$self->{_secondary_id
}{$name}};
979 # First we need to loop over to get the longest record.
982 foreach my $second (@seconds) {
983 my $tmplen = length($second) + 1;
984 my @prims = keys %{$self->{_secondary_id
}{$name}{$second}};
986 foreach my $prim (@prims) {
987 my $recordlen = $tmplen + length($prim);
989 if ($recordlen > $length) {
990 $length = $recordlen;
995 # Now we can print the index
997 my $fh = $self->new_secondary_filehandle($name);
999 print $fh sprintf("%04d",$length);
1000 @seconds = sort @seconds;
1002 foreach my $second (@seconds) {
1004 my @prims = keys %{$self->{_secondary_id
}{$name}{$second}};
1007 foreach my $prim (@prims) {
1008 my $record = $tmp . "\t" . $prim;
1009 if (length($record) > $length) {
1010 $self->throw("Something has gone horribly wrong - length of record is more than we thought [$length]\n");
1012 print $fh sprintf("%-${length}s",$record);
1013 print $fh sprintf("%-${length}s",$record);
1022 =head2 new_secondary_filehandle
1024 Title : new_secondary_filehandle
1025 Usage : $obj->new_secondary_filehandle($newval)
1028 Returns : value of new_secondary_filehandle
1029 Args : newvalue (optional)
1034 sub new_secondary_filehandle
{
1035 my ($self,$name) = @_;
1037 my $indexdir = $self->_config_path;
1039 my $secindex = File
::Spec
->catfile($indexdir,"id_$name.index");
1041 open(my $fh,">", $secindex) || $self->throw($!);
1045 =head2 open_secondary_index
1047 Title : open_secondary_index
1048 Usage : $obj->open_secondary_index($newval)
1051 Returns : value of open_secondary_index
1052 Args : newvalue (optional)
1057 sub open_secondary_index
{
1058 my ($self,$name) = @_;
1060 if (!defined($self->{_secondary_filehandle
}{$name})) {
1062 my $indexdir = $self->_config_path;
1063 my $secindex = $indexdir . "/id_$name.index";
1065 if (! -e
$secindex) {
1066 $self->throw("Index is not present for namespace [$name]\n");
1069 open(my $newfh,"<", $secindex) || $self->throw($!);
1070 my $reclen = $self->read_header($newfh);
1072 $self->{_secondary_filehandle
} {$name} = $newfh;
1073 $self->{_secondary_record_size
}{$name} = $reclen;
1076 return $self->{_secondary_filehandle
}{$name};
1080 =head2 _add_id_position
1082 Title : _add_id_position
1083 Usage : $obj->_add_id_position($newval)
1086 Returns : value of _add_id_position
1087 Args : newvalue (optional)
1092 sub _add_id_position
{
1093 my ($self,$id,$pos,$fileid,$length,$secondary_id) = @_;
1095 if (!defined($id)) {
1096 $self->throw("No id defined. Can't add id position");
1098 if (!defined($pos)) {
1099 $self->throw("No position defined. Can't add id position");
1101 if ( ! defined($fileid)) {
1102 $self->throw("No fileid defined. Can't add id position");
1104 if (! defined($length) || $length <= 0) {
1105 $self->throw("No length defined or <= 0 [$length]. Can't add id position");
1108 $self->{_id
}{$id}{_pos
} = $pos;
1109 $self->{_id
}{$id}{_length
} = $length;
1110 $self->{_id
}{$id}{_fileid
} = $fileid;
1112 # Now the secondary ids
1114 foreach my $sec (keys (%$secondary_id)) {
1115 my $value = $secondary_id->{$sec};
1116 $self->{_secondary_id
}{$sec}{$value}{$id} = 1;
1119 $self->{_maxidlength
} = length($id)
1120 if !exists $self->{_maxidlength
} or
1121 length($id) >= $self->{_maxidlength
};
1123 $self->{_maxfileidlength
} = length($fileid)
1124 if !exists $self->{_maxfileidlength
} or
1125 length($fileid) >= $self->{_maxfileidlength
};
1127 $self->{_maxposlength
} = length($pos)
1128 if !exists $self->{_maxposlength
} or
1129 length($pos) >= $self->{_maxposlength
};
1131 $self->{_maxlengthlength
} = length($length)
1132 if !exists $self->{_maxlengthlength
} or
1133 length($length) >= $self->{_maxlengthlength
};
1136 =head2 make_config_file
1138 Title : make_config_file
1139 Usage : $obj->make_config_file($newval)
1142 Returns : value of make_config_file
1143 Args : newvalue (optional)
1147 sub make_config_file
{
1148 my ($self,$files) = @_;
1150 my @files = @
$files;
1152 my $configfile = $self->_config_file;
1154 open(my $CON,">", $configfile) || $self->throw("Can't create config file [$configfile]");
1156 # First line must be the type of index - in this case flat
1157 print $CON "index\tflat/1\n";
1163 foreach my $file (@files) {
1165 my $size = -s
$file;
1167 print $CON "fileid_$count\t$file\t$size\n";
1170 open($fh,"<", $file) || $self->throw($!);
1171 $self->{_file
} {$count} = $file;
1172 $self->{_dbfile
}{$file} = $count;
1173 $self->{_size
}{$count} = $size;
1177 # Now the namespaces
1179 print $CON "primary_namespace\t" .$self->primary_namespace. "\n";
1181 # Needs fixing for the secondary stuff
1183 my $second_patterns = $self->secondary_patterns;
1185 my @second = keys %$second_patterns;
1188 print $CON "secondary_namespaces";
1190 foreach my $second (@second) {
1191 print $CON "\t$second";
1196 # Now the config format
1198 unless (defined ($self->format) ) {
1199 $self->throw("Format does not exist in module - can't write config file");
1201 my $format = $self->format;
1202 my $alphabet = $self->alphabet;
1203 my $alpha = $alphabet ?
"/$alphabet" : '';
1204 print $CON "format\t" . "$format\n";
1209 =head2 read_config_file
1211 Title : read_config_file
1212 Usage : $obj->read_config_file($newval)
1215 Returns : value of read_config_file
1216 Args : newvalue (optional)
1220 sub read_config_file
{
1222 my $configfile = $self->_config_file;
1223 return unless -e
$configfile;
1225 open(my $CON,"<", $configfile) || $self->throw("Can't open configfile [$configfile]");
1227 # First line must be type
1232 # This is hard coded as we only index flatfiles here
1233 if ($line =~ m{index\tflat/(\d+)}) {
1236 $self->throw("First line not compatible with flat file index. Should be something like\n\nindex\tflat/1");
1239 $self->index_type("flat");
1240 $self->index_version($version);
1245 # Look for fileid lines
1246 if ($_ =~ /^fileid_(\d+)\t(.+)\t(\d+)/) {
1251 if (! -e
$filename) {
1252 $self->throw("File [$filename] does not exist!");
1254 if (-s
$filename != $filesize) {
1255 $self->throw("Flatfile size for $filename differs from what the index thinks it is. Real size [" . (-s
$filename) . "] Index thinks it is [" . $filesize . "]");
1259 open($fh,"<", $filename) || $self->throw($!);
1260 $self->{_file
} {$fileid} = $filename;
1261 $self->{_dbfile
}{$filename} = $fileid;
1262 $self->{_size
} {$fileid} = $filesize;
1265 # Look for namespace lines
1266 if ( /(.*)_namespaces?\t(.+)/ ) {
1267 if ($1 eq "primary") {
1268 $self->primary_namespace($2);
1269 } elsif ($1 eq "secondary") {
1270 $self->secondary_namespaces(split "\t",$2);
1272 $self->throw("Unknown namespace name in config file [$1");
1276 # Look for format lines
1277 if ($_ =~ /format\t(\S+)/) {
1278 # Check the format here?
1281 # handle LSID format
1282 if ($format =~ /^URN:LSID:open-bio\.org:(\w+)(?:\/(\w
+))?
/) {
1284 $self->alphabet($2);
1285 } else { # compatibility with older versions
1293 # Now check we have all that we need
1295 my @fileid_keys = keys (%{$self->{_file
}});
1297 if (!(@fileid_keys)) {
1298 $self->throw("No flatfile fileid files in config - check the index has been made correctly");
1301 if (!defined($self->primary_namespace)) {
1302 $self->throw("No primary namespace exists");
1305 if (! -e
$self->primary_index_file) {
1306 $self->throw("Primary index file [" . $self->primary_index_file . "] doesn't exist");
1312 =head2 get_fileid_by_filename
1314 Title : get_fileid_by_filename
1315 Usage : $obj->get_fileid_by_filename($newval)
1318 Returns : value of get_fileid_by_filename
1319 Args : newvalue (optional)
1323 sub get_fileid_by_filename
{
1324 my ($self,$file) = @_;
1326 if (!defined($self->{_dbfile
})) {
1327 $self->throw("No file to fileid mapping present. Has the fileid file been read?");
1331 return $self->{_dbfile
}{$file};
1334 =head2 get_filehandle_by_fileid
1336 Title : get_filehandle_by_fileid
1337 Usage : $obj->get_filehandle_by_fileid($newval)
1340 Returns : value of get_filehandle_by_fileid
1341 Args : newvalue (optional)
1345 sub get_filehandle_by_fileid
{
1346 my ($self,$fileid) = @_;
1348 if (!defined($self->{_file
}{$fileid})) {
1349 $self->throw("ERROR: undefined fileid in index [$fileid]");
1353 open($fh,"<", $self->{_file
}{$fileid}) || $self->throw($!);
1357 =head2 primary_index_file
1359 Title : primary_index_file
1360 Usage : $obj->primary_index_file($newval)
1363 Returns : value of primary_index_file
1364 Args : newvalue (optional)
1369 sub primary_index_file
{
1372 return File
::Spec
->catfile($self->_config_path,"key_" . $self->primary_namespace . ".key");
1375 =head2 primary_index_filehandle
1377 Title : primary_index_filehandle
1378 Usage : $obj->primary_index_filehandle($newval)
1381 Returns : value of primary_index_filehandle
1382 Args : newvalue (optional)
1387 sub primary_index_filehandle
{
1390 unless (defined ($self->{'_primary_index_handle'})) {
1391 open($self->{'_primary_index_handle'}, "<" . $self->primary_index_file) || self
->throw($@
);
1393 return $self->{'_primary_index_handle'};
1399 Usage : $obj->format($newval)
1402 Returns : value of format
1403 Args : newvalue (optional)
1409 my ($obj,$value) = @_;
1410 if( defined $value) {
1411 $obj->{'format'} = $value;
1413 return $obj->{'format'};
1418 my ($obj,$value) = @_;
1419 if( defined $value) {
1420 $obj->{alphabet
} = $value;
1422 return $obj->{alphabet
};
1428 Usage : $obj->write_flag($newval)
1431 Returns : value of write_flag
1432 Args : newvalue (optional)
1438 my ($obj,$value) = @_;
1439 if( defined $value) {
1440 $obj->{'write_flag'} = $value;
1442 return $obj->{'write_flag'};
1449 Usage : $obj->dbname($newval)
1450 Function: get/set database name
1452 Returns : value of dbname
1453 Args : newvalue (optional)
1459 my $d = $self->{flat_dbname
};
1460 $self->{flat_dbname
} = shift if @_;
1464 =head2 index_directory
1466 Title : index_directory
1467 Usage : $obj->index_directory($newval)
1470 Returns : value of index_directory
1471 Args : newvalue (optional)
1476 sub index_directory
{
1477 my ($self,$arg) = @_;
1479 if (defined($arg)) {
1480 if ($arg !~ m{/$}) {
1483 $self->{_index_directory
} = $arg;
1485 return $self->{_index_directory
};
1491 my $root = $self->index_directory;
1492 my $dbname = $self->dbname;
1493 File
::Spec
->catfile($root,$dbname);
1498 my $path = $self->_config_path;
1499 File
::Spec
->catfile($path,CONFIG_FILE_NAME
);
1505 Usage : $obj->record_size($newval)
1508 Returns : value of record_size
1509 Args : newvalue (optional)
1516 $self->{_record_size
} = shift if @_;
1517 return $self->{_record_size
};
1520 =head2 primary_namespace
1522 Title : primary_namespace
1523 Usage : $obj->primary_namespace($newval)
1526 Returns : value of primary_namespace
1527 Args : newvalue (optional)
1531 sub primary_namespace
{
1533 $self->{_primary_namespace
} = shift if @_;
1534 return $self->{_primary_namespace
};
1540 Usage : $obj->index_type($newval)
1543 Returns : value of index_type
1544 Args : newvalue (optional)
1551 $self->{_index_type
} = shift if @_;
1552 return $self->{_index_type
};
1555 =head2 index_version
1557 Title : index_version
1558 Usage : $obj->index_version($newval)
1561 Returns : value of index_version
1562 Args : newvalue (optional)
1569 $self->{_index_version
} = shift if @_;
1570 return $self->{_index_version
};
1573 =head2 primary_pattern
1575 Title : primary_pattern
1576 Usage : $obj->primary_pattern($newval)
1579 Returns : value of primary_pattern
1580 Args : newvalue (optional)
1585 sub primary_pattern
{
1587 $obj->{'primary_pattern'} = shift if @_;
1588 return $obj->{'primary_pattern'};
1591 =head2 start_pattern
1593 Title : start_pattern
1594 Usage : $obj->start_pattern($newval)
1597 Returns : value of start_pattern
1598 Args : newvalue (optional)
1605 $obj->{'start_pattern'} = shift if @_;
1606 return $obj->{'start_pattern'};
1609 =head2 secondary_patterns
1611 Title : secondary_patterns
1612 Usage : $obj->secondary_patterns($newval)
1615 Returns : value of secondary_patterns
1616 Args : newvalue (optional)
1621 sub secondary_patterns
{
1622 my ($obj,$value) = @_;
1623 if( defined $value) {
1624 $obj->{'secondary_patterns'} = $value;
1626 my @names = keys %$value;
1628 foreach my $name (@names) {
1629 $obj->secondary_namespaces($name);
1632 return $obj->{'secondary_patterns'};
1636 =head2 secondary_namespaces
1638 Title : secondary_namespaces
1639 Usage : $obj->secondary_namespaces($newval)
1642 Returns : value of secondary_namespaces
1643 Args : newvalue (optional)
1648 sub secondary_namespaces
{
1649 my ($obj,@values) = @_;
1652 push(@
{$obj->{'secondary_namespaces'}},@values);
1654 return @
{$obj->{'secondary_namespaces'} || []};
1659 ## These are indexing routines to index commonly used format - fasta
1660 ## swissprot and embl
1662 sub new_SWISSPROT_index
{
1663 my ($self,$index_dir,@files) = @_;
1665 my %secondary_patterns;
1667 my $start_pattern = "^ID (\\S+)";
1668 my $primary_pattern = "^AC (\\S+)\\;";
1670 $secondary_patterns{"ID"} = $start_pattern;
1672 my $index = Bio
::DB
::Flat
::BinarySearch
->new
1673 (-index_dir
=> $index_dir,
1674 -format
=> 'swissprot',
1675 -primary_pattern
=> $primary_pattern,
1676 -primary_namespace
=> "ACC",
1677 -start_pattern
=> $start_pattern,
1678 -secondary_patterns
=> \
%secondary_patterns);
1680 $index->build_index(@files);
1683 sub new_EMBL_index
{
1684 my ($self,$index_dir,@files) = @_;
1686 my %secondary_patterns;
1688 my $start_pattern = "^ID (\\S+)";
1689 my $primary_pattern = "^AC (\\S+)\\;";
1690 my $primary_namespace = "ACC";
1692 $secondary_patterns{"ID"} = $start_pattern;
1694 my $index = Bio
::DB
::Flat
::BinarySearch
->new
1695 (-index_dir
=> $index_dir,
1697 -primary_pattern
=> $primary_pattern,
1698 -primary_namespace
=> "ACC",
1699 -start_pattern
=> $start_pattern,
1700 -secondary_patterns
=> \
%secondary_patterns);
1702 $index->build_index(@files);
1707 sub new_FASTA_index
{
1708 my ($self,$index_dir,@files) = @_;
1710 my %secondary_patterns;
1712 my $start_pattern = "^>";
1713 my $primary_pattern = "^>(\\S+)";
1714 my $primary_namespace = "ACC";
1716 $secondary_patterns{"ID"} = "^>\\S+ +(\\S+)";
1718 my $index = Bio
::DB
::Flat
::BinarySearch
->new
1719 (-index_dir
=> $index_dir,
1721 -primary_pattern
=> $primary_pattern,
1722 -primary_namespace
=> "ACC",
1723 -start_pattern
=> $start_pattern,
1724 -secondary_patterns
=> \
%secondary_patterns);
1726 $index->build_index(@files);
1731 # EVERYTHING THAT FOLLOWS THIS
1732 # is an awful hack - in reality Michele's code needs to be rewritten
1733 # to use Bio::SeqIO, but I have too little time to do this -- LS
1734 sub guess_alphabet
{
1738 my $format = $self->format;
1739 return 'protein' if $format eq 'swissprot';
1741 if ($format eq 'genbank') {
1742 return unless $line =~ /^LOCUS/;
1743 return 'dna' if $line =~ /\s+\d+\s+bp/i;
1747 if ($format eq 'embl') {
1748 return unless $line =~ /^ID/;
1749 return 'dna' if $line =~ / DNA;/i;
1750 return 'rna' if $line =~ / RNA;/i;
1757 # return (namespace,primary_pattern,start_pattern,secondary_pattern)
1758 sub _guess_patterns
{
1761 if ($format =~ /swiss(prot)?/i) {
1766 ACC
=> "^AC (\\S+);"
1770 if ($format =~ /embl/i) {
1775 ACC
=> q
/^AC (\S+);/,
1776 VERSION
=> q
/^SV\s+(\S+)/
1780 if ($format =~ /genbank/i) {
1785 ACC
=> q
/^ACCESSION\s+(\S+)/,
1786 VERSION
=> q
/^VERSION\s+(\S+)/
1790 if ($format =~ /fasta/i) {
1797 $self->throw("I can't handle format $format");