Bio/DB/Flat/BinarySearch.pm

   1 # $Id$
   2 #
   3 # BioPerl module for Bio::DB::Flat::BinarySearch
   4 #
   5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   6 #
   7 # Cared for by Michele Clamp <michele@sanger.ac.uk>>
   8 #
   9 # You may distribute this module under the same terms as perl itself
  10
  11 # POD documentation - main docs before the code
  12
  13 =head1 NAME
  14
  15 Bio::DB::Flat::BinarySearch - BinarySearch search indexing system for sequence files
  16
  17 =head1 SYNOPSIS
  18
  19
  20
  21 =head1 DESCRIPTION
  22
  23 This module can be used both to index sequence files and also to
  24 retrieve sequences from existing sequence files.
  25
  26 This object allows indexing of sequence files both by a primary key
  27 (say accession) and multiple secondary keys (say ids).  This is
  28 different from the Bio::Index::Abstract (see L<Bio::Index::Abstract>)
  29 which uses DBM files as storage.  This module uses a binary search to
  30 retrieve sequences which is more efficient for large datasets.
  31
  32 =head2 Index creation
  33
  34     my $sequencefile;  # Some fasta sequence file
  35
  36 Patterns have to be entered to define where the keys are to be indexed
  37 and also where the start of each record.  E.g. for fasta
  38
  39     my $start_pattern   = '^>';
  40     my $primary_pattern = '^>(\S+)';
  41
  42 So the start of a record is a line starting with a E<gt> and the
  43 primary key is all characters up to the first space after the E<gt>
  44
  45 A string also has to be entered to defined what the primary key
  46 (primary_namespace) is called.
  47
  48 The index can now be created using
  49
  50     my $index = Bio::DB::Flat::BinarySearch->new(
  51              -directory         => "/home/max/",
  52              -dbname            => "mydb",
  53                   -start_pattern     => $start_pattern,
  54                   -primary_pattern   => $primary_pattern,
  55              -primary_namespace => "ID",
  56                   -format            => "fasta" );
  57
  58     my @files = ("file1","file2","file3");
  59
  60     $index->build_index(@files);
  61
  62 The index is now ready to use.  For large sequence files the perl way
  63 of indexing takes a *long* time and a *huge* amount of memory.  For
  64 indexing things like dbEST I recommend using the DB_File indexer, BDB.
  65
  66 The formats currently supported by this module are fasta, Swissprot,
  67 and EMBL.
  68
  69 =head2 Creating indices with secondary keys
  70
  71 Sometimes just indexing files with one id per entry is not enough.  For
  72 instance you may want to retrieve sequences from swissprot using
  73 their accessions as well as their ids.
  74
  75 To be able to do this when creating your index you need to pass in
  76 a hash of secondary_patterns which have their namespaces as the keys
  77 to the hash.
  78
  79 e.g. For Indexing something like
  80
  81 ID   1433_CAEEL     STANDARD;      PRT;   248 AA.
  82 AC   P41932;
  83 DT   01-NOV-1995 (Rel. 32, Created)
  84 DT   01-NOV-1995 (Rel. 32, Last sequence update)
  85 DT   15-DEC-1998 (Rel. 37, Last annotation update)
  86 DE   14-3-3-LIKE PROTEIN 1.
  87 GN   FTT-1 OR M117.2.
  88 OS   Caenorhabditis elegans.
  89 OC   Eukaryota; Metazoa; Nematoda; Chromadorea; Rhabditida; Rhabditoidea;
  90 OC   Rhabditidae; Peloderinae; Caenorhabditis.
  91 OX   NCBI_TaxID=6239;
  92 RN   [1]
  93
  94 where we want to index the accession (P41932) as the primary key and the
  95 id (1433_CAEEL) as the secondary id.  The index is created as follows
  96
  97     my %secondary_patterns;
  98
  99     my $start_pattern   = '^ID   (\S+)';
 100     my $primary_pattern = '^AC   (\S+)\;';
 101
 102     $secondary_patterns{"ID"} = '^ID   (\S+)';
 103
 104     my $index = Bio::DB::Flat::BinarySearch->new(
 105                 -directory          => $index_directory,
 106                           -dbname             => "ppp",
 107                           -write_flag         => 1,
 108                 -verbose            => 1,
 109                 -start_pattern      => $start_pattern,
 110                 -primary_pattern    => $primary_pattern,
 111                 -primary_namespace  => 'AC',
 112                 -secondary_patterns => \%secondary_patterns);
 113
 114     $index->build_index($seqfile);
 115
 116 Of course having secondary indices makes indexing slower and use more
 117 memory.
 118
 119 =head2 Index reading
 120
 121 To fetch sequences using an existing index first of all create your sequence
 122 object
 123
 124     my $index = Bio::DB::Flat::BinarySearch->new(
 125                   -directory => $index_directory);
 126
 127 Now you can happily fetch sequences either by the primary key or
 128 by the secondary keys.
 129
 130     my $entry = $index->get_entry_by_id('HBA_HUMAN');
 131
 132 This returns just a string containing the whole entry.  This is
 133 useful is you just want to print the sequence to screen or write it to a file.
 134
 135 Other ways of getting sequences are
 136
 137     my $fh = $index->get_stream_by_id('HBA_HUMAN');
 138
 139 This can then be passed to a seqio object for output or converting
 140 into objects.
 141
 142     my $seq = Bio::SeqIO->new(-fh     => $fh,
 143                                             -format => 'fasta');
 144
 145 The last way is to retrieve a sequence directly.  This is the
 146 slowest way of extracting as the sequence objects need to be made.
 147
 148     my $seq = $index->get_Seq_by_id('HBA_HUMAN');
 149
 150 To access the secondary indices the secondary namespace needs to be known
 151
 152     $index->secondary_namespaces("ID");
 153
 154 Then the following call can be used
 155
 156     my $seq   = $index->get_Seq_by_secondary('ID','1433_CAEEL');
 157
 158 These calls are not yet implemented
 159
 160     my $fh    = $index->get_stream_by_secondary('ID','1433_CAEEL');
 161     my $entry = $index->get_entry_by_secondary('ID','1433_CAEEL');
 162
 163 =head1 FEEDBACK
 164
 165 =head2 Mailing Lists
 166
 167 User feedback is an integral part of the evolution of this and other
 168 Bioperl modules. Send your comments and suggestions preferably to one
 169 of the Bioperl mailing lists.  Your participation is much appreciated.
 170
 171   bioperl-l@bioperl.org                  - General discussion
 172   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 173
 174 =head2 Support
 175
 176 Please direct usage questions or support issues to the mailing list:
 177
 178 L<bioperl-l@bioperl.org>
 179
 180 rather than to the module maintainer directly. Many experienced and
 181 reponsive experts will be able look at the problem and quickly
 182 address it. Please include a thorough description of the problem
 183 with code and data examples if at all possible.
 184
 185 =head2 Reporting Bugs
 186
 187 Report bugs to the Bioperl bug tracking system to help us keep track
 188 the bugs and their resolution.  Bug reports can be submitted via the
 189 web:
 190
 191   http://bugzilla.open-bio.org/
 192
 193 =head1 AUTHOR - Michele Clamp
 194
 195 Email - michele@sanger.ac.uk
 196
 197 =head1 CONTRIBUTORS
 198
 199 Jason Stajich, jason@bioperl.org
 200
 201 =head1 APPENDIX
 202
 203 The rest of the documentation details each of the object methods. Internal
 204 methods are usually preceded with an "_" (underscore).
 205
 206 =cut
 207
 208 package Bio::DB::Flat::BinarySearch;
 209
 210 use strict;
 211
 212 use Fcntl qw(SEEK_END SEEK_CUR);
 213 # rather than using tell which might be buffered
 214 sub systell{ sysseek($_[0], 0, SEEK_CUR) }
 215 sub syseof{ sysseek($_[0], 0, SEEK_END) }
 216
 217 use File::Spec;
 218 use Bio::Root::RootI;
 219 use Bio::SeqIO;
 220 use Bio::Seq;
 221
 222 use base qw(Bio::DB::RandomAccessI);
 223
 224 use constant CONFIG_FILE_NAME => 'config.dat';
 225 use constant HEADER_SIZE      => 4;
 226 use constant DEFAULT_FORMAT   => 'fasta';
 227
 228 my @formats = ['FASTA','SWISSPROT','EMBL'];
 229
 230 =head2 new
 231
 232  Title   : new
 233  Usage   : For reading
 234              my $index = Bio::DB::Flat::BinarySearch->new(
 235                      -directory => '/Users/michele/indices/dbest',
 236                      -dbname    => 'mydb',
 237                      -format    => 'fasta');
 238
 239            For writing
 240
 241              my %secondary_patterns = {"ACC" => "^>\\S+ +(\\S+)"}
 242              my $index = Bio::DB::Flat::BinarySearch->new(
 243                      -directory          => '/Users/michele/indices',
 244                      -dbname             => 'mydb',
 245                      -primary_pattern    => "^>(\\S+)",
 246                      -secondary_patterns => \%secondary_patterns,
 247                      -primary_namespace  => "ID");
 248
 249              my @files = ('file1','file2','file3');
 250
 251              $index->build_index(@files);
 252
 253
 254  Function: create a new Bio::DB::Flat::BinarySearch object
 255  Returns : new Bio::DB::Flat::BinarySearch
 256  Args    : -directory          Root directory for index files
 257            -dbname             Name of subdirectory containing indices
 258                                for named database
 259            -write_flag         Allow building index
 260            -primary_pattern    Regexp defining the primary id
 261            -secondary_patterns A hash ref containing the secondary
 262                                patterns with the namespaces as keys
 263            -primary_namespace  A string defining what the primary key
 264                                is
 265
 266  Status  : Public
 267
 268 =cut
 269
 270 sub new {
 271     my($class, @args) = @_;
 272
 273     my $self = $class->SUPER::new(@args);
 274
 275     bless $self, $class;
 276
 277     my ($index_dir,$dbname,$format,$write_flag,$primary_pattern,
 278         $primary_namespace,$start_pattern,$secondary_patterns) =
 279             $self->_rearrange([qw(DIRECTORY
 280                                   DBNAME
 281                                   FORMAT
 282                                   WRITE_FLAG
 283                                   PRIMARY_PATTERN
 284                                   PRIMARY_NAMESPACE
 285                                   START_PATTERN
 286                                   SECONDARY_PATTERNS)], @args);
 287
 288     $self->index_directory($index_dir);
 289     $self->dbname($dbname);
 290
 291     if ($self->index_directory && $self->read_config_file) {
 292
 293         my $fh = $self->primary_index_filehandle;
 294         my $record_width = $self->read_header($fh);
 295         $self->record_size($record_width);
 296     }
 297     $format ||= DEFAULT_FORMAT;
 298     $self->format            ($format);
 299     $self->write_flag        ($write_flag);
 300
 301     if ($self->write_flag && ! $primary_namespace) {
 302       ($primary_namespace,$primary_pattern,
 303        $start_pattern,$secondary_patterns) =
 304         $self->_guess_patterns($self->format);
 305     }
 306
 307     $self->primary_pattern   ($primary_pattern);
 308     $self->primary_namespace ($primary_namespace);
 309     $self->start_pattern     ($start_pattern);
 310     $self->secondary_patterns($secondary_patterns);
 311
 312     return $self;
 313 }
 314
 315 sub new_from_registry {
 316     my ($self,%config) =  @_;
 317
 318     my $dbname   = $config{'dbname'};
 319     my $location = $config{'location'};
 320
 321     my $index =  Bio::DB::Flat::BinarySearch->new(-dbname    => $dbname,
 322                                                  -index_dir => $location,
 323                                                  );
 324 }
 325
 326 =head2 get_Seq_by_id
 327
 328  Title   : get_Seq_by_id
 329  Usage   : $obj->get_Seq_by_id($newval)
 330  Function:
 331  Example :
 332  Returns : value of get_Seq_by_id
 333  Args    : newvalue (optional)
 334
 335 =cut
 336
 337 sub get_Seq_by_id {
 338     my ($self,$id) = @_;
 339
 340     # too many uninit variables...
 341     local $^W = 0;
 342
 343     my ($fh,$length) = $self->get_stream_by_id($id);
 344
 345     unless (defined($self->format)) {
 346         $self->throw("Can't create sequence - format is not defined");
 347     }
 348
 349     return unless $fh;
 350
 351     unless ( defined($self->{_seqio}) ) {
 352
 353         $self->{_seqio} = Bio::SeqIO->new(-fh => $fh,
 354                                          -format => $self->format);
 355     } else {
 356         $self->{_seqio}->fh($fh);
 357     }
 358
 359     return $self->{_seqio}->next_seq;
 360 }
 361
 362 =head2 get_entry_by_id
 363
 364  Title   : get_entry_by_id
 365  Usage   : $obj->get_entry_by_id($newval)
 366  Function: Get a Bio::SeqI object for a unique ID
 367  Returns : Bio::SeqI
 368  Args    : string
 369
 370
 371 =cut
 372
 373 sub get_entry_by_id {
 374     my ($self,$id) = @_;
 375
 376     my ($fh,$length) = $self->get_stream_by_id($id);
 377
 378     my $entry;
 379
 380     sysread($fh,$entry,$length);
 381
 382     return $entry;
 383 }
 384
 385
 386 =head2 get_stream_by_id
 387
 388  Title   : get_stream_by_id
 389  Usage   : $obj->get_stream_by_id($id)
 390  Function: Gets a Sequence stream for an id
 391  Returns : Bio::SeqIO stream
 392  Args    : Id to lookup by
 393
 394
 395 =cut
 396
 397 sub get_stream_by_id {
 398     my ($self,$id) = @_;
 399
 400     unless( $self->record_size ) {
 401         if ($self->index_directory && $self->read_config_file) {
 402
 403             my $fh = $self->primary_index_filehandle;
 404             my $record_width = $self->read_header($fh);
 405             $self->record_size($record_width);
 406         }
 407     }
 408     my $indexfh = $self->primary_index_filehandle;
 409     syseof ($indexfh);
 410
 411     my $filesize = systell($indexfh);
 412
 413     $self->throw("file was not parsed properly, record size is empty")
 414         unless $self->record_size;
 415
 416     my $end = ($filesize - $self->{'_start_pos'}) / $self->record_size;
 417     my ($newid,$rest,$fhpos) = $self->find_entry($indexfh,0,$end,$id,$self->record_size);
 418
 419
 420     my ($fileid,$pos,$length) = split(/\t/,$rest);
 421
 422     #print STDERR "BinarySearch Found id entry $newid $fileid $pos $length:$rest\n";
 423
 424     if (!$newid) {
 425       return;
 426     }
 427
 428     my $file = $self->{_file}{$fileid};
 429
 430     open (my $IN,"<$file");
 431
 432     my $entry;
 433
 434     sysseek($IN,$pos,0);
 435
 436     return ($IN,$length);
 437 }
 438
 439 =head2 get_Seq_by_acc
 440
 441  Title   : get_Seq_by_acc
 442  Usage   : $obj->get_Seq_by_acc($acc)
 443  Function: Gets a Bio::SeqI object by accession number
 444  Returns : Bio::SeqI object
 445  Args    : string representing accession number
 446
 447
 448 =cut
 449
 450 sub get_Seq_by_acc {
 451     my ($self,$acc) = @_;
 452
 453     # too many uninit variables...
 454     local $^W = 0;
 455
 456     if ($self->primary_namespace eq "ACC") {
 457        return $self->get_Seq_by_id($acc);
 458     } else {
 459       return $self->get_Seq_by_secondary("ACC",$acc);
 460     }
 461 }
 462
 463 =head2 get_Seq_by_version
 464
 465  Title   : get_Seq_by_version
 466  Usage   : $obj->get_Seq_by_version($version)
 467  Function: Gets a Bio::SeqI object by accession.version number
 468  Returns : Bio::SeqI object
 469  Args    : string representing accession.version number
 470
 471
 472 =cut
 473
 474 sub get_Seq_by_version {
 475     my ($self,$acc) = @_;
 476
 477     # too many uninit variables...
 478     local $^W = 0;
 479
 480     if ($self->primary_namespace eq "VERSION") {
 481        return $self->get_Seq_by_id($acc);
 482     } else {
 483       return $self->get_Seq_by_secondary("VERSION",$acc);
 484     }
 485 }
 486
 487 =head2 get_Seq_by_secondary
 488
 489  Title   : get_Seq_by_secondary
 490  Usage   : $obj->get_Seq_by_secondary($namespace,$acc)
 491  Function: Gets a Bio::SeqI object looking up secondary accessions
 492  Returns : Bio::SeqI object
 493  Args    : namespace name to check secondary namespace and an id
 494
 495
 496 =cut
 497
 498 sub get_Seq_by_secondary {
 499     my ($self,$name,$id) = @_;
 500
 501     my @names = $self->secondary_namespaces;
 502
 503     my $found = 0;
 504     foreach my $tmpname (@names) {
 505         if ($name eq $tmpname) {
 506             $found = 1;
 507         }
 508     }
 509
 510     if ($found == 0) {
 511         $self->throw("Secondary index for $name doesn't exist\n");
 512     }
 513
 514     my $fh = $self->open_secondary_index($name);
 515
 516     syseof ($fh);
 517
 518     my $filesize = systell($fh);
 519
 520     my $recsize = $self->{'_secondary_record_size'}{$name};
 521 #    print "Name " . $recsize . "\n";
 522
 523     my $end = ($filesize - $self->{'_start_pos'})/$recsize;
 524
 525 #    print "End $end $filesize\n";
 526     my ($newid,$primary_id,$pos) = $self->find_entry($fh,0,$end,$id,$recsize);
 527
 528     sysseek($fh,$pos,0);
 529
 530 #    print "Found new id $newid $primary_id\n";
 531     # We now need to shuffle up the index file to find the top secondary entry
 532
 533     my $record = $newid;
 534
 535     while ($record =~ /^$newid/ && $pos >= 0) {
 536
 537         $record = $self->read_record($fh,$pos,$recsize);
 538         $pos = $pos - $recsize;
 539 #       print "Up record = $record:$newid\n";
 540     }
 541
 542     $pos += $recsize;
 543
 544 #    print "Top position is $pos\n";
 545
 546     # Now we have to shuffle back down again to read all the secondary entries
 547
 548     my $current_id = $newid;
 549     my %primary_id;
 550
 551     $primary_id{$primary_id} = 1;
 552
 553     while ($current_id eq $newid) {
 554         $record = $self->read_record($fh,$pos,$recsize);
 555         # print "Record is :$record:\n";
 556         my ($secid,$primary_id) = split(/\t/,$record,2);
 557         $current_id = $secid;
 558
 559         if ($current_id eq $newid) {
 560             $primary_id =~ s/ //g;
 561         #    print "Primary $primary_id\n";
 562             $primary_id{$primary_id} = 1;
 563
 564             $pos = $pos + $recsize;
 565          #   print "Down record = $record\n";
 566         }
 567     }
 568
 569     if (!defined($newid)) {
 570       return;
 571     }
 572
 573     my @entry;
 574
 575     foreach my $id (keys %primary_id) {
 576       push @entry,$self->get_Seq_by_id($id);
 577     }
 578     return wantarray ? @entry : $entry[0];
 579
 580 }
 581
 582 =head2 read_header
 583
 584  Title   : read_header
 585  Usage   : $obj->read_header($fhl)
 586  Function: Reads the header from the db file
 587  Returns : width of a record
 588  Args    : filehandle
 589
 590
 591 =cut
 592
 593 sub read_header {
 594     my ($self,$fh) = @_;
 595
 596     my $record_width;
 597
 598     sysread($fh,$record_width,HEADER_SIZE);
 599
 600     $self->{'_start_pos'} = HEADER_SIZE;
 601     $record_width =~ s/ //g;
 602     $record_width = $record_width * 1;
 603
 604     return $record_width;
 605 }
 606
 607 =head2 read_record
 608
 609  Title   : read_record
 610  Usage   : $obj->read_record($fh,$pos,$len)
 611  Function: Reads a record from a filehandle
 612  Returns : String
 613  Args    : filehandle, offset, and length
 614
 615
 616 =cut
 617
 618 sub read_record {
 619   my ($self,$fh,$pos,$len) = @_;
 620
 621   sysseek($fh,$pos,0);
 622
 623   my $record;
 624
 625   sysread($fh,$record,$len);
 626
 627   return $record;
 628
 629 }
 630
 631 =head2 get_all_primary_ids
 632
 633  Title   : get_all_primary_ids
 634  Usage   : @ids = $seqdb->get_all_primary_ids()
 635  Function: gives an array of all the primary_ids of the
 636            sequence objects in the database.
 637  Returns : an array of strings
 638  Args    : none
 639
 640 =cut
 641
 642 sub get_all_primary_ids {
 643   my $self = shift;
 644
 645   my $fh = $self->primary_index_filehandle;
 646   syseof($fh);
 647   my $filesize = systell($fh);
 648   my $recsize  = $self->record_size;
 649   my $end = $filesize;
 650
 651   my @ids;
 652   for (my $pos=$self->{'_start_pos'}; $pos < $end; $pos += $recsize) {
 653     my $record = $self->read_record($fh,$pos,$recsize);
 654     my ($entryid)  = split(/\t/,$record);
 655     push @ids,$entryid;
 656   }
 657   @ids;
 658 }
 659
 660
 661 =head2 find_entry
 662
 663  Title   : find_entry
 664  Usage   : $obj->find_entry($fh,$start,$end,$id,$recsize)
 665  Function: Extract an entry based on the start,end,id and record size
 666  Returns : string
 667  Args    : filehandle, start, end, id, recordsize
 668
 669
 670 =cut
 671
 672 sub find_entry {
 673     my ($self,$fh,$start,$end,$id,$recsize) = @_;
 674
 675     my $mid = int( ($end+1+$start) / 2);
 676     my $pos = ($mid-1)*$recsize + $self->{'_start_pos'};
 677
 678     my ($record) = $self->read_record($fh,$pos,$recsize);
 679     my ($entryid,$rest)  = split(/\t/,$record,2);
 680     $rest =~ s/\s+$//;
 681
 682 #    print "Mid $recsize $mid $pos:$entryid:$rest:$record\n";
 683 #    print "Entry :$id:$entryid:$rest\n";
 684
 685     my ($first,$second) = $id le $entryid ? ($id,$entryid) : ($entryid,$id);
 686
 687     if ($id eq $entryid) {
 688
 689       return ($id,$rest,$pos-$recsize);
 690
 691     } elsif ($first eq $id) {
 692
 693       if ($end-$start <= 1) {
 694         return;
 695       }
 696       my $end = $mid;
 697 #      print "Moving up $entryid $id\n";
 698       $self->find_entry($fh,$start,$end,$id,$recsize);
 699
 700     } elsif ($second eq $id ) {
 701 #       print "Moving down $entryid $id\n";
 702       if ($end-$start <= 1) {
 703         return;
 704       }
 705
 706       $start = $mid;
 707
 708       $self->find_entry($fh,$start,$end,$id,$recsize);
 709     }
 710
 711  }
 712
 713
 714 =head2 build_index
 715
 716  Title   : build_index
 717  Usage   : $obj->build_index(@files)
 718  Function: Build the index based on a set of files
 719  Returns : count of the number of entries
 720  Args    : List of filenames
 721
 722
 723 =cut
 724
 725 sub build_index {
 726     my ($self,@files) = @_;
 727     $self->write_flag or
 728         $self->throw('Cannot build index unless -write_flag is true');
 729
 730     my $rootdir = $self->index_directory;
 731
 732     if (!defined($rootdir)) {
 733         $self->throw("No index directory set - can't build indices");
 734     }
 735
 736     if (! -d $rootdir) {
 737         $self->throw("Index directory [$rootdir] is not a directory. Cant' build indices");
 738     }
 739
 740     my $dbpath = File::Spec->catfile($rootdir,$self->dbname);
 741     if (! -d $dbpath) {
 742       warn "Creating directory $dbpath\n";
 743       mkdir $dbpath,0777 or $self->throw("Couldn't create $dbpath: $!");
 744     }
 745
 746     unless (@files ) {
 747         $self->throw("Must enter an array of filenames to index");
 748     }
 749
 750     foreach my $file (@files) {
 751         $file = File::Spec->rel2abs($file)
 752             unless File::Spec->file_name_is_absolute($file);
 753         unless ( -e $file) {
 754             $self->throw("Can't index file [$file] as it doesn't exist");
 755         }
 756     }
 757
 758     if (my $filehash = $self->{_dbfile}) {
 759       push @files,keys %$filehash;
 760     }
 761
 762     my %seen;
 763     @files = grep {!$seen{$_}++} @files;
 764
 765     # Lets index
 766     $self->make_config_file(\@files);
 767     my $entries = 0;
 768     foreach my $file (@files) {
 769       $entries += $self->_index_file($file);
 770     }
 771
 772     # update alphabet if necessary
 773     $self->make_config_file(\@files);
 774
 775     # And finally write out the indices
 776     $self->write_primary_index;
 777     $self->write_secondary_indices;
 778
 779     $entries;
 780 }
 781
 782 =head2 _index_file
 783
 784  Title   : _index_file
 785  Usage   : $obj->_index_file($newval)
 786  Function:
 787  Example :
 788  Returns : value of _index_file
 789  Args    : newvalue (optional)
 790
 791
 792 =cut
 793
 794 sub _index_file {
 795         my ($self,$file) = @_;
 796         my $v = $self->verbose;
 797         open(my $FILE,"<", $file) || $self->throw("Can't open file [$file]");
 798
 799         my $recstart = 0;
 800         my $fileid = $self->get_fileid_by_filename($file);
 801         my $found = 0;
 802         my $id;
 803         my $count = 0;
 804
 805         my $primary       = $self->primary_pattern;
 806         my $start_pattern = $self->start_pattern;
 807
 808         my $pos = 0;
 809
 810         my $new_primary_entry;
 811
 812         my $length;
 813
 814         my $fh = $FILE;
 815
 816         my $done = -1;
 817
 818         my @secondary_names = $self->secondary_namespaces;
 819         my %secondary_id;
 820         my $last_one;
 821
 822         while (<$fh>) {
 823       $last_one = $_;
 824       $self->{alphabet} ||= $self->guess_alphabet($_);
 825       if ($_ =~ /$start_pattern/) {
 826                         if ($done == 0) {
 827                                 $id = $new_primary_entry;
 828                                 $self->{alphabet} ||= $self->guess_alphabet($_);
 829
 830                                 my $tmplen = (tell $fh) - length($_);
 831
 832                                 $length = $tmplen  - $pos;
 833
 834                                 unless( defined($id)) {
 835                                         $self->throw("No id defined for sequence");
 836                                 }
 837                                 unless( defined($fileid)) {
 838                                         $self->throw("No fileid defined for file $file");
 839                                 }
 840                                 unless( defined($pos)) {
 841                                         $self->throw("No position defined for " . $id . "\n");
 842                                 }
 843                                 unless( defined($length)) {
 844                                         $self->throw("No length defined for " . $id . "\n");
 845                                 }
 846                                 $self->_add_id_position($id,$pos,$fileid,$length,\%secondary_id);
 847
 848                                 $pos   = $tmplen;
 849
 850                                 if ($count > 0 && $count%1000 == 0) {
 851                                         $self->debug( "Indexed $count ids\n") if $v > 0;
 852                                 }
 853
 854                                 $count++;
 855                         } else {
 856                                 $done = 0;
 857                         }
 858       }
 859
 860       if ($_ =~ /$primary/) {
 861                         $new_primary_entry = $1;
 862       }
 863
 864       my $secondary_patterns = $self->secondary_patterns;
 865
 866       foreach my $sec (@secondary_names) {
 867                         my $pattern = $secondary_patterns->{$sec};
 868
 869                         if ($_ =~ /$pattern/) {
 870                                 $secondary_id{$sec} = $1;
 871                         }
 872       }
 873
 874         }
 875
 876         # Remember to add in the last one
 877
 878         $id = $new_primary_entry;
 879         # my $tmplen = (tell $fh) - length($last_one);
 880         my $tmplen = (tell $fh);
 881
 882         $length = $tmplen - $pos;
 883
 884         if (!defined($id)) {
 885                 $self->throw("No id defined for sequence");
 886         }
 887         if (!defined($fileid)) {
 888                 $self->throw("No fileid defined for file $file");
 889         }
 890         if (!defined($pos)) {
 891                 $self->throw("No position defined for " . $id . "\n");
 892         }
 893         if (!defined($length)) {
 894                 $self->throw("No length defined for " . $id . "\n");
 895         }
 896
 897         $self->_add_id_position($id,$pos,$fileid,$length,\%secondary_id);
 898         $count++;
 899
 900         close(FILE);
 901         $count;
 902 }
 903
 904 =head2 write_primary_index
 905
 906  Title   : write_primary_index
 907  Usage   : $obj->write_primary_index($newval)
 908  Function:
 909  Example :
 910  Returns : value of write_primary_index
 911  Args    : newvalue (optional)
 912
 913
 914 =cut
 915
 916 sub write_primary_index {
 917         my ($self) = @_;
 918
 919         my @ids = keys %{$self->{_id}};
 920
 921         @ids = sort {$a cmp $b} @ids;
 922
 923         open (my $INDEX,">" . $self->primary_index_file) ||
 924           $self->throw("Can't open primary index file [" .
 925                                                 $self->primary_index_file . "]");
 926
 927         my $recordlength = $self->{_maxidlength} +
 928                            $self->{_maxfileidlength} +
 929                                 $self->{_maxposlength} +
 930                                      $self->{_maxlengthlength} + 3;
 931
 932         print $INDEX sprintf("%04d",$recordlength);
 933
 934         foreach my $id (@ids) {
 935
 936                 if (!defined($self->{_id}{$id}{_fileid})) {
 937                         $self->throw("No fileid for $id\n");
 938                 }
 939                 if (!defined($self->{_id}{$id}{_pos})) {
 940                         $self->throw("No position for $id\n");
 941                 }
 942                 if (!defined($self->{_id}{$id}{_length})) {
 943                         $self->throw("No length for $id");
 944                 }
 945
 946                 my $record =  $id              . "\t" .
 947                   $self->{_id}{$id}{_fileid} . "\t" .
 948                          $self->{_id}{$id}{_pos}    . "\t" .
 949                                 $self->{_id}{$id}{_length};
 950
 951                 print $INDEX sprintf("%-${recordlength}s",$record);
 952
 953     }
 954 }
 955
 956 =head2 write_secondary_indices
 957
 958  Title   : write_secondary_indices
 959  Usage   : $obj->write_secondary_indices($newval)
 960  Function:
 961  Example :
 962  Returns : value of write_secondary_indices
 963  Args    : newvalue (optional)
 964
 965
 966 =cut
 967
 968 sub write_secondary_indices {
 969     my ($self) = @_;
 970
 971     # These are the different
 972     my @names = keys (%{$self->{_secondary_id}});
 973
 974
 975     foreach my $name (@names) {
 976
 977         my @seconds = keys %{$self->{_secondary_id}{$name}};
 978
 979         # First we need to loop over to get the longest record.
 980         my $length = 0;
 981
 982         foreach my $second (@seconds) {
 983             my $tmplen = length($second) + 1;
 984             my @prims = keys %{$self->{_secondary_id}{$name}{$second}};
 985
 986             foreach my $prim (@prims) {
 987                 my $recordlen = $tmplen + length($prim);
 988
 989                 if ($recordlen > $length) {
 990                     $length = $recordlen;
 991                 }
 992             }
 993         }
 994
 995         # Now we can print the index
 996
 997         my $fh = $self->new_secondary_filehandle($name);
 998
 999         print $fh sprintf("%04d",$length);
1000         @seconds = sort @seconds;
1001
1002         foreach my $second (@seconds) {
1003
1004             my @prims = keys %{$self->{_secondary_id}{$name}{$second}};
1005             my $tmp = $second;
1006
1007             foreach my $prim (@prims) {
1008                 my $record = $tmp . "\t" . $prim;
1009                 if (length($record) > $length) {
1010                     $self->throw("Something has gone horribly wrong - length of record is more than we thought [$length]\n");
1011                 } else {
1012                     print $fh sprintf("%-${length}s",$record);
1013                     print $fh sprintf("%-${length}s",$record);
1014                 }
1015             }
1016         }
1017
1018         close($fh);
1019     }
1020 }
1021
1022 =head2 new_secondary_filehandle
1023
1024  Title   : new_secondary_filehandle
1025  Usage   : $obj->new_secondary_filehandle($newval)
1026  Function:
1027  Example :
1028  Returns : value of new_secondary_filehandle
1029  Args    : newvalue (optional)
1030
1031
1032 =cut
1033
1034 sub new_secondary_filehandle {
1035     my ($self,$name) = @_;
1036
1037     my $indexdir = $self->_config_path;
1038
1039     my $secindex = File::Spec->catfile($indexdir,"id_$name.index");
1040
1041     open(my $fh,">", $secindex) || $self->throw($!);
1042     return $fh;
1043 }
1044
1045 =head2 open_secondary_index
1046
1047  Title   : open_secondary_index
1048  Usage   : $obj->open_secondary_index($newval)
1049  Function:
1050  Example :
1051  Returns : value of open_secondary_index
1052  Args    : newvalue (optional)
1053
1054
1055 =cut
1056
1057 sub open_secondary_index {
1058     my ($self,$name) = @_;
1059
1060     if (!defined($self->{_secondary_filehandle}{$name})) {
1061
1062         my $indexdir = $self->_config_path;
1063         my $secindex = $indexdir . "/id_$name.index";
1064
1065         if (! -e $secindex) {
1066             $self->throw("Index is not present for namespace [$name]\n");
1067         }
1068
1069         open(my $newfh,"<", $secindex) || $self->throw($!);
1070         my $reclen = $self->read_header($newfh);
1071
1072         $self->{_secondary_filehandle} {$name} = $newfh;
1073         $self->{_secondary_record_size}{$name} = $reclen;
1074     }
1075
1076     return $self->{_secondary_filehandle}{$name};
1077
1078 }
1079
1080 =head2 _add_id_position
1081
1082  Title   : _add_id_position
1083  Usage   : $obj->_add_id_position($newval)
1084  Function:
1085  Example :
1086  Returns : value of _add_id_position
1087  Args    : newvalue (optional)
1088
1089
1090 =cut
1091
1092 sub _add_id_position {
1093         my ($self,$id,$pos,$fileid,$length,$secondary_id) = @_;
1094
1095         if (!defined($id)) {
1096                 $self->throw("No id defined. Can't add id position");
1097         }
1098         if (!defined($pos)) {
1099                 $self->throw("No position defined. Can't add id position");
1100         }
1101         if ( ! defined($fileid)) {
1102                 $self->throw("No fileid defined. Can't add id position");
1103         }
1104         if (! defined($length) || $length <= 0) {
1105                 $self->throw("No length defined or <= 0 [$length]. Can't add id position");
1106         }
1107
1108         $self->{_id}{$id}{_pos}    = $pos;
1109         $self->{_id}{$id}{_length} = $length;
1110         $self->{_id}{$id}{_fileid} = $fileid;
1111
1112         # Now the secondary ids
1113
1114         foreach my $sec (keys (%$secondary_id)) {
1115                 my $value = $secondary_id->{$sec};
1116                 $self->{_secondary_id}{$sec}{$value}{$id} = 1;
1117         }
1118
1119         $self->{_maxidlength} = length($id)
1120           if !exists $self->{_maxidlength} or
1121                  length($id) >= $self->{_maxidlength};
1122
1123         $self->{_maxfileidlength} = length($fileid)
1124           if !exists $self->{_maxfileidlength} or
1125                  length($fileid) >= $self->{_maxfileidlength};
1126
1127         $self->{_maxposlength} = length($pos)
1128           if !exists $self->{_maxposlength} or
1129                  length($pos) >= $self->{_maxposlength};
1130
1131         $self->{_maxlengthlength} = length($length)
1132           if !exists $self->{_maxlengthlength} or
1133                  length($length) >= $self->{_maxlengthlength};
1134 }
1135
1136 =head2 make_config_file
1137
1138  Title   : make_config_file
1139  Usage   : $obj->make_config_file($newval)
1140  Function:
1141  Example :
1142  Returns : value of make_config_file
1143  Args    : newvalue (optional)
1144
1145 =cut
1146
1147 sub make_config_file {
1148     my ($self,$files) = @_;
1149
1150     my @files = @$files;
1151
1152     my $configfile = $self->_config_file;
1153
1154     open(my $CON,">", $configfile) || $self->throw("Can't create config file [$configfile]");
1155
1156     # First line must be the type of index - in this case flat
1157     print $CON "index\tflat/1\n";
1158
1159     # Now the fileids
1160
1161     my $count = 0;
1162
1163     foreach my $file (@files) {
1164
1165         my $size = -s $file;
1166
1167         print $CON "fileid_$count\t$file\t$size\n";
1168
1169         my $fh;
1170         open($fh,"<", $file) || $self->throw($!);
1171         $self->{_file}  {$count}   = $file;
1172         $self->{_dbfile}{$file}    = $count;
1173         $self->{_size}{$count}     = $size;
1174         $count++;
1175     }
1176
1177     # Now the namespaces
1178
1179     print $CON "primary_namespace\t" .$self->primary_namespace. "\n";
1180
1181     # Needs fixing for the secondary stuff
1182
1183     my $second_patterns = $self->secondary_patterns;
1184
1185     my @second = keys %$second_patterns;
1186
1187     if ((@second))  {
1188         print $CON "secondary_namespaces";
1189
1190         foreach my $second (@second) {
1191             print $CON "\t$second";
1192         }
1193         print $CON "\n";
1194     }
1195
1196     # Now the config format
1197
1198     unless (defined ($self->format) ) {
1199         $self->throw("Format does not exist in module - can't write config file");
1200     } else {
1201         my $format = $self->format;
1202         my $alphabet = $self->alphabet;
1203         my $alpha    = $alphabet ? "/$alphabet" : '';
1204         print $CON "format\t" . "$format\n";
1205      }
1206     close($CON);
1207 }
1208
1209 =head2 read_config_file
1210
1211  Title   : read_config_file
1212  Usage   : $obj->read_config_file($newval)
1213  Function:
1214  Example :
1215  Returns : value of read_config_file
1216  Args    : newvalue (optional)
1217
1218 =cut
1219
1220 sub read_config_file {
1221         my ($self) = @_;
1222         my $configfile = $self->_config_file;
1223         return unless -e $configfile;
1224
1225         open(my $CON,"<", $configfile) || $self->throw("Can't open configfile [$configfile]");
1226
1227         # First line must be type
1228         my $line = <$CON>;
1229         chomp($line);
1230         my $version;
1231
1232         # This is hard coded as we only index flatfiles here
1233         if ($line =~ m{index\tflat/(\d+)}) {
1234                 $version = $1;
1235         } else {
1236                 $self->throw("First line not compatible with flat file index.  Should be something like\n\nindex\tflat/1");
1237         }
1238
1239         $self->index_type("flat");
1240         $self->index_version($version);
1241
1242         while (<$CON>) {
1243                 chomp;
1244
1245                 # Look for fileid lines
1246                 if ($_ =~ /^fileid_(\d+)\t(.+)\t(\d+)/) {
1247                         my $fileid   = $1;
1248                         my $filename = $2;
1249                         my $filesize = $3;
1250
1251                         if (! -e $filename) {
1252                                 $self->throw("File [$filename] does not exist!");
1253                         }
1254                         if (-s $filename != $filesize) {
1255                                 $self->throw("Flatfile size for $filename differs from what the index thinks it is. Real size [" . (-s $filename) . "] Index thinks it is [" . $filesize  . "]");
1256                         }
1257
1258                         my $fh;
1259                         open($fh,"<", $filename) || $self->throw($!);
1260                         $self->{_file}  {$fileid}   = $filename;
1261                         $self->{_dbfile}{$filename} = $fileid;
1262                         $self->{_size}  {$fileid}   = $filesize;
1263                 }
1264
1265                 # Look for namespace lines
1266                 if ( /(.*)_namespaces?\t(.+)/ ) {
1267               if ($1 eq "primary") {
1268                        $self->primary_namespace($2);
1269               } elsif ($1 eq "secondary") {
1270                        $self->secondary_namespaces(split "\t",$2);
1271               } else {
1272                        $self->throw("Unknown namespace name in config file [$1");
1273               }
1274            }
1275
1276            # Look for format lines
1277            if ($_ =~ /format\t(\S+)/) {
1278               # Check the format here?
1279               my $format = $1;
1280
1281               # handle LSID format
1282               if ($format =~ /^URN:LSID:open-bio\.org:(\w+)(?:\/(\w+))?/) {
1283                  $self->format($1);
1284                  $self->alphabet($2);
1285               } else {  # compatibility with older versions
1286                  $self->format($1);
1287               }
1288             }
1289     }
1290
1291     close($CON);
1292
1293     # Now check we have all that we need
1294
1295     my @fileid_keys = keys (%{$self->{_file}});
1296
1297     if (!(@fileid_keys)) {
1298              $self->throw("No flatfile fileid files in config - check the index has been made correctly");
1299     }
1300
1301     if (!defined($self->primary_namespace)) {
1302             $self->throw("No primary namespace exists");
1303     }
1304
1305     if (! -e $self->primary_index_file) {
1306             $self->throw("Primary index file [" . $self->primary_index_file . "] doesn't exist");
1307     }
1308
1309     1;
1310 }
1311
1312 =head2 get_fileid_by_filename
1313
1314  Title   : get_fileid_by_filename
1315  Usage   : $obj->get_fileid_by_filename($newval)
1316  Function:
1317  Example :
1318  Returns : value of get_fileid_by_filename
1319  Args    : newvalue (optional)
1320
1321 =cut
1322
1323 sub get_fileid_by_filename {
1324     my ($self,$file) = @_;
1325
1326     if (!defined($self->{_dbfile})) {
1327         $self->throw("No file to fileid mapping present.  Has the fileid file been read?");
1328     }
1329
1330
1331     return $self->{_dbfile}{$file};
1332 }
1333
1334 =head2 get_filehandle_by_fileid
1335
1336  Title   : get_filehandle_by_fileid
1337  Usage   : $obj->get_filehandle_by_fileid($newval)
1338  Function:
1339  Example :
1340  Returns : value of get_filehandle_by_fileid
1341  Args    : newvalue (optional)
1342
1343 =cut
1344
1345 sub get_filehandle_by_fileid {
1346     my ($self,$fileid) = @_;
1347
1348     if (!defined($self->{_file}{$fileid})) {
1349         $self->throw("ERROR: undefined fileid in index [$fileid]");
1350     }
1351
1352         my $fh;
1353         open($fh,"<", $self->{_file}{$fileid}) || $self->throw($!);
1354     return $fh;
1355 }
1356
1357 =head2 primary_index_file
1358
1359  Title   : primary_index_file
1360  Usage   : $obj->primary_index_file($newval)
1361  Function:
1362  Example :
1363  Returns : value of primary_index_file
1364  Args    : newvalue (optional)
1365
1366
1367 =cut
1368
1369 sub primary_index_file {
1370     my ($self) = @_;
1371
1372     return File::Spec->catfile($self->_config_path,"key_" . $self->primary_namespace . ".key");
1373 }
1374
1375 =head2 primary_index_filehandle
1376
1377  Title   : primary_index_filehandle
1378  Usage   : $obj->primary_index_filehandle($newval)
1379  Function:
1380  Example :
1381  Returns : value of primary_index_filehandle
1382  Args    : newvalue (optional)
1383
1384
1385 =cut
1386
1387 sub primary_index_filehandle {
1388     my ($self) = @_;
1389
1390     unless (defined ($self->{'_primary_index_handle'})) {
1391         open($self->{'_primary_index_handle'}, "<" . $self->primary_index_file) || self->throw($@);
1392     }
1393     return $self->{'_primary_index_handle'};
1394 }
1395
1396 =head2 format
1397
1398  Title   : format
1399  Usage   : $obj->format($newval)
1400  Function:
1401  Example :
1402  Returns : value of format
1403  Args    : newvalue (optional)
1404
1405
1406 =cut
1407
1408 sub format{
1409    my ($obj,$value) = @_;
1410    if( defined $value) {
1411       $obj->{'format'} = $value;
1412     }
1413     return $obj->{'format'};
1414
1415 }
1416
1417 sub alphabet{
1418    my ($obj,$value) = @_;
1419    if( defined $value) {
1420       $obj->{alphabet} = $value;
1421     }
1422     return $obj->{alphabet};
1423 }
1424
1425 =head2 write_flag
1426
1427  Title   : write_flag
1428  Usage   : $obj->write_flag($newval)
1429  Function:
1430  Example :
1431  Returns : value of write_flag
1432  Args    : newvalue (optional)
1433
1434
1435 =cut
1436
1437 sub write_flag{
1438    my ($obj,$value) = @_;
1439    if( defined $value) {
1440       $obj->{'write_flag'} = $value;
1441     }
1442     return $obj->{'write_flag'};
1443
1444 }
1445
1446 =head2 dbname
1447
1448  Title   : dbname
1449  Usage   : $obj->dbname($newval)
1450  Function: get/set database name
1451  Example :
1452  Returns : value of dbname
1453  Args    : newvalue (optional)
1454
1455 =cut
1456
1457 sub dbname {
1458   my $self = shift;
1459   my $d = $self->{flat_dbname};
1460   $self->{flat_dbname} = shift if @_;
1461   $d;
1462 }
1463
1464 =head2 index_directory
1465
1466  Title   : index_directory
1467  Usage   : $obj->index_directory($newval)
1468  Function:
1469  Example :
1470  Returns : value of index_directory
1471  Args    : newvalue (optional)
1472
1473
1474 =cut
1475
1476 sub index_directory {
1477     my ($self,$arg) = @_;
1478
1479     if (defined($arg)) {
1480         if ($arg !~ m{/$}) {
1481             $arg .= "/";
1482         }
1483         $self->{_index_directory} = $arg;
1484     }
1485     return $self->{_index_directory};
1486
1487 }
1488
1489 sub _config_path {
1490   my $self = shift;
1491   my $root = $self->index_directory;
1492   my $dbname = $self->dbname;
1493   File::Spec->catfile($root,$dbname);
1494 }
1495
1496 sub _config_file {
1497   my $self = shift;
1498   my $path = $self->_config_path;
1499   File::Spec->catfile($path,CONFIG_FILE_NAME);
1500 }
1501
1502 =head2 record_size
1503
1504  Title   : record_size
1505  Usage   : $obj->record_size($newval)
1506  Function:
1507  Example :
1508  Returns : value of record_size
1509  Args    : newvalue (optional)
1510
1511
1512 =cut
1513
1514 sub record_size {
1515     my $self = shift;
1516     $self->{_record_size} = shift if @_;
1517     return $self->{_record_size};
1518 }
1519
1520 =head2 primary_namespace
1521
1522  Title   : primary_namespace
1523  Usage   : $obj->primary_namespace($newval)
1524  Function:
1525  Example :
1526  Returns : value of primary_namespace
1527  Args    : newvalue (optional)
1528
1529 =cut
1530
1531 sub primary_namespace {
1532     my $self = shift;
1533     $self->{_primary_namespace} =  shift if @_;
1534     return $self->{_primary_namespace};
1535 }
1536
1537 =head2 index_type
1538
1539  Title   : index_type
1540  Usage   : $obj->index_type($newval)
1541  Function:
1542  Example :
1543  Returns : value of index_type
1544  Args    : newvalue (optional)
1545
1546
1547 =cut
1548
1549 sub index_type {
1550     my $self = shift;
1551     $self->{_index_type} = shift if @_;
1552     return $self->{_index_type};
1553 }
1554
1555 =head2 index_version
1556
1557  Title   : index_version
1558  Usage   : $obj->index_version($newval)
1559  Function:
1560  Example :
1561  Returns : value of index_version
1562  Args    : newvalue (optional)
1563
1564
1565 =cut
1566
1567 sub index_version {
1568     my $self = shift;
1569     $self->{_index_version} = shift if @_;
1570     return $self->{_index_version};
1571 }
1572
1573 =head2 primary_pattern
1574
1575  Title   : primary_pattern
1576  Usage   : $obj->primary_pattern($newval)
1577  Function:
1578  Example :
1579  Returns : value of primary_pattern
1580  Args    : newvalue (optional)
1581
1582
1583 =cut
1584
1585 sub primary_pattern{
1586     my $obj = shift;
1587     $obj->{'primary_pattern'} = shift if @_;
1588     return $obj->{'primary_pattern'};
1589 }
1590
1591 =head2 start_pattern
1592
1593  Title   : start_pattern
1594  Usage   : $obj->start_pattern($newval)
1595  Function:
1596  Example :
1597  Returns : value of start_pattern
1598  Args    : newvalue (optional)
1599
1600
1601 =cut
1602
1603 sub start_pattern{
1604     my $obj = shift;
1605     $obj->{'start_pattern'} = shift if @_;
1606     return $obj->{'start_pattern'};
1607 }
1608
1609 =head2 secondary_patterns
1610
1611  Title   : secondary_patterns
1612  Usage   : $obj->secondary_patterns($newval)
1613  Function:
1614  Example :
1615  Returns : value of secondary_patterns
1616  Args    : newvalue (optional)
1617
1618
1619 =cut
1620
1621 sub secondary_patterns{
1622    my ($obj,$value) = @_;
1623    if( defined $value) {
1624       $obj->{'secondary_patterns'} = $value;
1625
1626       my @names = keys %$value;
1627
1628       foreach my $name (@names) {
1629           $obj->secondary_namespaces($name);
1630       }
1631     }
1632     return $obj->{'secondary_patterns'};
1633
1634 }
1635
1636 =head2 secondary_namespaces
1637
1638  Title   : secondary_namespaces
1639  Usage   : $obj->secondary_namespaces($newval)
1640  Function:
1641  Example :
1642  Returns : value of secondary_namespaces
1643  Args    : newvalue (optional)
1644
1645
1646 =cut
1647
1648 sub secondary_namespaces {
1649     my ($obj,@values) = @_;
1650
1651     if (@values) {
1652         push(@{$obj->{'secondary_namespaces'}},@values);
1653     }
1654     return @{$obj->{'secondary_namespaces'} || []};
1655 }
1656
1657
1658
1659 ## These are indexing routines to index commonly used format - fasta
1660 ## swissprot and embl
1661
1662 sub new_SWISSPROT_index {
1663     my ($self,$index_dir,@files) = @_;
1664
1665     my %secondary_patterns;
1666
1667     my $start_pattern = "^ID   (\\S+)";
1668     my $primary_pattern = "^AC   (\\S+)\\;";
1669
1670     $secondary_patterns{"ID"} = $start_pattern;
1671
1672     my $index =  Bio::DB::Flat::BinarySearch->new
1673         (-index_dir          => $index_dir,
1674          -format             => 'swissprot',
1675          -primary_pattern    => $primary_pattern,
1676          -primary_namespace  => "ACC",
1677          -start_pattern      => $start_pattern,
1678          -secondary_patterns => \%secondary_patterns);
1679
1680     $index->build_index(@files);
1681 }
1682
1683 sub new_EMBL_index {
1684    my ($self,$index_dir,@files) = @_;
1685
1686    my %secondary_patterns;
1687
1688    my $start_pattern = "^ID   (\\S+)";
1689    my $primary_pattern = "^AC   (\\S+)\\;";
1690    my $primary_namespace = "ACC";
1691
1692    $secondary_patterns{"ID"} = $start_pattern;
1693
1694    my $index = Bio::DB::Flat::BinarySearch->new
1695        (-index_dir          => $index_dir,
1696         -format             => 'embl',
1697         -primary_pattern    => $primary_pattern,
1698         -primary_namespace  => "ACC",
1699         -start_pattern      => $start_pattern,
1700         -secondary_patterns => \%secondary_patterns);
1701
1702     $index->build_index(@files);
1703
1704    return $index;
1705 }
1706
1707 sub new_FASTA_index {
1708    my ($self,$index_dir,@files) =  @_;
1709
1710    my %secondary_patterns;
1711
1712    my $start_pattern = "^>";
1713    my $primary_pattern = "^>(\\S+)";
1714    my $primary_namespace = "ACC";
1715
1716    $secondary_patterns{"ID"} = "^>\\S+ +(\\S+)";
1717
1718    my $index =  Bio::DB::Flat::BinarySearch->new
1719        (-index_dir          => $index_dir,
1720         -format             => 'fasta',
1721         -primary_pattern    => $primary_pattern,
1722         -primary_namespace  => "ACC",
1723         -start_pattern      => $start_pattern,
1724         -secondary_patterns => \%secondary_patterns);
1725
1726    $index->build_index(@files);
1727
1728    return $index;
1729 }
1730
1731 # EVERYTHING THAT FOLLOWS THIS
1732 # is an awful hack - in reality Michele's code needs to be rewritten
1733 # to use Bio::SeqIO, but I have too little time to do this -- LS
1734 sub guess_alphabet {
1735   my $self = shift;
1736   my $line = shift;
1737
1738   my $format = $self->format;
1739   return 'protein' if $format eq 'swissprot';
1740
1741   if ($format eq 'genbank') {
1742     return unless $line =~ /^LOCUS/;
1743     return 'dna' if $line =~ /\s+\d+\s+bp/i;
1744     return 'protein';
1745   }
1746
1747   if ($format eq 'embl') {
1748     return unless $line =~ /^ID/;
1749     return 'dna' if $line =~ / DNA;/i;
1750     return 'rna' if $line =~ / RNA;/i;
1751     return 'protein';
1752   }
1753
1754   return;
1755 }
1756
1757 # return (namespace,primary_pattern,start_pattern,secondary_pattern)
1758 sub _guess_patterns {
1759   my $self = shift;
1760   my $format = shift;
1761   if ($format =~ /swiss(prot)?/i) {
1762     return ('ID',
1763             "^ID   (\\S+)",
1764             "^ID   (\\S+)",
1765             {
1766              ACC  => "^AC   (\\S+);"
1767             });
1768   }
1769
1770   if ($format =~ /embl/i) {
1771     return ('ID',
1772             "^ID   (\\S+)",
1773             "^ID   (\\S+)",
1774             {
1775              ACC     => q/^AC   (\S+);/,
1776              VERSION => q/^SV\s+(\S+)/
1777             });
1778   }
1779
1780   if ($format =~ /genbank/i) {
1781     return ('ID',
1782             q/^LOCUS\s+(\S+)/,
1783             q/^LOCUS/,
1784             {
1785              ACC     => q/^ACCESSION\s+(\S+)/,
1786              VERSION => q/^VERSION\s+(\S+)/
1787             });
1788   }
1789
1790   if ($format =~ /fasta/i) {
1791     return ('ACC',
1792             '^>(\S+)',
1793             '^>(\S+)',
1794            );
1795   }
1796
1797   $self->throw("I can't handle format $format");
1798
1799 }
1800
1801 1;