Bio/DB/Flat/BinarySearch.pm

   1 #
   2 # BioPerl module for Bio::DB::Flat::BinarySearch
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Michele Clamp <michele@sanger.ac.uk>>
   7 #
   8 # You may distribute this module under the same terms as perl itself
   9
  10 # POD documentation - main docs before the code
  11
  12 =head1 NAME
  13
  14 Bio::DB::Flat::BinarySearch - BinarySearch search indexing system for sequence files
  15
  16 =head1 SYNOPSIS
  17
  18   TODO: SYNOPSIS NEEDED!
  19
  20 =head1 DESCRIPTION
  21
  22 This module can be used both to index sequence files and also to
  23 retrieve sequences from existing sequence files.
  24
  25 This object allows indexing of sequence files both by a primary key
  26 (say accession) and multiple secondary keys (say ids).  This is
  27 different from the Bio::Index::Abstract (see L<Bio::Index::Abstract>)
  28 which uses DBM files as storage.  This module uses a binary search to
  29 retrieve sequences which is more efficient for large datasets.
  30
  31 =head2 Index creation
  32
  33     my $sequencefile;  # Some fasta sequence file
  34
  35 Patterns have to be entered to define where the keys are to be indexed
  36 and also where the start of each record.  E.g. for fasta
  37
  38     my $start_pattern   = '^>';
  39     my $primary_pattern = '^>(\S+)';
  40
  41 So the start of a record is a line starting with a E<gt> and the
  42 primary key is all characters up to the first space after the E<gt>
  43
  44 A string also has to be entered to defined what the primary key
  45 (primary_namespace) is called.
  46
  47 The index can now be created using
  48
  49     my $index = Bio::DB::Flat::BinarySearch->new(
  50              -directory         => "/home/max/",
  51              -dbname            => "mydb",
  52               -start_pattern     => $start_pattern,
  53               -primary_pattern   => $primary_pattern,
  54              -primary_namespace => "ID",
  55               -format            => "fasta" );
  56
  57     my @files = ("file1","file2","file3");
  58
  59     $index->build_index(@files);
  60
  61 The index is now ready to use.  For large sequence files the perl way
  62 of indexing takes a *long* time and a *huge* amount of memory.  For
  63 indexing things like dbEST I recommend using the DB_File indexer, BDB.
  64
  65 The formats currently supported by this module are fasta, Swissprot,
  66 and EMBL.
  67
  68 =head2 Creating indices with secondary keys
  69
  70 Sometimes just indexing files with one id per entry is not enough.  For
  71 instance you may want to retrieve sequences from swissprot using
  72 their accessions as well as their ids.
  73
  74 To be able to do this when creating your index you need to pass in
  75 a hash of secondary_patterns which have their namespaces as the keys
  76 to the hash.
  77
  78 e.g. For Indexing something like
  79
  80 ID   1433_CAEEL     STANDARD;      PRT;   248 AA.
  81 AC   P41932;
  82 DT   01-NOV-1995 (Rel. 32, Created)
  83 DT   01-NOV-1995 (Rel. 32, Last sequence update)
  84 DT   15-DEC-1998 (Rel. 37, Last annotation update)
  85 DE   14-3-3-LIKE PROTEIN 1.
  86 GN   FTT-1 OR M117.2.
  87 OS   Caenorhabditis elegans.
  88 OC   Eukaryota; Metazoa; Nematoda; Chromadorea; Rhabditida; Rhabditoidea;
  89 OC   Rhabditidae; Peloderinae; Caenorhabditis.
  90 OX   NCBI_TaxID=6239;
  91 RN   [1]
  92
  93 where we want to index the accession (P41932) as the primary key and the
  94 id (1433_CAEEL) as the secondary id.  The index is created as follows
  95
  96     my %secondary_patterns;
  97
  98     my $start_pattern   = '^ID   (\S+)';
  99     my $primary_pattern = '^AC   (\S+)\;';
 100
 101     $secondary_patterns{"ID"} = '^ID   (\S+)';
 102
 103     my $index = Bio::DB::Flat::BinarySearch->new(
 104                 -directory          => $index_directory,
 105                   -dbname             => "ppp",
 106                   -write_flag         => 1,
 107                 -verbose            => 1,
 108                 -start_pattern      => $start_pattern,
 109                 -primary_pattern    => $primary_pattern,
 110                 -primary_namespace  => 'AC',
 111                 -secondary_patterns => \%secondary_patterns);
 112
 113     $index->build_index($seqfile);
 114
 115 Of course having secondary indices makes indexing slower and use more
 116 memory.
 117
 118 =head2 Index reading
 119
 120 To fetch sequences using an existing index first of all create your sequence
 121 object
 122
 123     my $index = Bio::DB::Flat::BinarySearch->new(
 124                   -directory => $index_directory);
 125
 126 Now you can happily fetch sequences either by the primary key or
 127 by the secondary keys.
 128
 129     my $entry = $index->get_entry_by_id('HBA_HUMAN');
 130
 131 This returns just a string containing the whole entry.  This is
 132 useful is you just want to print the sequence to screen or write it to a file.
 133
 134 Other ways of getting sequences are
 135
 136     my $fh = $index->get_stream_by_id('HBA_HUMAN');
 137
 138 This can then be passed to a seqio object for output or converting
 139 into objects.
 140
 141     my $seq = Bio::SeqIO->new(-fh     => $fh,
 142                                 -format => 'fasta');
 143
 144 The last way is to retrieve a sequence directly.  This is the
 145 slowest way of extracting as the sequence objects need to be made.
 146
 147     my $seq = $index->get_Seq_by_id('HBA_HUMAN');
 148
 149 To access the secondary indices the secondary namespace needs to be known
 150
 151     $index->secondary_namespaces("ID");
 152
 153 Then the following call can be used
 154
 155     my $seq   = $index->get_Seq_by_secondary('ID','1433_CAEEL');
 156
 157 These calls are not yet implemented
 158
 159     my $fh    = $index->get_stream_by_secondary('ID','1433_CAEEL');
 160     my $entry = $index->get_entry_by_secondary('ID','1433_CAEEL');
 161
 162 =head1 FEEDBACK
 163
 164 =head2 Mailing Lists
 165
 166 User feedback is an integral part of the evolution of this and other
 167 Bioperl modules. Send your comments and suggestions preferably to one
 168 of the Bioperl mailing lists.  Your participation is much appreciated.
 169
 170   bioperl-l@bioperl.org                  - General discussion
 171   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 172
 173 =head2 Support
 174
 175 Please direct usage questions or support issues to the mailing list:
 176
 177 I<bioperl-l@bioperl.org>
 178
 179 rather than to the module maintainer directly. Many experienced and
 180 reponsive experts will be able look at the problem and quickly
 181 address it. Please include a thorough description of the problem
 182 with code and data examples if at all possible.
 183
 184 =head2 Reporting Bugs
 185
 186 Report bugs to the Bioperl bug tracking system to help us keep track
 187 the bugs and their resolution.  Bug reports can be submitted via the
 188 web:
 189
 190   https://redmine.open-bio.org/projects/bioperl/
 191
 192 =head1 AUTHOR - Michele Clamp
 193
 194 Email - michele@sanger.ac.uk
 195
 196 =head1 CONTRIBUTORS
 197
 198 Jason Stajich, jason@bioperl.org
 199
 200 =head1 APPENDIX
 201
 202 The rest of the documentation details each of the object methods. Internal
 203 methods are usually preceded with an "_" (underscore).
 204
 205 =cut
 206
 207 package Bio::DB::Flat::BinarySearch;
 208
 209 use strict;
 210
 211 use Fcntl qw(SEEK_END SEEK_CUR);
 212
 213 # rather than using tell which might be buffered
 214 sub systell { sysseek( $_[0], 0, SEEK_CUR ) }
 215 sub syseof  { sysseek( $_[0], 0, SEEK_END ) }
 216
 217 use File::Spec;
 218 use Bio::Root::RootI;
 219 use Bio::SeqIO;
 220 use Bio::Seq;
 221
 222 use base qw(Bio::DB::RandomAccessI);
 223
 224 use constant CONFIG_FILE_NAME => 'config.dat';
 225 use constant HEADER_SIZE      => 4;
 226 use constant DEFAULT_FORMAT   => 'fasta';
 227
 228 my @formats = [ 'FASTA', 'SWISSPROT', 'EMBL' ];
 229
 230 =head2 new
 231
 232  Title   : new
 233  Usage   : For reading
 234              my $index = Bio::DB::Flat::BinarySearch->new(
 235                      -directory => '/Users/michele/indices/dbest',
 236              -dbname    => 'mydb',
 237                      -format    => 'fasta');
 238
 239            For writing
 240
 241              my %secondary_patterns = {"ACC" => "^>\\S+ +(\\S+)"}
 242              my $index = Bio::DB::Flat::BinarySearch->new(
 243              -directory          => '/Users/michele/indices',
 244                      -dbname             => 'mydb',
 245              -primary_pattern    => "^>(\\S+)",
 246                      -secondary_patterns => \%secondary_patterns,
 247              -primary_namespace  => "ID");
 248
 249              my @files = ('file1','file2','file3');
 250
 251              $index->build_index(@files);
 252
 253
 254  Function: create a new Bio::DB::Flat::BinarySearch object
 255  Returns : new Bio::DB::Flat::BinarySearch
 256  Args    : -directory          Root directory for index files
 257            -dbname             Name of subdirectory containing indices
 258                                for named database
 259            -write_flag         Allow building index
 260            -primary_pattern    Regexp defining the primary id
 261            -secondary_patterns A hash ref containing the secondary
 262                                patterns with the namespaces as keys
 263            -primary_namespace  A string defining what the primary key
 264                                is
 265
 266  Status  : Public
 267
 268 =cut
 269
 270 sub new {
 271     my ( $class, @args ) = @_;
 272
 273     my $self = $class->SUPER::new(@args);
 274
 275     bless $self, $class;
 276
 277     my ( $index_dir, $dbname, $format, $write_flag, $primary_pattern,
 278         $primary_namespace, $start_pattern, $secondary_patterns )
 279       = $self->_rearrange(
 280         [
 281             qw(DIRECTORY
 282               DBNAME
 283               FORMAT
 284               WRITE_FLAG
 285               PRIMARY_PATTERN
 286               PRIMARY_NAMESPACE
 287               START_PATTERN
 288               SECONDARY_PATTERNS)
 289         ],
 290         @args
 291       );
 292
 293     $self->index_directory($index_dir);
 294     $self->dbname($dbname);
 295
 296     if ( $self->index_directory && $self->read_config_file ) {
 297
 298         my $fh           = $self->primary_index_filehandle;
 299         my $record_width = $self->read_header($fh);
 300         $self->record_size($record_width);
 301     }
 302     $format ||= DEFAULT_FORMAT;
 303     $self->format($format);
 304     $self->write_flag($write_flag);
 305
 306     if ( $self->write_flag && !$primary_namespace ) {
 307         (
 308             $primary_namespace, $primary_pattern,
 309             $start_pattern,     $secondary_patterns
 310         ) = $self->_guess_patterns( $self->format );
 311     }
 312
 313     $self->primary_pattern($primary_pattern);
 314     $self->primary_namespace($primary_namespace);
 315     $self->start_pattern($start_pattern);
 316     $self->secondary_patterns($secondary_patterns);
 317
 318     return $self;
 319 }
 320
 321 sub new_from_registry {
 322     my ( $self, %config ) = @_;
 323
 324     my $dbname   = $config{'dbname'};
 325     my $location = $config{'location'};
 326
 327     my $index = Bio::DB::Flat::BinarySearch->new(
 328         -dbname    => $dbname,
 329         -index_dir => $location,
 330     );
 331 }
 332
 333 =head2 get_Seq_by_id
 334
 335  Title   : get_Seq_by_id
 336  Usage   : $obj->get_Seq_by_id($newval)
 337  Function:
 338  Example :
 339  Returns : value of get_Seq_by_id
 340  Args    : newvalue (optional)
 341
 342 =cut
 343
 344 sub get_Seq_by_id {
 345     my ( $self, $id ) = @_;
 346
 347     # too many uninit variables...
 348     local $^W = 0;
 349
 350     my ( $fh, $length ) = $self->get_stream_by_id($id);
 351
 352     unless ( defined( $self->format ) ) {
 353         $self->throw("Can't create sequence - format is not defined");
 354     }
 355
 356     return unless $fh;
 357
 358     unless ( defined( $self->{_seqio} ) ) {
 359
 360         $self->{_seqio} = Bio::SeqIO->new(
 361             -fh     => $fh,
 362             -format => $self->format
 363         );
 364     }
 365     else {
 366         $self->{_seqio}->fh($fh);
 367     }
 368
 369     return $self->{_seqio}->next_seq;
 370 }
 371
 372 =head2 get_entry_by_id
 373
 374  Title   : get_entry_by_id
 375  Usage   : $obj->get_entry_by_id($newval)
 376  Function: Get a Bio::SeqI object for a unique ID
 377  Returns : Bio::SeqI
 378  Args    : string
 379
 380
 381 =cut
 382
 383 sub get_entry_by_id {
 384     my ( $self, $id ) = @_;
 385
 386     my ( $fh, $length ) = $self->get_stream_by_id($id);
 387
 388     my $entry;
 389
 390     sysread( $fh, $entry, $length );
 391
 392     return $entry;
 393 }
 394
 395 =head2 get_stream_by_id
 396
 397  Title   : get_stream_by_id
 398  Usage   : $obj->get_stream_by_id($id)
 399  Function: Gets a Sequence stream for an id
 400  Returns : Bio::SeqIO stream
 401  Args    : Id to lookup by
 402
 403
 404 =cut
 405
 406 sub get_stream_by_id {
 407     my ( $self, $id ) = @_;
 408
 409     unless ( $self->record_size ) {
 410         if ( $self->index_directory && $self->read_config_file ) {
 411
 412             my $fh           = $self->primary_index_filehandle;
 413             my $record_width = $self->read_header($fh);
 414             $self->record_size($record_width);
 415         }
 416     }
 417     my $indexfh = $self->primary_index_filehandle;
 418     syseof($indexfh);
 419
 420     my $filesize = systell($indexfh);
 421
 422     $self->throw("file was not parsed properly, record size is empty")
 423       unless $self->record_size;
 424
 425     my $end = ( $filesize - $self->{'_start_pos'} ) / $self->record_size;
 426     my ( $newid, $rest, $fhpos ) =
 427       $self->find_entry( $indexfh, 0, $end, $id, $self->record_size );
 428
 429     my ( $fileid, $pos, $length ) = split( /\t/, $rest );
 430
 431 #print STDERR "BinarySearch Found id entry $newid $fileid $pos $length:$rest\n";
 432
 433     if ( !$newid ) {
 434         return;
 435     }
 436
 437     my $file = $self->{_file}{$fileid};
 438
 439     open my $IN, '<', $file or $self->throw("Could not read file '$file': $!");
 440
 441     my $entry;
 442
 443     sysseek( $IN, $pos, 0 );
 444
 445     return ( $IN, $length );
 446 }
 447
 448 =head2 get_Seq_by_acc
 449
 450  Title   : get_Seq_by_acc
 451  Usage   : $obj->get_Seq_by_acc($acc)
 452  Function: Gets a Bio::SeqI object by accession number
 453  Returns : Bio::SeqI object
 454  Args    : string representing accession number
 455
 456
 457 =cut
 458
 459 sub get_Seq_by_acc {
 460     my ( $self, $acc ) = @_;
 461
 462     # too many uninit variables...
 463     local $^W = 0;
 464
 465     if ( $self->primary_namespace eq "ACC" ) {
 466         return $self->get_Seq_by_id($acc);
 467     }
 468     else {
 469         return $self->get_Seq_by_secondary( "ACC", $acc );
 470     }
 471 }
 472
 473 =head2 get_Seq_by_version
 474
 475  Title   : get_Seq_by_version
 476  Usage   : $obj->get_Seq_by_version($version)
 477  Function: Gets a Bio::SeqI object by accession.version number
 478  Returns : Bio::SeqI object
 479  Args    : string representing accession.version number
 480
 481
 482 =cut
 483
 484 sub get_Seq_by_version {
 485     my ( $self, $acc ) = @_;
 486
 487     # too many uninit variables...
 488     local $^W = 0;
 489
 490     if ( $self->primary_namespace eq "VERSION" ) {
 491         return $self->get_Seq_by_id($acc);
 492     }
 493     else {
 494         return $self->get_Seq_by_secondary( "VERSION", $acc );
 495     }
 496 }
 497
 498 =head2 get_Seq_by_secondary
 499
 500  Title   : get_Seq_by_secondary
 501  Usage   : $obj->get_Seq_by_secondary($namespace,$acc)
 502  Function: Gets a Bio::SeqI object looking up secondary accessions
 503  Returns : Bio::SeqI object
 504  Args    : namespace name to check secondary namespace and an id
 505
 506
 507 =cut
 508
 509 sub get_Seq_by_secondary {
 510     my ( $self, $name, $id ) = @_;
 511
 512     my @names = $self->secondary_namespaces;
 513
 514     my $found = 0;
 515     foreach my $tmpname (@names) {
 516         if ( $name eq $tmpname ) {
 517             $found = 1;
 518         }
 519     }
 520
 521     if ( $found == 0 ) {
 522         $self->throw("Secondary index for $name doesn't exist\n");
 523     }
 524
 525     my $fh = $self->open_secondary_index($name);
 526
 527     syseof($fh);
 528
 529     my $filesize = systell($fh);
 530
 531     my $recsize = $self->{'_secondary_record_size'}{$name};
 532
 533     #    print "Name " . $recsize . "\n";
 534
 535     my $end = ( $filesize - $self->{'_start_pos'} ) / $recsize;
 536
 537     #    print "End $end $filesize\n";
 538     my ( $newid, $primary_id, $pos ) =
 539       $self->find_entry( $fh, 0, $end, $id, $recsize );
 540
 541     sysseek( $fh, $pos, 0 );
 542
 543     #    print "Found new id $newid $primary_id\n";
 544     # We now need to shuffle up the index file to find the top secondary entry
 545
 546     my $record = $newid;
 547
 548     while ( $record =~ /^$newid/ && $pos >= 0 ) {
 549
 550         $record = $self->read_record( $fh, $pos, $recsize );
 551         $pos = $pos - $recsize;
 552
 553         #       print "Up record = $record:$newid\n";
 554     }
 555
 556     $pos += $recsize;
 557
 558     #    print "Top position is $pos\n";
 559
 560     # Now we have to shuffle back down again to read all the secondary entries
 561
 562     my $current_id = $newid;
 563     my %primary_id;
 564
 565     $primary_id{$primary_id} = 1;
 566
 567     while ( $current_id eq $newid ) {
 568         $record = $self->read_record( $fh, $pos, $recsize );
 569
 570         # print "Record is :$record:\n";
 571         my ( $secid, $primary_id ) = split( /\t/, $record, 2 );
 572         $current_id = $secid;
 573
 574         if ( $current_id eq $newid ) {
 575             $primary_id =~ s/ //g;
 576
 577             #    print "Primary $primary_id\n";
 578             $primary_id{$primary_id} = 1;
 579
 580             $pos = $pos + $recsize;
 581
 582             #   print "Down record = $record\n";
 583         }
 584     }
 585
 586     if ( !defined($newid) ) {
 587         return;
 588     }
 589
 590     my @entry;
 591
 592     foreach my $id ( keys %primary_id ) {
 593         push @entry, $self->get_Seq_by_id($id);
 594     }
 595     return wantarray ? @entry : $entry[0];
 596
 597 }
 598
 599 =head2 read_header
 600
 601  Title   : read_header
 602  Usage   : $obj->read_header($fhl)
 603  Function: Reads the header from the db file
 604  Returns : width of a record
 605  Args    : filehandle
 606
 607
 608 =cut
 609
 610 sub read_header {
 611     my ( $self, $fh ) = @_;
 612
 613     my $record_width;
 614
 615     sysread( $fh, $record_width, HEADER_SIZE );
 616
 617     $self->{'_start_pos'} = HEADER_SIZE;
 618     $record_width =~ s/ //g;
 619     $record_width = $record_width * 1;
 620
 621     return $record_width;
 622 }
 623
 624 =head2 read_record
 625
 626  Title   : read_record
 627  Usage   : $obj->read_record($fh,$pos,$len)
 628  Function: Reads a record from a filehandle
 629  Returns : String
 630  Args    : filehandle, offset, and length
 631
 632
 633 =cut
 634
 635 sub read_record {
 636     my ( $self, $fh, $pos, $len ) = @_;
 637
 638     sysseek( $fh, $pos, 0 );
 639
 640     my $record;
 641
 642     sysread( $fh, $record, $len );
 643
 644     return $record;
 645
 646 }
 647
 648 =head2 get_all_primary_ids
 649
 650  Title   : get_all_primary_ids
 651  Usage   : @ids = $seqdb->get_all_primary_ids()
 652  Function: gives an array of all the primary_ids of the
 653            sequence objects in the database.
 654  Returns : an array of strings
 655  Args    : none
 656
 657 =cut
 658
 659 sub get_all_primary_ids {
 660     my $self = shift;
 661
 662     my $fh = $self->primary_index_filehandle;
 663     syseof($fh);
 664     my $filesize = systell($fh);
 665     my $recsize  = $self->record_size;
 666     my $end      = $filesize;
 667
 668     my @ids;
 669     for ( my $pos = $self->{'_start_pos'} ; $pos < $end ; $pos += $recsize ) {
 670         my $record = $self->read_record( $fh, $pos, $recsize );
 671         my ($entryid) = split( /\t/, $record );
 672         push @ids, $entryid;
 673     }
 674     @ids;
 675 }
 676
 677 =head2 find_entry
 678
 679  Title   : find_entry
 680  Usage   : $obj->find_entry($fh,$start,$end,$id,$recsize)
 681  Function: Extract an entry based on the start,end,id and record size
 682  Returns : string
 683  Args    : filehandle, start, end, id, recordsize
 684
 685
 686 =cut
 687
 688 sub find_entry {
 689     my ( $self, $fh, $start, $end, $id, $recsize ) = @_;
 690
 691     my $mid = int( ( $end + 1 + $start ) / 2 );
 692     my $pos = ( $mid - 1 ) * $recsize + $self->{'_start_pos'};
 693
 694     my ($record) = $self->read_record( $fh, $pos, $recsize );
 695     my ( $entryid, $rest ) = split( /\t/, $record, 2 );
 696     $rest =~ s/\s+$//;
 697
 698     #    print "Mid $recsize $mid $pos:$entryid:$rest:$record\n";
 699     #    print "Entry :$id:$entryid:$rest\n";
 700
 701     my ( $first, $second ) =
 702       $id le $entryid ? ( $id, $entryid ) : ( $entryid, $id );
 703
 704     if ( $id eq $entryid ) {
 705
 706         return ( $id, $rest, $pos - $recsize );
 707
 708     }
 709     elsif ( $first eq $id ) {
 710
 711         if ( $end - $start <= 1 ) {
 712             return;
 713         }
 714         my $end = $mid;
 715
 716         #      print "Moving up $entryid $id\n";
 717         $self->find_entry( $fh, $start, $end, $id, $recsize );
 718
 719     }
 720     elsif ( $second eq $id ) {
 721
 722         #       print "Moving down $entryid $id\n";
 723         if ( $end - $start <= 1 ) {
 724             return;
 725         }
 726
 727         $start = $mid;
 728
 729         $self->find_entry( $fh, $start, $end, $id, $recsize );
 730     }
 731
 732 }
 733
 734 =head2 build_index
 735
 736  Title   : build_index
 737  Usage   : $obj->build_index(@files)
 738  Function: Build the index based on a set of files
 739  Returns : count of the number of entries
 740  Args    : List of filenames
 741
 742
 743 =cut
 744
 745 sub build_index {
 746     my ( $self, @files ) = @_;
 747     $self->write_flag
 748       or $self->throw('Cannot build index unless -write_flag is true');
 749
 750     my $rootdir = $self->index_directory;
 751
 752     if ( !defined($rootdir) ) {
 753         $self->throw("No index directory set - can't build indices");
 754     }
 755
 756     if ( !-d $rootdir ) {
 757         $self->throw(
 758             "Index directory [$rootdir] is not a directory. Cant' build indices"
 759         );
 760     }
 761
 762     my $dbpath = File::Spec->catfile( $rootdir, $self->dbname );
 763     if ( !-d $dbpath ) {
 764         warn "Creating directory $dbpath\n";
 765         mkdir $dbpath, 0777 or $self->throw("Couldn't create $dbpath: $!");
 766     }
 767
 768     unless (@files) {
 769         $self->throw("Must enter an array of filenames to index");
 770     }
 771
 772     foreach my $file (@files) {
 773         $file = File::Spec->rel2abs($file)
 774           unless File::Spec->file_name_is_absolute($file);
 775         unless ( -e $file ) {
 776             $self->throw("Can't index file [$file] as it doesn't exist");
 777         }
 778     }
 779
 780     if ( my $filehash = $self->{_dbfile} ) {
 781         push @files, keys %$filehash;
 782     }
 783
 784     my %seen;
 785     @files = grep { !$seen{$_}++ } @files;
 786
 787     # Lets index
 788     $self->make_config_file( \@files );
 789     my $entries = 0;
 790     foreach my $file (@files) {
 791         $entries += $self->_index_file($file);
 792     }
 793
 794     # update alphabet if necessary
 795     $self->make_config_file( \@files );
 796
 797     # And finally write out the indices
 798     $self->write_primary_index;
 799     $self->write_secondary_indices;
 800
 801     $entries;
 802 }
 803
 804 =head2 _index_file
 805
 806  Title   : _index_file
 807  Usage   : $obj->_index_file($newval)
 808  Function:
 809  Example :
 810  Returns : value of _index_file
 811  Args    : newvalue (optional)
 812
 813 =cut
 814
 815 sub _index_file {
 816     my ( $self, $file ) = @_;
 817     my $v = $self->verbose;
 818     open my $FILE, '<', $file or $self->throw("Could not read file '$file': $!");
 819
 820     my $recstart = 0;
 821     my $fileid   = $self->get_fileid_by_filename($file);
 822     my $found    = 0;
 823     my $id;
 824     my $count = 0;
 825
 826     my $primary       = $self->primary_pattern;
 827     my $start_pattern = $self->start_pattern;
 828
 829     my $pos = 0;
 830
 831     my $new_primary_entry;
 832
 833     my $length;
 834
 835     my $fh = $FILE;
 836
 837     my $done = -1;
 838
 839     my @secondary_names = $self->secondary_namespaces;
 840     my %secondary_id;
 841     my $last_one;
 842
 843     # In Windows, text files have '\r\n' as line separator, but when reading in
 844     # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n",
 845     # "length $_" will report 4 although the line is 5 bytes in length.
 846     # We assume that all lines have the same line separator and only read current line.
 847     my $init_pos   = tell($fh);
 848     my $curr_line  = <$fh>;
 849     my $pos_diff   = tell($fh) - $init_pos;
 850     my $correction = $pos_diff - length $curr_line;
 851     seek $fh, $init_pos, 0; # Rewind position to proceed to read the file
 852
 853     while (<$fh>) {
 854         $last_one = $_;
 855         $self->{alphabet} ||= $self->guess_alphabet($_);
 856         if ( $_ =~ /$start_pattern/ ) {
 857             if ( $done == 0 ) {
 858                 $id = $new_primary_entry;
 859                 $self->{alphabet} ||= $self->guess_alphabet($_);
 860
 861                 my $tmplen = ( tell $fh ) - length($_) - $correction;
 862
 863                 $length = $tmplen - $pos;
 864
 865                 unless ( defined($id) ) {
 866                     $self->throw("No id defined for sequence");
 867                 }
 868                 unless ( defined($fileid) ) {
 869                     $self->throw("No fileid defined for file $file");
 870                 }
 871                 unless ( defined($pos) ) {
 872                     $self->throw( "No position defined for " . $id . "\n" );
 873                 }
 874                 unless ( defined($length) ) {
 875                     $self->throw( "No length defined for " . $id . "\n" );
 876                 }
 877                 $self->_add_id_position( $id, $pos, $fileid, $length,
 878                     \%secondary_id );
 879
 880                 $pos = $tmplen;
 881
 882                 if ( $count > 0 && $count % 1000 == 0 ) {
 883                     $self->debug("Indexed $count ids\n") if $v > 0;
 884                 }
 885
 886                 $count++;
 887             }
 888             else {
 889                 $done = 0;
 890             }
 891         }
 892
 893         if ( $_ =~ /$primary/ ) {
 894             $new_primary_entry = $1;
 895         }
 896
 897         my $secondary_patterns = $self->secondary_patterns;
 898
 899         foreach my $sec (@secondary_names) {
 900             my $pattern = $secondary_patterns->{$sec};
 901
 902             if ( $_ =~ /$pattern/ ) {
 903                 $secondary_id{$sec} = $1;
 904             }
 905         }
 906
 907     }
 908
 909     # Remember to add in the last one
 910
 911     $id = $new_primary_entry;
 912
 913     # my $tmplen = (tell $fh) - length($last_one);
 914     my $tmplen = ( tell $fh );
 915
 916     $length = $tmplen - $pos;
 917
 918     if ( !defined($id) ) {
 919         $self->throw("No id defined for sequence");
 920     }
 921     if ( !defined($fileid) ) {
 922         $self->throw("No fileid defined for file $file");
 923     }
 924     if ( !defined($pos) ) {
 925         $self->throw( "No position defined for " . $id . "\n" );
 926     }
 927     if ( !defined($length) ) {
 928         $self->throw( "No length defined for " . $id . "\n" );
 929     }
 930
 931     $self->_add_id_position( $id, $pos, $fileid, $length, \%secondary_id );
 932     $count++;
 933
 934     close $FILE;
 935     $count;
 936 }
 937
 938 =head2 write_primary_index
 939
 940  Title   : write_primary_index
 941  Usage   : $obj->write_primary_index($newval)
 942  Function:
 943  Example :
 944  Returns : value of write_primary_index
 945  Args    : newvalue (optional)
 946
 947
 948 =cut
 949
 950 sub write_primary_index {
 951     my ($self) = @_;
 952
 953     my @ids = keys %{ $self->{_id} };
 954
 955     @ids = sort { $a cmp $b } @ids;
 956
 957     open my $INDEX, '>', $self->primary_index_file
 958       or $self->throw(
 959         "Could not write primary index file '" . $self->primary_index_file . "': $!" );
 960
 961     my $recordlength =
 962       $self->{_maxidlength} +
 963       $self->{_maxfileidlength} +
 964       $self->{_maxposlength} +
 965       $self->{_maxlengthlength} + 3;
 966
 967     print $INDEX sprintf( "%04d", $recordlength );
 968
 969     foreach my $id (@ids) {
 970
 971         if ( !defined( $self->{_id}{$id}{_fileid} ) ) {
 972             $self->throw("No fileid for $id\n");
 973         }
 974         if ( !defined( $self->{_id}{$id}{_pos} ) ) {
 975             $self->throw("No position for $id\n");
 976         }
 977         if ( !defined( $self->{_id}{$id}{_length} ) ) {
 978             $self->throw("No length for $id");
 979         }
 980
 981         my $record =
 982             $id . "\t"
 983           . $self->{_id}{$id}{_fileid} . "\t"
 984           . $self->{_id}{$id}{_pos} . "\t"
 985           . $self->{_id}{$id}{_length};
 986
 987         print $INDEX sprintf( "%-${recordlength}s", $record );
 988
 989     }
 990 }
 991
 992 =head2 write_secondary_indices
 993
 994  Title   : write_secondary_indices
 995  Usage   : $obj->write_secondary_indices($newval)
 996  Function:
 997  Example :
 998  Returns : value of write_secondary_indices
 999  Args    : newvalue (optional)
1000
1001
1002 =cut
1003
1004 sub write_secondary_indices {
1005     my ($self) = @_;
1006
1007     # These are the different
1008     my @names = keys( %{ $self->{_secondary_id} } );
1009
1010     foreach my $name (@names) {
1011
1012         my @seconds = keys %{ $self->{_secondary_id}{$name} };
1013
1014         # First we need to loop over to get the longest record.
1015         my $length = 0;
1016
1017         foreach my $second (@seconds) {
1018             my $tmplen = length($second) + 1;
1019             my @prims  = keys %{ $self->{_secondary_id}{$name}{$second} };
1020
1021             foreach my $prim (@prims) {
1022                 my $recordlen = $tmplen + length($prim);
1023
1024                 if ( $recordlen > $length ) {
1025                     $length = $recordlen;
1026                 }
1027             }
1028         }
1029
1030         # Now we can print the index
1031
1032         my $fh = $self->new_secondary_filehandle($name);
1033
1034         print $fh sprintf( "%04d", $length );
1035         @seconds = sort @seconds;
1036
1037         foreach my $second (@seconds) {
1038
1039             my @prims = keys %{ $self->{_secondary_id}{$name}{$second} };
1040             my $tmp   = $second;
1041
1042             foreach my $prim (@prims) {
1043                 my $record = $tmp . "\t" . $prim;
1044                 if ( length($record) > $length ) {
1045                     $self->throw(
1046 "Something has gone horribly wrong - length of record is more than we thought [$length]\n"
1047                     );
1048                 }
1049                 else {
1050                     print $fh sprintf( "%-${length}s", $record );
1051                 }
1052             }
1053         }
1054
1055         close($fh);
1056     }
1057 }
1058
1059 =head2 new_secondary_filehandle
1060
1061  Title   : new_secondary_filehandle
1062  Usage   : $obj->new_secondary_filehandle($newval)
1063  Function:
1064  Example :
1065  Returns : value of new_secondary_filehandle
1066  Args    : newvalue (optional)
1067
1068
1069 =cut
1070
1071 sub new_secondary_filehandle {
1072     my ( $self, $name ) = @_;
1073
1074     my $indexdir = $self->_config_path;
1075
1076     my $secindex = File::Spec->catfile( $indexdir, "id_$name.index" );
1077
1078     open my $fh, '>', $secindex or $self->throw("Could not write file '$secindex': $!");
1079     return $fh;
1080 }
1081
1082 =head2 open_secondary_index
1083
1084  Title   : open_secondary_index
1085  Usage   : $obj->open_secondary_index($newval)
1086  Function:
1087  Example :
1088  Returns : value of open_secondary_index
1089  Args    : newvalue (optional)
1090
1091
1092 =cut
1093
1094 sub open_secondary_index {
1095     my ( $self, $name ) = @_;
1096
1097     if ( !defined( $self->{_secondary_filehandle}{$name} ) ) {
1098
1099         my $indexdir = $self->_config_path;
1100         my $secindex = $indexdir . "/id_$name.index";
1101
1102         if ( !-e $secindex ) {
1103             $self->throw("Index is not present for namespace [$name]\n");
1104         }
1105
1106         open my $newfh, '<', $secindex or $self->throw("Could not read file '$secindex': $!");
1107         my $reclen = $self->read_header($newfh);
1108
1109         $self->{_secondary_filehandle}{$name}  = $newfh;
1110         $self->{_secondary_record_size}{$name} = $reclen;
1111     }
1112
1113     return $self->{_secondary_filehandle}{$name};
1114
1115 }
1116
1117 =head2 _add_id_position
1118
1119  Title   : _add_id_position
1120  Usage   : $obj->_add_id_position($newval)
1121  Function:
1122  Example :
1123  Returns : value of _add_id_position
1124  Args    : newvalue (optional)
1125
1126
1127 =cut
1128
1129 sub _add_id_position {
1130     my ( $self, $id, $pos, $fileid, $length, $secondary_id ) = @_;
1131
1132     if ( !defined($id) ) {
1133         $self->throw("No id defined. Can't add id position");
1134     }
1135     if ( !defined($pos) ) {
1136         $self->throw("No position defined. Can't add id position");
1137     }
1138     if ( !defined($fileid) ) {
1139         $self->throw("No fileid defined. Can't add id position");
1140     }
1141     if ( !defined($length) || $length <= 0 ) {
1142         $self->throw(
1143             "No length defined or <= 0 [$length]. Can't add id position");
1144     }
1145
1146     $self->{_id}{$id}{_pos}    = $pos;
1147     $self->{_id}{$id}{_length} = $length;
1148     $self->{_id}{$id}{_fileid} = $fileid;
1149
1150     # Now the secondary ids
1151
1152     foreach my $sec ( keys(%$secondary_id) ) {
1153         my $value = $secondary_id->{$sec};
1154         $self->{_secondary_id}{$sec}{$value}{$id} = 1;
1155     }
1156
1157     $self->{_maxidlength} = length($id)
1158       if !exists $self->{_maxidlength}
1159           or length($id) >= $self->{_maxidlength};
1160
1161     $self->{_maxfileidlength} = length($fileid)
1162       if !exists $self->{_maxfileidlength}
1163           or length($fileid) >= $self->{_maxfileidlength};
1164
1165     $self->{_maxposlength} = length($pos)
1166       if !exists $self->{_maxposlength}
1167           or length($pos) >= $self->{_maxposlength};
1168
1169     $self->{_maxlengthlength} = length($length)
1170       if !exists $self->{_maxlengthlength}
1171           or length($length) >= $self->{_maxlengthlength};
1172 }
1173
1174 =head2 make_config_file
1175
1176  Title   : make_config_file
1177  Usage   : $obj->make_config_file($newval)
1178  Function:
1179  Example :
1180  Returns : value of make_config_file
1181  Args    : newvalue (optional)
1182
1183 =cut
1184
1185 sub make_config_file {
1186     my ( $self, $files ) = @_;
1187
1188     my @files = @$files;
1189
1190     my $configfile = $self->_config_file;
1191
1192     open my $CON, '>', $configfile
1193       or $self->throw("Could not write config file '$configfile': $!");
1194
1195     # First line must be the type of index - in this case flat
1196     print $CON "index\tflat/1\n";
1197
1198     # Now the fileids
1199
1200     my $count = 0;
1201
1202     foreach my $file (@files) {
1203
1204         my $size = -s $file;
1205
1206         print $CON "fileid_$count\t$file\t$size\n";
1207
1208         $self->{_file}{$count}  = $file;
1209         $self->{_dbfile}{$file} = $count;
1210         $self->{_size}{$count}  = $size;
1211         $count++;
1212     }
1213
1214     # Now the namespaces
1215
1216     print $CON "primary_namespace\t" . $self->primary_namespace . "\n";
1217
1218     # Needs fixing for the secondary stuff
1219
1220     my $second_patterns = $self->secondary_patterns;
1221
1222     my @second = keys %$second_patterns;
1223
1224     if ( (@second) ) {
1225         print $CON "secondary_namespaces";
1226
1227         foreach my $second (@second) {
1228             print $CON "\t$second";
1229         }
1230         print $CON "\n";
1231     }
1232
1233     # Now the config format
1234
1235     unless ( defined( $self->format ) ) {
1236         $self->throw(
1237             "Format does not exist in module - can't write config file");
1238     }
1239     else {
1240         my $format   = $self->format;
1241         my $alphabet = $self->alphabet;
1242         my $alpha    = $alphabet ? "/$alphabet" : '';
1243         print $CON "format\t" . "$format\n";
1244     }
1245     close $CON;
1246 }
1247
1248 =head2 read_config_file
1249
1250  Title   : read_config_file
1251  Usage   : $obj->read_config_file($newval)
1252  Function:
1253  Example :
1254  Returns : value of read_config_file
1255  Args    : newvalue (optional)
1256
1257 =cut
1258
1259 sub read_config_file {
1260     my ($self) = @_;
1261     my $configfile = $self->_config_file;
1262     return unless -e $configfile;
1263
1264     open my $CON, '<', $configfile
1265       or $self->throw("Could not read config file '$configfile': $!");
1266
1267     # First line must be type
1268     my $line = <$CON>;
1269     chomp($line);
1270     my $version;
1271
1272     # This is hard coded as we only index flatfiles here
1273     if ( $line =~ m{index\tflat/(\d+)} ) {
1274         $version = $1;
1275     }
1276     else {
1277         $self->throw(
1278 "First line not compatible with flat file index.  Should be something like\n\nindex\tflat/1"
1279         );
1280     }
1281
1282     $self->index_type("flat");
1283     $self->index_version($version);
1284
1285     while (<$CON>) {
1286         chomp;
1287
1288         # Look for fileid lines
1289         if ( $_ =~ /^fileid_(\d+)\t(.+)\t(\d+)/ ) {
1290             my $fileid   = $1;
1291             my $filename = $2;
1292             my $filesize = $3;
1293
1294             if ( !-e $filename ) {
1295                 $self->throw("File [$filename] does not exist!");
1296             }
1297             if ( -s $filename != $filesize ) {
1298                 $self->throw(
1299 "Flatfile size for $filename differs from what the index thinks it is. Real size ["
1300                       . ( -s $filename )
1301                       . "] Index thinks it is ["
1302                       . $filesize
1303                       . "]" );
1304             }
1305
1306             $self->{_file}{$fileid}     = $filename;
1307             $self->{_dbfile}{$filename} = $fileid;
1308             $self->{_size}{$fileid}     = $filesize;
1309         }
1310
1311         # Look for namespace lines
1312         if (/(.*)_namespaces?\t(.+)/) {
1313             if ( $1 eq "primary" ) {
1314                 $self->primary_namespace($2);
1315             }
1316             elsif ( $1 eq "secondary" ) {
1317                 $self->secondary_namespaces( split "\t", $2 );
1318             }
1319             else {
1320                 $self->throw("Unknown namespace name in config file [$1");
1321             }
1322         }
1323
1324         # Look for format lines
1325         if ( $_ =~ /format\t(\S+)/ ) {
1326
1327             # Check the format here?
1328             my $format = $1;
1329
1330             # handle LSID format
1331             if ( $format =~ /^URN:LSID:open-bio\.org:(\w+)(?:\/(\w+))?/ ) {
1332                 $self->format($1);
1333                 $self->alphabet($2);
1334             }
1335             else {    # compatibility with older versions
1336                 $self->format($1);
1337             }
1338         }
1339     }
1340
1341     close($CON);
1342
1343     # Now check we have all that we need
1344
1345     my @fileid_keys = keys( %{ $self->{_file} } );
1346
1347     if ( !(@fileid_keys) ) {
1348         $self->throw(
1349 "No flatfile fileid files in config - check the index has been made correctly"
1350         );
1351     }
1352
1353     if ( !defined( $self->primary_namespace ) ) {
1354         $self->throw("No primary namespace exists");
1355     }
1356
1357     if ( !-e $self->primary_index_file ) {
1358         $self->throw( "Primary index file ["
1359               . $self->primary_index_file
1360               . "] doesn't exist" );
1361     }
1362
1363     1;
1364 }
1365
1366 =head2 get_fileid_by_filename
1367
1368  Title   : get_fileid_by_filename
1369  Usage   : $obj->get_fileid_by_filename($newval)
1370  Function:
1371  Example :
1372  Returns : value of get_fileid_by_filename
1373  Args    : newvalue (optional)
1374
1375 =cut
1376
1377 sub get_fileid_by_filename {
1378     my ( $self, $file ) = @_;
1379
1380     if ( !defined( $self->{_dbfile} ) ) {
1381         $self->throw(
1382             "No file to fileid mapping present.  Has the fileid file been read?"
1383         );
1384     }
1385
1386     return $self->{_dbfile}{$file};
1387 }
1388
1389 =head2 get_filehandle_by_fileid
1390
1391  Title   : get_filehandle_by_fileid
1392  Usage   : $obj->get_filehandle_by_fileid($newval)
1393  Function:
1394  Example :
1395  Returns : value of get_filehandle_by_fileid
1396  Args    : newvalue (optional)
1397
1398 =cut
1399
1400 sub get_filehandle_by_fileid {
1401     my ( $self, $fileid ) = @_;
1402
1403     if ( !defined( $self->{_file}{$fileid} ) ) {
1404         $self->throw("ERROR: undefined fileid in index [$fileid]");
1405     }
1406
1407     open my $fh, '<', $self->{_file}{$fileid} or $self->throw("Could not read file '$self->{_file}{$fileid}': $!");
1408     return $fh;
1409 }
1410
1411 =head2 primary_index_file
1412
1413  Title   : primary_index_file
1414  Usage   : $obj->primary_index_file($newval)
1415  Function:
1416  Example :
1417  Returns : value of primary_index_file
1418  Args    : newvalue (optional)
1419
1420
1421 =cut
1422
1423 sub primary_index_file {
1424     my ($self) = @_;
1425
1426     return File::Spec->catfile( $self->_config_path,
1427         "key_" . $self->primary_namespace . ".key" );
1428 }
1429
1430 =head2 primary_index_filehandle
1431
1432  Title   : primary_index_filehandle
1433  Usage   : $obj->primary_index_filehandle($newval)
1434  Function:
1435  Example :
1436  Returns : value of primary_index_filehandle
1437  Args    : newvalue (optional)
1438
1439
1440 =cut
1441
1442 sub primary_index_filehandle {
1443     my ($self) = @_;
1444
1445     unless ( defined( $self->{'_primary_index_handle'} ) ) {
1446         my $primary_file = $self->primary_index_file;
1447         open $self->{'_primary_index_handle'}, '<', $primary_file
1448           or self->throw("Could not read file '$primary_file': $!\n");
1449     }
1450     return $self->{'_primary_index_handle'};
1451 }
1452
1453 =head2 format
1454
1455  Title   : format
1456  Usage   : $obj->format($newval)
1457  Function:
1458  Example :
1459  Returns : value of format
1460  Args    : newvalue (optional)
1461
1462
1463 =cut
1464
1465 sub format {
1466     my ( $obj, $value ) = @_;
1467     if ( defined $value ) {
1468         $obj->{'format'} = $value;
1469     }
1470     return $obj->{'format'};
1471
1472 }
1473
1474 sub alphabet {
1475     my ( $obj, $value ) = @_;
1476     if ( defined $value ) {
1477         $obj->{alphabet} = $value;
1478     }
1479     return $obj->{alphabet};
1480 }
1481
1482 =head2 write_flag
1483
1484  Title   : write_flag
1485  Usage   : $obj->write_flag($newval)
1486  Function:
1487  Example :
1488  Returns : value of write_flag
1489  Args    : newvalue (optional)
1490
1491
1492 =cut
1493
1494 sub write_flag {
1495     my ( $obj, $value ) = @_;
1496     if ( defined $value ) {
1497         $obj->{'write_flag'} = $value;
1498     }
1499     return $obj->{'write_flag'};
1500
1501 }
1502
1503 =head2 dbname
1504
1505  Title   : dbname
1506  Usage   : $obj->dbname($newval)
1507  Function: get/set database name
1508  Example :
1509  Returns : value of dbname
1510  Args    : newvalue (optional)
1511
1512 =cut
1513
1514 sub dbname {
1515     my $self = shift;
1516     my $d    = $self->{flat_dbname};
1517     $self->{flat_dbname} = shift if @_;
1518     $d;
1519 }
1520
1521 =head2 index_directory
1522
1523  Title   : index_directory
1524  Usage   : $obj->index_directory($newval)
1525  Function:
1526  Example :
1527  Returns : value of index_directory
1528  Args    : newvalue (optional)
1529
1530
1531 =cut
1532
1533 sub index_directory {
1534     my ( $self, $arg ) = @_;
1535
1536     if ( defined($arg) ) {
1537         if ( $arg !~ m{/$} ) {
1538             $arg .= "/";
1539         }
1540         $self->{_index_directory} = $arg;
1541     }
1542     return $self->{_index_directory};
1543
1544 }
1545
1546 sub _config_path {
1547     my $self   = shift;
1548     my $root   = $self->index_directory;
1549     my $dbname = $self->dbname;
1550     File::Spec->catfile( $root, $dbname );
1551 }
1552
1553 sub _config_file {
1554     my $self = shift;
1555     my $path = $self->_config_path;
1556     File::Spec->catfile( $path, CONFIG_FILE_NAME );
1557 }
1558
1559 =head2 record_size
1560
1561  Title   : record_size
1562  Usage   : $obj->record_size($newval)
1563  Function:
1564  Example :
1565  Returns : value of record_size
1566  Args    : newvalue (optional)
1567
1568
1569 =cut
1570
1571 sub record_size {
1572     my $self = shift;
1573     $self->{_record_size} = shift if @_;
1574     return $self->{_record_size};
1575 }
1576
1577 =head2 primary_namespace
1578
1579  Title   : primary_namespace
1580  Usage   : $obj->primary_namespace($newval)
1581  Function:
1582  Example :
1583  Returns : value of primary_namespace
1584  Args    : newvalue (optional)
1585
1586 =cut
1587
1588 sub primary_namespace {
1589     my $self = shift;
1590     $self->{_primary_namespace} = shift if @_;
1591     return $self->{_primary_namespace};
1592 }
1593
1594 =head2 index_type
1595
1596  Title   : index_type
1597  Usage   : $obj->index_type($newval)
1598  Function:
1599  Example :
1600  Returns : value of index_type
1601  Args    : newvalue (optional)
1602
1603
1604 =cut
1605
1606 sub index_type {
1607     my $self = shift;
1608     $self->{_index_type} = shift if @_;
1609     return $self->{_index_type};
1610 }
1611
1612 =head2 index_version
1613
1614  Title   : index_version
1615  Usage   : $obj->index_version($newval)
1616  Function:
1617  Example :
1618  Returns : value of index_version
1619  Args    : newvalue (optional)
1620
1621
1622 =cut
1623
1624 sub index_version {
1625     my $self = shift;
1626     $self->{_index_version} = shift if @_;
1627     return $self->{_index_version};
1628 }
1629
1630 =head2 primary_pattern
1631
1632  Title   : primary_pattern
1633  Usage   : $obj->primary_pattern($newval)
1634  Function:
1635  Example :
1636  Returns : value of primary_pattern
1637  Args    : newvalue (optional)
1638
1639
1640 =cut
1641
1642 sub primary_pattern {
1643     my $obj = shift;
1644     $obj->{'primary_pattern'} = shift if @_;
1645     return $obj->{'primary_pattern'};
1646 }
1647
1648 =head2 start_pattern
1649
1650  Title   : start_pattern
1651  Usage   : $obj->start_pattern($newval)
1652  Function:
1653  Example :
1654  Returns : value of start_pattern
1655  Args    : newvalue (optional)
1656
1657
1658 =cut
1659
1660 sub start_pattern {
1661     my $obj = shift;
1662     $obj->{'start_pattern'} = shift if @_;
1663     return $obj->{'start_pattern'};
1664 }
1665
1666 =head2 secondary_patterns
1667
1668  Title   : secondary_patterns
1669  Usage   : $obj->secondary_patterns($newval)
1670  Function:
1671  Example :
1672  Returns : value of secondary_patterns
1673  Args    : newvalue (optional)
1674
1675
1676 =cut
1677
1678 sub secondary_patterns {
1679     my ( $obj, $value ) = @_;
1680     if ( defined $value ) {
1681         $obj->{'secondary_patterns'} = $value;
1682
1683         my @names = keys %$value;
1684
1685         foreach my $name (@names) {
1686             $obj->secondary_namespaces($name);
1687         }
1688     }
1689     return $obj->{'secondary_patterns'};
1690
1691 }
1692
1693 =head2 secondary_namespaces
1694
1695  Title   : secondary_namespaces
1696  Usage   : $obj->secondary_namespaces($newval)
1697  Function:
1698  Example :
1699  Returns : value of secondary_namespaces
1700  Args    : newvalue (optional)
1701
1702
1703 =cut
1704
1705 sub secondary_namespaces {
1706     my ( $obj, @values ) = @_;
1707
1708     if (@values) {
1709         push( @{ $obj->{'secondary_namespaces'} }, @values );
1710     }
1711     return @{ $obj->{'secondary_namespaces'} || [] };
1712 }
1713
1714 ## These are indexing routines to index commonly used format - fasta
1715 ## swissprot and embl
1716
1717 sub new_SWISSPROT_index {
1718     my ( $self, $index_dir, @files ) = @_;
1719
1720     my %secondary_patterns;
1721
1722     my $start_pattern   = "^ID   (\\S+)";
1723     my $primary_pattern = "^AC   (\\S+)\\;";
1724
1725     $secondary_patterns{"ID"} = $start_pattern;
1726
1727     my $index = Bio::DB::Flat::BinarySearch->new(
1728         -index_dir          => $index_dir,
1729         -format             => 'swissprot',
1730         -primary_pattern    => $primary_pattern,
1731         -primary_namespace  => "ACC",
1732         -start_pattern      => $start_pattern,
1733         -secondary_patterns => \%secondary_patterns
1734     );
1735
1736     $index->build_index(@files);
1737 }
1738
1739 sub new_EMBL_index {
1740     my ( $self, $index_dir, @files ) = @_;
1741
1742     my %secondary_patterns;
1743
1744     my $start_pattern     = "^ID   (\\S+)";
1745     my $primary_pattern   = "^AC   (\\S+)\\;";
1746     my $primary_namespace = "ACC";
1747
1748     $secondary_patterns{"ID"} = $start_pattern;
1749
1750     my $index = Bio::DB::Flat::BinarySearch->new(
1751         -index_dir          => $index_dir,
1752         -format             => 'embl',
1753         -primary_pattern    => $primary_pattern,
1754         -primary_namespace  => "ACC",
1755         -start_pattern      => $start_pattern,
1756         -secondary_patterns => \%secondary_patterns
1757     );
1758
1759     $index->build_index(@files);
1760
1761     return $index;
1762 }
1763
1764 sub new_FASTA_index {
1765     my ( $self, $index_dir, @files ) = @_;
1766
1767     my %secondary_patterns;
1768
1769     my $start_pattern     = "^>";
1770     my $primary_pattern   = "^>(\\S+)";
1771     my $primary_namespace = "ACC";
1772
1773     $secondary_patterns{"ID"} = "^>\\S+ +(\\S+)";
1774
1775     my $index = Bio::DB::Flat::BinarySearch->new(
1776         -index_dir          => $index_dir,
1777         -format             => 'fasta',
1778         -primary_pattern    => $primary_pattern,
1779         -primary_namespace  => "ACC",
1780         -start_pattern      => $start_pattern,
1781         -secondary_patterns => \%secondary_patterns
1782     );
1783
1784     $index->build_index(@files);
1785
1786     return $index;
1787 }
1788
1789 # EVERYTHING THAT FOLLOWS THIS
1790 # is an awful hack - in reality Michele's code needs to be rewritten
1791 # to use Bio::SeqIO, but I have too little time to do this -- LS
1792 sub guess_alphabet {
1793     my $self = shift;
1794     my $line = shift;
1795
1796     my $format = $self->format;
1797     return 'protein' if $format eq 'swissprot';
1798
1799     if ( $format eq 'genbank' ) {
1800         return unless $line =~ /^LOCUS/;
1801         return 'dna' if $line =~ /\s+\d+\s+bp/i;
1802         return 'protein';
1803     }
1804
1805     if ( $format eq 'embl' ) {
1806         return unless $line =~ /^ID/;
1807         return 'dna' if $line =~ / DNA;/i;
1808         return 'rna' if $line =~ / RNA;/i;
1809         return 'protein';
1810     }
1811
1812     return;
1813 }
1814
1815 # return (namespace,primary_pattern,start_pattern,secondary_pattern)
1816 sub _guess_patterns {
1817     my $self   = shift;
1818     my $format = shift;
1819     if ( $format =~ /swiss(prot)?/i ) {
1820         return ( 'ID', "^ID   (\\S+)", "^ID   (\\S+)",
1821             { ACC => "^AC   (\\S+);" } );
1822     }
1823
1824     if ($format =~ /embl/i) {
1825         return ('ID',
1826             "^ID   (\\S+[^; ])",
1827             "^ID   (\\S+[^; ])",
1828             {
1829              ACC     => q/^AC   (\S+);/,
1830              VERSION => q/^SV\s+(\S+)/
1831             });
1832      }
1833
1834     if ( $format =~ /genbank/i ) {
1835         return (
1836             'ID',
1837             q/^LOCUS\s+(\S+)/,
1838             q/^LOCUS/,
1839             {
1840                 ACC     => q/^ACCESSION\s+(\S+)/,
1841                 VERSION => q/^VERSION\s+(\S+)/
1842             }
1843         );
1844     }
1845
1846     if ( $format =~ /fasta/i ) {
1847         return ( 'ACC', '^>(\S+)', '^>(\S+)', );
1848     }
1849
1850     $self->throw("I can't handle format $format");
1851
1852 }
1853
1854 1;