Bio/DB/TFBS/transfac_pro.pm

   1 # $Id: transfac_pro.pm,v 1.15 2006/08/12 11:00:03 sendu Exp $
   2 #
   3 # BioPerl module for Bio::DB::TFBS::transfac_pro
   4 #
   5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   6 #
   7 # Cared for by Sendu Bala <bix@sendu.me.uk>
   8 #
   9 # Copyright Sendu Bala
  10 #
  11 # You may distribute this module under the same terms as perl itself
  12
  13 # POD documentation - main docs before the code
  14
  15 =head1 NAME
  16
  17 Bio::DB::TFBS::transfac_pro - An implementation of Bio::DB::TFBS
  18 which uses local flat files for transfac pro
  19
  20 =head1 SYNOPSIS
  21
  22   use Bio::DB::Taxonomy;
  23
  24   my $db = new Bio::DB::Taxonomy(-source => 'transfac_pro'
  25                                  -dat_dir => $directory);
  26
  27   # we're interested in the gene P5
  28   my ($gene_id) = $db->get_gene_ids(-name => 'P5'); # G000001
  29
  30   # we want all the transcription factors that bind to our gene
  31   my @factor_ids = $db->get_factor_ids(-gene => $gene_id);
  32
  33   # get info about those TFs
  34   foreach my $factor_id (@factor_ids) {
  35     my $factor = $db->get_factor($factor_id);
  36     my $name = $factor->universal_name;
  37     # etc. - see Bio::Map::TranscriptionFactor, eg. find out where it binds
  38   }
  39
  40   # get a matrix
  41   my $matrix = $db->get_matrix('M00001');
  42
  43   # get a binding site sequence
  44   my $seq = $db->get_site('R00001');
  45
  46 =head1 DESCRIPTION
  47
  48 This is an implementation which uses local flat files and the DB_File
  49 module RECNO data structures to manage a local copy of the Transfac Pro TFBS
  50 database.
  51
  52 Required database files require a license which can be obtained via
  53 http://www.biobase-international.com/pages/index.php?id=170
  54
  55 Within the linux installation tarball you will find a cgibin tar ball, and
  56 inside that is a data directory containing the .dat files needed by this
  57 module. Point to that data directory with -dat_dir
  58
  59 =head1 FEEDBACK
  60
  61 =head2 Mailing Lists
  62
  63 User feedback is an integral part of the evolution of this and other
  64 Bioperl modules. Send your comments and suggestions preferably to
  65 the Bioperl mailing list.  Your participation is much appreciated.
  66
  67   bioperl-l@bioperl.org                  - General discussion
  68   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  69
  70 =head2 Support
  71
  72 Please direct usage questions or support issues to the mailing list:
  73
  74 I<bioperl-l@bioperl.org>
  75
  76 rather than to the module maintainer directly. Many experienced and
  77 reponsive experts will be able look at the problem and quickly
  78 address it. Please include a thorough description of the problem
  79 with code and data examples if at all possible.
  80
  81 =head2 Reporting Bugs
  82
  83 Report bugs to the Bioperl bug tracking system to help us keep track
  84 of the bugs and their resolution. Bug reports can be submitted via
  85 the web:
  86
  87   https://redmine.open-bio.org/projects/bioperl/
  88
  89 =head1 AUTHOR - Sendu Bala
  90
  91 Email bix@sendu.me.uk
  92
  93 =head1 CONTRIBUTORS
  94
  95 Based on Bio::DB::Taxonomy::flatfile by Jason Stajich
  96
  97 =head1 APPENDIX
  98
  99 The rest of the documentation details each of the object methods.
 100 Internal methods are usually preceded with a _
 101
 102 =cut
 103
 104 # Let the code begin...
 105
 106 package Bio::DB::TFBS::transfac_pro;
 107 use strict;
 108 use Bio::Annotation::Reference;
 109 use Bio::Annotation::SimpleValue;
 110 use Bio::LocatableSeq;
 111 use Bio::SimpleAlign;
 112 use Bio::Matrix::PSM::SiteMatrix;
 113 use Bio::AlignIO;
 114 use Bio::Map::GeneMap;
 115 use Bio::Map::TranscriptionFactor;
 116 use Bio::Map::Position;
 117 use Bio::Map::Relative;
 118 use DB_File;
 119
 120 use constant SEPARATOR => ':!:';
 121 use constant INTERNAL_SEPARATOR => '!:!';
 122
 123 $DB_BTREE->{'flags'} = R_DUP; # allow duplicate values in DB_File BTREEs
 124
 125 use base qw(Bio::DB::TFBS);
 126
 127 =head2 new
 128
 129  Title   : new
 130  Usage   : my $obj = new Bio::DB::TFBS::transfac_pro();
 131  Function: Builds a new Bio::DB::TFBS::transfac_pro object
 132  Returns : an instance of Bio::DB::TTFBS::transfac_pro
 133  Args    : -dat_dir   => name of directory where Transfac Pro .dat files
 134                          (required to initially build indexes)
 135            -tax_db    => Bio::DB::Taxonomy object, used when initially building
 136                          indexes, gives better results for species information
 137                          but not required.
 138            -index_dir => name of directory where index files should be created
 139                          or already exist. (defaults to -dat_dir, required if
 140                          -dat_dir not supplied)
 141            -force     => 1 replace current indexes even if they exist
 142
 143 =cut
 144
 145 sub new {
 146     my ($class, @args) = @_;
 147
 148     my $self = $class->SUPER::new(@args);
 149
 150     my ($dat_dir, $index_dir, $tax_db, $force) = $self->_rearrange([qw(DAT_DIR INDEX_DIR TAX_DB FORCE)], @args);
 151     $self->throw("At least one of -dat_dir and -index_dir must be supplied") unless ($dat_dir || $index_dir);
 152
 153     $self->index_directory($index_dir || $dat_dir);
 154     $self->{_tax_db} = $tax_db if $tax_db;
 155
 156     if ($dat_dir) {
 157         $self->_build_index($dat_dir, $force);
 158     }
 159
 160     $self->_db_connect;
 161     return $self;
 162 }
 163
 164 =head2 Bio::DB::TFBS Interface implementation
 165
 166 =cut
 167
 168 sub _get_ids {
 169     my ($self, $dat, @args) = @_;
 170     @args % 2 == 0 || $self->throw("Must provide key => value pairs");
 171     my $hash = $self->{$dat} || $self->throw("Unknown .dat type '$dat'");
 172
 173     if (@args) {
 174         # get a subset corresponding to args
 175         my @final;
 176         my %args = @args;
 177         my $multiple = 0;
 178         while (my ($type, $value) = each %args) {
 179             unless ($value) {
 180                 $self->warn("Arguement '$type' has no value, ignored");
 181                 next;
 182             }
 183             $type =~ s/-//;
 184             $type = lc($type);
 185             my $converter = $hash->{$type};
 186             unless ($converter) {
 187                 $self->warn("Unknown search type '$type' for .dat type '$dat'");
 188                 next;
 189             }
 190
 191             my @ids = $converter->get_dup($value);
 192             unless (@ids) {
 193                 @ids = $converter->get_dup(lc($value));
 194             }
 195
 196             if ($multiple) {
 197                 # we can have multiple types given at once, find the ids that
 198                 # satisfy all criteria
 199                 @final || return;
 200                 my %final = map { $_ => 1 } @final;
 201                 @final = grep { $final{$_} } @ids;
 202             }
 203             else {
 204                 @final = @ids;
 205                 $multiple++;
 206             }
 207         }
 208
 209         return @final;
 210     }
 211     else {
 212         # get them all
 213         my $db_file_hash = $self->{$dat}->{id};
 214
 215         my ($key, $prev_key, $value) = ('_!_', '!_!');
 216         my @ids;
 217         while (1) {
 218             $db_file_hash->seq($key, $value, R_NEXT);
 219             last if $prev_key eq $key;
 220             push(@ids, $value); # confusing? when creating objects we store
 221                                 # $value as accession and $key as id, but from
 222                                 # this method we return $value as id given $id!
 223             $prev_key = $key;
 224         }
 225
 226         return @ids;
 227     }
 228 }
 229
 230 =head2 get_reference
 231
 232  Title   : get_reference
 233  Usage   : my $ref = $obj->get_reference($id);
 234  Function: Get a literature reference.
 235  Returns : Bio::Annotation::Reference
 236  Args    : string - a reference id ('RE...')
 237
 238 =cut
 239
 240 sub get_reference {
 241     my ($self, $id) = @_;
 242     $id || return;
 243     my $data = $self->{reference}->{data}->{$id} || return;
 244     my @data = split(SEPARATOR, $data);
 245
 246     return Bio::Annotation::Reference->new(-pubmed   => $data[0],
 247                                            -authors  => $data[1],
 248                                            -title    => $data[2],
 249                                            -location => $data[3] );
 250 }
 251
 252 =head2 get_genemap
 253
 254  Title   : get_genemap
 255  Usage   : my $map = $obj->get_genemap($id);
 256  Function: Get a GeneMap for a gene.
 257  Returns : Bio::Map::GeneMap
 258  Args    : string - a gene id ('G...'), and optionally int (number of bp
 259            upstream)
 260
 261 =cut
 262
 263 sub get_genemap {
 264     my ($self, $id, $upstream) = @_;
 265     $id || return;
 266     return $self->{got_map}->{$id} if defined $self->{got_map}->{$id};
 267     $upstream ||= 1000;
 268     my $data = $self->{gene}->{data}->{$id} || return;
 269     my @data = split(SEPARATOR, $data);
 270
 271     # accession = id name description species_tax_id_or_raw_string
 272     my $taxon = $self->{_tax_db} ? $self->{_tax_db}->get_taxon($data[3]) || $data[3] : $data[3];
 273     my $map = Bio::Map::GeneMap->get(-uid => $id,
 274                                      -gene => $data[1],
 275                                      -species => $taxon,
 276                                      -description => $data[2],
 277                                      -upstream => $upstream);
 278     $self->{got_map}->{$id} = $map; # prevents infinite recurse when we call get_factor below
 279
 280     # spawn all the factors that belong on this gene map
 281     # get_factor_ids(-gene => ...) only works for genes that encode factors;
 282     # have to go via sites
 283     foreach my $sid ($self->get_site_ids(-gene => $id)) {
 284         foreach my $fid ($self->get_factor_ids(-site => $sid)) {
 285             # it is quite deliberate that we deeply recurse to arrive at the
 286             # correct answer, which involves pulling in most of the database
 287             no warnings "recursion";
 288             $self->get_factor($fid);
 289         }
 290     }
 291
 292     return $map;
 293 }
 294
 295 =head2 get_seq
 296
 297  Title   : get_seq
 298  Usage   : my $seq = $obj->get_seq($id);
 299  Function: Get the sequence of a site. The sequence will be annotated with the
 300            the tags 'relative_start', 'relative_end', 'relative_type' and
 301            'relative_to'.
 302  Returns : Bio::Seq
 303  Args    : string - a site id ('R...')
 304
 305 =cut
 306
 307 sub get_seq {
 308     my ($self, $id) = @_;
 309     $id || return;
 310     my $data = $self->{site}->{data}->{$id} || return;
 311     my @data = split(SEPARATOR, $data);
 312
 313     my $seq = Bio::Seq->new(-seq              => $data[2],
 314                             -accession_number => $id,
 315                             -description      => $data[6] ? 'Genomic sequence' : 'Consensus or artificial sequence',
 316                             -id               => $data[0],
 317                             -strand           => 1,
 318                             -alphabet         => $data[7] || 'dna',
 319                             -species          => $data[6]);
 320
 321     my $annot = $seq->annotation;
 322     my $sv = Bio::Annotation::SimpleValue->new(-tagname => 'relative_start', -value => $data[4] || 1);
 323     $annot->add_Annotation($sv);
 324     $sv = Bio::Annotation::SimpleValue->new(-tagname => 'relative_end', -value => $data[5] || ($data[4] || 1 + length($data[2]) - 1));
 325     $annot->add_Annotation($sv);
 326     $sv = Bio::Annotation::SimpleValue->new(-tagname => 'relative_type', -value => $data[3] || 'artificial');
 327     $annot->add_Annotation($sv);
 328     $sv = Bio::Annotation::SimpleValue->new(-tagname => 'relative_to', -value => $data[1]);
 329     $annot->add_Annotation($sv);
 330
 331     return $seq;
 332 }
 333
 334 =head2 get_fragment
 335
 336  Title   : get_fragment
 337  Usage   : my $seq = $obj->get_fragment($id);
 338  Function: Get the sequence of a fragment.
 339  Returns : Bio::Seq
 340  Args    : string - a site id ('FR...')
 341
 342 =cut
 343
 344 sub get_fragment {
 345     my ($self, $id) = @_;
 346     $id || return;
 347     my $data = $self->{fragment}->{data}->{$id} || return;
 348     my @data = split(SEPARATOR, $data);
 349
 350     # accession = id gene_id1 gene_id2 species_tax_id_or_raw_string sequence source
 351     return new Bio::Seq( -seq              => $data[4],
 352                          -accession_number => $id,
 353                          -description      => 'Between genes '.$data[1].' and '.$data[2],
 354                          -species          => $data[3],
 355                          -id               => $data[0],
 356                          -alphabet         => 'dna' );
 357 }
 358
 359 =head2 get_matrix
 360
 361  Title   : get_matrix
 362  Usage   : my $matrix = $obj->get_matrix($id);
 363  Function: Get a matrix that describes a binding site.
 364  Returns : Bio::Matrix::PSM::SiteMatrix
 365  Args    : string - a matrix id ('M...'), optionally a sequence string from
 366            which base frequencies will be calcualted for the matrix model
 367            (default 0.25 each)
 368
 369 =cut
 370
 371 sub get_matrix {
 372     my ($self, $id, $seq) = @_;
 373     $id || return;
 374     $seq ||= 'atgc';
 375     $seq = lc($seq);
 376     my $data = $self->{matrix}->{data}->{$id} || return;
 377     my @data = split(SEPARATOR, $data);
 378     $data[4] || $self->throw("Matrix data missing for $id");
 379
 380     my ($a, $c, $g, $t);
 381     foreach my $position (split(INTERNAL_SEPARATOR, $data[4])) {
 382         my ($a_count, $c_count, $g_count, $t_count) = split("\t", $position);
 383         push(@{$a}, $a_count);
 384         push(@{$c}, $c_count);
 385         push(@{$g}, $g_count);
 386         push(@{$t}, $t_count);
 387     }
 388
 389     # our psms include a simple background model so we can use
 390     # sequence_match_weight() if desired
 391     my $a_freq = ($seq =~ tr/a//) / length($seq);
 392     my $c_freq = ($seq =~ tr/c//) / length($seq);
 393     my $g_freq = ($seq =~ tr/g//) / length($seq);
 394     my $t_freq = ($seq =~ tr/t//) / length($seq);
 395
 396     my $psm = Bio::Matrix::PSM::SiteMatrix->new(-pA => $a,
 397                                                 -pC => $c,
 398                                                 -pG => $g,
 399                                                 -pT => $t,
 400                                                 -id => $data[0],
 401                                                 -accession_number => $id,
 402                                                 -sites => $data[3],
 403                                                 -width => scalar(@{$a}),
 404                                                 -correction => 1,
 405                                                 -model => { A => $a_freq, C => $c_freq, G => $g_freq, T => $t_freq } );
 406
 407     #*** used to make a Bio::Matrix::PSM::Psm and add references, but it
 408     #    didn't seem worth it. You can get references from the database by:
 409     #foreach my $ref_id ($db->get_reference_ids(-matrix => $id)) {
 410     #    my $ref = $db->get_reference($ref_id);
 411     #}
 412
 413     return $psm;
 414 }
 415
 416 =head2 get_aln
 417
 418  Title   : get_aln
 419  Usage   : my $aln = $obj->get_aln($id);
 420  Function: Get the alignment that was used to generate a matrix. Each sequence
 421            in the alignment will have an accession_number corresponding to the
 422            Transfac site id, and id() based on that but unique within the
 423            alignment.
 424  Returns : Bio::SimpleAlign
 425  Args    : string - a matrix id ('M...'), optionally true to, when a matrix
 426            lists no sequences, search for sequences via the matrix's factors,
 427            picking the sites that best match the matrix
 428
 429 =cut
 430
 431 my %VALID_STRAND = map {$_ => 1} qw(-1 0 1);
 432
 433 sub get_aln {
 434     my ($self, $id, $via_factors) = @_;
 435     $id || return;
 436     my $data = $self->{matrix}->{data}->{$id} || $self->throw("matrix '$id' had no data in DB_File");
 437     my @data = split(SEPARATOR, $data);
 438
 439     if (! $data[5] && $via_factors) {
 440         # This is a matrix with no site sequences given in matrix.dat.
 441         # Find some matching site sequences via factors.
 442
 443         # First, check its factors for sites
 444         my %site_seqs;
 445         my %factor_ids;
 446         foreach my $factor_id ($self->get_factor_ids(-matrix => $id)) {
 447             $factor_ids{$factor_id} = 1;
 448             foreach my $site_id ($self->get_site_ids(-factor => $factor_id)) {
 449                 next if defined $site_seqs{$site_id};
 450                 my $seq = $self->get_seq($site_id);
 451
 452                 # skip sites that have no sequence, or have IUPAC symbols in
 453                 # their sequence (most probably the 'consensus' sequence itself
 454                 # that was used to make and exactly corresponds to the matrix)
 455                 my $seq_str = $seq->seq || next;
 456                 $seq_str =~ /[MRWSYKVHDB]/ and next;
 457
 458                 $site_seqs{$site_id} = $seq;
 459             }
 460         }
 461         my @seqs = values %site_seqs;
 462
 463         if (@seqs > 1) {
 464             # pick the sub-seqs that match to the matrix with the best scores
 465             my $matrix = $self->get_matrix($id);
 466             my $desired_sequences = $matrix->sites;
 467             return if @seqs < $desired_sequences;
 468
 469             my $desired_length = $matrix->width;
 470             my %best_seqs;
 471             foreach my $seq (@seqs) {
 472                 my $for_str = $seq->seq;
 473                 next if length($for_str) < $desired_length;
 474                 my $rev_str = $seq->revcom->seq;
 475
 476                 my $best_score = 0;
 477                 my $best_subseq = '';
 478                 my $best_i = 0;
 479                 my $best_subseq_caps = 0;
 480                 my $best_revcom;
 481                 my $revcom = 0;
 482                 foreach my $seq_str ($for_str, $rev_str) {
 483                     for my $i (0..(length($seq_str) - $desired_length)) {
 484                         my $subseq = substr($seq_str, $i, $desired_length);
 485                         $subseq =~ s/[^ACGTacgt]//g; # can only score atcg
 486                         next unless length($subseq) == $desired_length; # short or 0-length seqs could get the highest scores!
 487                         my $score = $matrix->sequence_match_weight($subseq);
 488
 489                         # caps represent the author-chosen bit of a site
 490                         # sequence so we would prefer to choose a subseq that
 491                         # contains it
 492                         my $caps = $subseq =~ tr/ACGT//;
 493
 494                         #*** (don't know why numeric == fails for comparing
 495                         #     scores, when the string eq works)
 496                         if ($score > $best_score || ("$score" eq "$best_score" && $caps > $best_subseq_caps)) {
 497                             $best_score = $score;
 498                             $best_subseq_caps = $caps;
 499                             $best_subseq = $subseq;
 500                             $best_i = $i;
 501                             $best_revcom = $revcom;
 502                         }
 503                     }
 504                     $revcom++;
 505                 }
 506
 507                 if ($best_score) {
 508                     $best_seqs{$seq->accession_number} = [$best_subseq, $seq->accession_number, ($best_i + 1), $revcom ? -1 : 1, $best_score];
 509                 }
 510             }
 511             my @sorted = sort { $best_seqs{$b}->[-1] <=> $best_seqs{$a}->[-1] } keys %best_seqs;
 512             return if @sorted < $desired_sequences;
 513             splice(@sorted, $desired_sequences);
 514             my %wanted = map { $_ => 1 } @sorted;
 515
 516             my @site_data;
 517             foreach my $seq (@seqs) {
 518                 next unless exists $wanted{$seq->accession_number};
 519                 my @data = @{$best_seqs{$seq->accession_number}};
 520                 pop(@data);
 521                 push(@site_data, join('_', @data));
 522             }
 523
 524             $data[5] = join(INTERNAL_SEPARATOR, @site_data);
 525             $self->{matrix}->{data}->{$id} = join(SEPARATOR, @data);
 526         }
 527     }
 528     $data[5] || return;
 529
 530     my @blocks = split(INTERNAL_SEPARATOR, $data[5]);
 531
 532     # append gap chars to all sequences to make them the same length
 533     # (applies to sequences found via factors, presumably, since we already
 534     # do this for matrix alignments in transfac_pro.pm)
 535     my $longest = 0;
 536     foreach (@blocks) {
 537         my ($seq) = split('_', $_);
 538         my $length = length($seq);
 539         if ($length > $longest) {
 540             $longest = $length;
 541         }
 542     }
 543     foreach my $i (0..$#blocks) {
 544         my $block = $blocks[$i];
 545         my ($seq, $seq_id) = split('_', $block);
 546         my $length = length($seq);
 547         if ($length < $longest) {
 548             my $orig_seq = $seq;
 549             $seq .= '-'x($longest - $length);
 550             $block =~ s/^${orig_seq}_/${seq}_/;
 551             $blocks[$i] = $block;
 552         }
 553     }
 554
 555     # build the alignment
 556     my $aln = Bio::SimpleAlign->new(-source => 'transfac_pro');
 557     my %done_ids;
 558     foreach (@blocks) {
 559         my ($seq, $seq_acc, $start, $strand) = split('_', $_);
 560
 561         $self->throw("Invalid strand $strand found in block $_")
 562             unless exists $VALID_STRAND{$strand};
 563         # we can get back multiple different subparts of the same site (sequence),
 564         # so $seq_acc isn't unique across this loop. Can't use it as the seq id
 565         # of the alignment (ids must be unique in SimpleAlign), so we
 566         # uniquify the id and store the original id as the accession_number
 567         my $seq_id;
 568         $done_ids{$seq_acc}++;
 569         if ($done_ids{$seq_acc} > 1) {
 570             $seq_id = $seq_acc.'_'.$done_ids{$seq_acc};
 571         }
 572         else {
 573             $seq_id = $seq_acc;
 574         }
 575
 576         my $gaps = $seq =~ tr/-//;
 577         my $length = length($seq) - $gaps;
 578         $self->throw("seq '$seq_id' for matrix '$id' had seq '$seq'") unless $length;
 579         $aln->add_seq(Bio::LocatableSeq->new(-seq    => $seq,
 580                                              -id     => $seq_id,
 581                                              -accession_number => $seq_acc,
 582                                              -start  => $start,
 583                                              -end    => $start + $length - 1,
 584                                              -strand => $strand));
 585     }
 586     $aln->id($id);
 587     # could also store score? of?
 588
 589     return $aln;
 590 }
 591
 592 =head2 get_factor
 593
 594  Title   : get_factor
 595  Usage   : my $factor = $obj->get_factor($id);
 596  Function: Get the details of a transcription factor.
 597  Returns : Bio::Map::TranscriptionFactor
 598  Args    : string - a factor id ('T...')
 599
 600 =cut
 601
 602 sub get_factor {
 603     my ($self, $id) = @_;
 604     $id || return;
 605     return $self->{got_factor}->{$id} if defined $self->{got_factor}->{$id};
 606     my $data = $self->{factor}->{data}->{$id} || return;
 607     my @data = split(SEPARATOR, $data);
 608
 609     # accession = id name species sequence
 610     my $tf = Bio::Map::TranscriptionFactor->get(-id => $id,
 611                                                 -universal_name => $data[1]);
 612     #*** not sure what to do with species and sequence, since we don't want to
 613     # confuse the idea that a TF is a general thing that could bind to any
 614     # species... then again, you might want to model species-specific variants
 615     # of a TF with different binding abilities...
 616     #*** idea of having inclusion and exclusion species so you can prevent/
 617     # ignore a tf that binds to the wrong species (a species that doesn't even
 618     # have the tf), and associating sequence with each species/tf combo so you
 619     # can see how diverged the tf is and make assumptions about site difference
 620     # allowance
 621
 622     # place it on all its genemaps
 623     foreach my $sid ($self->get_site_ids(-factor => $id)) {
 624         my $s_data = $self->{site}->{data}->{$sid} || next;
 625         my @s_data = split(SEPARATOR, $s_data);
 626
 627         # accession = id gene_id sequence relative_to first_position last_position species_tax_id_or_raw_string
 628         $s_data[1] || next; # site isn't relative to a gene, meaningless
 629         $s_data[4] || next; # don't know where its supposed to be, can't model it
 630         $s_data[5] ||= $s_data[4] + ($s_data[2] ? length($s_data[2]) - 1 : 0);
 631
 632         # it is quite deliberate that we deeply recurse to arrive at the
 633         # correct answer, which involves pulling in most of the database
 634         no warnings "recursion";
 635         my $gene_map = $self->get_genemap($s_data[1]) || next;
 636         return $self->{got_factor}->{$id} if defined $self->{got_factor}->{$id};
 637
 638         #*** not always relative to gene start...
 639         #    we need Bio::Map::Gene s to have some default tss and atg positions
 640         #    that we can be relative to
 641         my $rel = Bio::Map::Relative->new(-element => $gene_map->gene, -description => $s_data[3]);
 642         Bio::Map::Position->new(-map => $gene_map, -element => $tf, -start => $s_data[4], -end => $s_data[5], -relative => $rel);
 643     }
 644
 645     $self->{got_factor}->{$id} = $tf;
 646     return $tf;
 647 }
 648
 649 # since get_factor() is uncertain, just have direct access methods to factor
 650 # information
 651 sub get_factor_name {
 652     my ($self, $id) = @_;
 653     my $details = $self->_get_factor_details($id) || return;
 654     return $details->{name};
 655 }
 656 sub get_factor_species {
 657     my ($self, $id) = @_;
 658     my $details = $self->_get_factor_details($id) || return;
 659     return $details->{species};
 660 }
 661 sub get_factor_sequence {
 662     my ($self, $id) = @_;
 663     my $details = $self->_get_factor_details($id) || return;
 664     return $details->{sequence};
 665 }
 666 sub _get_factor_details {
 667     my ($self, $id) = @_;
 668     $id || return;
 669
 670     return $self->{factor_details}->{$id} if defined $self->{factor_details}->{$id};
 671
 672     my $data = $self->{factor}->{data}->{$id} || return;
 673     my @data = split(SEPARATOR, $data);
 674
 675     # accession = id name species sequence
 676
 677     my %details = (name => $data[1], species => $data[2], sequence => $data[3]);
 678     $self->{factor_details}->{$id} = \%details;
 679
 680     return \%details;
 681 }
 682
 683 =head2 get_reference_ids
 684
 685  Title   : get_reference_ids
 686  Usage   : my @ids = $obj->get_reference_ids(-key => $value);
 687  Function: Get all the reference ids that are associated with the supplied
 688            args.
 689  Returns : list of strings (ids)
 690  Args    : -key => value, where value is a string id, and key is one of:
 691            -pubmed -site -gene -matrix -factor
 692
 693 =cut
 694
 695 sub get_reference_ids {
 696     my $self = shift;
 697     return $self->_get_ids('reference', @_);
 698 }
 699
 700 # -id -name -species -site -factor -reference
 701 sub get_gene_ids {
 702     my $self = shift;
 703     return $self->_get_ids('gene', @_);
 704 }
 705
 706 =head2 get_site_ids
 707
 708  Title   : get_site_ids
 709  Usage   : my @ids = $obj->get_site_ids(-key => $value);
 710  Function: Get all the site ids that are associated with the supplied
 711            args.
 712  Returns : list of strings (ids)
 713  Args    : -key => value, where value is a string id, and key is one of:
 714            -id -species -gene -matrix -factor -reference
 715
 716 =cut
 717
 718 sub get_site_ids {
 719     my $self = shift;
 720     return $self->_get_ids('site', @_);
 721 }
 722
 723 =head2 get_matrix_ids
 724
 725  Title   : get_matrix_ids
 726  Usage   : my @ids = $obj->get_matrix_ids(-key => $value);
 727  Function: Get all the matrix ids that are associated with the supplied
 728            args.
 729  Returns : list of strings (ids)
 730  Args    : -key => value, where value is a string id, and key is one of:
 731            -id -name -site -factor -reference
 732
 733 =cut
 734
 735 sub get_matrix_ids {
 736     my $self = shift;
 737     return $self->_get_ids('matrix', @_);
 738 }
 739
 740 =head2 get_factor_ids
 741
 742  Title   : get_factor_ids
 743  Usage   : my @ids = $obj->get_factor_ids(-key => $value);
 744  Function: Get all the factor ids that are associated with the supplied
 745            args.
 746  Returns : list of strings (ids)
 747  Args    : -key => value, where value is a string id, and key is one of:
 748            -id -name -species -interactors -gene -matrix -site -reference
 749            NB: -gene only gets factor ids for genes that encode factors
 750
 751 =cut
 752
 753 sub get_factor_ids {
 754     my $self = shift;
 755     return $self->_get_ids('factor', @_);
 756 }
 757
 758 =head2 get_fragment_ids
 759
 760  Title   : get_fragment_ids
 761  Usage   : my @ids = $obj->get_fragment_ids(-key => $value);
 762  Function: Get all the fragment ids that are associated with the supplied
 763            args.
 764  Returns : list of strings (ids)
 765  Args    : -key => value, where value is a string id, and key is one of:
 766            -id -species -gene -factor -reference
 767
 768 =cut
 769
 770 sub get_fragment_ids {
 771     my $self = shift;
 772     return $self->_get_ids('fragment', @_);
 773 }
 774
 775 =head2 Helper methods
 776
 777 =cut
 778
 779 # internal method which does the indexing
 780 sub _build_index {
 781     my ($self, $dat_dir, $force) = @_;
 782
 783     # MLDBM would give us transparent complex data structures with DB_File,
 784     # allowing just one index file, but its yet another requirment and we
 785     # don't strictly need it
 786
 787     my $index_dir = $self->index_directory;
 788     my $gene_index      = "$index_dir/gene.dat.index";
 789     my $reference_index = "$index_dir/reference.dat.index";
 790     my $matrix_index    = "$index_dir/matrix.dat.index";
 791     my $factor_index    = "$index_dir/factor.dat.index";
 792     my $fragment_index  = "$index_dir/fragment.dat.index";
 793     my $site_index      = "$index_dir/site.dat.index";
 794
 795     my $reference_dat = "$dat_dir/reference.dat";
 796     if (! -e $reference_index || $force) {
 797         open my $REF, '<', $reference_dat or $self->throw("Could not read reference file '$reference_dat': $!");
 798
 799         my %references;
 800         unlink $reference_index;
 801         my $ref = tie(%references, 'DB_File', $reference_index, O_RDWR|O_CREAT, 0644, $DB_HASH)
 802             or $self->throw("CCould not open file '$reference_index': $!");
 803
 804         my %pubmed;
 805         my $reference_pubmed = $reference_index.'.pubmed';
 806         unlink $reference_pubmed;
 807         my $pub = tie(%pubmed, 'DB_File', $reference_pubmed, O_RDWR|O_CREAT, 0644, $DB_BTREE)
 808             or $self->throw("Could not open file '$reference_pubmed': $!");
 809
 810         my %gene;
 811         my $reference_gene = $gene_index.'.reference';
 812         unlink $reference_gene;
 813         my $gene = tie(%gene, 'DB_File', $reference_gene, O_RDWR|O_CREAT, 0644, $DB_BTREE)
 814             or $self->throw("Could not open file '$reference_gene': $!");
 815
 816         my %site;
 817         my $reference_site = $site_index.'.reference';
 818         unlink $reference_site;
 819         my $site = tie(%site, 'DB_File', $reference_site, O_RDWR|O_CREAT, 0644, $DB_BTREE)
 820             or $self->throw("Could not open file '$reference_site': $!");
 821
 822         my %fragment;
 823         my $reference_fragment = $fragment_index.'.reference';
 824         unlink $reference_fragment;
 825         my $fragment = tie(%fragment, 'DB_File', $reference_fragment, O_RDWR|O_CREAT, 0644, $DB_BTREE)
 826             or $self->throw("Could not open file '$reference_fragment': $!");
 827
 828         my %factor;
 829         my $reference_factor = $factor_index.'.reference';
 830         unlink $reference_factor;
 831         my $factor = tie(%factor, 'DB_File', $reference_factor, O_RDWR|O_CREAT, 0644, $DB_BTREE)
 832             or $self->throw("Could not open file '$reference_factor': $!");
 833
 834         my %matrix;
 835         my $reference_matrix = $matrix_index.'.reference';
 836         unlink $reference_matrix;
 837         my $matrix = tie(%matrix, 'DB_File', $reference_matrix, O_RDWR|O_CREAT, 0644, $DB_BTREE)
 838             or $self->throw("Could not open file '$reference_matrix': $!");
 839
 840         # skip the first three header lines
 841         <$REF>; <$REF>; <$REF>;
 842
 843         my @data;
 844         while (<$REF>) {
 845             if (/^AC  (\S+)/) {
 846                 $data[0] = $1;
 847             }
 848             elsif (/^RX  PUBMED: (\d+)/) {
 849                 $data[1] = $1;
 850                 $pub->put("$1", $data[0]);
 851             }
 852             elsif (/^RA  (.+)\n$/) {
 853                 $data[2] = $1;
 854             }
 855             elsif (/^RT  (.+?)\.?\n$/) {
 856                 $data[3] = $1;
 857             }
 858             elsif (/^RL  (.+?)\.?\n$/) {
 859                 $data[4] = $1;
 860             }
 861             elsif (/^GE  TRANSFAC: (\w\d+)/) {
 862                 $gene->put($data[0], "$1");
 863             }
 864             elsif (/^BS  TRANSFAC: (\w\d+)/) {
 865                 $site->put($data[0], "$1");
 866             }
 867             elsif (/^FA  TRANSFAC: (\w\d+)/) {
 868                 $factor->put($data[0], "$1");
 869             }
 870             elsif (/^FR  TRANSFAC: (FR\d+)/) {
 871                 $fragment->put($data[0], "$1");
 872             }
 873             elsif (/^MX  TRANSFAC: (\w\d+)/) {
 874                 $matrix->put($data[0], "$1");
 875             }
 876             elsif (/^\/\//) {
 877                 # end of a record, store previous data and reset
 878
 879                 # accession = pubmed authors title location
 880                 $references{$data[0]} = join(SEPARATOR, ($data[1] || '',
 881                                                          $data[2] || '',
 882                                                          $data[3] || '',
 883                                                          $data[4] || ''));
 884
 885                 @data = ();
 886             }
 887         }
 888         close $REF;
 889
 890         $ref = $pub = $gene = $site = $fragment = $factor = $matrix = undef;
 891         untie %references;
 892         untie %pubmed;
 893         untie %gene;
 894         untie %site;
 895         untie %fragment;
 896         untie %factor;
 897         untie %matrix;
 898     }
 899
 900     my $gene_dat = "$dat_dir/gene.dat";
 901     if (! -e $gene_index || $force) {
 902         open my $GEN, '<', $gene_dat or $self->throw("Could not read gene file '$gene_dat': $!");
 903
 904         my %genes;
 905         unlink $gene_index;
 906         my $gene = tie(%genes, 'DB_File', $gene_index, O_RDWR|O_CREAT, 0644, $DB_HASH)
 907             or $self->throw("Could not open file '$gene_index': $!");
 908
 909         my %id;
 910         my $gene_id = $gene_index.'.id';
 911         unlink $gene_id;
 912         my $id = tie(%id, 'DB_File', $gene_id, O_RDWR|O_CREAT, 0644, $DB_BTREE)
 913             or $self->throw("Could not open file '$gene_id': $!");
 914
 915         my %name;
 916         my $gene_name = $gene_index.'.name';
 917         unlink $gene_name;
 918         my $name = tie(%name, 'DB_File', $gene_name, O_RDWR|O_CREAT, 0644, $DB_BTREE)
 919             or $self->throw("Could not open file '$gene_name': $!");
 920
 921         my %species;
 922         my $gene_species = $gene_index.'.species';
 923         unlink $gene_species;
 924         my $species = tie(%species, 'DB_File', $gene_species, O_RDWR|O_CREAT, 0644, $DB_BTREE)
 925             or $self->throw("Could not open file '$gene_species': $!");
 926
 927         my %site;
 928         my $gene_site = $site_index.'.gene';
 929         unlink $gene_site;
 930         my $site = tie(%site, 'DB_File', $gene_site, O_RDWR|O_CREAT, 0644, $DB_BTREE)
 931             or $self->throw("Could not open file '$gene_site': $!");
 932
 933         my %factor;
 934         my $gene_factor = $factor_index.'.gene';
 935         unlink $gene_factor;
 936         my $factor = tie(%factor, 'DB_File', $gene_factor, O_RDWR|O_CREAT, 0644, $DB_BTREE)
 937             or $self->throw("Could not open file '$gene_factor': $!");
 938
 939         my %fragment;
 940         my $gene_fragment = $fragment_index.'.gene';
 941         unlink $gene_fragment;
 942         my $fragment = tie(%fragment, 'DB_File', $gene_fragment, O_RDWR|O_CREAT, 0644, $DB_BTREE)
 943             or $self->throw("Could not open file '$gene_fragment': $!");
 944
 945         my %reference;
 946         my $gene_reference = $reference_index.'.gene';
 947         unlink $gene_reference;
 948         my $reference = tie(%reference, 'DB_File', $gene_reference, O_RDWR|O_CREAT, 0644, $DB_BTREE)
 949             or $self->throw("Could not open file '$gene_reference': $!");
 950
 951         # skip the first three header lines
 952         <$GEN>; <$GEN>; <$GEN>;
 953
 954         my @data;
 955         while (<$GEN>) {
 956             if (/^AC  (\S+)/) {
 957                 $data[0] = $1;
 958             }
 959             elsif (/^ID  (\S+)/) {
 960                 $data[1] = $1;
 961                 $id->put("$1", $data[0]);
 962             }
 963             elsif (/^SD  (.+)$/) {
 964                 $data[2] = lc("$1");
 965                 $name->put(lc("$1"), $data[0]);
 966             }
 967             elsif (/^SY  (.+)\.$/) {
 968                 foreach (split('; ', lc("$1"))) {
 969                     $name->put($_, $data[0]);
 970                 }
 971             }
 972             elsif (/^DE  (.+)$/) {
 973                 $data[3] = $1;
 974             }
 975             elsif (/^OS  (.+)$/) {
 976                 my $raw_species = $1;
 977                 my $taxid = $self->_species_to_taxid($raw_species);
 978                 $data[4] = $taxid || $raw_species;
 979                 $species->put($data[4], $data[0]);
 980             }
 981             elsif (/^RN  .+?(RE\d+)/) {
 982                 $reference->put($data[0], "$1");
 983             }
 984             elsif (/^BS  .+?(R\d+)/) {
 985                 $site->put($data[0], "$1");
 986             }
 987             elsif (/^FA  (T\d+)/) {
 988                 $factor->put($data[0], "$1");
 989             }
 990             elsif (/^BR  (FR\d+)/) {
 991                 $fragment->put($data[0], "$1");
 992             }
 993             elsif (/^\/\//) {
 994                 # end of a record, store previous data and reset
 995
 996                 # accession = id name description species_tax_id_or_raw_string
 997                 $genes{$data[0]} = join(SEPARATOR, ($data[1] || '',
 998                                                     $data[2] || '',
 999                                                     $data[3] || '',
1000                                                     $data[4] || ''));
1001
1002                 @data = ();
1003             }
1004         }
1005         close $GEN;
1006
1007         $gene = $id = $name = $species = $site = $factor = $reference = undef;
1008         untie %genes;
1009         untie %id;
1010         untie %name;
1011         untie %species;
1012         untie %site;
1013         untie %factor;
1014         untie %reference;
1015     }
1016
1017     my $site_dat = "$dat_dir/site.dat";
1018     if (! -e $site_index || $force) {
1019         open my $SIT, '<', $site_dat or $self->throw("Could not read site file '$site_dat': $!");
1020
1021         my %sites;
1022         unlink $site_index;
1023         my $site = tie(%sites, 'DB_File', $site_index, O_RDWR|O_CREAT, 0644, $DB_HASH)
1024             or $self->throw("Could not open file '$site_index': $!");
1025
1026         my %id;
1027         my $site_id = $site_index.'.id';
1028         unlink $site_id;
1029         my $id = tie(%id, 'DB_File', $site_id, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1030             or $self->throw("Could not open file '$site_id': $!");
1031
1032         my %species;
1033         my $site_species = $site_index.'.species';
1034         unlink $site_species;
1035         my $species = tie(%species, 'DB_File', $site_species, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1036             or $self->throw("Could not open file '$site_species': $!");
1037
1038         my %qualities;
1039         my $site_qualities = $site_index.'.qual';
1040         unlink $site_qualities;
1041         my $quality = tie(%qualities, 'DB_File', $site_qualities, O_RDWR|O_CREAT, 0644, $DB_HASH)
1042             or $self->throw("Could not open file '$site_qualities': $!");
1043
1044         my %gene;
1045         my $site_gene = $gene_index.'.site';
1046         unlink $site_gene;
1047         my $gene = tie(%gene, 'DB_File', $site_gene, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1048             or $self->throw("Could not open file '$site_gene': $!");
1049
1050         my %matrix;
1051         my $site_matrix = $matrix_index.'.site';
1052         unlink $site_matrix;
1053         my $matrix = tie(%matrix, 'DB_File', $site_matrix, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1054             or $self->throw("Could not open file '$site_matrix': $!");
1055
1056         my %factor;
1057         my $site_factor = $factor_index.'.site';
1058         unlink $site_factor;
1059         my $factor = tie(%factor, 'DB_File', $site_factor, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1060             or $self->throw("Could not open file '$site_factor': $!");
1061
1062         my %reference;
1063         my $site_reference = $reference_index.'.site';
1064         unlink $site_reference;
1065         my $reference = tie(%reference, 'DB_File', $site_reference, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1066             or $self->throw("Could not open file '$site_reference': $!");
1067
1068         # skip the first three header lines
1069         <$SIT>; <$SIT>; <$SIT>;
1070
1071         my @data;
1072         while (<$SIT>) {
1073             if (/^AC  (\S+)/) {
1074                 $data[0] = $1;
1075             }
1076             elsif (/^ID  (\S+)/) {
1077                 $data[1] = $1;
1078                 $id->put("$1", $data[0]);
1079             }
1080             elsif (/^TY  (.+)$/) {
1081                 $data[8] = $1;
1082             }
1083             elsif (/^DE  .*Gene: (G\d+)/) {
1084                 $data[2] = $1;
1085                 $gene->put($data[0], "$1");
1086
1087                 # if it has no gene it is an artificial sequence, unless it
1088                 # has a species (OS line), in which case it is unassigned
1089                 # genomic; either way we won't be able to make a
1090                 # Bio::Map::PositionI later on, so such sites won't be
1091                 # on any MapI.
1092             }
1093             elsif (/^OS  (.+)$/) {
1094                 # Since not all sites in site.dat with a species have a gene,
1095                 # (small handful are unassigned 'genomic') can't delegate to
1096                 # gene.dat and must parse species here (effectively again)
1097                 my $raw_species = $1;
1098                 my $taxid = $self->_species_to_taxid($raw_species);
1099                 $data[7] = $taxid || $raw_species;
1100                 $species->put($data[7], $data[0]);
1101             }
1102             elsif (/^SQ  (.+)\.$/) {
1103                 $data[3] = $1;
1104                 # there can actually be more than one SQ line, seemingly with
1105                 # variations of the sequence (not a long sequence split over
1106                 # two lines); not sure what to do with data; currently we end
1107                 # up storing only the last variant.
1108             }
1109             elsif (/^S1  (.+)$/) {
1110                 $data[4] = $1;
1111                 # if S1 not present, means transcriptional start
1112             }
1113             elsif (/^SF  (.+)$/) {
1114                 $data[5] = $1;
1115             }
1116             elsif (/^ST  (.+)$/) {
1117                 $data[6] = $1;
1118             }
1119             elsif (/^RN  .+?(RE\d+)/) {
1120                 $reference->put($data[0], "$1");
1121             }
1122             elsif (/^MX  (M\d+)/) {
1123                 $matrix->put($data[0], "$1");
1124             }
1125             elsif (/^BF  (T\d+); .+?; Quality: (\d)/) {
1126                 $factor->put($data[0], "$1");
1127                 $qualities{$data[0].SEPARATOR.$1} = $2;
1128             }
1129             elsif (/^\/\//) {
1130                 # end of a record, store previous data and reset
1131
1132                 # accession = id gene_id sequence relative_to first_position last_position species_tax_id_or_raw_string type
1133                 $sites{$data[0]} = join(SEPARATOR, ($data[1] || '',
1134                                                     $data[2] || '',
1135                                                     $data[3] || '',
1136                                                     $data[4] || 'TSS',
1137                                                     $data[5] || '',
1138                                                     $data[6] || '',
1139                                                     $data[7] || '',
1140                                                     $data[8] || ''));
1141
1142                 @data = ();
1143             }
1144         }
1145         close $SIT;
1146
1147         $site = $id = $species = $quality = $gene = $matrix = $factor = $reference = undef;
1148         untie %sites;
1149         untie %id;
1150         untie %species;
1151         untie %qualities;
1152         untie %gene;
1153         untie %matrix;
1154         untie %factor;
1155         untie %reference;
1156     }
1157
1158     my $matrix_dat = "$dat_dir/matrix.dat";
1159     if (! -e $matrix_index || $force) {
1160         open my $MAT, '<', $matrix_dat or $self->throw("Could not read matrix file '$matrix_dat': $!");
1161
1162         my %matrices;
1163         unlink $matrix_index;
1164         my $matrix = tie(%matrices, 'DB_File', $matrix_index, O_RDWR|O_CREAT, 0644, $DB_HASH)
1165             or $self->throw("Could not open file '$matrix_index': $!");
1166
1167         my %id;
1168         my $matrix_id = $matrix_index.'.id';
1169         unlink $matrix_id;
1170         my $id = tie(%id, 'DB_File', $matrix_id, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1171             or $self->throw("Could not open file '$matrix_id': $!");
1172
1173         my %name;
1174         my $matrix_name = $matrix_index.'.name';
1175         unlink $matrix_name;
1176         my $name = tie(%name, 'DB_File', $matrix_name, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1177             or $self->throw("Could not open file '$matrix_name': $!");
1178
1179         my %site;
1180         my $matrix_site = $site_index.'.matrix';
1181         unlink $matrix_site;
1182         my $site = tie(%site, 'DB_File', $matrix_site, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1183             or $self->throw("Could not open file '$matrix_site': $!");
1184
1185         my %factor;
1186         my $matrix_factor = $factor_index.'.matrix';
1187         unlink $matrix_factor;
1188         my $factor = tie(%factor, 'DB_File', $matrix_factor, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1189             or $self->throw("Could not open file '$matrix_factor': $!");
1190
1191         my %reference;
1192         my $matrix_reference = $reference_index.'.matrix';
1193         unlink $matrix_reference;
1194         my $reference = tie(%reference, 'DB_File', $matrix_reference, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1195             or $self->throw("Could not open file '$matrix_reference': $!");
1196
1197         # skip the first three header lines
1198         <$MAT>; <$MAT>; <$MAT>;
1199
1200         my @data;
1201         my @matrix_data;
1202         my @site_data;
1203         while (<$MAT>) {
1204             if (/^AC  (\S+)/) {
1205                 $data[0] = $1;
1206             }
1207             elsif (/^ID  (\S+)/) {
1208                 $data[1] = $1;
1209                 $id->put("$1", $data[0]);
1210             }
1211             elsif (/^NA  (.+)$/) {
1212                 $data[2] = $1;
1213                 $name->put("$1", $data[0]);
1214             }
1215             elsif (/^DE  (.+)$/) {
1216                 $data[3] = $1;
1217             }
1218             elsif (/^\d\d  \s*(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/) {
1219                 # a, c, g, t counts/weights
1220                 push(@matrix_data, join("\t", ($1, $2, $3, $4)));
1221
1222                 # Work out the number of sites as the largest number of
1223                 # sites amongst all positions in the sequences. (The BA
1224                 # line isn't reliable for telling us the correct number of
1225                 # sites all the time)
1226                 my $num = $1 + $2 + $3 + $4;
1227                 $data[4] ||= 0;
1228                 if ($num > $data[4]) {
1229                     $data[4] = $num;
1230                 }
1231             }
1232             elsif (/^BS  ([\sa-zA-Z]+); (.+?); (-?\d+); \d+;.*; ([np])/) {
1233                 # sequence id start strand
1234                 push(@site_data, join('_', ($1, $2, $3, $4 eq 'p' ? 1 : -1)));
1235                 $site->put($data[0], $2);
1236             }
1237             elsif (/^BF  (T\d+)/) {
1238                 $factor->put($data[0], "$1");
1239             }
1240             elsif (/^RN  .+?(RE\d+)/) {
1241                 $reference->put($data[0], "$1");
1242             }
1243             elsif (/^\/\//) {
1244                 # end of a record, store previous data and reset
1245                 my $matrix_data = join(INTERNAL_SEPARATOR, @matrix_data) || '';
1246
1247                 # sites of a matrix are pre-aligned but padded with spaces on
1248                 # the left and no padding on the right; pad with -s both sides
1249                 my $longest_seq = 0;
1250
1251                 # For all the work, does anything meaningful actually get passed
1252                 # on here? Commenting out fixes the latest crashes on trunk.
1253                 # 5-10-10 cjfields
1254
1255                 #foreach my $site_seq (map {my ($seq) = split("_", $_ ,2); $seq;} @site_data) {
1256                 #    $site_seq =~ s/ /-/g;
1257                 #    my $length = length($site_seq);
1258                 #    if ($length > $longest_seq) {
1259                 #        $longest_seq = $length;
1260                 #    }
1261                 #}
1262                 #foreach my $site (@site_data) {
1263                 #    my ($site_seq) = split("_", $site ,2);
1264                 #    my $length = length($site_seq);
1265                 #    if ($length < $longest_seq) {
1266                 #        $site_seq .= '-' x ($longest_seq - $length);
1267                 #    }
1268                 #}
1269
1270                 my $site_data = join(INTERNAL_SEPARATOR, @site_data) || '';
1271
1272                 # accession = id name description num_of_sites matrix_data site_data
1273                 $matrices{$data[0]} = join(SEPARATOR, ($data[1] || '',
1274                                                        $data[2] || '',
1275                                                        $data[3] || '',
1276                                                        $data[4],
1277                                                        $matrix_data,
1278                                                        $site_data));
1279
1280                 @data = @matrix_data = @site_data = ();
1281             }
1282         }
1283         close $MAT;
1284
1285         $matrix = $id = $name = $site = $factor = $reference = undef;
1286         untie %matrices;
1287         untie %id;
1288         untie %name;
1289         untie %site;
1290         untie %factor;
1291         untie %reference;
1292     }
1293
1294     my $factor_dat = "$dat_dir/factor.dat";
1295     if (! -e $factor_index || $force) {
1296         open my $FAC, '<', $factor_dat or $self->throw("Could not read factor file '$factor_dat': $!");
1297
1298         my %factors;
1299         unlink $factor_index;
1300         my $factor = tie(%factors, 'DB_File', $factor_index, O_RDWR|O_CREAT, 0644, $DB_HASH)
1301             or $self->throw("Could not open file '$factor_index': $!");
1302
1303         my %id;
1304         my $factor_id = $factor_index.'.id';
1305         unlink $factor_id;
1306         my $id = tie(%id, 'DB_File', $factor_id, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1307             or $self->throw("Could not open file '$factor_id': $!");
1308
1309         my %name;
1310         my $factor_name = $factor_index.'.name';
1311         unlink $factor_name;
1312         my $name = tie(%name, 'DB_File', $factor_name, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1313             or $self->throw("Could not open file '$factor_name': $!");
1314
1315         my %species;
1316         my $factor_species = $factor_index.'.species';
1317         unlink $factor_species;
1318         my $species = tie(%species, 'DB_File', $factor_species, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1319             or $self->throw("Could not open file '$factor_species': $!");
1320
1321         my %interactors;
1322         my $factor_interactors = $factor_index.'.interactors';
1323         unlink $factor_interactors;
1324         my $interact = tie(%interactors, 'DB_File', $factor_interactors, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1325             or $self->throw("Could not open file '$factor_interactors': $!");
1326
1327         my %gene;
1328         my $factor_gene = $gene_index.'.factor';
1329         unlink $factor_gene;
1330         my $gene = tie(%gene, 'DB_File', $factor_gene, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1331             or $self->throw("Could not open file '$factor_gene': $!");
1332
1333         my %matrix;
1334         my $factor_matrix = $matrix_index.'.factor';
1335         unlink $factor_matrix;
1336         my $matrix = tie(%matrix, 'DB_File', $factor_matrix, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1337             or $self->throw("Could not open file '$factor_matrix': $!");
1338
1339         my %site;
1340         my $factor_site = $site_index.'.factor';
1341         unlink $factor_site;
1342         my $site = tie(%site, 'DB_File', $factor_site, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1343             or $self->throw("Could not open file '$factor_site': $!");
1344
1345         my %fragment;
1346         my $factor_fragment = $fragment_index.'.factor';
1347         unlink $factor_fragment;
1348         my $fragment = tie(%fragment, 'DB_File', $factor_fragment, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1349             or $self->throw("Could not open file '$factor_fragment': $!");
1350
1351         my %reference;
1352         my $factor_reference = $reference_index.'.factor';
1353         unlink $factor_reference;
1354         my $reference = tie(%reference, 'DB_File', $factor_reference, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1355             or $self->throw("Could not open file '$factor_reference': $!");
1356
1357         # skip the first three header lines
1358         <$FAC>; <$FAC>; <$FAC>;
1359
1360         my @data;
1361         my $sequence = '';
1362         while (<$FAC>) {
1363             if (/^AC  (\S+)/) {
1364                 $data[0] = $1;
1365             }
1366             elsif (/^ID  (\S+)/) {
1367                 # IDs are always the same as AC? Is this needed?
1368                 $data[1] = $1;
1369                 $id->put("$1", $data[0]);
1370             }
1371             elsif (/^FA  (.+)$/) {
1372                 $data[2] = $1;
1373                 $name->put("$1", $data[0]);
1374             }
1375             elsif (/^OS  (.+)$/) {
1376                 # This is the species the actual factor came from, which may
1377                 # differ from the species of any sequences it is described as
1378                 # binding to. Not all factors that have a species have a gene,
1379                 # so can't delegate species to a gene lookup.
1380                 my $raw_species = $1;
1381                 my $taxid = $self->_species_to_taxid($raw_species);
1382                 $data[3] = $taxid || $raw_species;
1383                 $species->put($data[3], $data[0]);
1384             }
1385             elsif (/^GE  (G\d+)/) {
1386                 $gene->put($data[0], "$1");
1387             }
1388             elsif (/^SQ  (.+)$/) {
1389                 $sequence .= $1;
1390             }
1391             elsif (/^IN  (T\d+)/) {
1392                 $interact->put($data[0], "$1");
1393             }
1394             elsif (/^MX  (M\d+)/) {
1395                 $matrix->put($data[0], "$1");
1396             }
1397             elsif (/^BS  (R\d+)/) {
1398                 $site->put($data[0], "$1");
1399             }
1400             elsif (/^BR  (FR\d+)/) {
1401                 $fragment->put($data[0], "$1");
1402             }
1403             elsif (/^RN  .+?(RE\d+)/) {
1404                 $reference->put($data[0], "$1");
1405             }
1406             elsif (/^\/\//) {
1407                 # end of a record, store previous data and reset
1408
1409                 # accession = id name species sequence
1410                 $factors{$data[0]} = join(SEPARATOR, ($data[1] || '',
1411                                                       $data[2] || '',
1412                                                       $data[3] || '',
1413                                                       $sequence));
1414
1415                 @data = ();
1416                 $sequence = '';
1417             }
1418         }
1419         close $FAC;
1420
1421         $factor = $id = $name = $species = $interact = $gene = $matrix = $site = $fragment = $reference = undef;
1422         untie %factors;
1423         untie %id;
1424         untie %name;
1425         untie %species;
1426         untie %interactors;
1427         untie %gene;
1428         untie %matrix;
1429         untie %site;
1430         untie %fragment;
1431         untie %reference;
1432     }
1433
1434     my $fragment_dat = "$dat_dir/fragment.dat";
1435     if (! -e $fragment_index || $force) {
1436         if (open my $FRA, '<', $fragment_dat) {
1437             my %fragments;
1438             unlink $fragment_index;
1439             my $fragment = tie(%fragments, 'DB_File', $fragment_index, O_RDWR|O_CREAT, 0644, $DB_HASH)
1440                 or $self->throw("Could not open file '$fragment_index': $!");
1441
1442             my %id;
1443             my $fragment_id = $fragment_index.'.id';
1444             unlink $fragment_id;
1445             my $id = tie(%id, 'DB_File', $fragment_id, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1446                 or $self->throw("Could not open file '$fragment_id': $!");
1447
1448             my %qualities;
1449             my $fragment_qualities = $fragment_index.'.qual';
1450             unlink $fragment_qualities;
1451             my $quality = tie(%qualities, 'DB_File', $fragment_qualities, O_RDWR|O_CREAT, 0644, $DB_HASH)
1452                 or $self->throw("Could not open file '$fragment_qualities': $!");
1453
1454             my %species;
1455             my $fragment_species = $fragment_index.'.species';
1456             unlink $fragment_species;
1457             my $species = tie(%species, 'DB_File', $fragment_species, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1458                 or $self->throw("Could not open file '$fragment_species': $!");
1459
1460             my %gene;
1461             my $fragment_gene = $gene_index.'.fragment';
1462             unlink $fragment_gene;
1463             my $gene = tie(%gene, 'DB_File', $fragment_gene, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1464                 or $self->throw("Could not open file '$fragment_gene': $!");
1465
1466             my %factor;
1467             my $fragment_factor = $factor_index.'.fragment';
1468             unlink $fragment_factor;
1469             my $factor = tie(%factor, 'DB_File', $fragment_factor, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1470                 or $self->throw("Could not open file '$fragment_factor': $!");
1471
1472             my %reference;
1473             my $fragment_reference = $reference_index.'.fragment';
1474             unlink $fragment_reference;
1475             my $reference = tie(%reference, 'DB_File', $fragment_reference, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1476                 or $self->throw("Could not open file '$fragment_reference': $!");
1477
1478             # skip the first three header lines
1479             <$FRA>; <$FRA>; <$FRA>;
1480
1481             my @data;
1482             while (<$FRA>) {
1483                 if (/^AC  (\S+)/) {
1484                     $data[0] = $1;
1485                 }
1486                 elsif (/^ID  (\S+)/) {
1487                     # IDs are always the same as AC? Is this needed?
1488                     $data[1] = $1;
1489                     $id->put("$1", $data[0]);
1490                 }
1491                 elsif (/^DE  Gene: (G\d+)(?:.+Gene: (G\d+))?/) {
1492                     my ($gene1, $gene2) = ($1, $2);
1493                     $data[2] = $gene1;
1494                     $data[3] = $gene2; # could be undef
1495                     $gene->put($data[0], $gene1);
1496                     $gene->put($data[0], $gene2) if $gene2;
1497                 }
1498                 elsif (/^OS  (.+)$/) {
1499                     # As per the site.dat parsing
1500                     my $raw_species = $1;
1501                     my $taxid = $self->_species_to_taxid($raw_species);
1502                     $data[4] = $taxid || $raw_species;
1503                     $species->put($data[4], $data[0]);
1504                 }
1505                 elsif (/^SQ  [atcgn]*([ATCGN]+)[atcgn]*/) {
1506                     $data[5] .= $1;
1507                     # there can be (usually are) multiple SQ lines with a single
1508                     # long seq split over them. The 'real' sequence is in caps
1509                 }
1510                 elsif (/^SC  Build (\S+):$/) {
1511                     $data[6] = $1;
1512                     # maybe parse it out a little more? We have build,
1513                     # chromosomal coords and strand, eg.
1514                     # SC  Build HSA_May2004: Chr.2 43976692..43978487 (FORWARD).
1515                 }
1516                 elsif (/^RN  .+?(RE\d+)/) {
1517                     $reference->put($data[0], "$1");
1518                 }
1519                 elsif (/^BF  (T\d+); .+?; Quality: (\d)/) {
1520                     $factor->put($data[0], "$1");
1521                     $qualities{$data[0].SEPARATOR.$1} = $2;
1522                 }
1523                 elsif (/^\/\//) {
1524                     # end of a record, store previous data and reset
1525
1526                     # accession = id gene_id1 gene_id2 species_tax_id_or_raw_string sequence source
1527                     $fragments{$data[0]} = join(SEPARATOR, ($data[1] || '',
1528                                                             $data[2] || '',
1529                                                             $data[3] || '',
1530                                                             $data[4] || '',
1531                                                             $data[5] || '',
1532                                                             $data[6] || ''));
1533
1534                     @data = ();
1535                 }
1536             }
1537             close $FRA;
1538
1539             $fragment = $id = $species = $quality = $gene = $factor = $reference = undef;
1540             untie %fragments;
1541             untie %id;
1542             untie %species;
1543             untie %qualities;
1544             untie %gene;
1545             untie %factor;
1546             untie %reference;
1547         }
1548         else {
1549             $self->warn("Could not read fragment file '$fragment_dat', assuming you have an old version of Transfac Pro with no fragment.dat file");
1550         }
1551     }
1552 }
1553
1554 # connect the internal db handle
1555 sub _db_connect {
1556     my $self = shift;
1557     return if $self->{'_initialized'};
1558
1559     my $index_dir = $self->index_directory;
1560     my $gene_index = "$index_dir/gene.dat.index";
1561     my $reference_index = "$index_dir/reference.dat.index";
1562     my $matrix_index = "$index_dir/matrix.dat.index";
1563     my $factor_index = "$index_dir/factor.dat.index";
1564     my $site_index = "$index_dir/site.dat.index";
1565     my $fragment_index = "$index_dir/fragment.dat.index";
1566
1567     foreach ($gene_index, $reference_index, $matrix_index, $factor_index, $site_index, $fragment_index) {
1568         if (! -e $_) {
1569             #$self->warn("Index files have not been created");
1570             #return 0;
1571         }
1572     }
1573
1574     # reference
1575     {
1576         $self->{reference}->{data} = {};
1577         tie (%{$self->{reference}->{data}}, 'DB_File', $reference_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$reference_index': $!");
1578
1579         my $reference_pubmed = $reference_index.'.pubmed';
1580         $self->{reference}->{pubmed} = tie (%{$self->{reference}->{pubmed}}, 'DB_File', $reference_pubmed, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_pubmed': $!");
1581
1582         my $reference_gene = $gene_index.'.reference';
1583         $self->{gene}->{reference} = tie (%{$self->{gene}->{reference}}, 'DB_File', $reference_gene, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_gene': $!");
1584
1585         my $reference_site = $site_index.'.reference';
1586         $self->{site}->{reference} = tie (%{$self->{site}->{reference}}, 'DB_File', $reference_site, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_site': $!");
1587
1588         my $reference_fragment = $fragment_index.'.reference';
1589         $self->{fragment}->{reference} = tie (%{$self->{fragment}->{reference}}, 'DB_File', $reference_fragment, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_fragment': $!");
1590
1591         my $reference_factor = $factor_index.'.reference';
1592         $self->{factor}->{reference} = tie (%{$self->{factor}->{reference}}, 'DB_File', $reference_factor, undef, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_factor': $!");
1593
1594         my $reference_matrix = $matrix_index.'.reference';
1595         $self->{matrix}->{reference} = tie (%{$self->{matrix}->{reference}}, 'DB_File', $reference_matrix, undef, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_matrix': $!");
1596     }
1597
1598     # gene
1599     {
1600         $self->{gene}->{data} = {};
1601         tie (%{$self->{gene}->{data}}, 'DB_File', $gene_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$gene_index': $!");
1602
1603         my $gene_id = $gene_index.'.id';
1604         $self->{gene}->{id} = tie(%{$self->{gene}->{id}}, 'DB_File', $gene_id, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_id': $!");
1605
1606         my $gene_name = $gene_index.'.name';
1607         $self->{gene}->{name} = tie(%{$self->{gene}->{name}}, 'DB_File', $gene_name, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_name': $!");
1608
1609         my $gene_species = $gene_index.'.species';
1610         $self->{gene}->{species} = tie(%{$self->{gene}->{species}}, 'DB_File', $gene_species, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_species': $!");
1611
1612         my $gene_site = $site_index.'.gene';
1613         $self->{site}->{gene} = tie(%{$self->{site}->{gene}}, 'DB_File', $gene_site, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_site': $!");
1614
1615         my $gene_fragment = $fragment_index.'.gene';
1616         $self->{fragment}->{gene} = tie(%{$self->{fragment}->{gene}}, 'DB_File', $gene_fragment, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_fragment': $!");
1617
1618         my $gene_factor = $factor_index.'.gene';
1619         $self->{factor}->{gene} = tie(%{$self->{factor}->{gene}}, 'DB_File', $gene_factor, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_factor': $!");
1620
1621         my $gene_reference = $reference_index.'.gene';
1622         $self->{reference}->{gene} = tie(%{$self->{reference}->{gene}}, 'DB_File', $gene_reference, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_reference': $!");
1623     }
1624
1625     # site
1626     {
1627         $self->{site}->{data} = {};
1628         tie (%{$self->{site}->{data}}, 'DB_File', $site_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$site_index': $!");
1629
1630         my $site_id = $site_index.'.id';
1631         $self->{site}->{id} = tie(%{$self->{site}->{id}}, 'DB_File', $site_id, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_id': $!");
1632
1633         my $site_species = $site_index.'.species';
1634         $self->{site}->{species} = tie(%{$self->{site}->{species}}, 'DB_File', $site_species, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file $site_species': $!");
1635
1636         #*** quality not actually used by anything (yet)
1637         my $site_qualities = $site_index.'.qual';
1638         $self->{quality} = {};
1639         tie(%{$self->{quality}}, 'DB_File', $site_qualities, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$site_qualities': $!");
1640
1641         my $site_gene = $gene_index.'.site';
1642         $self->{gene}->{site} = tie(%{$self->{gene}->{site}}, 'DB_File', $site_gene, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_gene': $!");
1643
1644         my $site_matrix = $matrix_index.'.site';
1645         $self->{matrix}->{site} = tie(%{$self->{matrix}->{site}}, 'DB_File', $site_matrix, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_matrix': $!");
1646
1647         my $site_factor = $factor_index.'.site';
1648         $self->{factor}->{site} = tie(%{$self->{factor}->{site}}, 'DB_File', $site_factor, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_factor': $!");
1649
1650         my $site_reference = $reference_index.'.site';
1651         $self->{reference}->{site} = tie(%{$self->{reference}->{site}}, 'DB_File', $site_reference, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_reference': $!");
1652     }
1653
1654     # fragment (may not be in older databases)
1655     if (-e $fragment_index) {
1656         $self->{fragment}->{data} = {};
1657         tie (%{$self->{fragment}->{data}}, 'DB_File', $fragment_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$fragment_index': $!");
1658
1659         my $fragment_id = $fragment_index.'.id';
1660         $self->{fragment}->{id} = tie(%{$self->{fragment}->{id}}, 'DB_File', $fragment_id, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_id': $!");
1661
1662         my $fragment_species = $fragment_index.'.species';
1663         $self->{fragment}->{species} = tie(%{$self->{fragment}->{species}}, 'DB_File', $fragment_species, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file $fragment_species': $!");
1664
1665         #*** quality not actually used by anything (yet)
1666         my $fragment_qualities = $fragment_index.'.qual';
1667         $self->{fragment_quality} = {};
1668         tie(%{$self->{fragment_quality}}, 'DB_File', $fragment_qualities, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$fragment_qualities': $!");
1669
1670         my $fragment_gene = $gene_index.'.fragment';
1671         $self->{gene}->{fragment} = tie(%{$self->{gene}->{fragment}}, 'DB_File', $fragment_gene, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_gene': $!");
1672
1673         my $fragment_factor = $factor_index.'.fragment';
1674         $self->{factor}->{fragment} = tie(%{$self->{factor}->{fragment}}, 'DB_File', $fragment_factor, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_factor': $!");
1675
1676         my $fragment_reference = $reference_index.'.fragment';
1677         $self->{reference}->{fragment} = tie(%{$self->{reference}->{fragment}}, 'DB_File', $fragment_reference, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_reference': $!");
1678     }
1679     else {
1680         die "no fragment_index at '$fragment_index'\n";
1681     }
1682
1683     # matrix
1684     {
1685         $self->{matrix}->{data} = {};
1686         tie (%{$self->{matrix}->{data}}, 'DB_File', $matrix_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$matrix_index': $!");
1687
1688         my $matrix_id = $matrix_index.'.id';
1689         $self->{matrix}->{id} = tie(%{$self->{matrix}->{id}}, 'DB_File', $matrix_id, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_id': $!");
1690
1691         my $matrix_name = $matrix_index.'.name';
1692         $self->{matrix}->{name} = tie(%{$self->{matrix}->{name}}, 'DB_File', $matrix_name, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_name': $!");
1693
1694         my $matrix_site = $site_index.'.matrix';
1695         $self->{site}->{matrix} = tie(%{$self->{site}->{matrix}}, 'DB_File', $matrix_site, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_site': $!");
1696
1697         my $matrix_factor = $factor_index.'.matrix';
1698         $self->{factor}->{matrix} = tie(%{$self->{factor}->{matrix}}, 'DB_File', $matrix_factor, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_factor': $!");
1699
1700         my $matrix_reference = $reference_index.'.matrix';
1701         $self->{reference}->{matrix} = tie(%{$self->{reference}->{matrix}}, 'DB_File', $matrix_reference, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_reference': $!");
1702     }
1703
1704     # factor
1705     {
1706         $self->{factor}->{data} = {};
1707         tie (%{$self->{factor}->{data}}, 'DB_File', $factor_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$factor_index': $!");
1708
1709         my $factor_id = $factor_index.'.id';
1710         $self->{factor}->{id} = tie(%{$self->{factor}->{id}}, 'DB_File', $factor_id, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file 'factor_id': $!");
1711
1712         my $factor_name = $factor_index.'.name';
1713         $self->{factor}->{name} = tie(%{$self->{factor}->{name}}, 'DB_File', $factor_name, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_name': $!");
1714
1715         my $factor_species = $factor_index.'.species';
1716         $self->{factor}->{species} = tie(%{$self->{factor}->{species}}, 'DB_File', $factor_species, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_species': $!");
1717
1718         my $factor_interactors = $factor_index.'.interactors';
1719         $self->{factor}->{interactors} = tie(%{$self->{factor}->{interactors}}, 'DB_File', $factor_interactors, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_interactors': $!");
1720
1721         my $factor_gene = $gene_index.'.factor';
1722         $self->{gene}->{factor} = tie(%{$self->{gene}->{factor}}, 'DB_File', $factor_gene, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_gene': $!");
1723
1724         my $factor_matrix = $matrix_index.'.factor';
1725         $self->{matrix}->{factor} = tie(%{$self->{matrix}->{factor}}, 'DB_File', $factor_matrix, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_matrix': $!");
1726
1727         my $factor_site = $site_index.'.factor';
1728         $self->{site}->{factor} = tie(%{$self->{site}->{factor}}, 'DB_File', $factor_site, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_site': $!");
1729
1730         my $factor_fragment = $fragment_index.'.factor';
1731         $self->{fragment}->{factor} = tie(%{$self->{fragment}->{factor}}, 'DB_File', $factor_fragment, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_fragment': $!");
1732
1733         my $factor_reference = $reference_index.'.factor';
1734         $self->{reference}->{factor} = tie(%{$self->{reference}->{factor}}, 'DB_File', $factor_reference, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_reference': $!");
1735     }
1736
1737     $self->{'_initialized'}  = 1;
1738 }
1739
1740 =head2 index_directory
1741
1742  Title   : index_directory
1743  Funtion : Get/set the location that index files are stored. (this module
1744            will index the supplied database)
1745  Usage   : $obj->index_directory($newval)
1746  Returns : value of index_directory (a scalar)
1747  Args    : on set, new value (a scalar or undef, optional)
1748
1749 =cut
1750
1751 sub index_directory {
1752     my $self = shift;
1753     return $self->{'index_directory'} = shift if @_;
1754     return $self->{'index_directory'};
1755 }
1756
1757 # resolve a transfac species string into an ncbi taxid
1758 sub _species_to_taxid {
1759     my ($self, $raw_species) = @_;
1760     $raw_species or return;
1761
1762     my $species_string;
1763     my @split = split(', ', $raw_species);
1764     (@split > 1) ? ($species_string = $split[1]) : ($species_string = $split[0]);
1765
1766     my $ncbi_taxid;
1767     if ($species_string =~ /^[A-Z]\S+ \S+$/) {
1768         SWITCH: for ($species_string) {
1769             # some species don't classify so custom handling
1770             /^Darnel ryegrass/ && do { $ncbi_taxid = 34176; last; };
1771             /^Coix lacryma/ && do { $ncbi_taxid = 4505; last; };
1772             /^Rattus spec/ && do { $ncbi_taxid = 10116; last; };
1773             /^Mus spec/ && do { $ncbi_taxid = 10090; last; };
1774             /^Equus spec/ && do { $ncbi_taxid = 9796; last; };
1775             /^Cavia sp/ && do { $ncbi_taxid = 10141; last; };
1776             /^Marsh marigold/ && do { $ncbi_taxid = 3449; last; };
1777             /^Phalaenopsis sp/ && do { $ncbi_taxid = 36900; last; };
1778             /^Anthirrhinum majus/ && do { $ncbi_taxid = 4151; last; };
1779             /^Equus spec/ && do { $ncbi_taxid = 9796; last; };
1780             /^Lycopodium spec/ && do { $ncbi_taxid = 13840; last; };
1781             /^Autographa californica/ && do { $ncbi_taxid = 307456; last; };
1782             /^E26 AEV/ && do { $ncbi_taxid = 31920; last; };
1783             /^Pseudocentrotus miliaris/ && do { $ncbi_taxid = 7677; last; }; # the genus is 7677 but this species isn't there
1784             /^SL3-3 (?:retro)?virus/ && do { $ncbi_taxid = 53454; last; }; # 53454 is unclassified MLV-related, SL3-3 a variant of that?
1785             /^Petunia sp/ && do { $ncbi_taxid = 4104; last; };
1786         }
1787         if (! $ncbi_taxid && defined $self->{_tax_db}) {
1788             ($ncbi_taxid) = $self->{_tax_db}->get_taxonids($species_string);
1789         }
1790     }
1791     else {
1792         # some species lines are poorly formated so custom handling
1793         SWITCH: for ($raw_species) {
1794             # for speed, go by common first letters
1795             my $first_letter = substr($raw_species, 0, 1);
1796
1797             $first_letter eq 'A' && do {
1798                 /^Adiantum raddianum/ && do { $ncbi_taxid = 32168; last; };
1799                 /^Avian sarcoma virus \(strain 17\)/ && do { $ncbi_taxid = 11877; last; };
1800                 /^AMV/ && do { $ncbi_taxid = 11866; last; };
1801                 /^AEV/ && do { $ncbi_taxid = 11861; last; };
1802                 /^AS42|^Avian musculoaponeurotic/ && do { $ncbi_taxid = 11873; last; };
1803                 /^Avian myelocytomatosis/ && do { $ncbi_taxid = 11869; last; };
1804                 /^ASV 31/ && do { $ncbi_taxid = 35270; last; };
1805                 /^A-MuLV/ && do { $ncbi_taxid = 188539; last; };
1806                 /^Asparagus officinalis/ && do { $ncbi_taxid = 4686; last; };
1807                 /^Agrobacterium tumefaciens/ && do { $ncbi_taxid = 358; last; };
1808                 /^ALV/ && do { $ncbi_taxid = 11864; last; };
1809                 /^AAV/ && do { $ncbi_taxid = 272636; last; };
1810                 /^AKV MLV/ && do { $ncbi_taxid = 11791; last; };
1811                 last;
1812             };
1813
1814             $first_letter eq 'B' && do {
1815                 /^BPV-1/ && do { $ncbi_taxid = 10559; last; };
1816                 /^BKV/ && do { $ncbi_taxid = 10629; last; };
1817                 /^Bolivian squirrel monkey/ && do { $ncbi_taxid = 39432; last; };
1818                 last;
1819             };
1820
1821             $first_letter eq 'C' && do {
1822                 /^Cauliflower/ && do { $ncbi_taxid = 3715; last; };
1823                 /^Chamek/ && do { $ncbi_taxid = 118643; last; };
1824                 /^Candida albicans/ && do { $ncbi_taxid = 5476; last; };
1825                 /^CaMV/ && do { $ncbi_taxid = 10641; last; };
1826                 last;
1827             };
1828
1829             $first_letter eq 'E' && do {
1830                 /^Eucalyptus gunnii/ && do { $ncbi_taxid = 3933; last; };
1831                 /^EBV, Epstein-Barr virus/ && do { $ncbi_taxid = 10376; last; };
1832                 /^Eucalyptus globulus subsp. bicostata/ && do { $ncbi_taxid = 71272; last; };
1833                 /^Eucalyptus globulus subsp. globulus/ && do { $ncbi_taxid = 71271; last; };
1834                 last;
1835             };
1836
1837             $first_letter eq 'F' && do {
1838                 /^FBR MuLV/ && do { $ncbi_taxid = 11806; last; };
1839                 /^FBJ MuLV/ && do { $ncbi_taxid = 11805; last; };
1840                 /^FeLV|Feline leukemia/ && do { $ncbi_taxid = 11923; last; };
1841                 /^Flaveria trinervia/ && do { $ncbi_taxid = 4227; last; };
1842                 /^FSV/ && do { $ncbi_taxid = 11885; last; };
1843                 /^F-MuLV/ && do { $ncbi_taxid = 11795; last; };
1844                 last;
1845             };
1846
1847             $first_letter eq 'H' && do {
1848                 /^HSV-1/ && do { $ncbi_taxid = 10298; last; };
1849                 /^HTLV-I/ && do { $ncbi_taxid = 11908; last; };
1850                 /^HIV-1/ && do { $ncbi_taxid = 11676; last; };
1851                 /^HPV-16/ && do { $ncbi_taxid = 333760; last; };
1852                 /^HBV/ && do { $ncbi_taxid = 10407; last; };
1853                 /^HBI/ && do { $ncbi_taxid = 11867; last; };
1854                 /^HPV-8/ && do { $ncbi_taxid = 10579; last; };
1855                 /^HPV-11/ && do { $ncbi_taxid = 10580; last; };
1856                 /^HPV-18/ && do { $ncbi_taxid = 333761; last; };
1857                 /^HCMV/ && do { $ncbi_taxid = 10359; last; };
1858                 /^HSV/ && do { $ncbi_taxid = 126283; last; };
1859                 /^HSV-2/ && do { $ncbi_taxid = 10310; last; };
1860                 /^HCV/ && do { $ncbi_taxid = 11108; last; };
1861                 /^HIV-2/ && do { $ncbi_taxid = 11709; last; };
1862                 last;
1863             };
1864
1865             $first_letter eq 'M' && do {
1866                 /^MMTV/ && do { $ncbi_taxid = 11757; last; };
1867                 /^Mo-MuLV/ && do { $ncbi_taxid = 11801; last; };
1868                 /^MuLV/ && do { $ncbi_taxid = 11786; last; };
1869                 /^MSV/ && do { $ncbi_taxid = 11802; last; };
1870                 /^MC29/ && do { $ncbi_taxid = 11868; last; };
1871                 /^MVM/ && do { $ncbi_taxid = 10794; last; };
1872                 /^MH2E21/ && do { $ncbi_taxid = 11955; last; }; # 11955 is a species, presumably MH2E21 is the strain
1873                 last;
1874             };
1875
1876             $first_letter eq 'R' && do {
1877                 /^Raphanus sativus/ && do { $ncbi_taxid = 3726; last; };
1878                 /^REV-T/ && do { $ncbi_taxid = 11636; last; };
1879                 /^RAV-0/ && do { $ncbi_taxid = 11867; last; }; # should be rous-associated virus 0 variant
1880                 /^RSV/ && do { $ncbi_taxid = 11886; last; };
1881                 /^RadLV/ && do { $ncbi_taxid = 31689; last; };
1882                 /^RTBV/ && do { $ncbi_taxid = 10654; last; };
1883                 last;
1884             };
1885
1886             $first_letter eq 'S' && do {
1887                 /^SV40/ && do { $ncbi_taxid = 10633; last; };
1888                 /^Sesbania rostrata/ && do { $ncbi_taxid = 3895; last; };
1889                 /^SIV/ && do { $ncbi_taxid = 11723; last; };
1890                 /^Spinacia oleracea/ && do { $ncbi_taxid = 3562; last; };
1891                 /^SCMV/ && do { $ncbi_taxid = 10364; last; }; # supposed to be AGM isolate
1892                 last;
1893             };
1894
1895             # and lower case
1896             $first_letter eq 'a' && do {
1897                 /^adenovirus type 5/ && do { $ncbi_taxid = 28285; last; };
1898                 /^adenovirus type 2/ && do { $ncbi_taxid = 10515; last; };
1899                 /^adenovirus/ && do { $ncbi_taxid = 189831; last; }; # 189831 ('unclassified Adenoviridae') is the closest I can get, but this has no genus and is not a species
1900                 last;
1901             };
1902
1903             $first_letter eq 'b' && do {
1904                 /^bell pepper/ && do { $ncbi_taxid = 4072; last; };
1905                 /^baculovirus, Autographa californica/ && do { $ncbi_taxid = 46015; last; };
1906                 /^broccoli/ && do { $ncbi_taxid = 36774; last; };
1907                 /^barley/ && do { $ncbi_taxid = 112509; last; };
1908                 last;
1909             };
1910
1911             $first_letter eq 'c' && do {
1912                 /^clawed frog/ && do { $ncbi_taxid = 8355; last; };
1913                 /^chipmunk/ && do { $ncbi_taxid = 64680; last; };
1914                 /^common tree shrew/ && do { $ncbi_taxid = 37347; last; };
1915                 /^cat/ && do { $ncbi_taxid = 9685; last; };
1916                 last;
1917             };
1918
1919             # and misc
1920             /^NK24/ && do { $ncbi_taxid = 11955; last; };
1921             /^OK10/ && do { $ncbi_taxid = 11871; last; };
1922             /^Dendrobium grex/ && do { $ncbi_taxid = 84618; last; };
1923             /^KSHV/ && do { $ncbi_taxid = 37296; last; };
1924             /^Oncidium/ && do { $ncbi_taxid = 96474; last; };
1925             /^Japanese quail/ && do { $ncbi_taxid = 93934; last; };
1926             /^Nile tilapia/ && do { $ncbi_taxid = 8128; last; };
1927             /^GALV/ && do { $ncbi_taxid = 11840; last; };
1928             /^JCV/ && do { $ncbi_taxid = 10632; last; };
1929             /^LPV/ && do { $ncbi_taxid = 10574; last; };
1930             /^Py,/ && do { $ncbi_taxid = 36362; last; };
1931             /^DHBV/ && do { $ncbi_taxid = 12639; last; };
1932             /^VZV/ && do { $ncbi_taxid = 10335; last; };
1933             /^Vicia faba/ && do { $ncbi_taxid = 3906; last; };
1934
1935             /^hamster/ && do { $ncbi_taxid = 10029; last; };
1936             /^sea urchin/ && do { $ncbi_taxid = 7668; last; };
1937             /^fruit fly/ && do { $ncbi_taxid = 7227; last; };
1938             /^halibut/ && do { $ncbi_taxid = 8267; last; };
1939             /^vaccinia virus/ && do { $ncbi_taxid = 10245; last; };
1940             /^taxonomic class Mammalia/ && do { $ncbi_taxid = 40674; last; }; # not a species
1941             /^taxonomic class Vertebrata/ && do { $ncbi_taxid = 7742; last; }; # not a species
1942             /^dog/ && do { $ncbi_taxid = 9615; last; };
1943             /^parsley/ && do { $ncbi_taxid = 4043; last; };
1944             /^mouse, Mus domesticus Torino/ && do { $ncbi_taxid = 10092; last; }; # 10092 is domesticus subspecies, but not the Torino strain
1945             /^lemur, Eulemur fulvus collaris/ && do { $ncbi_taxid = 47178; last; };
1946             /^red sea bream/ && do { $ncbi_taxid = 143350; last; };
1947             /^zebra finch/ && do { $ncbi_taxid = 59729; last; };
1948             /^mung bean/ && do { $ncbi_taxid = 3916; last; };
1949             /^soybean/ && do { $ncbi_taxid = 3847; last; };
1950             /^oat/ && do { $ncbi_taxid = 4498; last; };
1951             /^pseudorabies virus/ && do { $ncbi_taxid = 10345; last; };
1952         }
1953     }
1954
1955     $self->warn("Didn't know what species '$raw_species' was, unable to classify") unless $ncbi_taxid;
1956     return $ncbi_taxid;
1957 }
1958
1959 sub DESTROY {
1960     my $self = shift;
1961     # Destroy tied references to close filehandles
1962     # and allow proper temporary files deletion
1963     undef $self->{_tax_db}->{'_nodes'};
1964     undef $self->{_tax_db}->{'_id2name'};
1965     undef $self->{_tax_db}->{'_name2id'};
1966     undef $self->{_tax_db}->{'_parent2children'};
1967     undef $self->{_tax_db}->{'_parentbtree'};
1968 }
1969
1970 1;