branch-1-6/Bio/DB/TFBS/transfac_pro.pm

   1 # $Id: transfac_pro.pm,v 1.15 2006/08/12 11:00:03 sendu Exp $
   2 #
   3 # BioPerl module for Bio::DB::TFBS::transfac_pro
   4 #
   5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   6 #
   7 # Cared for by Sendu Bala <bix@sendu.me.uk>
   8 #
   9 # Copyright Sendu Bala
  10 #
  11 # You may distribute this module under the same terms as perl itself
  12
  13 # POD documentation - main docs before the code
  14
  15 =head1 NAME
  16
  17 Bio::DB::TFBS::transfac_pro - An implementation of Bio::DB::TFBS
  18 which uses local flat files for transfac pro
  19
  20 =head1 SYNOPSIS
  21
  22   use Bio::DB::Taxonomy;
  23
  24   my $db = new Bio::DB::Taxonomy(-source => 'transfac_pro'
  25                                  -dat_dir => $directory);
  26
  27   # we're interested in the gene P5
  28   my ($gene_id) = $db->get_gene_ids(-name => 'P5'); # G000001
  29
  30   # we want all the transcription factors that bind to our gene
  31   my @factor_ids = $db->get_factor_ids(-gene => $gene_id);
  32
  33   # get info about those TFs
  34   foreach my $factor_id (@factor_ids) {
  35     my $factor = $db->get_factor($factor_id);
  36     my $name = $factor->universal_name;
  37     # etc. - see Bio::Map::TranscriptionFactor, eg. find out where it binds
  38   }
  39
  40   # get a matrix
  41   my $matrix = $db->get_matrix('M00001');
  42
  43   # get a binding site sequence
  44   my $seq = $db->get_site('R00001');
  45
  46 =head1 DESCRIPTION
  47
  48 This is an implementation which uses local flat files and the DB_File
  49 module RECNO data structures to manage a local copy of the Transfac Pro TFBS
  50 database.
  51
  52 Required database files require a license which can be obtained via
  53 http://www.biobase-international.com/pages/index.php?id=170
  54
  55 Within the linux installation tarball you will find a cgibin tar ball, and
  56 inside that is a data directory containing the .dat files needed by this
  57 module. Point to that data directory with -dat_dir
  58
  59 =head1 FEEDBACK
  60
  61 =head2 Mailing Lists
  62
  63 User feedback is an integral part of the evolution of this and other
  64 Bioperl modules. Send your comments and suggestions preferably to
  65 the Bioperl mailing list.  Your participation is much appreciated.
  66
  67   bioperl-l@bioperl.org                  - General discussion
  68   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  69
  70 =head2 Support
  71
  72 Please direct usage questions or support issues to the mailing list:
  73
  74 I<bioperl-l@bioperl.org>
  75
  76 rather than to the module maintainer directly. Many experienced and
  77 reponsive experts will be able look at the problem and quickly
  78 address it. Please include a thorough description of the problem
  79 with code and data examples if at all possible.
  80
  81 =head2 Reporting Bugs
  82
  83 Report bugs to the Bioperl bug tracking system to help us keep track
  84 of the bugs and their resolution. Bug reports can be submitted via
  85 the web:
  86
  87   http://bugzilla.open-bio.org/
  88
  89 =head1 AUTHOR - Sendu Bala
  90
  91 Email bix@sendu.me.uk
  92
  93 =head1 CONTRIBUTORS
  94
  95 Based on Bio::DB::Taxonomy::flatfile by Jason Stajich
  96
  97 =head1 APPENDIX
  98
  99 The rest of the documentation details each of the object methods.
 100 Internal methods are usually preceded with a _
 101
 102 =cut
 103
 104 # Let the code begin...
 105
 106 package Bio::DB::TFBS::transfac_pro;
 107 use strict;
 108 use Bio::Annotation::Reference;
 109 use Bio::Annotation::SimpleValue;
 110 use Bio::LocatableSeq;
 111 use Bio::SimpleAlign;
 112 use Bio::Matrix::PSM::SiteMatrix;
 113 use Bio::AlignIO;
 114 use Bio::Map::GeneMap;
 115 use Bio::Map::TranscriptionFactor;
 116 use Bio::Map::Position;
 117 use Bio::Map::Relative;
 118 use DB_File;
 119
 120 use constant SEPARATOR => ':!:';
 121 use constant INTERNAL_SEPARATOR => '!:!';
 122
 123 $DB_BTREE->{'flags'} = R_DUP; # allow duplicate values in DB_File BTREEs
 124
 125 use base qw(Bio::DB::TFBS);
 126
 127 =head2 new
 128
 129  Title   : new
 130  Usage   : my $obj = new Bio::DB::TFBS::transfac_pro();
 131  Function: Builds a new Bio::DB::TFBS::transfac_pro object
 132  Returns : an instance of Bio::DB::TTFBS::transfac_pro
 133  Args    : -dat_dir   => name of directory where Transfac Pro .dat files
 134                          (required to initially build indexes)
 135            -tax_db    => Bio::DB::Taxonomy object, used when initially building
 136                          indexes, gives better results for species information
 137                          but not required.
 138            -index_dir => name of directory where index files should be created
 139                          or already exist. (defaults to -dat_dir, required if
 140                          -dat_dir not supplied)
 141            -force     => 1 replace current indexes even if they exist
 142
 143 =cut
 144
 145 sub new {
 146     my ($class, @args) = @_;
 147
 148     my $self = $class->SUPER::new(@args);
 149
 150     my ($dat_dir, $index_dir, $tax_db, $force) = $self->_rearrange([qw(DAT_DIR INDEX_DIR TAX_DB FORCE)], @args);
 151     $self->throw("At least one of -dat_dir and -index_dir must be supplied") unless ($dat_dir || $index_dir);
 152
 153     $self->index_directory($index_dir || $dat_dir);
 154     $self->{_tax_db} = $tax_db if $tax_db;
 155
 156     if ($dat_dir) {
 157         $self->_build_index($dat_dir, $force);
 158     }
 159
 160     $self->_db_connect;
 161     return $self;
 162 }
 163
 164 =head2 Bio::DB::TFBS Interface implementation
 165
 166 =cut
 167
 168 sub _get_ids {
 169     my ($self, $dat, @args) = @_;
 170     @args % 2 == 0 || $self->throw("Must provide key => value pairs");
 171     my $hash = $self->{$dat} || $self->throw("Unknown .dat type '$dat'");
 172
 173     if (@args) {
 174         # get a subset corresponding to args
 175         my @final;
 176         my %args = @args;
 177         my $multiple = 0;
 178         while (my ($type, $value) = each %args) {
 179             unless ($value) {
 180                 $self->warn("Arguement '$type' has no value, ignored");
 181                 next;
 182             }
 183             $type =~ s/-//;
 184             $type = lc($type);
 185             my $converter = $hash->{$type};
 186             unless ($converter) {
 187                 $self->warn("Unknown search type '$type' for .dat type '$dat'");
 188                 next;
 189             }
 190
 191             my @ids = $converter->get_dup($value);
 192             unless (@ids) {
 193                 @ids = $converter->get_dup(lc($value));
 194             }
 195
 196             if ($multiple) {
 197                 # we can have multiple types given at once, find the ids that
 198                 # satisfy all criteria
 199                 @final || return;
 200                 my %final = map { $_ => 1 } @final;
 201                 @final = grep { $final{$_} } @ids;
 202             }
 203             else {
 204                 @final = @ids;
 205                 $multiple++;
 206             }
 207         }
 208
 209         return @final;
 210     }
 211     else {
 212         # get them all
 213         my $db_file_hash = $self->{$dat}->{id};
 214
 215         my ($key, $prev_key, $value) = ('_!_', '!_!');
 216         my @ids;
 217         while (1) {
 218             $db_file_hash->seq($key, $value, R_NEXT);
 219             last if $prev_key eq $key;
 220             push(@ids, $value); # confusing? when creating objects we store
 221                                 # $value as accession and $key as id, but from
 222                                 # this method we return $value as id given $id!
 223             $prev_key = $key;
 224         }
 225
 226         return @ids;
 227     }
 228 }
 229
 230 =head2 get_reference
 231
 232  Title   : get_reference
 233  Usage   : my $ref = $obj->get_reference($id);
 234  Function: Get a literature reference.
 235  Returns : Bio::Annotation::Reference
 236  Args    : string - a reference id ('RE...')
 237
 238 =cut
 239
 240 sub get_reference {
 241     my ($self, $id) = @_;
 242     $id || return;
 243     my $data = $self->{reference}->{data}->{$id} || return;
 244     my @data = split(SEPARATOR, $data);
 245
 246     return Bio::Annotation::Reference->new(-pubmed   => $data[0],
 247                                            -authors  => $data[1],
 248                                            -title    => $data[2],
 249                                            -location => $data[3] );
 250 }
 251
 252 =head2 get_genemap
 253
 254  Title   : get_genemap
 255  Usage   : my $map = $obj->get_genemap($id);
 256  Function: Get a GeneMap for a gene.
 257  Returns : Bio::Map::GeneMap
 258  Args    : string - a gene id ('G...'), and optionally int (number of bp
 259            upstream)
 260
 261 =cut
 262
 263 sub get_genemap {
 264     my ($self, $id, $upstream) = @_;
 265     $id || return;
 266     return $self->{got_map}->{$id} if defined $self->{got_map}->{$id};
 267     $upstream ||= 1000;
 268     my $data = $self->{gene}->{data}->{$id} || return;
 269     my @data = split(SEPARATOR, $data);
 270
 271     # accession = id name description species_tax_id_or_raw_string
 272     my $taxon = $self->{_tax_db} ? $self->{_tax_db}->get_taxon($data[3]) || $data[3] : $data[3];
 273     my $map = Bio::Map::GeneMap->get(-uid => $id,
 274                                      -gene => $data[1],
 275                                      -species => $taxon,
 276                                      -description => $data[2],
 277                                      -upstream => $upstream);
 278     $self->{got_map}->{$id} = $map; # prevents infinite recurse when we call get_factor below
 279
 280     # spawn all the factors that belong on this gene map
 281     # get_factor_ids(-gene => ...) only works for genes that encode factors;
 282     # have to go via sites
 283     foreach my $sid ($self->get_site_ids(-gene => $id)) {
 284         foreach my $fid ($self->get_factor_ids(-site => $sid)) {
 285             # it is quite deliberate that we deeply recurse to arrive at the
 286             # correct answer, which involves pulling in most of the database
 287             no warnings "recursion";
 288             $self->get_factor($fid);
 289         }
 290     }
 291
 292     return $map;
 293 }
 294
 295 =head2 get_seq
 296
 297  Title   : get_seq
 298  Usage   : my $seq = $obj->get_seq($id);
 299  Function: Get the sequence of a site. The sequence will be annotated with the
 300            the tags 'relative_start', 'relative_end', 'relative_type' and
 301            'relative_to'.
 302  Returns : Bio::Seq
 303  Args    : string - a site id ('R...')
 304
 305 =cut
 306
 307 sub get_seq {
 308     my ($self, $id) = @_;
 309     $id || return;
 310     my $data = $self->{site}->{data}->{$id} || return;
 311     my @data = split(SEPARATOR, $data);
 312
 313     my $seq = Bio::Seq->new(-seq              => $data[2],
 314                             -accession_number => $id,
 315                             -description      => $data[6] ? 'Genomic sequence' : 'Consensus or artificial sequence',
 316                             -id               => $data[0],
 317                             -strand           => 1,
 318                             -alphabet         => $data[7] || 'dna',
 319                             -species          => $data[6]);
 320
 321     my $annot = $seq->annotation;
 322     my $sv = Bio::Annotation::SimpleValue->new(-tagname => 'relative_start', -value => $data[4] || 1);
 323     $annot->add_Annotation($sv);
 324     $sv = Bio::Annotation::SimpleValue->new(-tagname => 'relative_end', -value => $data[5] || ($data[4] || 1 + length($data[2]) - 1));
 325     $annot->add_Annotation($sv);
 326     $sv = Bio::Annotation::SimpleValue->new(-tagname => 'relative_type', -value => $data[3] || 'artificial');
 327     $annot->add_Annotation($sv);
 328     $sv = Bio::Annotation::SimpleValue->new(-tagname => 'relative_to', -value => $data[1]);
 329     $annot->add_Annotation($sv);
 330
 331     return $seq;
 332 }
 333
 334 =head2 get_fragment
 335
 336  Title   : get_fragment
 337  Usage   : my $seq = $obj->get_fragment($id);
 338  Function: Get the sequence of a fragment.
 339  Returns : Bio::Seq
 340  Args    : string - a site id ('FR...')
 341
 342 =cut
 343
 344 sub get_fragment {
 345     my ($self, $id) = @_;
 346     $id || return;
 347     my $data = $self->{fragment}->{data}->{$id} || return;
 348     my @data = split(SEPARATOR, $data);
 349
 350     # accession = id gene_id1 gene_id2 species_tax_id_or_raw_string sequence source
 351     return new Bio::Seq( -seq              => $data[4],
 352                          -accession_number => $id,
 353                          -description      => 'Between genes '.$data[1].' and '.$data[2],
 354                          -species          => $data[3],
 355                          -id               => $data[0],
 356                          -alphabet         => 'dna' );
 357 }
 358
 359 =head2 get_matrix
 360
 361  Title   : get_matrix
 362  Usage   : my $matrix = $obj->get_matrix($id);
 363  Function: Get a matrix that describes a binding site.
 364  Returns : Bio::Matrix::PSM::SiteMatrix
 365  Args    : string - a matrix id ('M...'), optionally a sequence string from
 366            which base frequencies will be calcualted for the matrix model
 367            (default 0.25 each)
 368
 369 =cut
 370
 371 sub get_matrix {
 372     my ($self, $id, $seq) = @_;
 373     $id || return;
 374     $seq ||= 'atgc';
 375     $seq = lc($seq);
 376     my $data = $self->{matrix}->{data}->{$id} || return;
 377     my @data = split(SEPARATOR, $data);
 378     $data[4] || $self->throw("Matrix data missing for $id");
 379
 380     my ($a, $c, $g, $t);
 381     foreach my $position (split(INTERNAL_SEPARATOR, $data[4])) {
 382         my ($a_count, $c_count, $g_count, $t_count) = split("\t", $position);
 383         push(@{$a}, $a_count);
 384         push(@{$c}, $c_count);
 385         push(@{$g}, $g_count);
 386         push(@{$t}, $t_count);
 387     }
 388
 389     # our psms include a simple background model so we can use
 390     # sequence_match_weight() if desired
 391     my $a_freq = ($seq =~ tr/a//) / length($seq);
 392     my $c_freq = ($seq =~ tr/c//) / length($seq);
 393     my $g_freq = ($seq =~ tr/g//) / length($seq);
 394     my $t_freq = ($seq =~ tr/t//) / length($seq);
 395
 396     my $psm = new Bio::Matrix::PSM::SiteMatrix( -pA => $a,
 397                                                 -pC => $c,
 398                                                 -pG => $g,
 399                                                 -pT => $t,
 400                                                 -id => $data[0],
 401                                                 -accession_number => $id,
 402                                                 -sites => $data[3],
 403                                                 -width => scalar(@{$a}),
 404                                                 -correction => 1,
 405                                                 -model => { A => $a_freq, C => $c_freq, G => $g_freq, T => $t_freq } );
 406
 407     #*** used to make a Bio::Matrix::PSM::Psm and add references, but it
 408     #    didn't seem worth it. You can get references from the database by:
 409     #foreach my $ref_id ($db->get_reference_ids(-matrix => $id)) {
 410     #    my $ref = $db->get_reference($ref_id);
 411     #}
 412
 413     return $psm;
 414 }
 415
 416 =head2 get_aln
 417
 418  Title   : get_aln
 419  Usage   : my $aln = $obj->get_aln($id);
 420  Function: Get the alignment that was used to generate a matrix. Each sequence
 421            in the alignment will have an accession_number corresponding to the
 422            Transfac site id, and id() based on that but unique within the
 423            alignment.
 424  Returns : Bio::SimpleAlign
 425  Args    : string - a matrix id ('M...'), optionally true to, when a matrix
 426            lists no sequences, search for sequences via the matrix's factors,
 427            picking the sites that best match the matrix
 428
 429 =cut
 430
 431 sub get_aln {
 432     my ($self, $id, $via_factors) = @_;
 433     $id || return;
 434     my $data = $self->{matrix}->{data}->{$id} || $self->throw("matrix '$id' had no data in DB_File");
 435     my @data = split(SEPARATOR, $data);
 436
 437     if (! $data[5] && $via_factors) {
 438         # This is a matrix with no site sequences given in matrix.dat.
 439         # Find some matching site sequences via factors.
 440
 441         # First, check its factors for sites
 442         my %site_seqs;
 443         my %factor_ids;
 444         foreach my $factor_id ($self->get_factor_ids(-matrix => $id)) {
 445             $factor_ids{$factor_id} = 1;
 446             foreach my $site_id ($self->get_site_ids(-factor => $factor_id)) {
 447                 next if defined $site_seqs{$site_id};
 448                 my $seq = $self->get_seq($site_id);
 449
 450                 # skip sites that have no sequence, or have IUPAC symbols in
 451                 # their sequence (most probably the 'consensus' sequence itself
 452                 # that was used to make and exactly corresponds to the matrix)
 453                 my $seq_str = $seq->seq || next;
 454                 $seq_str =~ /[MRWSYKVHDB]/ and next;
 455
 456                 $site_seqs{$site_id} = $seq;
 457             }
 458         }
 459         my @seqs = values %site_seqs;
 460
 461         if (@seqs > 1) {
 462             # pick the sub-seqs that match to the matrix with the best scores
 463             my $matrix = $self->get_matrix($id);
 464             my $desired_sequences = $matrix->sites;
 465             return if @seqs < $desired_sequences;
 466
 467             my $desired_length = $matrix->width;
 468             my %best_seqs;
 469             foreach my $seq (@seqs) {
 470                 my $for_str = $seq->seq;
 471                 next if length($for_str) < $desired_length;
 472                 my $rev_str = $seq->revcom->seq;
 473
 474                 my $best_score = 0;
 475                 my $best_subseq = '';
 476                 my $best_i = 0;
 477                 my $best_subseq_caps = 0;
 478                 my $best_revcom;
 479                 my $revcom = 0;
 480                 foreach my $seq_str ($for_str, $rev_str) {
 481                     for my $i (0..(length($seq_str) - $desired_length)) {
 482                         my $subseq = substr($seq_str, $i, $desired_length);
 483                         $subseq =~ s/[^ACGTacgt]//g; # can only score atcg
 484                         next unless length($subseq) == $desired_length; # short or 0-length seqs could get the highest scores!
 485                         my $score = $matrix->sequence_match_weight($subseq);
 486
 487                         # caps represent the author-chosen bit of a site
 488                         # sequence so we would prefer to choose a subseq that
 489                         # contains it
 490                         my $caps = $subseq =~ tr/ACGT//;
 491
 492                         #*** (don't know why numeric == fails for comparing
 493                         #     scores, when the string eq works)
 494                         if ($score > $best_score || ("$score" eq "$best_score" && $caps > $best_subseq_caps)) {
 495                             $best_score = $score;
 496                             $best_subseq_caps = $caps;
 497                             $best_subseq = $subseq;
 498                             $best_i = $i;
 499                             $best_revcom = $revcom;
 500                         }
 501                     }
 502                     $revcom++;
 503                 }
 504
 505                 if ($best_score) {
 506                     $best_seqs{$seq->accession_number} = [$best_subseq, $seq->accession_number, ($best_i + 1), $revcom ? -1 : 1, $best_score];
 507                 }
 508             }
 509             my @sorted = sort { $best_seqs{$b}->[-1] <=> $best_seqs{$a}->[-1] } keys %best_seqs;
 510             return if @sorted < $desired_sequences;
 511             splice(@sorted, $desired_sequences);
 512             my %wanted = map { $_ => 1 } @sorted;
 513
 514             my @site_data;
 515             foreach my $seq (@seqs) {
 516                 next unless exists $wanted{$seq->accession_number};
 517                 my @data = @{$best_seqs{$seq->accession_number}};
 518                 pop(@data);
 519                 push(@site_data, join('_', @data));
 520             }
 521
 522             $data[5] = join(INTERNAL_SEPARATOR, @site_data);
 523             $self->{matrix}->{data}->{$id} = join(SEPARATOR, @data);
 524         }
 525     }
 526     $data[5] || return;
 527
 528     my @blocks = split(INTERNAL_SEPARATOR, $data[5]);
 529
 530     # append gap chars to all sequences to make them the same length
 531     # (applies to sequences found via factors, presumably, since we already
 532     # do this for matrix alignments in transfac_pro.pm)
 533     my $longest = 0;
 534     foreach (@blocks) {
 535         my ($seq) = split('_', $_);
 536         my $length = length($seq);
 537         if ($length > $longest) {
 538             $longest = $length;
 539         }
 540     }
 541     foreach my $i (0..$#blocks) {
 542         my $block = $blocks[$i];
 543         my ($seq, $seq_id) = split('_', $block);
 544         my $length = length($seq);
 545         if ($length < $longest) {
 546             my $orig_seq = $seq;
 547             $seq .= '-'x($longest - $length);
 548             $block =~ s/^${orig_seq}_/${seq}_/;
 549             $blocks[$i] = $block;
 550         }
 551     }
 552
 553     # build the alignment
 554     my $aln = Bio::SimpleAlign->new(-source => 'transfac_pro');
 555     my %done_ids;
 556     foreach (@blocks) {
 557         my ($seq, $seq_acc, $start, $strand) = split('_', $_);
 558
 559         # we can get back multiple different subparts of the same site (sequence),
 560         # so $seq_acc isn't unique across this loop. Can't use it as the seq id
 561         # of the alignment (ids must be unique in SimpleAlign), so we
 562         # uniquify the id and store the original id as the accession_number
 563         my $seq_id;
 564         $done_ids{$seq_acc}++;
 565         if ($done_ids{$seq_acc} > 1) {
 566             $seq_id = $seq_acc.'_'.$done_ids{$seq_acc};
 567         }
 568         else {
 569             $seq_id = $seq_acc;
 570         }
 571
 572         my $gaps = $seq =~ tr/-//;
 573         my $length = length($seq) - $gaps;
 574         $self->throw("seq '$seq_id' for matrix '$id' had seq '$seq'") unless $length;
 575         $aln->add_seq(Bio::LocatableSeq->new(-seq    => $seq,
 576                                              -id     => $seq_id,
 577                                              -accession_number => $seq_acc,
 578                                              -start  => $start,
 579                                              -end    => $start + $length - 1,
 580                                              -strand => $strand));
 581     }
 582     $aln->id($id);
 583     # could also store score? of?
 584
 585     return $aln;
 586 }
 587
 588 =head2 get_factor
 589
 590  Title   : get_factor
 591  Usage   : my $factor = $obj->get_factor($id);
 592  Function: Get the details of a transcription factor.
 593  Returns : Bio::Map::TranscriptionFactor
 594  Args    : string - a factor id ('T...')
 595
 596 =cut
 597
 598 sub get_factor {
 599     my ($self, $id) = @_;
 600     $id || return;
 601     return $self->{got_factor}->{$id} if defined $self->{got_factor}->{$id};
 602     my $data = $self->{factor}->{data}->{$id} || return;
 603     my @data = split(SEPARATOR, $data);
 604
 605     # accession = id name species sequence
 606     my $tf = Bio::Map::TranscriptionFactor->get(-id => $id,
 607                                                 -universal_name => $data[1]);
 608     #*** not sure what to do with species and sequence, since we don't want to
 609     # confuse the idea that a TF is a general thing that could bind to any
 610     # species... then again, you might want to model species-specific variants
 611     # of a TF with different binding abilities...
 612     #*** idea of having inclusion and exclusion species so you can prevent/
 613     # ignore a tf that binds to the wrong species (a species that doesn't even
 614     # have the tf), and associating sequence with each species/tf combo so you
 615     # can see how diverged the tf is and make assumptions about site difference
 616     # allowance
 617
 618     # place it on all its genemaps
 619     foreach my $sid ($self->get_site_ids(-factor => $id)) {
 620         my $s_data = $self->{site}->{data}->{$sid} || next;
 621         my @s_data = split(SEPARATOR, $s_data);
 622
 623         # accession = id gene_id sequence relative_to first_position last_position species_tax_id_or_raw_string
 624         $s_data[1] || next; # site isn't relative to a gene, meaningless
 625         $s_data[4] || next; # don't know where its supposed to be, can't model it
 626         $s_data[5] ||= $s_data[4] + ($s_data[2] ? length($s_data[2]) - 1 : 0);
 627
 628         # it is quite deliberate that we deeply recurse to arrive at the
 629         # correct answer, which involves pulling in most of the database
 630         no warnings "recursion";
 631         my $gene_map = $self->get_genemap($s_data[1]) || next;
 632         return $self->{got_factor}->{$id} if defined $self->{got_factor}->{$id};
 633
 634         #*** not always relative to gene start...
 635         #    we need Bio::Map::Gene s to have some default tss and atg positions
 636         #    that we can be relative to
 637         my $rel = new Bio::Map::Relative(-element => $gene_map->gene, -description => $s_data[3]);
 638         Bio::Map::Position->new(-map => $gene_map, -element => $tf, -start => $s_data[4], -end => $s_data[5], -relative => $rel);
 639     }
 640
 641     $self->{got_factor}->{$id} = $tf;
 642     return $tf;
 643 }
 644
 645 # since get_factor() is uncertain, just have direct access methods to factor
 646 # information
 647 sub get_factor_name {
 648     my ($self, $id) = @_;
 649     my $details = $self->_get_factor_details($id) || return;
 650     return $details->{name};
 651 }
 652 sub get_factor_species {
 653     my ($self, $id) = @_;
 654     my $details = $self->_get_factor_details($id) || return;
 655     return $details->{species};
 656 }
 657 sub get_factor_sequence {
 658     my ($self, $id) = @_;
 659     my $details = $self->_get_factor_details($id) || return;
 660     return $details->{sequence};
 661 }
 662 sub _get_factor_details {
 663     my ($self, $id) = @_;
 664     $id || return;
 665
 666     return $self->{factor_details}->{$id} if defined $self->{factor_details}->{$id};
 667
 668     my $data = $self->{factor}->{data}->{$id} || return;
 669     my @data = split(SEPARATOR, $data);
 670
 671     # accession = id name species sequence
 672
 673     my %details = (name => $data[1], species => $data[2], sequence => $data[3]);
 674     $self->{factor_details}->{$id} = \%details;
 675
 676     return \%details;
 677 }
 678
 679 =head2 get_reference_ids
 680
 681  Title   : get_reference_ids
 682  Usage   : my @ids = $obj->get_reference_ids(-key => $value);
 683  Function: Get all the reference ids that are associated with the supplied
 684            args.
 685  Returns : list of strings (ids)
 686  Args    : -key => value, where value is a string id, and key is one of:
 687            -pubmed -site -gene -matrix -factor
 688
 689 =cut
 690
 691 sub get_reference_ids {
 692     my $self = shift;
 693     return $self->_get_ids('reference', @_);
 694 }
 695
 696 # -id -name -species -site -factor -reference
 697 sub get_gene_ids {
 698     my $self = shift;
 699     return $self->_get_ids('gene', @_);
 700 }
 701
 702 =head2 get_site_ids
 703
 704  Title   : get_site_ids
 705  Usage   : my @ids = $obj->get_site_ids(-key => $value);
 706  Function: Get all the site ids that are associated with the supplied
 707            args.
 708  Returns : list of strings (ids)
 709  Args    : -key => value, where value is a string id, and key is one of:
 710            -id -species -gene -matrix -factor -reference
 711
 712 =cut
 713
 714 sub get_site_ids {
 715     my $self = shift;
 716     return $self->_get_ids('site', @_);
 717 }
 718
 719 =head2 get_matrix_ids
 720
 721  Title   : get_matrix_ids
 722  Usage   : my @ids = $obj->get_matrix_ids(-key => $value);
 723  Function: Get all the matrix ids that are associated with the supplied
 724            args.
 725  Returns : list of strings (ids)
 726  Args    : -key => value, where value is a string id, and key is one of:
 727            -id -name -site -factor -reference
 728
 729 =cut
 730
 731 sub get_matrix_ids {
 732     my $self = shift;
 733     return $self->_get_ids('matrix', @_);
 734 }
 735
 736 =head2 get_factor_ids
 737
 738  Title   : get_factor_ids
 739  Usage   : my @ids = $obj->get_factor_ids(-key => $value);
 740  Function: Get all the factor ids that are associated with the supplied
 741            args.
 742  Returns : list of strings (ids)
 743  Args    : -key => value, where value is a string id, and key is one of:
 744            -id -name -species -interactors -gene -matrix -site -reference
 745            NB: -gene only gets factor ids for genes that encode factors
 746
 747 =cut
 748
 749 sub get_factor_ids {
 750     my $self = shift;
 751     return $self->_get_ids('factor', @_);
 752 }
 753
 754 =head2 get_fragment_ids
 755
 756  Title   : get_fragment_ids
 757  Usage   : my @ids = $obj->get_fragment_ids(-key => $value);
 758  Function: Get all the fragment ids that are associated with the supplied
 759            args.
 760  Returns : list of strings (ids)
 761  Args    : -key => value, where value is a string id, and key is one of:
 762            -id -species -gene -factor -reference
 763
 764 =cut
 765
 766 sub get_fragment_ids {
 767     my $self = shift;
 768     return $self->_get_ids('fragment', @_);
 769 }
 770
 771 =head2 Helper methods
 772
 773 =cut
 774
 775 # internal method which does the indexing
 776 sub _build_index {
 777     my ($self, $dat_dir, $force) = @_;
 778
 779     # MLDBM would give us transparent complex data structures with DB_File,
 780     # allowing just one index file, but its yet another requirment and we
 781     # don't strictly need it
 782
 783     my $index_dir = $self->index_directory;
 784     my $gene_index = "$index_dir/gene.dat.index";
 785     my $reference_index = "$index_dir/reference.dat.index";
 786     my $matrix_index = "$index_dir/matrix.dat.index";
 787     my $factor_index = "$index_dir/factor.dat.index";
 788     my $fragment_index = "$index_dir/fragment.dat.index";
 789     my $site_index = "$index_dir/site.dat.index";
 790
 791     my $reference_dat = "$dat_dir/reference.dat";
 792     if (! -e $reference_index || $force) {
 793         open(REF, $reference_dat) || $self->throw("Cannot open reference file '$reference_dat' for reading");
 794
 795         my %references;
 796         unlink $reference_index;
 797         my $ref = tie(%references, 'DB_File', $reference_index, O_RDWR|O_CREAT, 0644, $DB_HASH) || $self->throw("Cannot open file '$reference_index': $!");
 798
 799         my %pubmed;
 800         my $reference_pubmed = $reference_index.'.pubmed';
 801         unlink $reference_pubmed;
 802         my $pub = tie(%pubmed, 'DB_File', $reference_pubmed, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_pubmed': $!");
 803
 804         my %gene;
 805         my $reference_gene = $gene_index.'.reference';
 806         unlink $reference_gene;
 807         my $gene = tie(%gene, 'DB_File', $reference_gene, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_gene': $!");
 808
 809         my %site;
 810         my $reference_site = $site_index.'.reference';
 811         unlink $reference_site;
 812         my $site = tie(%site, 'DB_File', $reference_site, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_site': $!");
 813
 814         my %fragment;
 815         my $reference_fragment = $fragment_index.'.reference';
 816         unlink $reference_fragment;
 817         my $fragment = tie(%fragment, 'DB_File', $reference_fragment, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_fragment': $!");
 818
 819         my %factor;
 820         my $reference_factor = $factor_index.'.reference';
 821         unlink $reference_factor;
 822         my $factor = tie(%factor, 'DB_File', $reference_factor, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_factor': $!");
 823
 824         my %matrix;
 825         my $reference_matrix = $matrix_index.'.reference';
 826         unlink $reference_matrix;
 827         my $matrix = tie(%matrix, 'DB_File', $reference_matrix, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_matrix': $!");
 828
 829         # skip the first three header lines
 830         <REF>; <REF>; <REF>;
 831
 832         my @data;
 833         while (<REF>) {
 834             if (/^AC  (\S+)/) {
 835                 $data[0] = $1;
 836             }
 837             elsif (/^RX  PUBMED: (\d+)/) {
 838                 $data[1] = $1;
 839                 $pub->put("$1", $data[0]);
 840             }
 841             elsif (/^RA  (.+)\n$/) {
 842                 $data[2] = $1;
 843             }
 844             elsif (/^RT  (.+?)\.?\n$/) {
 845                 $data[3] = $1;
 846             }
 847             elsif (/^RL  (.+?)\.?\n$/) {
 848                 $data[4] = $1;
 849             }
 850             elsif (/^GE  TRANSFAC: (\w\d+)/) {
 851                 $gene->put($data[0], "$1");
 852             }
 853             elsif (/^BS  TRANSFAC: (\w\d+)/) {
 854                 $site->put($data[0], "$1");
 855             }
 856             elsif (/^FA  TRANSFAC: (\w\d+)/) {
 857                 $factor->put($data[0], "$1");
 858             }
 859             elsif (/^FR  TRANSFAC: (FR\d+)/) {
 860                 $fragment->put($data[0], "$1");
 861             }
 862             elsif (/^MX  TRANSFAC: (\w\d+)/) {
 863                 $matrix->put($data[0], "$1");
 864             }
 865             elsif (/^\/\//) {
 866                 # end of a record, store previous data and reset
 867
 868                 # accession = pubmed authors title location
 869                 $references{$data[0]} = join(SEPARATOR, ($data[1] || '', $data[2] || '', $data[3] || '', $data[4] || ''));
 870
 871                 @data = ();
 872             }
 873         }
 874         close(REF);
 875
 876         $ref = $pub = $gene = $site = $fragment = $factor = $matrix = undef;
 877         untie %references;
 878         untie %pubmed;
 879         untie %gene;
 880         untie %site;
 881         untie %fragment;
 882         untie %factor;
 883         untie %matrix;
 884     }
 885
 886     my $gene_dat = "$dat_dir/gene.dat";
 887     if (! -e $gene_index || $force) {
 888         open(GEN, $gene_dat) || $self->throw("Cannot open gene file '$gene_dat' for reading");
 889
 890         my %genes;
 891         unlink $gene_index;
 892         my $gene = tie(%genes, 'DB_File', $gene_index, O_RDWR|O_CREAT, 0644, $DB_HASH) || $self->throw("Cannot open file '$gene_index': $!");
 893
 894         my %id;
 895         my $gene_id = $gene_index.'.id';
 896         unlink $gene_id;
 897         my $id = tie(%id, 'DB_File', $gene_id, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$gene_id': $!");
 898
 899         my %name;
 900         my $gene_name = $gene_index.'.name';
 901         unlink $gene_name;
 902         my $name = tie(%name, 'DB_File', $gene_name, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$gene_name': $!");
 903
 904         my %species;
 905         my $gene_species = $gene_index.'.species';
 906         unlink $gene_species;
 907         my $species = tie(%species, 'DB_File', $gene_species, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$gene_species': $!");
 908
 909         my %site;
 910         my $gene_site = $site_index.'.gene';
 911         unlink $gene_site;
 912         my $site = tie(%site, 'DB_File', $gene_site, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$gene_site': $!");
 913
 914         my %factor;
 915         my $gene_factor = $factor_index.'.gene';
 916         unlink $gene_factor;
 917         my $factor = tie(%factor, 'DB_File', $gene_factor, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$gene_factor': $!");
 918
 919         my %fragment;
 920         my $gene_fragment = $fragment_index.'.gene';
 921         unlink $gene_fragment;
 922         my $fragment = tie(%fragment, 'DB_File', $gene_fragment, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$gene_fragment': $!");
 923
 924         my %reference;
 925         my $gene_reference = $reference_index.'.gene';
 926         unlink $gene_reference;
 927         my $reference = tie(%reference, 'DB_File', $gene_reference, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$gene_reference': $!");
 928
 929         # skip the first three header lines
 930         <GEN>; <GEN>; <GEN>;
 931
 932         my @data;
 933         while (<GEN>) {
 934             if (/^AC  (\S+)/) {
 935                 $data[0] = $1;
 936             }
 937             elsif (/^ID  (\S+)/) {
 938                 $data[1] = $1;
 939                 $id->put("$1", $data[0]);
 940             }
 941             elsif (/^SD  (.+)$/) {
 942                 $data[2] = lc("$1");
 943                 $name->put(lc("$1"), $data[0]);
 944             }
 945             elsif (/^SY  (.+)\.$/) {
 946                 foreach (split('; ', lc("$1"))) {
 947                     $name->put($_, $data[0]);
 948                 }
 949             }
 950             elsif (/^DE  (.+)$/) {
 951                 $data[3] = $1;
 952             }
 953             elsif (/^OS  (.+)$/) {
 954                 my $raw_species = $1;
 955                 my $taxid = $self->_species_to_taxid($raw_species);
 956                 $data[4] = $taxid || $raw_species;
 957                 $species->put($data[4], $data[0]);
 958             }
 959             elsif (/^RN  .+?(RE\d+)/) {
 960                 $reference->put($data[0], "$1");
 961             }
 962             elsif (/^BS  .+?(R\d+)/) {
 963                 $site->put($data[0], "$1");
 964             }
 965             elsif (/^FA  (T\d+)/) {
 966                 $factor->put($data[0], "$1");
 967             }
 968             elsif (/^BR  (FR\d+)/) {
 969                 $fragment->put($data[0], "$1");
 970             }
 971             elsif (/^\/\//) {
 972                 # end of a record, store previous data and reset
 973
 974                 # accession = id name description species_tax_id_or_raw_string
 975                 $genes{$data[0]} = join(SEPARATOR, ($data[1] || '', $data[2] || '', $data[3] || '', $data[4] || ''));
 976
 977                 @data = ();
 978             }
 979         }
 980         close(GEN);
 981
 982         $gene = $id = $name = $species = $site = $factor = $reference = undef;
 983         untie %genes;
 984         untie %id;
 985         untie %name;
 986         untie %species;
 987         untie %site;
 988         untie %factor;
 989         untie %reference;
 990     }
 991
 992     my $site_dat = "$dat_dir/site.dat";
 993     if (! -e $site_index || $force) {
 994         open(SIT, $site_dat) || $self->throw("Cannot open site file '$site_dat' for reading");
 995
 996         my %sites;
 997         unlink $site_index;
 998         my $site = tie(%sites, 'DB_File', $site_index, O_RDWR|O_CREAT, 0644, $DB_HASH) || $self->throw("Cannot open file '$site_index': $!");
 999
1000         my %id;
1001         my $site_id = $site_index.'.id';
1002         unlink $site_id;
1003         my $id = tie(%id, 'DB_File', $site_id, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$site_id': $!");
1004
1005         my %species;
1006         my $site_species = $site_index.'.species';
1007         unlink $site_species;
1008         my $species = tie(%species, 'DB_File', $site_species, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$site_species': $!");
1009
1010         my %qualities;
1011         my $site_qualities = $site_index.'.qual';
1012         unlink $site_qualities;
1013         my $quality = tie(%qualities, 'DB_File', $site_qualities, O_RDWR|O_CREAT, 0644, $DB_HASH) || $self->throw("Cannot open file '$site_qualities': $!");
1014
1015         my %gene;
1016         my $site_gene = $gene_index.'.site';
1017         unlink $site_gene;
1018         my $gene = tie(%gene, 'DB_File', $site_gene, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$site_gene': $!");
1019
1020         my %matrix;
1021         my $site_matrix = $matrix_index.'.site';
1022         unlink $site_matrix;
1023         my $matrix = tie(%matrix, 'DB_File', $site_matrix, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$site_matrix': $!");
1024
1025         my %factor;
1026         my $site_factor = $factor_index.'.site';
1027         unlink $site_factor;
1028         my $factor = tie(%factor, 'DB_File', $site_factor, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$site_factor': $!");
1029
1030         my %reference;
1031         my $site_reference = $reference_index.'.site';
1032         unlink $site_reference;
1033         my $reference = tie(%reference, 'DB_File', $site_reference, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$site_reference': $!");
1034
1035         # skip the first three header lines
1036         <SIT>; <SIT>; <SIT>;
1037
1038         my @data;
1039         while (<SIT>) {
1040             if (/^AC  (\S+)/) {
1041                 $data[0] = $1;
1042             }
1043             elsif (/^ID  (\S+)/) {
1044                 $data[1] = $1;
1045                 $id->put("$1", $data[0]);
1046             }
1047             elsif (/^TY  (.+)$/) {
1048                 $data[8] = $1;
1049             }
1050             elsif (/^DE  .*Gene: (G\d+)/) {
1051                 $data[2] = $1;
1052                 $gene->put($data[0], "$1");
1053
1054                 # if it has no gene it is an artificial sequence, unless it
1055                 # has a species (OS line), in which case it is unassigned
1056                 # genomic; either way we won't be able to make a
1057                 # Bio::Map::PositionI later on, so such sites won't be
1058                 # on any MapI.
1059             }
1060             elsif (/^OS  (.+)$/) {
1061                 # Since not all sites in site.dat with a species have a gene,
1062                 # (small handful are unassigned 'genomic') can't delegate to
1063                 # gene.dat and must parse species here (effectively again)
1064                 my $raw_species = $1;
1065                 my $taxid = $self->_species_to_taxid($raw_species);
1066                 $data[7] = $taxid || $raw_species;
1067                 $species->put($data[7], $data[0]);
1068             }
1069             elsif (/^SQ  (.+)\.$/) {
1070                 $data[3] = $1;
1071                 # there can actually be more than one SQ line, seemingly with
1072                 # variations of the sequence (not a long sequence split over
1073                 # two lines); not sure what to do with data; currently we end
1074                 # up storing only the last variant.
1075             }
1076             elsif (/^S1  (.+)$/) {
1077                 $data[4] = $1;
1078                 # if S1 not present, means transcriptional start
1079             }
1080             elsif (/^SF  (.+)$/) {
1081                 $data[5] = $1;
1082             }
1083             elsif (/^ST  (.+)$/) {
1084                 $data[6] = $1;
1085             }
1086             elsif (/^RN  .+?(RE\d+)/) {
1087                 $reference->put($data[0], "$1");
1088             }
1089             elsif (/^MX  (M\d+)/) {
1090                 $matrix->put($data[0], "$1");
1091             }
1092             elsif (/^BF  (T\d+); .+?; Quality: (\d)/) {
1093                 $factor->put($data[0], "$1");
1094                 $qualities{$data[0].SEPARATOR.$1} = $2;
1095             }
1096             elsif (/^\/\//) {
1097                 # end of a record, store previous data and reset
1098
1099                 # accession = id gene_id sequence relative_to first_position last_position species_tax_id_or_raw_string type
1100                 $sites{$data[0]} = join(SEPARATOR, ($data[1] || '', $data[2] || '', $data[3] || '', $data[4] || 'TSS', $data[5] || '', $data[6] || '', $data[7] || '', $data[8] || ''));
1101
1102                 @data = ();
1103             }
1104         }
1105         close(SIT);
1106
1107         $site = $id = $species = $quality = $gene = $matrix = $factor = $reference = undef;
1108         untie %sites;
1109         untie %id;
1110         untie %species;
1111         untie %qualities;
1112         untie %gene;
1113         untie %matrix;
1114         untie %factor;
1115         untie %reference;
1116     }
1117
1118     my $matrix_dat = "$dat_dir/matrix.dat";
1119     if (! -e $matrix_index || $force) {
1120         open(MAT, $matrix_dat) || $self->throw("Cannot open matrix file '$matrix_dat' for reading");
1121
1122         my %matrices;
1123         unlink $matrix_index;
1124         my $matrix = tie(%matrices, 'DB_File', $matrix_index, O_RDWR|O_CREAT, 0644, $DB_HASH) || $self->throw("Cannot open file '$matrix_index': $!");
1125
1126         my %id;
1127         my $matrix_id = $matrix_index.'.id';
1128         unlink $matrix_id;
1129         my $id = tie(%id, 'DB_File', $matrix_id, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$matrix_id': $!");
1130
1131         my %name;
1132         my $matrix_name = $matrix_index.'.name';
1133         unlink $matrix_name;
1134         my $name = tie(%name, 'DB_File', $matrix_name, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$matrix_name': $!");
1135
1136         my %site;
1137         my $matrix_site = $site_index.'.matrix';
1138         unlink $matrix_site;
1139         my $site = tie(%site, 'DB_File', $matrix_site, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$matrix_site': $!");
1140
1141         my %factor;
1142         my $matrix_factor = $factor_index.'.matrix';
1143         unlink $matrix_factor;
1144         my $factor = tie(%factor, 'DB_File', $matrix_factor, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$matrix_factor': $!");
1145
1146         my %reference;
1147         my $matrix_reference = $reference_index.'.matrix';
1148         unlink $matrix_reference;
1149         my $reference = tie(%reference, 'DB_File', $matrix_reference, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$matrix_reference': $!");
1150
1151         # skip the first three header lines
1152         <MAT>; <MAT>; <MAT>;
1153
1154         my @data;
1155         my @matrix_data;
1156         my @site_data;
1157         while (<MAT>) {
1158             if (/^AC  (\S+)/) {
1159                 $data[0] = $1;
1160             }
1161             elsif (/^ID  (\S+)/) {
1162                 $data[1] = $1;
1163                 $id->put("$1", $data[0]);
1164             }
1165             elsif (/^NA  (.+)$/) {
1166                 $data[2] = $1;
1167                 $name->put("$1", $data[0]);
1168             }
1169             elsif (/^DE  (.+)$/) {
1170                 $data[3] = $1;
1171             }
1172             elsif (/^\d\d  \s*(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/) {
1173                 # a, c, g, t counts/weights
1174                 push(@matrix_data, join("\t", ($1, $2, $3, $4)));
1175
1176                 # Work out the number of sites as the largest number of
1177                 # sites amongst all positions in the sequences. (The BA
1178                 # line isn't reliable for telling us the correct number of
1179                 # sites all the time)
1180                 my $num = $1 + $2 + $3 + $4;
1181                 $data[4] ||= 0;
1182                 if ($num > $data[4]) {
1183                     $data[4] = $num;
1184                 }
1185             }
1186             elsif (/^BS  ([\sa-zA-Z]+); (.+?); (-?\d+); \d+;.*; ([np])/) {
1187                 # sequence id start strand
1188                 push(@site_data, join('_', ($1, $2, $3, $4 eq 'p' ? 1 : -1)));
1189                 $site->put($data[0], $2);
1190             }
1191             elsif (/^BF  (T\d+)/) {
1192                 $factor->put($data[0], "$1");
1193             }
1194             elsif (/^RN  .+?(RE\d+)/) {
1195                 $reference->put($data[0], "$1");
1196             }
1197             elsif (/^\/\//) {
1198                 # end of a record, store previous data and reset
1199                 my $matrix_data = join(INTERNAL_SEPARATOR, @matrix_data) || '';
1200
1201                 # sites of a matrix are pre-aligned but padded with spaces on
1202                 # the left and no padding on the right; pad with -s both sides
1203                 my $longest_seq = 0;
1204                 foreach my $site_seq (@site_data) {
1205                     $site_seq =~ s/ /-/g;
1206                     my $length = length($site_seq);
1207                     if ($length > $longest_seq) {
1208                         $longest_seq = $length;
1209                     }
1210                 }
1211                 foreach my $site_seq (@site_data) {
1212                     my $length = length($site_seq);
1213                     if ($length < $longest_seq) {
1214                         $site_seq .= '-' x ($longest_seq - $length);
1215                     }
1216                 }
1217                 my $site_data = join(INTERNAL_SEPARATOR, @site_data) || '';
1218
1219                 # accession = id name description num_of_sites matrix_data site_data
1220                 $matrices{$data[0]} = join(SEPARATOR, ($data[1] || '', $data[2] || '', $data[3] || '', $data[4], $matrix_data, $site_data));
1221
1222                 @data = @matrix_data = @site_data = ();
1223             }
1224         }
1225         close(MAT);
1226
1227         $matrix = $id = $name = $site = $factor = $reference = undef;
1228         untie %matrices;
1229         untie %id;
1230         untie %name;
1231         untie %site;
1232         untie %factor;
1233         untie %reference;
1234     }
1235
1236     my $factor_dat = "$dat_dir/factor.dat";
1237     if (! -e $factor_index || $force) {
1238         open(FAC, $factor_dat) || $self->throw("Cannot open factor file '$factor_dat' for reading");
1239
1240         my %factors;
1241         unlink $factor_index;
1242         my $factor = tie(%factors, 'DB_File', $factor_index, O_RDWR|O_CREAT, 0644, $DB_HASH) || $self->throw("Cannot open file '$factor_index': $!");
1243
1244         my %id;
1245         my $factor_id = $factor_index.'.id';
1246         unlink $factor_id;
1247         my $id = tie(%id, 'DB_File', $factor_id, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file 'factor_id': $!");
1248
1249         my %name;
1250         my $factor_name = $factor_index.'.name';
1251         unlink $factor_name;
1252         my $name = tie(%name, 'DB_File', $factor_name, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_name': $!");
1253
1254         my %species;
1255         my $factor_species = $factor_index.'.species';
1256         unlink $factor_species;
1257         my $species = tie(%species, 'DB_File', $factor_species, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_species': $!");
1258
1259         my %interactors;
1260         my $factor_interactors = $factor_index.'.interactors';
1261         unlink $factor_interactors;
1262         my $interact = tie(%interactors, 'DB_File', $factor_interactors, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_interactors': $!");
1263
1264         my %gene;
1265         my $factor_gene = $gene_index.'.factor';
1266         unlink $factor_gene;
1267         my $gene = tie(%gene, 'DB_File', $factor_gene, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_gene': $!");
1268
1269         my %matrix;
1270         my $factor_matrix = $matrix_index.'.factor';
1271         unlink $factor_matrix;
1272         my $matrix = tie(%matrix, 'DB_File', $factor_matrix, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_matrix': $!");
1273
1274         my %site;
1275         my $factor_site = $site_index.'.factor';
1276         unlink $factor_site;
1277         my $site = tie(%site, 'DB_File', $factor_site, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_site': $!");
1278
1279         my %fragment;
1280         my $factor_fragment = $fragment_index.'.factor';
1281         unlink $factor_fragment;
1282         my $fragment = tie(%fragment, 'DB_File', $factor_fragment, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_fragment': $!");
1283
1284         my %reference;
1285         my $factor_reference = $reference_index.'.factor';
1286         unlink $factor_reference;
1287         my $reference = tie(%reference, 'DB_File', $factor_reference, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_reference': $!");
1288
1289         # skip the first three header lines
1290         <FAC>; <FAC>; <FAC>;
1291
1292         my @data;
1293         my $sequence = '';
1294         while (<FAC>) {
1295             if (/^AC  (\S+)/) {
1296                 $data[0] = $1;
1297             }
1298             elsif (/^ID  (\S+)/) {
1299                 # IDs are always the same as AC? Is this needed?
1300                 $data[1] = $1;
1301                 $id->put("$1", $data[0]);
1302             }
1303             elsif (/^FA  (.+)$/) {
1304                 $data[2] = $1;
1305                 $name->put("$1", $data[0]);
1306             }
1307             elsif (/^OS  (.+)$/) {
1308                 # This is the species the actual factor came from, which may
1309                 # differ from the species of any sequences it is described as
1310                 # binding to. Not all factors that have a species have a gene,
1311                 # so can't delegate species to a gene lookup.
1312                 my $raw_species = $1;
1313                 my $taxid = $self->_species_to_taxid($raw_species);
1314                 $data[3] = $taxid || $raw_species;
1315                 $species->put($data[3], $data[0]);
1316             }
1317             elsif (/^GE  (G\d+)/) {
1318                 $gene->put($data[0], "$1");
1319             }
1320             elsif (/^SQ  (.+)$/) {
1321                 $sequence .= $1;
1322             }
1323             elsif (/^IN  (T\d+)/) {
1324                 $interact->put($data[0], "$1");
1325             }
1326             elsif (/^MX  (M\d+)/) {
1327                 $matrix->put($data[0], "$1");
1328             }
1329             elsif (/^BS  (R\d+)/) {
1330                 $site->put($data[0], "$1");
1331             }
1332             elsif (/^BR  (FR\d+)/) {
1333                 $fragment->put($data[0], "$1");
1334             }
1335             elsif (/^RN  .+?(RE\d+)/) {
1336                 $reference->put($data[0], "$1");
1337             }
1338             elsif (/^\/\//) {
1339                 # end of a record, store previous data and reset
1340
1341                 # accession = id name species sequence
1342                 $factors{$data[0]} = join(SEPARATOR, ($data[1] || '', $data[2] || '', $data[3] || '', $sequence));
1343
1344                 @data = ();
1345                 $sequence = '';
1346             }
1347         }
1348         close(FAC);
1349
1350         $factor = $id = $name = $species = $interact = $gene = $matrix = $site = $fragment = $reference = undef;
1351         untie %factors;
1352         untie %id;
1353         untie %name;
1354         untie %species;
1355         untie %interactors;
1356         untie %gene;
1357         untie %matrix;
1358         untie %site;
1359         untie %fragment;
1360         untie %reference;
1361     }
1362
1363     my $fragment_dat = "$dat_dir/fragment.dat";
1364     if (! -e $fragment_index || $force) {
1365         if (open(FRA, $fragment_dat)) {
1366             my %fragments;
1367             unlink $fragment_index;
1368             my $fragment = tie(%fragments, 'DB_File', $fragment_index, O_RDWR|O_CREAT, 0644, $DB_HASH) || $self->throw("Cannot open file '$fragment_index': $!");
1369
1370             my %id;
1371             my $fragment_id = $fragment_index.'.id';
1372             unlink $fragment_id;
1373             my $id = tie(%id, 'DB_File', $fragment_id, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$fragment_id': $!");
1374
1375             my %qualities;
1376             my $fragment_qualities = $fragment_index.'.qual';
1377             unlink $fragment_qualities;
1378             my $quality = tie(%qualities, 'DB_File', $fragment_qualities, O_RDWR|O_CREAT, 0644, $DB_HASH) || $self->throw("Cannot open file '$fragment_qualities': $!");
1379
1380             my %species;
1381             my $fragment_species = $fragment_index.'.species';
1382             unlink $fragment_species;
1383             my $species = tie(%species, 'DB_File', $fragment_species, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$fragment_species': $!");
1384
1385             my %gene;
1386             my $fragment_gene = $gene_index.'.fragment';
1387             unlink $fragment_gene;
1388             my $gene = tie(%gene, 'DB_File', $fragment_gene, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$fragment_gene': $!");
1389
1390             my %factor;
1391             my $fragment_factor = $factor_index.'.fragment';
1392             unlink $fragment_factor;
1393             my $factor = tie(%factor, 'DB_File', $fragment_factor, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$fragment_factor': $!");
1394
1395             my %reference;
1396             my $fragment_reference = $reference_index.'.fragment';
1397             unlink $fragment_reference;
1398             my $reference = tie(%reference, 'DB_File', $fragment_reference, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$fragment_reference': $!");
1399
1400             # skip the first three header lines
1401             <FRA>; <FRA>; <FRA>;
1402
1403             my @data;
1404             while (<FRA>) {
1405                 if (/^AC  (\S+)/) {
1406                     $data[0] = $1;
1407                 }
1408                 elsif (/^ID  (\S+)/) {
1409                     # IDs are always the same as AC? Is this needed?
1410                     $data[1] = $1;
1411                     $id->put("$1", $data[0]);
1412                 }
1413                 elsif (/^DE  Gene: (G\d+)(?:.+Gene: (G\d+))?/) {
1414                     my ($gene1, $gene2) = ($1, $2);
1415                     $data[2] = $gene1;
1416                     $data[3] = $gene2; # could be undef
1417                     $gene->put($data[0], $gene1);
1418                     $gene->put($data[0], $gene2) if $gene2;
1419                 }
1420                 elsif (/^OS  (.+)$/) {
1421                     # As per the site.dat parsing
1422                     my $raw_species = $1;
1423                     my $taxid = $self->_species_to_taxid($raw_species);
1424                     $data[4] = $taxid || $raw_species;
1425                     $species->put($data[4], $data[0]);
1426                 }
1427                 elsif (/^SQ  [atcgn]*([ATCGN]+)[atcgn]*/) {
1428                     $data[5] .= $1;
1429                     # there can be (usually are) multiple SQ lines with a single
1430                     # long seq split over them. The 'real' sequence is in caps
1431                 }
1432                 elsif (/^SC  Build (\S+):$/) {
1433                     $data[6] = $1;
1434                     # maybe parse it out a little more? We have build,
1435                     # chromosomal coords and strand, eg.
1436                     # SC  Build HSA_May2004: Chr.2 43976692..43978487 (FORWARD).
1437                 }
1438                 elsif (/^RN  .+?(RE\d+)/) {
1439                     $reference->put($data[0], "$1");
1440                 }
1441                 elsif (/^BF  (T\d+); .+?; Quality: (\d)/) {
1442                     $factor->put($data[0], "$1");
1443                     $qualities{$data[0].SEPARATOR.$1} = $2;
1444                 }
1445                 elsif (/^\/\//) {
1446                     # end of a record, store previous data and reset
1447
1448                     # accession = id gene_id1 gene_id2 species_tax_id_or_raw_string sequence source
1449                     $fragments{$data[0]} = join(SEPARATOR, ($data[1] || '', $data[2] || '', $data[3] || '', $data[4] || '', $data[5] || '', $data[6] || ''));
1450
1451                     @data = ();
1452                 }
1453             }
1454             close(FRA);
1455
1456             $fragment = $id = $species = $quality = $gene = $factor = $reference = undef;
1457             untie %fragments;
1458             untie %id;
1459             untie %species;
1460             untie %qualities;
1461             untie %gene;
1462             untie %factor;
1463             untie %reference;
1464         }
1465         else {
1466             $self->warn("Cannot open fragment file '$fragment_dat' for reading, assuming you have an old version of Transfac Pro with no fragment.dat file.");
1467         }
1468     }
1469 }
1470
1471 # connect the internal db handle
1472 sub _db_connect {
1473     my $self = shift;
1474     return if $self->{'_initialized'};
1475
1476     my $index_dir = $self->index_directory;
1477     my $gene_index = "$index_dir/gene.dat.index";
1478     my $reference_index = "$index_dir/reference.dat.index";
1479     my $matrix_index = "$index_dir/matrix.dat.index";
1480     my $factor_index = "$index_dir/factor.dat.index";
1481     my $site_index = "$index_dir/site.dat.index";
1482     my $fragment_index = "$index_dir/fragment.dat.index";
1483
1484     foreach ($gene_index, $reference_index, $matrix_index, $factor_index, $site_index, $fragment_index) {
1485         if (! -e $_) {
1486             #$self->warn("Index files have not been created");
1487             #return 0;
1488         }
1489     }
1490
1491     # reference
1492     {
1493         $self->{reference}->{data} = {};
1494         tie (%{$self->{reference}->{data}}, 'DB_File', $reference_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$reference_index': $!");
1495
1496         my $reference_pubmed = $reference_index.'.pubmed';
1497         $self->{reference}->{pubmed} = tie (%{$self->{reference}->{pubmed}}, 'DB_File', $reference_pubmed, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_pubmed': $!");
1498
1499         my $reference_gene = $gene_index.'.reference';
1500         $self->{gene}->{reference} = tie (%{$self->{gene}->{reference}}, 'DB_File', $reference_gene, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_gene': $!");
1501
1502         my $reference_site = $site_index.'.reference';
1503         $self->{site}->{reference} = tie (%{$self->{site}->{reference}}, 'DB_File', $reference_site, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_site': $!");
1504
1505         my $reference_fragment = $fragment_index.'.reference';
1506         $self->{fragment}->{reference} = tie (%{$self->{fragment}->{reference}}, 'DB_File', $reference_fragment, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_fragment': $!");
1507
1508         my $reference_factor = $factor_index.'.reference';
1509         $self->{factor}->{reference} = tie (%{$self->{factor}->{reference}}, 'DB_File', $reference_factor, undef, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_factor': $!");
1510
1511         my $reference_matrix = $matrix_index.'.reference';
1512         $self->{matrix}->{reference} = tie (%{$self->{matrix}->{reference}}, 'DB_File', $reference_matrix, undef, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_matrix': $!");
1513     }
1514
1515     # gene
1516     {
1517         $self->{gene}->{data} = {};
1518         tie (%{$self->{gene}->{data}}, 'DB_File', $gene_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$gene_index': $!");
1519
1520         my $gene_id = $gene_index.'.id';
1521         $self->{gene}->{id} = tie(%{$self->{gene}->{id}}, 'DB_File', $gene_id, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_id': $!");
1522
1523         my $gene_name = $gene_index.'.name';
1524         $self->{gene}->{name} = tie(%{$self->{gene}->{name}}, 'DB_File', $gene_name, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_name': $!");
1525
1526         my $gene_species = $gene_index.'.species';
1527         $self->{gene}->{species} = tie(%{$self->{gene}->{species}}, 'DB_File', $gene_species, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_species': $!");
1528
1529         my $gene_site = $site_index.'.gene';
1530         $self->{site}->{gene} = tie(%{$self->{site}->{gene}}, 'DB_File', $gene_site, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_site': $!");
1531
1532         my $gene_fragment = $fragment_index.'.gene';
1533         $self->{fragment}->{gene} = tie(%{$self->{fragment}->{gene}}, 'DB_File', $gene_fragment, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_fragment': $!");
1534
1535         my $gene_factor = $factor_index.'.gene';
1536         $self->{factor}->{gene} = tie(%{$self->{factor}->{gene}}, 'DB_File', $gene_factor, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_factor': $!");
1537
1538         my $gene_reference = $reference_index.'.gene';
1539         $self->{reference}->{gene} = tie(%{$self->{reference}->{gene}}, 'DB_File', $gene_reference, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_reference': $!");
1540     }
1541
1542     # site
1543     {
1544         $self->{site}->{data} = {};
1545         tie (%{$self->{site}->{data}}, 'DB_File', $site_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$site_index': $!");
1546
1547         my $site_id = $site_index.'.id';
1548         $self->{site}->{id} = tie(%{$self->{site}->{id}}, 'DB_File', $site_id, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_id': $!");
1549
1550         my $site_species = $site_index.'.species';
1551         $self->{site}->{species} = tie(%{$self->{site}->{species}}, 'DB_File', $site_species, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file $site_species': $!");
1552
1553         #*** quality not actually used by anything (yet)
1554         my $site_qualities = $site_index.'.qual';
1555         $self->{quality} = {};
1556         tie(%{$self->{quality}}, 'DB_File', $site_qualities, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$site_qualities': $!");
1557
1558         my $site_gene = $gene_index.'.site';
1559         $self->{gene}->{site} = tie(%{$self->{gene}->{site}}, 'DB_File', $site_gene, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_gene': $!");
1560
1561         my $site_matrix = $matrix_index.'.site';
1562         $self->{matrix}->{site} = tie(%{$self->{matrix}->{site}}, 'DB_File', $site_matrix, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_matrix': $!");
1563
1564         my $site_factor = $factor_index.'.site';
1565         $self->{factor}->{site} = tie(%{$self->{factor}->{site}}, 'DB_File', $site_factor, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_factor': $!");
1566
1567         my $site_reference = $reference_index.'.site';
1568         $self->{reference}->{site} = tie(%{$self->{reference}->{site}}, 'DB_File', $site_reference, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_reference': $!");
1569     }
1570
1571     # fragment (may not be in older databases)
1572     if (-e $fragment_index) {
1573         $self->{fragment}->{data} = {};
1574         tie (%{$self->{fragment}->{data}}, 'DB_File', $fragment_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$fragment_index': $!");
1575
1576         my $fragment_id = $fragment_index.'.id';
1577         $self->{fragment}->{id} = tie(%{$self->{fragment}->{id}}, 'DB_File', $fragment_id, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_id': $!");
1578
1579         my $fragment_species = $fragment_index.'.species';
1580         $self->{fragment}->{species} = tie(%{$self->{fragment}->{species}}, 'DB_File', $fragment_species, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file $fragment_species': $!");
1581
1582         #*** quality not actually used by anything (yet)
1583         my $fragment_qualities = $fragment_index.'.qual';
1584         $self->{fragment_quality} = {};
1585         tie(%{$self->{fragment_quality}}, 'DB_File', $fragment_qualities, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$fragment_qualities': $!");
1586
1587         my $fragment_gene = $gene_index.'.fragment';
1588         $self->{gene}->{fragment} = tie(%{$self->{gene}->{fragment}}, 'DB_File', $fragment_gene, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_gene': $!");
1589
1590         my $fragment_factor = $factor_index.'.fragment';
1591         $self->{factor}->{fragment} = tie(%{$self->{factor}->{fragment}}, 'DB_File', $fragment_factor, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_factor': $!");
1592
1593         my $fragment_reference = $reference_index.'.fragment';
1594         $self->{reference}->{fragment} = tie(%{$self->{reference}->{fragment}}, 'DB_File', $fragment_reference, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_reference': $!");
1595     }
1596     else {
1597         die "no fragment_index at '$fragment_index'\n";
1598     }
1599
1600     # matrix
1601     {
1602         $self->{matrix}->{data} = {};
1603         tie (%{$self->{matrix}->{data}}, 'DB_File', $matrix_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$matrix_index': $!");
1604
1605         my $matrix_id = $matrix_index.'.id';
1606         $self->{matrix}->{id} = tie(%{$self->{matrix}->{id}}, 'DB_File', $matrix_id, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_id': $!");
1607
1608         my $matrix_name = $matrix_index.'.name';
1609         $self->{matrix}->{name} = tie(%{$self->{matrix}->{name}}, 'DB_File', $matrix_name, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_name': $!");
1610
1611         my $matrix_site = $site_index.'.matrix';
1612         $self->{site}->{matrix} = tie(%{$self->{site}->{matrix}}, 'DB_File', $matrix_site, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_site': $!");
1613
1614         my $matrix_factor = $factor_index.'.matrix';
1615         $self->{factor}->{matrix} = tie(%{$self->{factor}->{matrix}}, 'DB_File', $matrix_factor, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_factor': $!");
1616
1617         my $matrix_reference = $reference_index.'.matrix';
1618         $self->{reference}->{matrix} = tie(%{$self->{reference}->{matrix}}, 'DB_File', $matrix_reference, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_reference': $!");
1619     }
1620
1621     # factor
1622     {
1623         $self->{factor}->{data} = {};
1624         tie (%{$self->{factor}->{data}}, 'DB_File', $factor_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$factor_index': $!");
1625
1626         my $factor_id = $factor_index.'.id';
1627         $self->{factor}->{id} = tie(%{$self->{factor}->{id}}, 'DB_File', $factor_id, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file 'factor_id': $!");
1628
1629         my $factor_name = $factor_index.'.name';
1630         $self->{factor}->{name} = tie(%{$self->{factor}->{name}}, 'DB_File', $factor_name, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_name': $!");
1631
1632         my $factor_species = $factor_index.'.species';
1633         $self->{factor}->{species} = tie(%{$self->{factor}->{species}}, 'DB_File', $factor_species, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_species': $!");
1634
1635         my $factor_interactors = $factor_index.'.interactors';
1636         $self->{factor}->{interactors} = tie(%{$self->{factor}->{interactors}}, 'DB_File', $factor_interactors, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_interactors': $!");
1637
1638         my $factor_gene = $gene_index.'.factor';
1639         $self->{gene}->{factor} = tie(%{$self->{gene}->{factor}}, 'DB_File', $factor_gene, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_gene': $!");
1640
1641         my $factor_matrix = $matrix_index.'.factor';
1642         $self->{matrix}->{factor} = tie(%{$self->{matrix}->{factor}}, 'DB_File', $factor_matrix, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_matrix': $!");
1643
1644         my $factor_site = $site_index.'.factor';
1645         $self->{site}->{factor} = tie(%{$self->{site}->{factor}}, 'DB_File', $factor_site, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_site': $!");
1646
1647         my $factor_fragment = $fragment_index.'.factor';
1648         $self->{fragment}->{factor} = tie(%{$self->{fragment}->{factor}}, 'DB_File', $factor_fragment, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_fragment': $!");
1649
1650         my $factor_reference = $reference_index.'.factor';
1651         $self->{reference}->{factor} = tie(%{$self->{reference}->{factor}}, 'DB_File', $factor_reference, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_reference': $!");
1652     }
1653
1654     $self->{'_initialized'}  = 1;
1655 }
1656
1657 =head2 index_directory
1658
1659  Title   : index_directory
1660  Funtion : Get/set the location that index files are stored. (this module
1661            will index the supplied database)
1662  Usage   : $obj->index_directory($newval)
1663  Returns : value of index_directory (a scalar)
1664  Args    : on set, new value (a scalar or undef, optional)
1665
1666 =cut
1667
1668 sub index_directory {
1669     my $self = shift;
1670     return $self->{'index_directory'} = shift if @_;
1671     return $self->{'index_directory'};
1672 }
1673
1674 # resolve a transfac species string into an ncbi taxid
1675 sub _species_to_taxid {
1676     my ($self, $raw_species) = @_;
1677     $raw_species or return;
1678
1679     my $species_string;
1680     my @split = split(', ', $raw_species);
1681     (@split > 1) ? ($species_string = $split[1]) : ($species_string = $split[0]);
1682
1683     my $ncbi_taxid;
1684     if ($species_string =~ /^[A-Z]\S+ \S+$/) {
1685         SWITCH: for ($species_string) {
1686             # some species don't classify so custom handling
1687             /^Darnel ryegrass/ && do { $ncbi_taxid = 34176; last; };
1688             /^Coix lacryma/ && do { $ncbi_taxid = 4505; last; };
1689             /^Rattus spec/ && do { $ncbi_taxid = 10116; last; };
1690             /^Mus spec/ && do { $ncbi_taxid = 10090; last; };
1691             /^Equus spec/ && do { $ncbi_taxid = 9796; last; };
1692             /^Cavia sp/ && do { $ncbi_taxid = 10141; last; };
1693             /^Marsh marigold/ && do { $ncbi_taxid = 3449; last; };
1694             /^Phalaenopsis sp/ && do { $ncbi_taxid = 36900; last; };
1695             /^Anthirrhinum majus/ && do { $ncbi_taxid = 4151; last; };
1696             /^Equus spec/ && do { $ncbi_taxid = 9796; last; };
1697             /^Lycopodium spec/ && do { $ncbi_taxid = 13840; last; };
1698             /^Autographa californica/ && do { $ncbi_taxid = 307456; last; };
1699             /^E26 AEV/ && do { $ncbi_taxid = 31920; last; };
1700             /^Pseudocentrotus miliaris/ && do { $ncbi_taxid = 7677; last; }; # the genus is 7677 but this species isn't there
1701             /^SL3-3 (?:retro)?virus/ && do { $ncbi_taxid = 53454; last; }; # 53454 is unclassified MLV-related, SL3-3 a variant of that?
1702             /^Petunia sp/ && do { $ncbi_taxid = 4104; last; };
1703         }
1704         if (! $ncbi_taxid && defined $self->{_tax_db}) {
1705             ($ncbi_taxid) = $self->{_tax_db}->get_taxonids($species_string);
1706         }
1707     }
1708     else {
1709         # some species lines are poorly formated so custom handling
1710         SWITCH: for ($raw_species) {
1711             # for speed, go by common first letters
1712             my $first_letter = substr($raw_species, 0, 1);
1713
1714             $first_letter eq 'A' && do {
1715                 /^Adiantum raddianum/ && do { $ncbi_taxid = 32168; last; };
1716                 /^Avian sarcoma virus \(strain 17\)/ && do { $ncbi_taxid = 11877; last; };
1717                 /^AMV/ && do { $ncbi_taxid = 11866; last; };
1718                 /^AEV/ && do { $ncbi_taxid = 11861; last; };
1719                 /^AS42|^Avian musculoaponeurotic/ && do { $ncbi_taxid = 11873; last; };
1720                 /^Avian myelocytomatosis/ && do { $ncbi_taxid = 11869; last; };
1721                 /^ASV 31/ && do { $ncbi_taxid = 35270; last; };
1722                 /^A-MuLV/ && do { $ncbi_taxid = 188539; last; };
1723                 /^Asparagus officinalis/ && do { $ncbi_taxid = 4686; last; };
1724                 /^Agrobacterium tumefaciens/ && do { $ncbi_taxid = 358; last; };
1725                 /^ALV/ && do { $ncbi_taxid = 11864; last; };
1726                 /^AAV/ && do { $ncbi_taxid = 272636; last; };
1727                 /^AKV MLV/ && do { $ncbi_taxid = 11791; last; };
1728                 last;
1729             };
1730
1731             $first_letter eq 'B' && do {
1732                 /^BPV-1/ && do { $ncbi_taxid = 10559; last; };
1733                 /^BKV/ && do { $ncbi_taxid = 10629; last; };
1734                 /^Bolivian squirrel monkey/ && do { $ncbi_taxid = 39432; last; };
1735                 last;
1736             };
1737
1738             $first_letter eq 'C' && do {
1739                 /^Cauliflower/ && do { $ncbi_taxid = 3715; last; };
1740                 /^Chamek/ && do { $ncbi_taxid = 118643; last; };
1741                 /^Candida albicans/ && do { $ncbi_taxid = 5476; last; };
1742                 /^CaMV/ && do { $ncbi_taxid = 10641; last; };
1743                 last;
1744             };
1745
1746             $first_letter eq 'E' && do {
1747                 /^Eucalyptus gunnii/ && do { $ncbi_taxid = 3933; last; };
1748                 /^EBV, Epstein-Barr virus/ && do { $ncbi_taxid = 10376; last; };
1749                 /^Eucalyptus globulus subsp. bicostata/ && do { $ncbi_taxid = 71272; last; };
1750                 /^Eucalyptus globulus subsp. globulus/ && do { $ncbi_taxid = 71271; last; };
1751                 last;
1752             };
1753
1754             $first_letter eq 'F' && do {
1755                 /^FBR MuLV/ && do { $ncbi_taxid = 11806; last; };
1756                 /^FBJ MuLV/ && do { $ncbi_taxid = 11805; last; };
1757                 /^FeLV|Feline leukemia/ && do { $ncbi_taxid = 11923; last; };
1758                 /^Flaveria trinervia/ && do { $ncbi_taxid = 4227; last; };
1759                 /^FSV/ && do { $ncbi_taxid = 11885; last; };
1760                 /^F-MuLV/ && do { $ncbi_taxid = 11795; last; };
1761                 last;
1762             };
1763
1764             $first_letter eq 'H' && do {
1765                 /^HSV-1/ && do { $ncbi_taxid = 10298; last; };
1766                 /^HTLV-I/ && do { $ncbi_taxid = 11908; last; };
1767                 /^HIV-1/ && do { $ncbi_taxid = 11676; last; };
1768                 /^HPV-16/ && do { $ncbi_taxid = 333760; last; };
1769                 /^HBV/ && do { $ncbi_taxid = 10407; last; };
1770                 /^HBI/ && do { $ncbi_taxid = 11867; last; };
1771                 /^HPV-8/ && do { $ncbi_taxid = 10579; last; };
1772                 /^HPV-11/ && do { $ncbi_taxid = 10580; last; };
1773                 /^HPV-18/ && do { $ncbi_taxid = 333761; last; };
1774                 /^HCMV/ && do { $ncbi_taxid = 10359; last; };
1775                 /^HSV/ && do { $ncbi_taxid = 126283; last; };
1776                 /^HSV-2/ && do { $ncbi_taxid = 10310; last; };
1777                 /^HCV/ && do { $ncbi_taxid = 11108; last; };
1778                 /^HIV-2/ && do { $ncbi_taxid = 11709; last; };
1779                 last;
1780             };
1781
1782             $first_letter eq 'M' && do {
1783                 /^MMTV/ && do { $ncbi_taxid = 11757; last; };
1784                 /^Mo-MuLV/ && do { $ncbi_taxid = 11801; last; };
1785                 /^MuLV/ && do { $ncbi_taxid = 11786; last; };
1786                 /^MSV/ && do { $ncbi_taxid = 11802; last; };
1787                 /^MC29/ && do { $ncbi_taxid = 11868; last; };
1788                 /^MVM/ && do { $ncbi_taxid = 10794; last; };
1789                 /^MH2E21/ && do { $ncbi_taxid = 11955; last; }; # 11955 is a species, presumably MH2E21 is the strain
1790                 last;
1791             };
1792
1793             $first_letter eq 'R' && do {
1794                 /^Raphanus sativus/ && do { $ncbi_taxid = 3726; last; };
1795                 /^REV-T/ && do { $ncbi_taxid = 11636; last; };
1796                 /^RAV-0/ && do { $ncbi_taxid = 11867; last; }; # should be rous-associated virus 0 variant
1797                 /^RSV/ && do { $ncbi_taxid = 11886; last; };
1798                 /^RadLV/ && do { $ncbi_taxid = 31689; last; };
1799                 /^RTBV/ && do { $ncbi_taxid = 10654; last; };
1800                 last;
1801             };
1802
1803             $first_letter eq 'S' && do {
1804                 /^SV40/ && do { $ncbi_taxid = 10633; last; };
1805                 /^Sesbania rostrata/ && do { $ncbi_taxid = 3895; last; };
1806                 /^SIV/ && do { $ncbi_taxid = 11723; last; };
1807                 /^Spinacia oleracea/ && do { $ncbi_taxid = 3562; last; };
1808                 /^SCMV/ && do { $ncbi_taxid = 10364; last; }; # supposed to be AGM isolate
1809                 last;
1810             };
1811
1812             # and lower case
1813             $first_letter eq 'a' && do {
1814                 /^adenovirus type 5/ && do { $ncbi_taxid = 28285; last; };
1815                 /^adenovirus type 2/ && do { $ncbi_taxid = 10515; last; };
1816                 /^adenovirus/ && do { $ncbi_taxid = 189831; last; }; # 189831 ('unclassified Adenoviridae') is the closest I can get, but this has no genus and is not a species
1817                 last;
1818             };
1819
1820             $first_letter eq 'b' && do {
1821                 /^bell pepper/ && do { $ncbi_taxid = 4072; last; };
1822                 /^baculovirus, Autographa californica/ && do { $ncbi_taxid = 46015; last; };
1823                 /^broccoli/ && do { $ncbi_taxid = 36774; last; };
1824                 /^barley/ && do { $ncbi_taxid = 112509; last; };
1825                 last;
1826             };
1827
1828             $first_letter eq 'c' && do {
1829                 /^clawed frog/ && do { $ncbi_taxid = 8355; last; };
1830                 /^chipmunk/ && do { $ncbi_taxid = 64680; last; };
1831                 /^common tree shrew/ && do { $ncbi_taxid = 37347; last; };
1832                 /^cat/ && do { $ncbi_taxid = 9685; last; };
1833                 last;
1834             };
1835
1836             # and misc
1837             /^NK24/ && do { $ncbi_taxid = 11955; last; };
1838             /^OK10/ && do { $ncbi_taxid = 11871; last; };
1839             /^Dendrobium grex/ && do { $ncbi_taxid = 84618; last; };
1840             /^KSHV/ && do { $ncbi_taxid = 37296; last; };
1841             /^Oncidium/ && do { $ncbi_taxid = 96474; last; };
1842             /^Japanese quail/ && do { $ncbi_taxid = 93934; last; };
1843             /^Nile tilapia/ && do { $ncbi_taxid = 8128; last; };
1844             /^GALV/ && do { $ncbi_taxid = 11840; last; };
1845             /^JCV/ && do { $ncbi_taxid = 10632; last; };
1846             /^LPV/ && do { $ncbi_taxid = 10574; last; };
1847             /^Py,/ && do { $ncbi_taxid = 36362; last; };
1848             /^DHBV/ && do { $ncbi_taxid = 12639; last; };
1849             /^VZV/ && do { $ncbi_taxid = 10335; last; };
1850             /^Vicia faba/ && do { $ncbi_taxid = 3906; last; };
1851
1852             /^hamster/ && do { $ncbi_taxid = 10029; last; };
1853             /^sea urchin/ && do { $ncbi_taxid = 7668; last; };
1854             /^fruit fly/ && do { $ncbi_taxid = 7227; last; };
1855             /^halibut/ && do { $ncbi_taxid = 8267; last; };
1856             /^vaccinia virus/ && do { $ncbi_taxid = 10245; last; };
1857             /^taxonomic class Mammalia/ && do { $ncbi_taxid = 40674; last; }; # not a species
1858             /^taxonomic class Vertebrata/ && do { $ncbi_taxid = 7742; last; }; # not a species
1859             /^dog/ && do { $ncbi_taxid = 9615; last; };
1860             /^parsley/ && do { $ncbi_taxid = 4043; last; };
1861             /^mouse, Mus domesticus Torino/ && do { $ncbi_taxid = 10092; last; }; # 10092 is domesticus subspecies, but not the Torino strain
1862             /^lemur, Eulemur fulvus collaris/ && do { $ncbi_taxid = 47178; last; };
1863             /^red sea bream/ && do { $ncbi_taxid = 143350; last; };
1864             /^zebra finch/ && do { $ncbi_taxid = 59729; last; };
1865             /^mung bean/ && do { $ncbi_taxid = 3916; last; };
1866             /^soybean/ && do { $ncbi_taxid = 3847; last; };
1867             /^oat/ && do { $ncbi_taxid = 4498; last; };
1868             /^pseudorabies virus/ && do { $ncbi_taxid = 10345; last; };
1869         }
1870     }
1871
1872     $self->warn("Didn't know what species '$raw_species' was, unable to classify") unless $ncbi_taxid;
1873     return $ncbi_taxid;
1874 }
1875
1876 1;