1 # $Id: transfac_pro.pm,v 1.15 2006/08/12 11:00:03 sendu Exp $
3 # BioPerl module for Bio::DB::TFBS::transfac_pro
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Sendu Bala <bix@sendu.me.uk>
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::DB::TFBS::transfac_pro - An implementation of Bio::DB::TFBS
18 which uses local flat files for transfac pro
22 use Bio::DB::Taxonomy;
24 my $db = new Bio::DB::Taxonomy(-source => 'transfac_pro'
25 -dat_dir => $directory);
27 # we're interested in the gene P5
28 my ($gene_id) = $db->get_gene_ids(-name => 'P5'); # G000001
30 # we want all the transcription factors that bind to our gene
31 my @factor_ids = $db->get_factor_ids(-gene => $gene_id);
33 # get info about those TFs
34 foreach my $factor_id (@factor_ids) {
35 my $factor = $db->get_factor($factor_id);
36 my $name = $factor->universal_name;
37 # etc. - see Bio::Map::TranscriptionFactor, eg. find out where it binds
41 my $matrix = $db->get_matrix('M00001');
43 # get a binding site sequence
44 my $seq = $db->get_site('R00001');
48 This is an implementation which uses local flat files and the DB_File
49 module RECNO data structures to manage a local copy of the Transfac Pro TFBS
52 Required database files require a license which can be obtained via
53 http://www.biobase-international.com/pages/index.php?id=170
55 Within the linux installation tarball you will find a cgibin tar ball, and
56 inside that is a data directory containing the .dat files needed by this
57 module. Point to that data directory with -dat_dir
63 User feedback is an integral part of the evolution of this and other
64 Bioperl modules. Send your comments and suggestions preferably to
65 the Bioperl mailing list. Your participation is much appreciated.
67 bioperl-l@bioperl.org - General discussion
68 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
72 Please direct usage questions or support issues to the mailing list:
74 I<bioperl-l@bioperl.org>
76 rather than to the module maintainer directly. Many experienced and
77 reponsive experts will be able look at the problem and quickly
78 address it. Please include a thorough description of the problem
79 with code and data examples if at all possible.
83 Report bugs to the Bioperl bug tracking system to help us keep track
84 of the bugs and their resolution. Bug reports can be submitted via
87 https://redmine.open-bio.org/projects/bioperl/
89 =head1 AUTHOR - Sendu Bala
95 Based on Bio::DB::Taxonomy::flatfile by Jason Stajich
99 The rest of the documentation details each of the object methods.
100 Internal methods are usually preceded with a _
104 # Let the code begin...
106 package Bio
::DB
::TFBS
::transfac_pro
;
108 use Bio
::Annotation
::Reference
;
109 use Bio
::Annotation
::SimpleValue
;
110 use Bio
::LocatableSeq
;
111 use Bio
::SimpleAlign
;
112 use Bio
::Matrix
::PSM
::SiteMatrix
;
114 use Bio
::Map
::GeneMap
;
115 use Bio
::Map
::TranscriptionFactor
;
116 use Bio
::Map
::Position
;
117 use Bio
::Map
::Relative
;
120 use constant SEPARATOR
=> ':!:';
121 use constant INTERNAL_SEPARATOR
=> '!:!';
123 $DB_BTREE->{'flags'} = R_DUP
; # allow duplicate values in DB_File BTREEs
125 use base
qw(Bio::DB::TFBS);
130 Usage : my $obj = new Bio::DB::TFBS::transfac_pro();
131 Function: Builds a new Bio::DB::TFBS::transfac_pro object
132 Returns : an instance of Bio::DB::TTFBS::transfac_pro
133 Args : -dat_dir => name of directory where Transfac Pro .dat files
134 (required to initially build indexes)
135 -tax_db => Bio::DB::Taxonomy object, used when initially building
136 indexes, gives better results for species information
138 -index_dir => name of directory where index files should be created
139 or already exist. (defaults to -dat_dir, required if
140 -dat_dir not supplied)
141 -force => 1 replace current indexes even if they exist
146 my ($class, @args) = @_;
148 my $self = $class->SUPER::new
(@args);
150 my ($dat_dir, $index_dir, $tax_db, $force) = $self->_rearrange([qw(DAT_DIR INDEX_DIR TAX_DB FORCE)], @args);
151 $self->throw("At least one of -dat_dir and -index_dir must be supplied") unless ($dat_dir || $index_dir);
153 $self->index_directory($index_dir || $dat_dir);
154 $self->{_tax_db
} = $tax_db if $tax_db;
157 $self->_build_index($dat_dir, $force);
164 =head2 Bio::DB::TFBS Interface implementation
169 my ($self, $dat, @args) = @_;
170 @args % 2 == 0 || $self->throw("Must provide key => value pairs");
171 my $hash = $self->{$dat} || $self->throw("Unknown .dat type '$dat'");
174 # get a subset corresponding to args
178 while (my ($type, $value) = each %args) {
180 $self->warn("Arguement '$type' has no value, ignored");
185 my $converter = $hash->{$type};
186 unless ($converter) {
187 $self->warn("Unknown search type '$type' for .dat type '$dat'");
191 my @ids = $converter->get_dup($value);
193 @ids = $converter->get_dup(lc($value));
197 # we can have multiple types given at once, find the ids that
198 # satisfy all criteria
200 my %final = map { $_ => 1 } @final;
201 @final = grep { $final{$_} } @ids;
213 my $db_file_hash = $self->{$dat}->{id
};
215 my ($key, $prev_key, $value) = ('_!_', '!_!');
218 $db_file_hash->seq($key, $value, R_NEXT
);
219 last if $prev_key eq $key;
220 push(@ids, $value); # confusing? when creating objects we store
221 # $value as accession and $key as id, but from
222 # this method we return $value as id given $id!
232 Title : get_reference
233 Usage : my $ref = $obj->get_reference($id);
234 Function: Get a literature reference.
235 Returns : Bio::Annotation::Reference
236 Args : string - a reference id ('RE...')
241 my ($self, $id) = @_;
243 my $data = $self->{reference
}->{data
}->{$id} || return;
244 my @data = split(SEPARATOR
, $data);
246 return Bio
::Annotation
::Reference
->new(-pubmed
=> $data[0],
247 -authors
=> $data[1],
249 -location
=> $data[3] );
255 Usage : my $map = $obj->get_genemap($id);
256 Function: Get a GeneMap for a gene.
257 Returns : Bio::Map::GeneMap
258 Args : string - a gene id ('G...'), and optionally int (number of bp
264 my ($self, $id, $upstream) = @_;
266 return $self->{got_map
}->{$id} if defined $self->{got_map
}->{$id};
268 my $data = $self->{gene
}->{data
}->{$id} || return;
269 my @data = split(SEPARATOR
, $data);
271 # accession = id name description species_tax_id_or_raw_string
272 my $taxon = $self->{_tax_db
} ?
$self->{_tax_db
}->get_taxon($data[3]) || $data[3] : $data[3];
273 my $map = Bio
::Map
::GeneMap
->get(-uid
=> $id,
276 -description
=> $data[2],
277 -upstream
=> $upstream);
278 $self->{got_map
}->{$id} = $map; # prevents infinite recurse when we call get_factor below
280 # spawn all the factors that belong on this gene map
281 # get_factor_ids(-gene => ...) only works for genes that encode factors;
282 # have to go via sites
283 foreach my $sid ($self->get_site_ids(-gene
=> $id)) {
284 foreach my $fid ($self->get_factor_ids(-site
=> $sid)) {
285 # it is quite deliberate that we deeply recurse to arrive at the
286 # correct answer, which involves pulling in most of the database
287 no warnings
"recursion";
288 $self->get_factor($fid);
298 Usage : my $seq = $obj->get_seq($id);
299 Function: Get the sequence of a site. The sequence will be annotated with the
300 the tags 'relative_start', 'relative_end', 'relative_type' and
303 Args : string - a site id ('R...')
308 my ($self, $id) = @_;
310 my $data = $self->{site
}->{data
}->{$id} || return;
311 my @data = split(SEPARATOR
, $data);
313 my $seq = Bio
::Seq
->new(-seq
=> $data[2],
314 -accession_number
=> $id,
315 -description
=> $data[6] ?
'Genomic sequence' : 'Consensus or artificial sequence',
318 -alphabet
=> $data[7] || 'dna',
319 -species
=> $data[6]);
321 my $annot = $seq->annotation;
322 my $sv = Bio
::Annotation
::SimpleValue
->new(-tagname
=> 'relative_start', -value
=> $data[4] || 1);
323 $annot->add_Annotation($sv);
324 $sv = Bio
::Annotation
::SimpleValue
->new(-tagname
=> 'relative_end', -value
=> $data[5] || ($data[4] || 1 + length($data[2]) - 1));
325 $annot->add_Annotation($sv);
326 $sv = Bio
::Annotation
::SimpleValue
->new(-tagname
=> 'relative_type', -value
=> $data[3] || 'artificial');
327 $annot->add_Annotation($sv);
328 $sv = Bio
::Annotation
::SimpleValue
->new(-tagname
=> 'relative_to', -value
=> $data[1]);
329 $annot->add_Annotation($sv);
337 Usage : my $seq = $obj->get_fragment($id);
338 Function: Get the sequence of a fragment.
340 Args : string - a site id ('FR...')
345 my ($self, $id) = @_;
347 my $data = $self->{fragment
}->{data
}->{$id} || return;
348 my @data = split(SEPARATOR
, $data);
350 # accession = id gene_id1 gene_id2 species_tax_id_or_raw_string sequence source
351 return new Bio
::Seq
( -seq
=> $data[4],
352 -accession_number
=> $id,
353 -description
=> 'Between genes '.$data[1].' and '.$data[2],
354 -species
=> $data[3],
356 -alphabet
=> 'dna' );
362 Usage : my $matrix = $obj->get_matrix($id);
363 Function: Get a matrix that describes a binding site.
364 Returns : Bio::Matrix::PSM::SiteMatrix
365 Args : string - a matrix id ('M...'), optionally a sequence string from
366 which base frequencies will be calcualted for the matrix model
372 my ($self, $id, $seq) = @_;
376 my $data = $self->{matrix
}->{data
}->{$id} || return;
377 my @data = split(SEPARATOR
, $data);
378 $data[4] || $self->throw("Matrix data missing for $id");
381 foreach my $position (split(INTERNAL_SEPARATOR
, $data[4])) {
382 my ($a_count, $c_count, $g_count, $t_count) = split("\t", $position);
383 push(@
{$a}, $a_count);
384 push(@
{$c}, $c_count);
385 push(@
{$g}, $g_count);
386 push(@
{$t}, $t_count);
389 # our psms include a simple background model so we can use
390 # sequence_match_weight() if desired
391 my $a_freq = ($seq =~ tr/a//) / length($seq);
392 my $c_freq = ($seq =~ tr/c//) / length($seq);
393 my $g_freq = ($seq =~ tr/g//) / length($seq);
394 my $t_freq = ($seq =~ tr/t//) / length($seq);
396 my $psm = Bio
::Matrix
::PSM
::SiteMatrix
->new(-pA
=> $a,
401 -accession_number
=> $id,
403 -width
=> scalar(@
{$a}),
405 -model
=> { A
=> $a_freq, C
=> $c_freq, G
=> $g_freq, T
=> $t_freq } );
407 #*** used to make a Bio::Matrix::PSM::Psm and add references, but it
408 # didn't seem worth it. You can get references from the database by:
409 #foreach my $ref_id ($db->get_reference_ids(-matrix => $id)) {
410 # my $ref = $db->get_reference($ref_id);
419 Usage : my $aln = $obj->get_aln($id);
420 Function: Get the alignment that was used to generate a matrix. Each sequence
421 in the alignment will have an accession_number corresponding to the
422 Transfac site id, and id() based on that but unique within the
424 Returns : Bio::SimpleAlign
425 Args : string - a matrix id ('M...'), optionally true to, when a matrix
426 lists no sequences, search for sequences via the matrix's factors,
427 picking the sites that best match the matrix
431 my %VALID_STRAND = map {$_ => 1} qw(-1 0 1);
434 my ($self, $id, $via_factors) = @_;
436 my $data = $self->{matrix
}->{data
}->{$id} || $self->throw("matrix '$id' had no data in DB_File");
437 my @data = split(SEPARATOR
, $data);
439 if (! $data[5] && $via_factors) {
440 # This is a matrix with no site sequences given in matrix.dat.
441 # Find some matching site sequences via factors.
443 # First, check its factors for sites
446 foreach my $factor_id ($self->get_factor_ids(-matrix
=> $id)) {
447 $factor_ids{$factor_id} = 1;
448 foreach my $site_id ($self->get_site_ids(-factor
=> $factor_id)) {
449 next if defined $site_seqs{$site_id};
450 my $seq = $self->get_seq($site_id);
452 # skip sites that have no sequence, or have IUPAC symbols in
453 # their sequence (most probably the 'consensus' sequence itself
454 # that was used to make and exactly corresponds to the matrix)
455 my $seq_str = $seq->seq || next;
456 $seq_str =~ /[MRWSYKVHDB]/ and next;
458 $site_seqs{$site_id} = $seq;
461 my @seqs = values %site_seqs;
464 # pick the sub-seqs that match to the matrix with the best scores
465 my $matrix = $self->get_matrix($id);
466 my $desired_sequences = $matrix->sites;
467 return if @seqs < $desired_sequences;
469 my $desired_length = $matrix->width;
471 foreach my $seq (@seqs) {
472 my $for_str = $seq->seq;
473 next if length($for_str) < $desired_length;
474 my $rev_str = $seq->revcom->seq;
477 my $best_subseq = '';
479 my $best_subseq_caps = 0;
482 foreach my $seq_str ($for_str, $rev_str) {
483 for my $i (0..(length($seq_str) - $desired_length)) {
484 my $subseq = substr($seq_str, $i, $desired_length);
485 $subseq =~ s/[^ACGTacgt]//g; # can only score atcg
486 next unless length($subseq) == $desired_length; # short or 0-length seqs could get the highest scores!
487 my $score = $matrix->sequence_match_weight($subseq);
489 # caps represent the author-chosen bit of a site
490 # sequence so we would prefer to choose a subseq that
492 my $caps = $subseq =~ tr/ACGT//;
494 #*** (don't know why numeric == fails for comparing
495 # scores, when the string eq works)
496 if ($score > $best_score || ("$score" eq "$best_score" && $caps > $best_subseq_caps)) {
497 $best_score = $score;
498 $best_subseq_caps = $caps;
499 $best_subseq = $subseq;
501 $best_revcom = $revcom;
508 $best_seqs{$seq->accession_number} = [$best_subseq, $seq->accession_number, ($best_i + 1), $revcom ?
-1 : 1, $best_score];
511 my @sorted = sort { $best_seqs{$b}->[-1] <=> $best_seqs{$a}->[-1] } keys %best_seqs;
512 return if @sorted < $desired_sequences;
513 splice(@sorted, $desired_sequences);
514 my %wanted = map { $_ => 1 } @sorted;
517 foreach my $seq (@seqs) {
518 next unless exists $wanted{$seq->accession_number};
519 my @data = @
{$best_seqs{$seq->accession_number}};
521 push(@site_data, join('_', @data));
524 $data[5] = join(INTERNAL_SEPARATOR
, @site_data);
525 $self->{matrix
}->{data
}->{$id} = join(SEPARATOR
, @data);
530 my @blocks = split(INTERNAL_SEPARATOR
, $data[5]);
532 # append gap chars to all sequences to make them the same length
533 # (applies to sequences found via factors, presumably, since we already
534 # do this for matrix alignments in transfac_pro.pm)
537 my ($seq) = split('_', $_);
538 my $length = length($seq);
539 if ($length > $longest) {
543 foreach my $i (0..$#blocks) {
544 my $block = $blocks[$i];
545 my ($seq, $seq_id) = split('_', $block);
546 my $length = length($seq);
547 if ($length < $longest) {
549 $seq .= '-'x
($longest - $length);
550 $block =~ s/^${orig_seq}_/${seq}_/;
551 $blocks[$i] = $block;
555 # build the alignment
556 my $aln = Bio
::SimpleAlign
->new(-source
=> 'transfac_pro');
559 my ($seq, $seq_acc, $start, $strand) = split('_', $_);
561 $self->throw("Invalid strand $strand found in block $_")
562 unless exists $VALID_STRAND{$strand};
563 # we can get back multiple different subparts of the same site (sequence),
564 # so $seq_acc isn't unique across this loop. Can't use it as the seq id
565 # of the alignment (ids must be unique in SimpleAlign), so we
566 # uniquify the id and store the original id as the accession_number
568 $done_ids{$seq_acc}++;
569 if ($done_ids{$seq_acc} > 1) {
570 $seq_id = $seq_acc.'_'.$done_ids{$seq_acc};
576 my $gaps = $seq =~ tr/-//;
577 my $length = length($seq) - $gaps;
578 $self->throw("seq '$seq_id' for matrix '$id' had seq '$seq'") unless $length;
579 $aln->add_seq(Bio
::LocatableSeq
->new(-seq
=> $seq,
581 -accession_number
=> $seq_acc,
583 -end
=> $start + $length - 1,
584 -strand
=> $strand));
587 # could also store score? of?
595 Usage : my $factor = $obj->get_factor($id);
596 Function: Get the details of a transcription factor.
597 Returns : Bio::Map::TranscriptionFactor
598 Args : string - a factor id ('T...')
603 my ($self, $id) = @_;
605 return $self->{got_factor
}->{$id} if defined $self->{got_factor
}->{$id};
606 my $data = $self->{factor
}->{data
}->{$id} || return;
607 my @data = split(SEPARATOR
, $data);
609 # accession = id name species sequence
610 my $tf = Bio
::Map
::TranscriptionFactor
->get(-id
=> $id,
611 -universal_name
=> $data[1]);
612 #*** not sure what to do with species and sequence, since we don't want to
613 # confuse the idea that a TF is a general thing that could bind to any
614 # species... then again, you might want to model species-specific variants
615 # of a TF with different binding abilities...
616 #*** idea of having inclusion and exclusion species so you can prevent/
617 # ignore a tf that binds to the wrong species (a species that doesn't even
618 # have the tf), and associating sequence with each species/tf combo so you
619 # can see how diverged the tf is and make assumptions about site difference
622 # place it on all its genemaps
623 foreach my $sid ($self->get_site_ids(-factor
=> $id)) {
624 my $s_data = $self->{site
}->{data
}->{$sid} || next;
625 my @s_data = split(SEPARATOR
, $s_data);
627 # accession = id gene_id sequence relative_to first_position last_position species_tax_id_or_raw_string
628 $s_data[1] || next; # site isn't relative to a gene, meaningless
629 $s_data[4] || next; # don't know where its supposed to be, can't model it
630 $s_data[5] ||= $s_data[4] + ($s_data[2] ?
length($s_data[2]) - 1 : 0);
632 # it is quite deliberate that we deeply recurse to arrive at the
633 # correct answer, which involves pulling in most of the database
634 no warnings
"recursion";
635 my $gene_map = $self->get_genemap($s_data[1]) || next;
636 return $self->{got_factor
}->{$id} if defined $self->{got_factor
}->{$id};
638 #*** not always relative to gene start...
639 # we need Bio::Map::Gene s to have some default tss and atg positions
640 # that we can be relative to
641 my $rel = Bio
::Map
::Relative
->new(-element
=> $gene_map->gene, -description
=> $s_data[3]);
642 Bio
::Map
::Position
->new(-map => $gene_map, -element
=> $tf, -start
=> $s_data[4], -end
=> $s_data[5], -relative
=> $rel);
645 $self->{got_factor
}->{$id} = $tf;
649 # since get_factor() is uncertain, just have direct access methods to factor
651 sub get_factor_name
{
652 my ($self, $id) = @_;
653 my $details = $self->_get_factor_details($id) || return;
654 return $details->{name
};
656 sub get_factor_species
{
657 my ($self, $id) = @_;
658 my $details = $self->_get_factor_details($id) || return;
659 return $details->{species
};
661 sub get_factor_sequence
{
662 my ($self, $id) = @_;
663 my $details = $self->_get_factor_details($id) || return;
664 return $details->{sequence
};
666 sub _get_factor_details
{
667 my ($self, $id) = @_;
670 return $self->{factor_details
}->{$id} if defined $self->{factor_details
}->{$id};
672 my $data = $self->{factor
}->{data
}->{$id} || return;
673 my @data = split(SEPARATOR
, $data);
675 # accession = id name species sequence
677 my %details = (name
=> $data[1], species
=> $data[2], sequence
=> $data[3]);
678 $self->{factor_details
}->{$id} = \
%details;
683 =head2 get_reference_ids
685 Title : get_reference_ids
686 Usage : my @ids = $obj->get_reference_ids(-key => $value);
687 Function: Get all the reference ids that are associated with the supplied
689 Returns : list of strings (ids)
690 Args : -key => value, where value is a string id, and key is one of:
691 -pubmed -site -gene -matrix -factor
695 sub get_reference_ids
{
697 return $self->_get_ids('reference', @_);
700 # -id -name -species -site -factor -reference
703 return $self->_get_ids('gene', @_);
709 Usage : my @ids = $obj->get_site_ids(-key => $value);
710 Function: Get all the site ids that are associated with the supplied
712 Returns : list of strings (ids)
713 Args : -key => value, where value is a string id, and key is one of:
714 -id -species -gene -matrix -factor -reference
720 return $self->_get_ids('site', @_);
723 =head2 get_matrix_ids
725 Title : get_matrix_ids
726 Usage : my @ids = $obj->get_matrix_ids(-key => $value);
727 Function: Get all the matrix ids that are associated with the supplied
729 Returns : list of strings (ids)
730 Args : -key => value, where value is a string id, and key is one of:
731 -id -name -site -factor -reference
737 return $self->_get_ids('matrix', @_);
740 =head2 get_factor_ids
742 Title : get_factor_ids
743 Usage : my @ids = $obj->get_factor_ids(-key => $value);
744 Function: Get all the factor ids that are associated with the supplied
746 Returns : list of strings (ids)
747 Args : -key => value, where value is a string id, and key is one of:
748 -id -name -species -interactors -gene -matrix -site -reference
749 NB: -gene only gets factor ids for genes that encode factors
755 return $self->_get_ids('factor', @_);
758 =head2 get_fragment_ids
760 Title : get_fragment_ids
761 Usage : my @ids = $obj->get_fragment_ids(-key => $value);
762 Function: Get all the fragment ids that are associated with the supplied
764 Returns : list of strings (ids)
765 Args : -key => value, where value is a string id, and key is one of:
766 -id -species -gene -factor -reference
770 sub get_fragment_ids
{
772 return $self->_get_ids('fragment', @_);
775 =head2 Helper methods
779 # internal method which does the indexing
781 my ($self, $dat_dir, $force) = @_;
783 # MLDBM would give us transparent complex data structures with DB_File,
784 # allowing just one index file, but its yet another requirment and we
785 # don't strictly need it
787 my $index_dir = $self->index_directory;
788 my $gene_index = "$index_dir/gene.dat.index";
789 my $reference_index = "$index_dir/reference.dat.index";
790 my $matrix_index = "$index_dir/matrix.dat.index";
791 my $factor_index = "$index_dir/factor.dat.index";
792 my $fragment_index = "$index_dir/fragment.dat.index";
793 my $site_index = "$index_dir/site.dat.index";
795 my $reference_dat = "$dat_dir/reference.dat";
796 if (! -e
$reference_index || $force) {
797 open my $REF, '<', $reference_dat or $self->throw("Could not read reference file '$reference_dat': $!");
800 unlink $reference_index;
801 my $ref = tie
(%references, 'DB_File', $reference_index, O_RDWR
|O_CREAT
, 0644, $DB_HASH)
802 or $self->throw("CCould not open file '$reference_index': $!");
805 my $reference_pubmed = $reference_index.'.pubmed';
806 unlink $reference_pubmed;
807 my $pub = tie
(%pubmed, 'DB_File', $reference_pubmed, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
808 or $self->throw("Could not open file '$reference_pubmed': $!");
811 my $reference_gene = $gene_index.'.reference';
812 unlink $reference_gene;
813 my $gene = tie
(%gene, 'DB_File', $reference_gene, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
814 or $self->throw("Could not open file '$reference_gene': $!");
817 my $reference_site = $site_index.'.reference';
818 unlink $reference_site;
819 my $site = tie
(%site, 'DB_File', $reference_site, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
820 or $self->throw("Could not open file '$reference_site': $!");
823 my $reference_fragment = $fragment_index.'.reference';
824 unlink $reference_fragment;
825 my $fragment = tie
(%fragment, 'DB_File', $reference_fragment, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
826 or $self->throw("Could not open file '$reference_fragment': $!");
829 my $reference_factor = $factor_index.'.reference';
830 unlink $reference_factor;
831 my $factor = tie
(%factor, 'DB_File', $reference_factor, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
832 or $self->throw("Could not open file '$reference_factor': $!");
835 my $reference_matrix = $matrix_index.'.reference';
836 unlink $reference_matrix;
837 my $matrix = tie
(%matrix, 'DB_File', $reference_matrix, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
838 or $self->throw("Could not open file '$reference_matrix': $!");
840 # skip the first three header lines
841 <$REF>; <$REF>; <$REF>;
848 elsif (/^RX PUBMED: (\d+)/) {
850 $pub->put("$1", $data[0]);
852 elsif (/^RA (.+)\n$/) {
855 elsif (/^RT (.+?)\.?\n$/) {
858 elsif (/^RL (.+?)\.?\n$/) {
861 elsif (/^GE TRANSFAC: (\w\d+)/) {
862 $gene->put($data[0], "$1");
864 elsif (/^BS TRANSFAC: (\w\d+)/) {
865 $site->put($data[0], "$1");
867 elsif (/^FA TRANSFAC: (\w\d+)/) {
868 $factor->put($data[0], "$1");
870 elsif (/^FR TRANSFAC: (FR\d+)/) {
871 $fragment->put($data[0], "$1");
873 elsif (/^MX TRANSFAC: (\w\d+)/) {
874 $matrix->put($data[0], "$1");
877 # end of a record, store previous data and reset
879 # accession = pubmed authors title location
880 $references{$data[0]} = join(SEPARATOR
, ($data[1] || '',
890 $ref = $pub = $gene = $site = $fragment = $factor = $matrix = undef;
900 my $gene_dat = "$dat_dir/gene.dat";
901 if (! -e
$gene_index || $force) {
902 open my $GEN, '<', $gene_dat or $self->throw("Could not read gene file '$gene_dat': $!");
906 my $gene = tie
(%genes, 'DB_File', $gene_index, O_RDWR
|O_CREAT
, 0644, $DB_HASH)
907 or $self->throw("Could not open file '$gene_index': $!");
910 my $gene_id = $gene_index.'.id';
912 my $id = tie
(%id, 'DB_File', $gene_id, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
913 or $self->throw("Could not open file '$gene_id': $!");
916 my $gene_name = $gene_index.'.name';
918 my $name = tie
(%name, 'DB_File', $gene_name, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
919 or $self->throw("Could not open file '$gene_name': $!");
922 my $gene_species = $gene_index.'.species';
923 unlink $gene_species;
924 my $species = tie
(%species, 'DB_File', $gene_species, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
925 or $self->throw("Could not open file '$gene_species': $!");
928 my $gene_site = $site_index.'.gene';
930 my $site = tie
(%site, 'DB_File', $gene_site, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
931 or $self->throw("Could not open file '$gene_site': $!");
934 my $gene_factor = $factor_index.'.gene';
936 my $factor = tie
(%factor, 'DB_File', $gene_factor, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
937 or $self->throw("Could not open file '$gene_factor': $!");
940 my $gene_fragment = $fragment_index.'.gene';
941 unlink $gene_fragment;
942 my $fragment = tie
(%fragment, 'DB_File', $gene_fragment, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
943 or $self->throw("Could not open file '$gene_fragment': $!");
946 my $gene_reference = $reference_index.'.gene';
947 unlink $gene_reference;
948 my $reference = tie
(%reference, 'DB_File', $gene_reference, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
949 or $self->throw("Could not open file '$gene_reference': $!");
951 # skip the first three header lines
952 <$GEN>; <$GEN>; <$GEN>;
959 elsif (/^ID (\S+)/) {
961 $id->put("$1", $data[0]);
963 elsif (/^SD (.+)$/) {
965 $name->put(lc("$1"), $data[0]);
967 elsif (/^SY (.+)\.$/) {
968 foreach (split('; ', lc("$1"))) {
969 $name->put($_, $data[0]);
972 elsif (/^DE (.+)$/) {
975 elsif (/^OS (.+)$/) {
976 my $raw_species = $1;
977 my $taxid = $self->_species_to_taxid($raw_species);
978 $data[4] = $taxid || $raw_species;
979 $species->put($data[4], $data[0]);
981 elsif (/^RN .+?(RE\d+)/) {
982 $reference->put($data[0], "$1");
984 elsif (/^BS .+?(R\d+)/) {
985 $site->put($data[0], "$1");
987 elsif (/^FA (T\d+)/) {
988 $factor->put($data[0], "$1");
990 elsif (/^BR (FR\d+)/) {
991 $fragment->put($data[0], "$1");
994 # end of a record, store previous data and reset
996 # accession = id name description species_tax_id_or_raw_string
997 $genes{$data[0]} = join(SEPARATOR
, ($data[1] || '',
1007 $gene = $id = $name = $species = $site = $factor = $reference = undef;
1017 my $site_dat = "$dat_dir/site.dat";
1018 if (! -e
$site_index || $force) {
1019 open my $SIT, '<', $site_dat or $self->throw("Could not read site file '$site_dat': $!");
1023 my $site = tie
(%sites, 'DB_File', $site_index, O_RDWR
|O_CREAT
, 0644, $DB_HASH)
1024 or $self->throw("Could not open file '$site_index': $!");
1027 my $site_id = $site_index.'.id';
1029 my $id = tie
(%id, 'DB_File', $site_id, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1030 or $self->throw("Could not open file '$site_id': $!");
1033 my $site_species = $site_index.'.species';
1034 unlink $site_species;
1035 my $species = tie
(%species, 'DB_File', $site_species, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1036 or $self->throw("Could not open file '$site_species': $!");
1039 my $site_qualities = $site_index.'.qual';
1040 unlink $site_qualities;
1041 my $quality = tie
(%qualities, 'DB_File', $site_qualities, O_RDWR
|O_CREAT
, 0644, $DB_HASH)
1042 or $self->throw("Could not open file '$site_qualities': $!");
1045 my $site_gene = $gene_index.'.site';
1047 my $gene = tie
(%gene, 'DB_File', $site_gene, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1048 or $self->throw("Could not open file '$site_gene': $!");
1051 my $site_matrix = $matrix_index.'.site';
1052 unlink $site_matrix;
1053 my $matrix = tie
(%matrix, 'DB_File', $site_matrix, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1054 or $self->throw("Could not open file '$site_matrix': $!");
1057 my $site_factor = $factor_index.'.site';
1058 unlink $site_factor;
1059 my $factor = tie
(%factor, 'DB_File', $site_factor, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1060 or $self->throw("Could not open file '$site_factor': $!");
1063 my $site_reference = $reference_index.'.site';
1064 unlink $site_reference;
1065 my $reference = tie
(%reference, 'DB_File', $site_reference, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1066 or $self->throw("Could not open file '$site_reference': $!");
1068 # skip the first three header lines
1069 <$SIT>; <$SIT>; <$SIT>;
1076 elsif (/^ID (\S+)/) {
1078 $id->put("$1", $data[0]);
1080 elsif (/^TY (.+)$/) {
1083 elsif (/^DE .*Gene: (G\d+)/) {
1085 $gene->put($data[0], "$1");
1087 # if it has no gene it is an artificial sequence, unless it
1088 # has a species (OS line), in which case it is unassigned
1089 # genomic; either way we won't be able to make a
1090 # Bio::Map::PositionI later on, so such sites won't be
1093 elsif (/^OS (.+)$/) {
1094 # Since not all sites in site.dat with a species have a gene,
1095 # (small handful are unassigned 'genomic') can't delegate to
1096 # gene.dat and must parse species here (effectively again)
1097 my $raw_species = $1;
1098 my $taxid = $self->_species_to_taxid($raw_species);
1099 $data[7] = $taxid || $raw_species;
1100 $species->put($data[7], $data[0]);
1102 elsif (/^SQ (.+)\.$/) {
1104 # there can actually be more than one SQ line, seemingly with
1105 # variations of the sequence (not a long sequence split over
1106 # two lines); not sure what to do with data; currently we end
1107 # up storing only the last variant.
1109 elsif (/^S1 (.+)$/) {
1111 # if S1 not present, means transcriptional start
1113 elsif (/^SF (.+)$/) {
1116 elsif (/^ST (.+)$/) {
1119 elsif (/^RN .+?(RE\d+)/) {
1120 $reference->put($data[0], "$1");
1122 elsif (/^MX (M\d+)/) {
1123 $matrix->put($data[0], "$1");
1125 elsif (/^BF (T\d+); .+?; Quality: (\d)/) {
1126 $factor->put($data[0], "$1");
1127 $qualities{$data[0].SEPARATOR
.$1} = $2;
1130 # end of a record, store previous data and reset
1132 # accession = id gene_id sequence relative_to first_position last_position species_tax_id_or_raw_string type
1133 $sites{$data[0]} = join(SEPARATOR
, ($data[1] || '',
1147 $site = $id = $species = $quality = $gene = $matrix = $factor = $reference = undef;
1158 my $matrix_dat = "$dat_dir/matrix.dat";
1159 if (! -e
$matrix_index || $force) {
1160 open my $MAT, '<', $matrix_dat or $self->throw("Could not read matrix file '$matrix_dat': $!");
1163 unlink $matrix_index;
1164 my $matrix = tie
(%matrices, 'DB_File', $matrix_index, O_RDWR
|O_CREAT
, 0644, $DB_HASH)
1165 or $self->throw("Could not open file '$matrix_index': $!");
1168 my $matrix_id = $matrix_index.'.id';
1170 my $id = tie
(%id, 'DB_File', $matrix_id, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1171 or $self->throw("Could not open file '$matrix_id': $!");
1174 my $matrix_name = $matrix_index.'.name';
1175 unlink $matrix_name;
1176 my $name = tie
(%name, 'DB_File', $matrix_name, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1177 or $self->throw("Could not open file '$matrix_name': $!");
1180 my $matrix_site = $site_index.'.matrix';
1181 unlink $matrix_site;
1182 my $site = tie
(%site, 'DB_File', $matrix_site, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1183 or $self->throw("Could not open file '$matrix_site': $!");
1186 my $matrix_factor = $factor_index.'.matrix';
1187 unlink $matrix_factor;
1188 my $factor = tie
(%factor, 'DB_File', $matrix_factor, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1189 or $self->throw("Could not open file '$matrix_factor': $!");
1192 my $matrix_reference = $reference_index.'.matrix';
1193 unlink $matrix_reference;
1194 my $reference = tie
(%reference, 'DB_File', $matrix_reference, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1195 or $self->throw("Could not open file '$matrix_reference': $!");
1197 # skip the first three header lines
1198 <$MAT>; <$MAT>; <$MAT>;
1207 elsif (/^ID (\S+)/) {
1209 $id->put("$1", $data[0]);
1211 elsif (/^NA (.+)$/) {
1213 $name->put("$1", $data[0]);
1215 elsif (/^DE (.+)$/) {
1218 elsif (/^\d\d \s*(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/) {
1219 # a, c, g, t counts/weights
1220 push(@matrix_data, join("\t", ($1, $2, $3, $4)));
1222 # Work out the number of sites as the largest number of
1223 # sites amongst all positions in the sequences. (The BA
1224 # line isn't reliable for telling us the correct number of
1225 # sites all the time)
1226 my $num = $1 + $2 + $3 + $4;
1228 if ($num > $data[4]) {
1232 elsif (/^BS ([\sa-zA-Z]+); (.+?); (-?\d+); \d+;.*; ([np])/) {
1233 # sequence id start strand
1234 push(@site_data, join('_', ($1, $2, $3, $4 eq 'p' ?
1 : -1)));
1235 $site->put($data[0], $2);
1237 elsif (/^BF (T\d+)/) {
1238 $factor->put($data[0], "$1");
1240 elsif (/^RN .+?(RE\d+)/) {
1241 $reference->put($data[0], "$1");
1244 # end of a record, store previous data and reset
1245 my $matrix_data = join(INTERNAL_SEPARATOR
, @matrix_data) || '';
1247 # sites of a matrix are pre-aligned but padded with spaces on
1248 # the left and no padding on the right; pad with -s both sides
1249 my $longest_seq = 0;
1251 # For all the work, does anything meaningful actually get passed
1252 # on here? Commenting out fixes the latest crashes on trunk.
1255 #foreach my $site_seq (map {my ($seq) = split("_", $_ ,2); $seq;} @site_data) {
1256 # $site_seq =~ s/ /-/g;
1257 # my $length = length($site_seq);
1258 # if ($length > $longest_seq) {
1259 # $longest_seq = $length;
1262 #foreach my $site (@site_data) {
1263 # my ($site_seq) = split("_", $site ,2);
1264 # my $length = length($site_seq);
1265 # if ($length < $longest_seq) {
1266 # $site_seq .= '-' x ($longest_seq - $length);
1270 my $site_data = join(INTERNAL_SEPARATOR
, @site_data) || '';
1272 # accession = id name description num_of_sites matrix_data site_data
1273 $matrices{$data[0]} = join(SEPARATOR
, ($data[1] || '',
1280 @data = @matrix_data = @site_data = ();
1285 $matrix = $id = $name = $site = $factor = $reference = undef;
1294 my $factor_dat = "$dat_dir/factor.dat";
1295 if (! -e
$factor_index || $force) {
1296 open my $FAC, '<', $factor_dat or $self->throw("Could not read factor file '$factor_dat': $!");
1299 unlink $factor_index;
1300 my $factor = tie
(%factors, 'DB_File', $factor_index, O_RDWR
|O_CREAT
, 0644, $DB_HASH)
1301 or $self->throw("Could not open file '$factor_index': $!");
1304 my $factor_id = $factor_index.'.id';
1306 my $id = tie
(%id, 'DB_File', $factor_id, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1307 or $self->throw("Could not open file '$factor_id': $!");
1310 my $factor_name = $factor_index.'.name';
1311 unlink $factor_name;
1312 my $name = tie
(%name, 'DB_File', $factor_name, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1313 or $self->throw("Could not open file '$factor_name': $!");
1316 my $factor_species = $factor_index.'.species';
1317 unlink $factor_species;
1318 my $species = tie
(%species, 'DB_File', $factor_species, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1319 or $self->throw("Could not open file '$factor_species': $!");
1322 my $factor_interactors = $factor_index.'.interactors';
1323 unlink $factor_interactors;
1324 my $interact = tie
(%interactors, 'DB_File', $factor_interactors, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1325 or $self->throw("Could not open file '$factor_interactors': $!");
1328 my $factor_gene = $gene_index.'.factor';
1329 unlink $factor_gene;
1330 my $gene = tie
(%gene, 'DB_File', $factor_gene, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1331 or $self->throw("Could not open file '$factor_gene': $!");
1334 my $factor_matrix = $matrix_index.'.factor';
1335 unlink $factor_matrix;
1336 my $matrix = tie
(%matrix, 'DB_File', $factor_matrix, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1337 or $self->throw("Could not open file '$factor_matrix': $!");
1340 my $factor_site = $site_index.'.factor';
1341 unlink $factor_site;
1342 my $site = tie
(%site, 'DB_File', $factor_site, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1343 or $self->throw("Could not open file '$factor_site': $!");
1346 my $factor_fragment = $fragment_index.'.factor';
1347 unlink $factor_fragment;
1348 my $fragment = tie
(%fragment, 'DB_File', $factor_fragment, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1349 or $self->throw("Could not open file '$factor_fragment': $!");
1352 my $factor_reference = $reference_index.'.factor';
1353 unlink $factor_reference;
1354 my $reference = tie
(%reference, 'DB_File', $factor_reference, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1355 or $self->throw("Could not open file '$factor_reference': $!");
1357 # skip the first three header lines
1358 <$FAC>; <$FAC>; <$FAC>;
1366 elsif (/^ID (\S+)/) {
1367 # IDs are always the same as AC? Is this needed?
1369 $id->put("$1", $data[0]);
1371 elsif (/^FA (.+)$/) {
1373 $name->put("$1", $data[0]);
1375 elsif (/^OS (.+)$/) {
1376 # This is the species the actual factor came from, which may
1377 # differ from the species of any sequences it is described as
1378 # binding to. Not all factors that have a species have a gene,
1379 # so can't delegate species to a gene lookup.
1380 my $raw_species = $1;
1381 my $taxid = $self->_species_to_taxid($raw_species);
1382 $data[3] = $taxid || $raw_species;
1383 $species->put($data[3], $data[0]);
1385 elsif (/^GE (G\d+)/) {
1386 $gene->put($data[0], "$1");
1388 elsif (/^SQ (.+)$/) {
1391 elsif (/^IN (T\d+)/) {
1392 $interact->put($data[0], "$1");
1394 elsif (/^MX (M\d+)/) {
1395 $matrix->put($data[0], "$1");
1397 elsif (/^BS (R\d+)/) {
1398 $site->put($data[0], "$1");
1400 elsif (/^BR (FR\d+)/) {
1401 $fragment->put($data[0], "$1");
1403 elsif (/^RN .+?(RE\d+)/) {
1404 $reference->put($data[0], "$1");
1407 # end of a record, store previous data and reset
1409 # accession = id name species sequence
1410 $factors{$data[0]} = join(SEPARATOR
, ($data[1] || '',
1421 $factor = $id = $name = $species = $interact = $gene = $matrix = $site = $fragment = $reference = undef;
1434 my $fragment_dat = "$dat_dir/fragment.dat";
1435 if (! -e
$fragment_index || $force) {
1436 if (open my $FRA, '<', $fragment_dat) {
1438 unlink $fragment_index;
1439 my $fragment = tie
(%fragments, 'DB_File', $fragment_index, O_RDWR
|O_CREAT
, 0644, $DB_HASH)
1440 or $self->throw("Could not open file '$fragment_index': $!");
1443 my $fragment_id = $fragment_index.'.id';
1444 unlink $fragment_id;
1445 my $id = tie
(%id, 'DB_File', $fragment_id, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1446 or $self->throw("Could not open file '$fragment_id': $!");
1449 my $fragment_qualities = $fragment_index.'.qual';
1450 unlink $fragment_qualities;
1451 my $quality = tie
(%qualities, 'DB_File', $fragment_qualities, O_RDWR
|O_CREAT
, 0644, $DB_HASH)
1452 or $self->throw("Could not open file '$fragment_qualities': $!");
1455 my $fragment_species = $fragment_index.'.species';
1456 unlink $fragment_species;
1457 my $species = tie
(%species, 'DB_File', $fragment_species, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1458 or $self->throw("Could not open file '$fragment_species': $!");
1461 my $fragment_gene = $gene_index.'.fragment';
1462 unlink $fragment_gene;
1463 my $gene = tie
(%gene, 'DB_File', $fragment_gene, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1464 or $self->throw("Could not open file '$fragment_gene': $!");
1467 my $fragment_factor = $factor_index.'.fragment';
1468 unlink $fragment_factor;
1469 my $factor = tie
(%factor, 'DB_File', $fragment_factor, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1470 or $self->throw("Could not open file '$fragment_factor': $!");
1473 my $fragment_reference = $reference_index.'.fragment';
1474 unlink $fragment_reference;
1475 my $reference = tie
(%reference, 'DB_File', $fragment_reference, O_RDWR
|O_CREAT
, 0644, $DB_BTREE)
1476 or $self->throw("Could not open file '$fragment_reference': $!");
1478 # skip the first three header lines
1479 <$FRA>; <$FRA>; <$FRA>;
1486 elsif (/^ID (\S+)/) {
1487 # IDs are always the same as AC? Is this needed?
1489 $id->put("$1", $data[0]);
1491 elsif (/^DE Gene: (G\d+)(?:.+Gene: (G\d+))?/) {
1492 my ($gene1, $gene2) = ($1, $2);
1494 $data[3] = $gene2; # could be undef
1495 $gene->put($data[0], $gene1);
1496 $gene->put($data[0], $gene2) if $gene2;
1498 elsif (/^OS (.+)$/) {
1499 # As per the site.dat parsing
1500 my $raw_species = $1;
1501 my $taxid = $self->_species_to_taxid($raw_species);
1502 $data[4] = $taxid || $raw_species;
1503 $species->put($data[4], $data[0]);
1505 elsif (/^SQ [atcgn]*([ATCGN]+)[atcgn]*/) {
1507 # there can be (usually are) multiple SQ lines with a single
1508 # long seq split over them. The 'real' sequence is in caps
1510 elsif (/^SC Build (\S+):$/) {
1512 # maybe parse it out a little more? We have build,
1513 # chromosomal coords and strand, eg.
1514 # SC Build HSA_May2004: Chr.2 43976692..43978487 (FORWARD).
1516 elsif (/^RN .+?(RE\d+)/) {
1517 $reference->put($data[0], "$1");
1519 elsif (/^BF (T\d+); .+?; Quality: (\d)/) {
1520 $factor->put($data[0], "$1");
1521 $qualities{$data[0].SEPARATOR
.$1} = $2;
1524 # end of a record, store previous data and reset
1526 # accession = id gene_id1 gene_id2 species_tax_id_or_raw_string sequence source
1527 $fragments{$data[0]} = join(SEPARATOR
, ($data[1] || '',
1539 $fragment = $id = $species = $quality = $gene = $factor = $reference = undef;
1549 $self->warn("Could not read fragment file '$fragment_dat', assuming you have an old version of Transfac Pro with no fragment.dat file");
1554 # connect the internal db handle
1557 return if $self->{'_initialized'};
1559 my $index_dir = $self->index_directory;
1560 my $gene_index = "$index_dir/gene.dat.index";
1561 my $reference_index = "$index_dir/reference.dat.index";
1562 my $matrix_index = "$index_dir/matrix.dat.index";
1563 my $factor_index = "$index_dir/factor.dat.index";
1564 my $site_index = "$index_dir/site.dat.index";
1565 my $fragment_index = "$index_dir/fragment.dat.index";
1567 foreach ($gene_index, $reference_index, $matrix_index, $factor_index, $site_index, $fragment_index) {
1569 #$self->warn("Index files have not been created");
1576 $self->{reference
}->{data
} = {};
1577 tie
(%{$self->{reference
}->{data
}}, 'DB_File', $reference_index, O_RDWR
, undef, $DB_HASH) || $self->throw("Cannot open file '$reference_index': $!");
1579 my $reference_pubmed = $reference_index.'.pubmed';
1580 $self->{reference
}->{pubmed
} = tie
(%{$self->{reference
}->{pubmed
}}, 'DB_File', $reference_pubmed, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_pubmed': $!");
1582 my $reference_gene = $gene_index.'.reference';
1583 $self->{gene
}->{reference
} = tie
(%{$self->{gene
}->{reference
}}, 'DB_File', $reference_gene, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_gene': $!");
1585 my $reference_site = $site_index.'.reference';
1586 $self->{site
}->{reference
} = tie
(%{$self->{site
}->{reference
}}, 'DB_File', $reference_site, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_site': $!");
1588 my $reference_fragment = $fragment_index.'.reference';
1589 $self->{fragment
}->{reference
} = tie
(%{$self->{fragment
}->{reference
}}, 'DB_File', $reference_fragment, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_fragment': $!");
1591 my $reference_factor = $factor_index.'.reference';
1592 $self->{factor
}->{reference
} = tie
(%{$self->{factor
}->{reference
}}, 'DB_File', $reference_factor, undef, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_factor': $!");
1594 my $reference_matrix = $matrix_index.'.reference';
1595 $self->{matrix
}->{reference
} = tie
(%{$self->{matrix
}->{reference
}}, 'DB_File', $reference_matrix, undef, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_matrix': $!");
1600 $self->{gene
}->{data
} = {};
1601 tie
(%{$self->{gene
}->{data
}}, 'DB_File', $gene_index, O_RDWR
, undef, $DB_HASH) || $self->throw("Cannot open file '$gene_index': $!");
1603 my $gene_id = $gene_index.'.id';
1604 $self->{gene
}->{id
} = tie
(%{$self->{gene
}->{id
}}, 'DB_File', $gene_id, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_id': $!");
1606 my $gene_name = $gene_index.'.name';
1607 $self->{gene
}->{name
} = tie
(%{$self->{gene
}->{name
}}, 'DB_File', $gene_name, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_name': $!");
1609 my $gene_species = $gene_index.'.species';
1610 $self->{gene
}->{species
} = tie
(%{$self->{gene
}->{species
}}, 'DB_File', $gene_species, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_species': $!");
1612 my $gene_site = $site_index.'.gene';
1613 $self->{site
}->{gene
} = tie
(%{$self->{site
}->{gene
}}, 'DB_File', $gene_site, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_site': $!");
1615 my $gene_fragment = $fragment_index.'.gene';
1616 $self->{fragment
}->{gene
} = tie
(%{$self->{fragment
}->{gene
}}, 'DB_File', $gene_fragment, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_fragment': $!");
1618 my $gene_factor = $factor_index.'.gene';
1619 $self->{factor
}->{gene
} = tie
(%{$self->{factor
}->{gene
}}, 'DB_File', $gene_factor, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_factor': $!");
1621 my $gene_reference = $reference_index.'.gene';
1622 $self->{reference
}->{gene
} = tie
(%{$self->{reference
}->{gene
}}, 'DB_File', $gene_reference, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_reference': $!");
1627 $self->{site
}->{data
} = {};
1628 tie
(%{$self->{site
}->{data
}}, 'DB_File', $site_index, O_RDWR
, undef, $DB_HASH) || $self->throw("Cannot open file '$site_index': $!");
1630 my $site_id = $site_index.'.id';
1631 $self->{site
}->{id
} = tie
(%{$self->{site
}->{id
}}, 'DB_File', $site_id, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_id': $!");
1633 my $site_species = $site_index.'.species';
1634 $self->{site
}->{species
} = tie
(%{$self->{site
}->{species
}}, 'DB_File', $site_species, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file $site_species': $!");
1636 #*** quality not actually used by anything (yet)
1637 my $site_qualities = $site_index.'.qual';
1638 $self->{quality
} = {};
1639 tie
(%{$self->{quality
}}, 'DB_File', $site_qualities, O_RDWR
, undef, $DB_HASH) || $self->throw("Cannot open file '$site_qualities': $!");
1641 my $site_gene = $gene_index.'.site';
1642 $self->{gene
}->{site
} = tie
(%{$self->{gene
}->{site
}}, 'DB_File', $site_gene, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_gene': $!");
1644 my $site_matrix = $matrix_index.'.site';
1645 $self->{matrix
}->{site
} = tie
(%{$self->{matrix
}->{site
}}, 'DB_File', $site_matrix, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_matrix': $!");
1647 my $site_factor = $factor_index.'.site';
1648 $self->{factor
}->{site
} = tie
(%{$self->{factor
}->{site
}}, 'DB_File', $site_factor, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_factor': $!");
1650 my $site_reference = $reference_index.'.site';
1651 $self->{reference
}->{site
} = tie
(%{$self->{reference
}->{site
}}, 'DB_File', $site_reference, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_reference': $!");
1654 # fragment (may not be in older databases)
1655 if (-e
$fragment_index) {
1656 $self->{fragment
}->{data
} = {};
1657 tie
(%{$self->{fragment
}->{data
}}, 'DB_File', $fragment_index, O_RDWR
, undef, $DB_HASH) || $self->throw("Cannot open file '$fragment_index': $!");
1659 my $fragment_id = $fragment_index.'.id';
1660 $self->{fragment
}->{id
} = tie
(%{$self->{fragment
}->{id
}}, 'DB_File', $fragment_id, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_id': $!");
1662 my $fragment_species = $fragment_index.'.species';
1663 $self->{fragment
}->{species
} = tie
(%{$self->{fragment
}->{species
}}, 'DB_File', $fragment_species, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file $fragment_species': $!");
1665 #*** quality not actually used by anything (yet)
1666 my $fragment_qualities = $fragment_index.'.qual';
1667 $self->{fragment_quality
} = {};
1668 tie
(%{$self->{fragment_quality
}}, 'DB_File', $fragment_qualities, O_RDWR
, undef, $DB_HASH) || $self->throw("Cannot open file '$fragment_qualities': $!");
1670 my $fragment_gene = $gene_index.'.fragment';
1671 $self->{gene
}->{fragment
} = tie
(%{$self->{gene
}->{fragment
}}, 'DB_File', $fragment_gene, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_gene': $!");
1673 my $fragment_factor = $factor_index.'.fragment';
1674 $self->{factor
}->{fragment
} = tie
(%{$self->{factor
}->{fragment
}}, 'DB_File', $fragment_factor, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_factor': $!");
1676 my $fragment_reference = $reference_index.'.fragment';
1677 $self->{reference
}->{fragment
} = tie
(%{$self->{reference
}->{fragment
}}, 'DB_File', $fragment_reference, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_reference': $!");
1680 die "no fragment_index at '$fragment_index'\n";
1685 $self->{matrix
}->{data
} = {};
1686 tie
(%{$self->{matrix
}->{data
}}, 'DB_File', $matrix_index, O_RDWR
, undef, $DB_HASH) || $self->throw("Cannot open file '$matrix_index': $!");
1688 my $matrix_id = $matrix_index.'.id';
1689 $self->{matrix
}->{id
} = tie
(%{$self->{matrix
}->{id
}}, 'DB_File', $matrix_id, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_id': $!");
1691 my $matrix_name = $matrix_index.'.name';
1692 $self->{matrix
}->{name
} = tie
(%{$self->{matrix
}->{name
}}, 'DB_File', $matrix_name, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_name': $!");
1694 my $matrix_site = $site_index.'.matrix';
1695 $self->{site
}->{matrix
} = tie
(%{$self->{site
}->{matrix
}}, 'DB_File', $matrix_site, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_site': $!");
1697 my $matrix_factor = $factor_index.'.matrix';
1698 $self->{factor
}->{matrix
} = tie
(%{$self->{factor
}->{matrix
}}, 'DB_File', $matrix_factor, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_factor': $!");
1700 my $matrix_reference = $reference_index.'.matrix';
1701 $self->{reference
}->{matrix
} = tie
(%{$self->{reference
}->{matrix
}}, 'DB_File', $matrix_reference, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_reference': $!");
1706 $self->{factor
}->{data
} = {};
1707 tie
(%{$self->{factor
}->{data
}}, 'DB_File', $factor_index, O_RDWR
, undef, $DB_HASH) || $self->throw("Cannot open file '$factor_index': $!");
1709 my $factor_id = $factor_index.'.id';
1710 $self->{factor
}->{id
} = tie
(%{$self->{factor
}->{id
}}, 'DB_File', $factor_id, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file 'factor_id': $!");
1712 my $factor_name = $factor_index.'.name';
1713 $self->{factor
}->{name
} = tie
(%{$self->{factor
}->{name
}}, 'DB_File', $factor_name, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_name': $!");
1715 my $factor_species = $factor_index.'.species';
1716 $self->{factor
}->{species
} = tie
(%{$self->{factor
}->{species
}}, 'DB_File', $factor_species, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_species': $!");
1718 my $factor_interactors = $factor_index.'.interactors';
1719 $self->{factor
}->{interactors
} = tie
(%{$self->{factor
}->{interactors
}}, 'DB_File', $factor_interactors, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_interactors': $!");
1721 my $factor_gene = $gene_index.'.factor';
1722 $self->{gene
}->{factor
} = tie
(%{$self->{gene
}->{factor
}}, 'DB_File', $factor_gene, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_gene': $!");
1724 my $factor_matrix = $matrix_index.'.factor';
1725 $self->{matrix
}->{factor
} = tie
(%{$self->{matrix
}->{factor
}}, 'DB_File', $factor_matrix, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_matrix': $!");
1727 my $factor_site = $site_index.'.factor';
1728 $self->{site
}->{factor
} = tie
(%{$self->{site
}->{factor
}}, 'DB_File', $factor_site, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_site': $!");
1730 my $factor_fragment = $fragment_index.'.factor';
1731 $self->{fragment
}->{factor
} = tie
(%{$self->{fragment
}->{factor
}}, 'DB_File', $factor_fragment, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_fragment': $!");
1733 my $factor_reference = $reference_index.'.factor';
1734 $self->{reference
}->{factor
} = tie
(%{$self->{reference
}->{factor
}}, 'DB_File', $factor_reference, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_reference': $!");
1737 $self->{'_initialized'} = 1;
1740 =head2 index_directory
1742 Title : index_directory
1743 Funtion : Get/set the location that index files are stored. (this module
1744 will index the supplied database)
1745 Usage : $obj->index_directory($newval)
1746 Returns : value of index_directory (a scalar)
1747 Args : on set, new value (a scalar or undef, optional)
1751 sub index_directory
{
1753 return $self->{'index_directory'} = shift if @_;
1754 return $self->{'index_directory'};
1757 # resolve a transfac species string into an ncbi taxid
1758 sub _species_to_taxid
{
1759 my ($self, $raw_species) = @_;
1760 $raw_species or return;
1763 my @split = split(', ', $raw_species);
1764 (@split > 1) ?
($species_string = $split[1]) : ($species_string = $split[0]);
1767 if ($species_string =~ /^[A-Z]\S+ \S+$/) {
1768 SWITCH
: for ($species_string) {
1769 # some species don't classify so custom handling
1770 /^Darnel ryegrass/ && do { $ncbi_taxid = 34176; last; };
1771 /^Coix lacryma/ && do { $ncbi_taxid = 4505; last; };
1772 /^Rattus spec/ && do { $ncbi_taxid = 10116; last; };
1773 /^Mus spec/ && do { $ncbi_taxid = 10090; last; };
1774 /^Equus spec/ && do { $ncbi_taxid = 9796; last; };
1775 /^Cavia sp/ && do { $ncbi_taxid = 10141; last; };
1776 /^Marsh marigold/ && do { $ncbi_taxid = 3449; last; };
1777 /^Phalaenopsis sp/ && do { $ncbi_taxid = 36900; last; };
1778 /^Anthirrhinum majus/ && do { $ncbi_taxid = 4151; last; };
1779 /^Equus spec/ && do { $ncbi_taxid = 9796; last; };
1780 /^Lycopodium spec/ && do { $ncbi_taxid = 13840; last; };
1781 /^Autographa californica/ && do { $ncbi_taxid = 307456; last; };
1782 /^E26 AEV/ && do { $ncbi_taxid = 31920; last; };
1783 /^Pseudocentrotus miliaris/ && do { $ncbi_taxid = 7677; last; }; # the genus is 7677 but this species isn't there
1784 /^SL3-3 (?:retro)?virus/ && do { $ncbi_taxid = 53454; last; }; # 53454 is unclassified MLV-related, SL3-3 a variant of that?
1785 /^Petunia sp/ && do { $ncbi_taxid = 4104; last; };
1787 if (! $ncbi_taxid && defined $self->{_tax_db
}) {
1788 ($ncbi_taxid) = $self->{_tax_db
}->get_taxonids($species_string);
1792 # some species lines are poorly formated so custom handling
1793 SWITCH
: for ($raw_species) {
1794 # for speed, go by common first letters
1795 my $first_letter = substr($raw_species, 0, 1);
1797 $first_letter eq 'A' && do {
1798 /^Adiantum raddianum/ && do { $ncbi_taxid = 32168; last; };
1799 /^Avian sarcoma virus \(strain 17\)/ && do { $ncbi_taxid = 11877; last; };
1800 /^AMV/ && do { $ncbi_taxid = 11866; last; };
1801 /^AEV/ && do { $ncbi_taxid = 11861; last; };
1802 /^AS42|^Avian musculoaponeurotic/ && do { $ncbi_taxid = 11873; last; };
1803 /^Avian myelocytomatosis/ && do { $ncbi_taxid = 11869; last; };
1804 /^ASV 31/ && do { $ncbi_taxid = 35270; last; };
1805 /^A-MuLV/ && do { $ncbi_taxid = 188539; last; };
1806 /^Asparagus officinalis/ && do { $ncbi_taxid = 4686; last; };
1807 /^Agrobacterium tumefaciens/ && do { $ncbi_taxid = 358; last; };
1808 /^ALV/ && do { $ncbi_taxid = 11864; last; };
1809 /^AAV/ && do { $ncbi_taxid = 272636; last; };
1810 /^AKV MLV/ && do { $ncbi_taxid = 11791; last; };
1814 $first_letter eq 'B' && do {
1815 /^BPV-1/ && do { $ncbi_taxid = 10559; last; };
1816 /^BKV/ && do { $ncbi_taxid = 10629; last; };
1817 /^Bolivian squirrel monkey/ && do { $ncbi_taxid = 39432; last; };
1821 $first_letter eq 'C' && do {
1822 /^Cauliflower/ && do { $ncbi_taxid = 3715; last; };
1823 /^Chamek/ && do { $ncbi_taxid = 118643; last; };
1824 /^Candida albicans/ && do { $ncbi_taxid = 5476; last; };
1825 /^CaMV/ && do { $ncbi_taxid = 10641; last; };
1829 $first_letter eq 'E' && do {
1830 /^Eucalyptus gunnii/ && do { $ncbi_taxid = 3933; last; };
1831 /^EBV, Epstein-Barr virus/ && do { $ncbi_taxid = 10376; last; };
1832 /^Eucalyptus globulus subsp. bicostata/ && do { $ncbi_taxid = 71272; last; };
1833 /^Eucalyptus globulus subsp. globulus/ && do { $ncbi_taxid = 71271; last; };
1837 $first_letter eq 'F' && do {
1838 /^FBR MuLV/ && do { $ncbi_taxid = 11806; last; };
1839 /^FBJ MuLV/ && do { $ncbi_taxid = 11805; last; };
1840 /^FeLV|Feline leukemia/ && do { $ncbi_taxid = 11923; last; };
1841 /^Flaveria trinervia/ && do { $ncbi_taxid = 4227; last; };
1842 /^FSV/ && do { $ncbi_taxid = 11885; last; };
1843 /^F-MuLV/ && do { $ncbi_taxid = 11795; last; };
1847 $first_letter eq 'H' && do {
1848 /^HSV-1/ && do { $ncbi_taxid = 10298; last; };
1849 /^HTLV-I/ && do { $ncbi_taxid = 11908; last; };
1850 /^HIV-1/ && do { $ncbi_taxid = 11676; last; };
1851 /^HPV-16/ && do { $ncbi_taxid = 333760; last; };
1852 /^HBV/ && do { $ncbi_taxid = 10407; last; };
1853 /^HBI/ && do { $ncbi_taxid = 11867; last; };
1854 /^HPV-8/ && do { $ncbi_taxid = 10579; last; };
1855 /^HPV-11/ && do { $ncbi_taxid = 10580; last; };
1856 /^HPV-18/ && do { $ncbi_taxid = 333761; last; };
1857 /^HCMV/ && do { $ncbi_taxid = 10359; last; };
1858 /^HSV/ && do { $ncbi_taxid = 126283; last; };
1859 /^HSV-2/ && do { $ncbi_taxid = 10310; last; };
1860 /^HCV/ && do { $ncbi_taxid = 11108; last; };
1861 /^HIV-2/ && do { $ncbi_taxid = 11709; last; };
1865 $first_letter eq 'M' && do {
1866 /^MMTV/ && do { $ncbi_taxid = 11757; last; };
1867 /^Mo-MuLV/ && do { $ncbi_taxid = 11801; last; };
1868 /^MuLV/ && do { $ncbi_taxid = 11786; last; };
1869 /^MSV/ && do { $ncbi_taxid = 11802; last; };
1870 /^MC29/ && do { $ncbi_taxid = 11868; last; };
1871 /^MVM/ && do { $ncbi_taxid = 10794; last; };
1872 /^MH2E21/ && do { $ncbi_taxid = 11955; last; }; # 11955 is a species, presumably MH2E21 is the strain
1876 $first_letter eq 'R' && do {
1877 /^Raphanus sativus/ && do { $ncbi_taxid = 3726; last; };
1878 /^REV-T/ && do { $ncbi_taxid = 11636; last; };
1879 /^RAV-0/ && do { $ncbi_taxid = 11867; last; }; # should be rous-associated virus 0 variant
1880 /^RSV/ && do { $ncbi_taxid = 11886; last; };
1881 /^RadLV/ && do { $ncbi_taxid = 31689; last; };
1882 /^RTBV/ && do { $ncbi_taxid = 10654; last; };
1886 $first_letter eq 'S' && do {
1887 /^SV40/ && do { $ncbi_taxid = 10633; last; };
1888 /^Sesbania rostrata/ && do { $ncbi_taxid = 3895; last; };
1889 /^SIV/ && do { $ncbi_taxid = 11723; last; };
1890 /^Spinacia oleracea/ && do { $ncbi_taxid = 3562; last; };
1891 /^SCMV/ && do { $ncbi_taxid = 10364; last; }; # supposed to be AGM isolate
1896 $first_letter eq 'a' && do {
1897 /^adenovirus type 5/ && do { $ncbi_taxid = 28285; last; };
1898 /^adenovirus type 2/ && do { $ncbi_taxid = 10515; last; };
1899 /^adenovirus/ && do { $ncbi_taxid = 189831; last; }; # 189831 ('unclassified Adenoviridae') is the closest I can get, but this has no genus and is not a species
1903 $first_letter eq 'b' && do {
1904 /^bell pepper/ && do { $ncbi_taxid = 4072; last; };
1905 /^baculovirus, Autographa californica/ && do { $ncbi_taxid = 46015; last; };
1906 /^broccoli/ && do { $ncbi_taxid = 36774; last; };
1907 /^barley/ && do { $ncbi_taxid = 112509; last; };
1911 $first_letter eq 'c' && do {
1912 /^clawed frog/ && do { $ncbi_taxid = 8355; last; };
1913 /^chipmunk/ && do { $ncbi_taxid = 64680; last; };
1914 /^common tree shrew/ && do { $ncbi_taxid = 37347; last; };
1915 /^cat/ && do { $ncbi_taxid = 9685; last; };
1920 /^NK24/ && do { $ncbi_taxid = 11955; last; };
1921 /^OK10/ && do { $ncbi_taxid = 11871; last; };
1922 /^Dendrobium grex/ && do { $ncbi_taxid = 84618; last; };
1923 /^KSHV/ && do { $ncbi_taxid = 37296; last; };
1924 /^Oncidium/ && do { $ncbi_taxid = 96474; last; };
1925 /^Japanese quail/ && do { $ncbi_taxid = 93934; last; };
1926 /^Nile tilapia/ && do { $ncbi_taxid = 8128; last; };
1927 /^GALV/ && do { $ncbi_taxid = 11840; last; };
1928 /^JCV/ && do { $ncbi_taxid = 10632; last; };
1929 /^LPV/ && do { $ncbi_taxid = 10574; last; };
1930 /^Py,/ && do { $ncbi_taxid = 36362; last; };
1931 /^DHBV/ && do { $ncbi_taxid = 12639; last; };
1932 /^VZV/ && do { $ncbi_taxid = 10335; last; };
1933 /^Vicia faba/ && do { $ncbi_taxid = 3906; last; };
1935 /^hamster/ && do { $ncbi_taxid = 10029; last; };
1936 /^sea urchin/ && do { $ncbi_taxid = 7668; last; };
1937 /^fruit fly/ && do { $ncbi_taxid = 7227; last; };
1938 /^halibut/ && do { $ncbi_taxid = 8267; last; };
1939 /^vaccinia virus/ && do { $ncbi_taxid = 10245; last; };
1940 /^taxonomic class Mammalia/ && do { $ncbi_taxid = 40674; last; }; # not a species
1941 /^taxonomic class Vertebrata/ && do { $ncbi_taxid = 7742; last; }; # not a species
1942 /^dog/ && do { $ncbi_taxid = 9615; last; };
1943 /^parsley/ && do { $ncbi_taxid = 4043; last; };
1944 /^mouse, Mus domesticus Torino/ && do { $ncbi_taxid = 10092; last; }; # 10092 is domesticus subspecies, but not the Torino strain
1945 /^lemur, Eulemur fulvus collaris/ && do { $ncbi_taxid = 47178; last; };
1946 /^red sea bream/ && do { $ncbi_taxid = 143350; last; };
1947 /^zebra finch/ && do { $ncbi_taxid = 59729; last; };
1948 /^mung bean/ && do { $ncbi_taxid = 3916; last; };
1949 /^soybean/ && do { $ncbi_taxid = 3847; last; };
1950 /^oat/ && do { $ncbi_taxid = 4498; last; };
1951 /^pseudorabies virus/ && do { $ncbi_taxid = 10345; last; };
1955 $self->warn("Didn't know what species '$raw_species' was, unable to classify") unless $ncbi_taxid;
1961 # Destroy tied references to close filehandles
1962 # and allow proper temporary files deletion
1963 undef $self->{_tax_db
}->{'_nodes'};
1964 undef $self->{_tax_db
}->{'_id2name'};
1965 undef $self->{_tax_db
}->{'_name2id'};
1966 undef $self->{_tax_db
}->{'_parent2children'};
1967 undef $self->{_tax_db
}->{'_parentbtree'};