1 # $Id: transfac_pro.pm,v 1.15 2006/08/12 11:00:03 sendu Exp $
3 # BioPerl module for Bio::DB::TFBS::transfac_pro
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Sendu Bala <bix@sendu.me.uk>
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::DB::TFBS::transfac_pro - An implementation of Bio::DB::TFBS
18 which uses local flat files for transfac pro
22 use Bio::DB::Taxonomy;
24 my $db = new Bio::DB::Taxonomy(-source => 'transfac_pro'
25 -dat_dir => $directory);
27 # we're interested in the gene P5
28 my ($gene_id) = $db->get_gene_ids(-name => 'P5'); # G000001
30 # we want all the transcription factors that bind to our gene
31 my @factor_ids = $db->get_factor_ids(-gene => $gene_id);
33 # get info about those TFs
34 foreach my $factor_id (@factor_ids) {
35 my $factor = $db->get_factor($factor_id);
36 my $name = $factor->universal_name;
37 # etc. - see Bio::Map::TranscriptionFactor, eg. find out where it binds
41 my $matrix = $db->get_matrix('M00001');
43 # get a binding site sequence
44 my $seq = $db->get_site('R00001');
48 This is an implementation which uses local flat files and the DB_File
49 module RECNO data structures to manage a local copy of the Transfac Pro TFBS
52 Required database files require a license which can be obtained via
53 http://www.biobase-international.com/pages/index.php?id=170
55 Within the linux installation tarball you will find a cgibin tar ball, and
56 inside that is a data directory containing the .dat files needed by this
57 module. Point to that data directory with -dat_dir
63 User feedback is an integral part of the evolution of this and other
64 Bioperl modules. Send your comments and suggestions preferably to
65 the Bioperl mailing list. Your participation is much appreciated.
67 bioperl-l@bioperl.org - General discussion
68 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
72 Please direct usage questions or support issues to the mailing list:
74 I<bioperl-l@bioperl.org>
76 rather than to the module maintainer directly. Many experienced and
77 reponsive experts will be able look at the problem and quickly
78 address it. Please include a thorough description of the problem
79 with code and data examples if at all possible.
83 Report bugs to the Bioperl bug tracking system to help us keep track
84 of the bugs and their resolution. Bug reports can be submitted via
87 http://bugzilla.open-bio.org/
89 =head1 AUTHOR - Sendu Bala
95 Based on Bio::DB::Taxonomy::flatfile by Jason Stajich
99 The rest of the documentation details each of the object methods.
100 Internal methods are usually preceded with a _
104 # Let the code begin...
106 package Bio
::DB
::TFBS
::transfac_pro
;
108 use Bio
::Annotation
::Reference
;
109 use Bio
::Annotation
::SimpleValue
;
110 use Bio
::LocatableSeq
;
111 use Bio
::SimpleAlign
;
112 use Bio
::Matrix
::PSM
::SiteMatrix
;
114 use Bio
::Map
::GeneMap
;
115 use Bio
::Map
::TranscriptionFactor
;
116 use Bio
::Map
::Position
;
117 use Bio
::Map
::Relative
;
120 use constant SEPARATOR
=> ':!:';
121 use constant INTERNAL_SEPARATOR
=> '!:!';
123 $DB_BTREE->{'flags'} = R_DUP
; # allow duplicate values in DB_File BTREEs
125 use base
qw(Bio::DB::TFBS);
130 Usage : my $obj = new Bio::DB::TFBS::transfac_pro();
131 Function: Builds a new Bio::DB::TFBS::transfac_pro object
132 Returns : an instance of Bio::DB::TTFBS::transfac_pro
133 Args : -dat_dir => name of directory where Transfac Pro .dat files
134 (required to initially build indexes)
135 -tax_db => Bio::DB::Taxonomy object, used when initially building
136 indexes, gives better results for species information
138 -index_dir => name of directory where index files should be created
139 or already exist. (defaults to -dat_dir, required if
140 -dat_dir not supplied)
141 -force => 1 replace current indexes even if they exist
146 my ($class, @args) = @_;
148 my $self = $class->SUPER::new
(@args);
150 my ($dat_dir, $index_dir, $tax_db, $force) = $self->_rearrange([qw(DAT_DIR INDEX_DIR TAX_DB FORCE)], @args);
151 $self->throw("At least one of -dat_dir and -index_dir must be supplied") unless ($dat_dir || $index_dir);
153 $self->index_directory($index_dir || $dat_dir);
154 $self->{_tax_db
} = $tax_db if $tax_db;
157 $self->_build_index($dat_dir, $force);
164 =head2 Bio::DB::TFBS Interface implementation
169 my ($self, $dat, @args) = @_;
170 @args % 2 == 0 || $self->throw("Must provide key => value pairs");
171 my $hash = $self->{$dat} || $self->throw("Unknown .dat type '$dat'");
174 # get a subset corresponding to args
178 while (my ($type, $value) = each %args) {
180 $self->warn("Arguement '$type' has no value, ignored");
185 my $converter = $hash->{$type};
186 unless ($converter) {
187 $self->warn("Unknown search type '$type' for .dat type '$dat'");
191 my @ids = $converter->get_dup($value);
193 @ids = $converter->get_dup(lc($value));
197 # we can have multiple types given at once, find the ids that
198 # satisfy all criteria
200 my %final = map { $_ => 1 } @final;
201 @final = grep { $final{$_} } @ids;
213 my $db_file_hash = $self->{$dat}->{id
};
215 my ($key, $prev_key, $value) = ('_!_', '!_!');
218 $db_file_hash->seq($key, $value, R_NEXT
);
219 last if $prev_key eq $key;
220 push(@ids, $value); # confusing? when creating objects we store
221 # $value as accession and $key as id, but from
222 # this method we return $value as id given $id!
232 Title : get_reference
233 Usage : my $ref = $obj->get_reference($id);
234 Function: Get a literature reference.
235 Returns : Bio::Annotation::Reference
236 Args : string - a reference id ('RE...')
241 my ($self, $id) = @_;
243 my $data = $self->{reference
}->{data
}->{$id} || return;
244 my @data = split(SEPARATOR
, $data);
246 return Bio
::Annotation
::Reference
->new(-pubmed
=> $data[0],
247 -authors
=> $data[1],
249 -location
=> $data[3] );
255 Usage : my $map = $obj->get_genemap($id);
256 Function: Get a GeneMap for a gene.
257 Returns : Bio::Map::GeneMap
258 Args : string - a gene id ('G...'), and optionally int (number of bp
264 my ($self, $id, $upstream) = @_;
266 return $self->{got_map
}->{$id} if defined $self->{got_map
}->{$id};
268 my $data = $self->{gene
}->{data
}->{$id} || return;
269 my @data = split(SEPARATOR
, $data);
271 # accession = id name description species_tax_id_or_raw_string
272 my $taxon = $self->{_tax_db
} ?
$self->{_tax_db
}->get_taxon($data[3]) || $data[3] : $data[3];
273 my $map = Bio
::Map
::GeneMap
->get(-uid
=> $id,
276 -description
=> $data[2],
277 -upstream
=> $upstream);
278 $self->{got_map
}->{$id} = $map; # prevents infinite recurse when we call get_factor below
280 # spawn all the factors that belong on this gene map
281 # get_factor_ids(-gene => ...) only works for genes that encode factors;
282 # have to go via sites
283 foreach my $sid ($self->get_site_ids(-gene
=> $id)) {
284 foreach my $fid ($self->get_factor_ids(-site
=> $sid)) {
285 # it is quite deliberate that we deeply recurse to arrive at the
286 # correct answer, which involves pulling in most of the database
287 no warnings
"recursion";
288 $self->get_factor($fid);
298 Usage : my $seq = $obj->get_seq($id);
299 Function: Get the sequence of a site. The sequence will be annotated with the
300 the tags 'relative_start', 'relative_end', 'relative_type' and
303 Args : string - a site id ('R...')
308 my ($self, $id) = @_;
310 my $data = $self->{site
}->{data
}->{$id} || return;
311 my @data = split(SEPARATOR
, $data);
313 my $seq = Bio
::Seq
->new(-seq
=> $data[2],
314 -accession_number
=> $id,
315 -description
=> $data[6] ?
'Genomic sequence' : 'Consensus or artificial sequence',
318 -alphabet
=> $data[7] || 'dna',
319 -species
=> $data[6]);
321 my $annot = $seq->annotation;
322 my $sv = Bio
::Annotation
::SimpleValue
->new(-tagname
=> 'relative_start', -value
=> $data[4] || 1);
323 $annot->add_Annotation($sv);
324 $sv = Bio
::Annotation
::SimpleValue
->new(-tagname
=> 'relative_end', -value
=> $data[5] || ($data[4] || 1 + length($data[2]) - 1));
325 $annot->add_Annotation($sv);
326 $sv = Bio
::Annotation
::SimpleValue
->new(-tagname
=> 'relative_type', -value
=> $data[3] || 'artificial');
327 $annot->add_Annotation($sv);
328 $sv = Bio
::Annotation
::SimpleValue
->new(-tagname
=> 'relative_to', -value
=> $data[1]);
329 $annot->add_Annotation($sv);
337 Usage : my $seq = $obj->get_fragment($id);
338 Function: Get the sequence of a fragment.
340 Args : string - a site id ('FR...')
345 my ($self, $id) = @_;
347 my $data = $self->{fragment
}->{data
}->{$id} || return;
348 my @data = split(SEPARATOR
, $data);
350 # accession = id gene_id1 gene_id2 species_tax_id_or_raw_string sequence source
351 return new Bio
::Seq
( -seq
=> $data[4],
352 -accession_number
=> $id,
353 -description
=> 'Between genes '.$data[1].' and '.$data[2],
354 -species
=> $data[3],
356 -alphabet
=> 'dna' );
362 Usage : my $matrix = $obj->get_matrix($id);
363 Function: Get a matrix that describes a binding site.
364 Returns : Bio::Matrix::PSM::SiteMatrix
365 Args : string - a matrix id ('M...'), optionally a sequence string from
366 which base frequencies will be calcualted for the matrix model
372 my ($self, $id, $seq) = @_;
376 my $data = $self->{matrix
}->{data
}->{$id} || return;
377 my @data = split(SEPARATOR
, $data);
378 $data[4] || $self->throw("Matrix data missing for $id");
381 foreach my $position (split(INTERNAL_SEPARATOR
, $data[4])) {
382 my ($a_count, $c_count, $g_count, $t_count) = split("\t", $position);
383 push(@
{$a}, $a_count);
384 push(@
{$c}, $c_count);
385 push(@
{$g}, $g_count);
386 push(@
{$t}, $t_count);
389 # our psms include a simple background model so we can use
390 # sequence_match_weight() if desired
391 my $a_freq = ($seq =~ tr/a//) / length($seq);
392 my $c_freq = ($seq =~ tr/c//) / length($seq);
393 my $g_freq = ($seq =~ tr/g//) / length($seq);
394 my $t_freq = ($seq =~ tr/t//) / length($seq);
396 my $psm = new Bio
::Matrix
::PSM
::SiteMatrix
( -pA
=> $a,
401 -accession_number
=> $id,
403 -width
=> scalar(@
{$a}),
405 -model
=> { A
=> $a_freq, C
=> $c_freq, G
=> $g_freq, T
=> $t_freq } );
407 #*** used to make a Bio::Matrix::PSM::Psm and add references, but it
408 # didn't seem worth it. You can get references from the database by:
409 #foreach my $ref_id ($db->get_reference_ids(-matrix => $id)) {
410 # my $ref = $db->get_reference($ref_id);
419 Usage : my $aln = $obj->get_aln($id);
420 Function: Get the alignment that was used to generate a matrix. Each sequence
421 in the alignment will have an accession_number corresponding to the
422 Transfac site id, and id() based on that but unique within the
424 Returns : Bio::SimpleAlign
425 Args : string - a matrix id ('M...'), optionally true to, when a matrix
426 lists no sequences, search for sequences via the matrix's factors,
427 picking the sites that best match the matrix
432 my ($self, $id, $via_factors) = @_;
434 my $data = $self->{matrix
}->{data
}->{$id} || $self->throw("matrix '$id' had no data in DB_File");
435 my @data = split(SEPARATOR
, $data);
437 if (! $data[5] && $via_factors) {
438 # This is a matrix with no site sequences given in matrix.dat.
439 # Find some matching site sequences via factors.
441 # First, check its factors for sites
444 foreach my $factor_id ($self->get_factor_ids(-matrix
=> $id)) {
445 $factor_ids{$factor_id} = 1;
446 foreach my $site_id ($self->get_site_ids(-factor
=> $factor_id)) {
447 next if defined $site_seqs{$site_id};
448 my $seq = $self->get_seq($site_id);
450 # skip sites that have no sequence, or have IUPAC symbols in
451 # their sequence (most probably the 'consensus' sequence itself
452 # that was used to make and exactly corresponds to the matrix)
453 my $seq_str = $seq->seq || next;
454 $seq_str =~ /[MRWSYKVHDB]/ and next;
456 $site_seqs{$site_id} = $seq;
459 my @seqs = values %site_seqs;
462 # pick the sub-seqs that match to the matrix with the best scores
463 my $matrix = $self->get_matrix($id);
464 my $desired_sequences = $matrix->sites;
465 return if @seqs < $desired_sequences;
467 my $desired_length = $matrix->width;
469 foreach my $seq (@seqs) {
470 my $for_str = $seq->seq;
471 next if length($for_str) < $desired_length;
472 my $rev_str = $seq->revcom->seq;
475 my $best_subseq = '';
477 my $best_subseq_caps = 0;
480 foreach my $seq_str ($for_str, $rev_str) {
481 for my $i (0..(length($seq_str) - $desired_length)) {
482 my $subseq = substr($seq_str, $i, $desired_length);
483 $subseq =~ s/[^ACGTacgt]//g; # can only score atcg
484 next unless length($subseq) == $desired_length; # short or 0-length seqs could get the highest scores!
485 my $score = $matrix->sequence_match_weight($subseq);
487 # caps represent the author-chosen bit of a site
488 # sequence so we would prefer to choose a subseq that
490 my $caps = $subseq =~ tr/ACGT//;
492 #*** (don't know why numeric == fails for comparing
493 # scores, when the string eq works)
494 if ($score > $best_score || ("$score" eq "$best_score" && $caps > $best_subseq_caps)) {
495 $best_score = $score;
496 $best_subseq_caps = $caps;
497 $best_subseq = $subseq;
499 $best_revcom = $revcom;
506 $best_seqs{$seq->accession_number} = [$best_subseq, $seq->accession_number, ($best_i + 1), $revcom ?
-1 : 1, $best_score];
509 my @sorted = sort { $best_seqs{$b}->[-1] <=> $best_seqs{$a}->[-1] } keys %best_seqs;
510 return if @sorted < $desired_sequences;
511 splice(@sorted, $desired_sequences);
512 my %wanted = map { $_ => 1 } @sorted;
515 foreach my $seq (@seqs) {
516 next unless exists $wanted{$seq->accession_number};
517 my @data = @
{$best_seqs{$seq->accession_number}};
519 push(@site_data, join('_', @data));
522 $data[5] = join(INTERNAL_SEPARATOR
, @site_data);
523 $self->{matrix
}->{data
}->{$id} = join(SEPARATOR
, @data);
528 my @blocks = split(INTERNAL_SEPARATOR
, $data[5]);
530 # append gap chars to all sequences to make them the same length
531 # (applies to sequences found via factors, presumably, since we already
532 # do this for matrix alignments in transfac_pro.pm)
535 my ($seq) = split('_', $_);
536 my $length = length($seq);
537 if ($length > $longest) {
541 foreach my $i (0..$#blocks) {
542 my $block = $blocks[$i];
543 my ($seq, $seq_id) = split('_', $block);
544 my $length = length($seq);
545 if ($length < $longest) {
547 $seq .= '-'x
($longest - $length);
548 $block =~ s/^${orig_seq}_/${seq}_/;
549 $blocks[$i] = $block;
553 # build the alignment
554 my $aln = Bio
::SimpleAlign
->new(-source
=> 'transfac_pro');
557 my ($seq, $seq_acc, $start, $strand) = split('_', $_);
559 # we can get back multiple different subparts of the same site (sequence),
560 # so $seq_acc isn't unique across this loop. Can't use it as the seq id
561 # of the alignment (ids must be unique in SimpleAlign), so we
562 # uniquify the id and store the original id as the accession_number
564 $done_ids{$seq_acc}++;
565 if ($done_ids{$seq_acc} > 1) {
566 $seq_id = $seq_acc.'_'.$done_ids{$seq_acc};
572 my $gaps = $seq =~ tr/-//;
573 my $length = length($seq) - $gaps;
574 $self->throw("seq '$seq_id' for matrix '$id' had seq '$seq'") unless $length;
575 $aln->add_seq(Bio
::LocatableSeq
->new(-seq
=> $seq,
577 -accession_number
=> $seq_acc,
579 -end
=> $start + $length - 1,
580 -strand
=> $strand));
583 # could also store score? of?
591 Usage : my $factor = $obj->get_factor($id);
592 Function: Get the details of a transcription factor.
593 Returns : Bio::Map::TranscriptionFactor
594 Args : string - a factor id ('T...')
599 my ($self, $id) = @_;
601 return $self->{got_factor
}->{$id} if defined $self->{got_factor
}->{$id};
602 my $data = $self->{factor
}->{data
}->{$id} || return;
603 my @data = split(SEPARATOR
, $data);
605 # accession = id name species sequence
606 my $tf = Bio
::Map
::TranscriptionFactor
->get(-id
=> $id,
607 -universal_name
=> $data[1]);
608 #*** not sure what to do with species and sequence, since we don't want to
609 # confuse the idea that a TF is a general thing that could bind to any
610 # species... then again, you might want to model species-specific variants
611 # of a TF with different binding abilities...
612 #*** idea of having inclusion and exclusion species so you can prevent/
613 # ignore a tf that binds to the wrong species (a species that doesn't even
614 # have the tf), and associating sequence with each species/tf combo so you
615 # can see how diverged the tf is and make assumptions about site difference
618 # place it on all its genemaps
619 foreach my $sid ($self->get_site_ids(-factor
=> $id)) {
620 my $s_data = $self->{site
}->{data
}->{$sid} || next;
621 my @s_data = split(SEPARATOR
, $s_data);
623 # accession = id gene_id sequence relative_to first_position last_position species_tax_id_or_raw_string
624 $s_data[1] || next; # site isn't relative to a gene, meaningless
625 $s_data[4] || next; # don't know where its supposed to be, can't model it
626 $s_data[5] ||= $s_data[4] + ($s_data[2] ?
length($s_data[2]) - 1 : 0);
628 # it is quite deliberate that we deeply recurse to arrive at the
629 # correct answer, which involves pulling in most of the database
630 no warnings
"recursion";
631 my $gene_map = $self->get_genemap($s_data[1]) || next;
632 return $self->{got_factor
}->{$id} if defined $self->{got_factor
}->{$id};
634 #*** not always relative to gene start...
635 # we need Bio::Map::Gene s to have some default tss and atg positions
636 # that we can be relative to
637 my $rel = new Bio
::Map
::Relative
(-element
=> $gene_map->gene, -description
=> $s_data[3]);
638 Bio
::Map
::Position
->new(-map => $gene_map, -element
=> $tf, -start
=> $s_data[4], -end
=> $s_data[5], -relative
=> $rel);
641 $self->{got_factor
}->{$id} = $tf;
645 # since get_factor() is uncertain, just have direct access methods to factor
647 sub get_factor_name
{
648 my ($self, $id) = @_;
649 my $details = $self->_get_factor_details($id) || return;
650 return $details->{name
};
652 sub get_factor_species
{
653 my ($self, $id) = @_;
654 my $details = $self->_get_factor_details($id) || return;
655 return $details->{species
};
657 sub get_factor_sequence
{
658 my ($self, $id) = @_;
659 my $details = $self->_get_factor_details($id) || return;
660 return $details->{sequence
};
662 sub _get_factor_details
{
663 my ($self, $id) = @_;
666 return $self->{factor_details
}->{$id} if defined $self->{factor_details
}->{$id};
668 my $data = $self->{factor
}->{data
}->{$id} || return;
669 my @data = split(SEPARATOR
, $data);
671 # accession = id name species sequence
673 my %details = (name
=> $data[1], species
=> $data[2], sequence
=> $data[3]);
674 $self->{factor_details
}->{$id} = \
%details;
679 =head2 get_reference_ids
681 Title : get_reference_ids
682 Usage : my @ids = $obj->get_reference_ids(-key => $value);
683 Function: Get all the reference ids that are associated with the supplied
685 Returns : list of strings (ids)
686 Args : -key => value, where value is a string id, and key is one of:
687 -pubmed -site -gene -matrix -factor
691 sub get_reference_ids
{
693 return $self->_get_ids('reference', @_);
696 # -id -name -species -site -factor -reference
699 return $self->_get_ids('gene', @_);
705 Usage : my @ids = $obj->get_site_ids(-key => $value);
706 Function: Get all the site ids that are associated with the supplied
708 Returns : list of strings (ids)
709 Args : -key => value, where value is a string id, and key is one of:
710 -id -species -gene -matrix -factor -reference
716 return $self->_get_ids('site', @_);
719 =head2 get_matrix_ids
721 Title : get_matrix_ids
722 Usage : my @ids = $obj->get_matrix_ids(-key => $value);
723 Function: Get all the matrix ids that are associated with the supplied
725 Returns : list of strings (ids)
726 Args : -key => value, where value is a string id, and key is one of:
727 -id -name -site -factor -reference
733 return $self->_get_ids('matrix', @_);
736 =head2 get_factor_ids
738 Title : get_factor_ids
739 Usage : my @ids = $obj->get_factor_ids(-key => $value);
740 Function: Get all the factor ids that are associated with the supplied
742 Returns : list of strings (ids)
743 Args : -key => value, where value is a string id, and key is one of:
744 -id -name -species -interactors -gene -matrix -site -reference
745 NB: -gene only gets factor ids for genes that encode factors
751 return $self->_get_ids('factor', @_);
754 =head2 get_fragment_ids
756 Title : get_fragment_ids
757 Usage : my @ids = $obj->get_fragment_ids(-key => $value);
758 Function: Get all the fragment ids that are associated with the supplied
760 Returns : list of strings (ids)
761 Args : -key => value, where value is a string id, and key is one of:
762 -id -species -gene -factor -reference
766 sub get_fragment_ids
{
768 return $self->_get_ids('fragment', @_);
771 =head2 Helper methods
775 # internal method which does the indexing
777 my ($self, $dat_dir, $force) = @_;
779 # MLDBM would give us transparent complex data structures with DB_File,
780 # allowing just one index file, but its yet another requirment and we
781 # don't strictly need it
783 my $index_dir = $self->index_directory;
784 my $gene_index = "$index_dir/gene.dat.index";
785 my $reference_index = "$index_dir/reference.dat.index";
786 my $matrix_index = "$index_dir/matrix.dat.index";
787 my $factor_index = "$index_dir/factor.dat.index";
788 my $fragment_index = "$index_dir/fragment.dat.index";
789 my $site_index = "$index_dir/site.dat.index";
791 my $reference_dat = "$dat_dir/reference.dat";
792 if (! -e
$reference_index || $force) {
793 open(REF
, $reference_dat) || $self->throw("Cannot open reference file '$reference_dat' for reading");
796 unlink $reference_index;
797 my $ref = tie
(%references, 'DB_File', $reference_index, O_RDWR
|O_CREAT
, 0644, $DB_HASH) || $self->throw("Cannot open file '$reference_index': $!");
800 my $reference_pubmed = $reference_index.'.pubmed';
801 unlink $reference_pubmed;
802 my $pub = tie
(%pubmed, 'DB_File', $reference_pubmed, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_pubmed': $!");
805 my $reference_gene = $gene_index.'.reference';
806 unlink $reference_gene;
807 my $gene = tie
(%gene, 'DB_File', $reference_gene, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_gene': $!");
810 my $reference_site = $site_index.'.reference';
811 unlink $reference_site;
812 my $site = tie
(%site, 'DB_File', $reference_site, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_site': $!");
815 my $reference_fragment = $fragment_index.'.reference';
816 unlink $reference_fragment;
817 my $fragment = tie
(%fragment, 'DB_File', $reference_fragment, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_fragment': $!");
820 my $reference_factor = $factor_index.'.reference';
821 unlink $reference_factor;
822 my $factor = tie
(%factor, 'DB_File', $reference_factor, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_factor': $!");
825 my $reference_matrix = $matrix_index.'.reference';
826 unlink $reference_matrix;
827 my $matrix = tie
(%matrix, 'DB_File', $reference_matrix, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_matrix': $!");
829 # skip the first three header lines
837 elsif (/^RX PUBMED: (\d+)/) {
839 $pub->put("$1", $data[0]);
841 elsif (/^RA (.+)\n$/) {
844 elsif (/^RT (.+?)\.?\n$/) {
847 elsif (/^RL (.+?)\.?\n$/) {
850 elsif (/^GE TRANSFAC: (\w\d+)/) {
851 $gene->put($data[0], "$1");
853 elsif (/^BS TRANSFAC: (\w\d+)/) {
854 $site->put($data[0], "$1");
856 elsif (/^FA TRANSFAC: (\w\d+)/) {
857 $factor->put($data[0], "$1");
859 elsif (/^FR TRANSFAC: (FR\d+)/) {
860 $fragment->put($data[0], "$1");
862 elsif (/^MX TRANSFAC: (\w\d+)/) {
863 $matrix->put($data[0], "$1");
866 # end of a record, store previous data and reset
868 # accession = pubmed authors title location
869 $references{$data[0]} = join(SEPARATOR
, ($data[1] || '', $data[2] || '', $data[3] || '', $data[4] || ''));
876 $ref = $pub = $gene = $site = $fragment = $factor = $matrix = undef;
886 my $gene_dat = "$dat_dir/gene.dat";
887 if (! -e
$gene_index || $force) {
888 open(GEN
, $gene_dat) || $self->throw("Cannot open gene file '$gene_dat' for reading");
892 my $gene = tie
(%genes, 'DB_File', $gene_index, O_RDWR
|O_CREAT
, 0644, $DB_HASH) || $self->throw("Cannot open file '$gene_index': $!");
895 my $gene_id = $gene_index.'.id';
897 my $id = tie
(%id, 'DB_File', $gene_id, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$gene_id': $!");
900 my $gene_name = $gene_index.'.name';
902 my $name = tie
(%name, 'DB_File', $gene_name, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$gene_name': $!");
905 my $gene_species = $gene_index.'.species';
906 unlink $gene_species;
907 my $species = tie
(%species, 'DB_File', $gene_species, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$gene_species': $!");
910 my $gene_site = $site_index.'.gene';
912 my $site = tie
(%site, 'DB_File', $gene_site, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$gene_site': $!");
915 my $gene_factor = $factor_index.'.gene';
917 my $factor = tie
(%factor, 'DB_File', $gene_factor, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$gene_factor': $!");
920 my $gene_fragment = $fragment_index.'.gene';
921 unlink $gene_fragment;
922 my $fragment = tie
(%fragment, 'DB_File', $gene_fragment, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$gene_fragment': $!");
925 my $gene_reference = $reference_index.'.gene';
926 unlink $gene_reference;
927 my $reference = tie
(%reference, 'DB_File', $gene_reference, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$gene_reference': $!");
929 # skip the first three header lines
937 elsif (/^ID (\S+)/) {
939 $id->put("$1", $data[0]);
941 elsif (/^SD (.+)$/) {
943 $name->put(lc("$1"), $data[0]);
945 elsif (/^SY (.+)\.$/) {
946 foreach (split('; ', lc("$1"))) {
947 $name->put($_, $data[0]);
950 elsif (/^DE (.+)$/) {
953 elsif (/^OS (.+)$/) {
954 my $raw_species = $1;
955 my $taxid = $self->_species_to_taxid($raw_species);
956 $data[4] = $taxid || $raw_species;
957 $species->put($data[4], $data[0]);
959 elsif (/^RN .+?(RE\d+)/) {
960 $reference->put($data[0], "$1");
962 elsif (/^BS .+?(R\d+)/) {
963 $site->put($data[0], "$1");
965 elsif (/^FA (T\d+)/) {
966 $factor->put($data[0], "$1");
968 elsif (/^BR (FR\d+)/) {
969 $fragment->put($data[0], "$1");
972 # end of a record, store previous data and reset
974 # accession = id name description species_tax_id_or_raw_string
975 $genes{$data[0]} = join(SEPARATOR
, ($data[1] || '', $data[2] || '', $data[3] || '', $data[4] || ''));
982 $gene = $id = $name = $species = $site = $factor = $reference = undef;
992 my $site_dat = "$dat_dir/site.dat";
993 if (! -e
$site_index || $force) {
994 open(SIT
, $site_dat) || $self->throw("Cannot open site file '$site_dat' for reading");
998 my $site = tie
(%sites, 'DB_File', $site_index, O_RDWR
|O_CREAT
, 0644, $DB_HASH) || $self->throw("Cannot open file '$site_index': $!");
1001 my $site_id = $site_index.'.id';
1003 my $id = tie
(%id, 'DB_File', $site_id, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$site_id': $!");
1006 my $site_species = $site_index.'.species';
1007 unlink $site_species;
1008 my $species = tie
(%species, 'DB_File', $site_species, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$site_species': $!");
1011 my $site_qualities = $site_index.'.qual';
1012 unlink $site_qualities;
1013 my $quality = tie
(%qualities, 'DB_File', $site_qualities, O_RDWR
|O_CREAT
, 0644, $DB_HASH) || $self->throw("Cannot open file '$site_qualities': $!");
1016 my $site_gene = $gene_index.'.site';
1018 my $gene = tie
(%gene, 'DB_File', $site_gene, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$site_gene': $!");
1021 my $site_matrix = $matrix_index.'.site';
1022 unlink $site_matrix;
1023 my $matrix = tie
(%matrix, 'DB_File', $site_matrix, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$site_matrix': $!");
1026 my $site_factor = $factor_index.'.site';
1027 unlink $site_factor;
1028 my $factor = tie
(%factor, 'DB_File', $site_factor, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$site_factor': $!");
1031 my $site_reference = $reference_index.'.site';
1032 unlink $site_reference;
1033 my $reference = tie
(%reference, 'DB_File', $site_reference, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$site_reference': $!");
1035 # skip the first three header lines
1036 <SIT
>; <SIT
>; <SIT
>;
1043 elsif (/^ID (\S+)/) {
1045 $id->put("$1", $data[0]);
1047 elsif (/^TY (.+)$/) {
1050 elsif (/^DE .*Gene: (G\d+)/) {
1052 $gene->put($data[0], "$1");
1054 # if it has no gene it is an artificial sequence, unless it
1055 # has a species (OS line), in which case it is unassigned
1056 # genomic; either way we won't be able to make a
1057 # Bio::Map::PositionI later on, so such sites won't be
1060 elsif (/^OS (.+)$/) {
1061 # Since not all sites in site.dat with a species have a gene,
1062 # (small handful are unassigned 'genomic') can't delegate to
1063 # gene.dat and must parse species here (effectively again)
1064 my $raw_species = $1;
1065 my $taxid = $self->_species_to_taxid($raw_species);
1066 $data[7] = $taxid || $raw_species;
1067 $species->put($data[7], $data[0]);
1069 elsif (/^SQ (.+)\.$/) {
1071 # there can actually be more than one SQ line, seemingly with
1072 # variations of the sequence (not a long sequence split over
1073 # two lines); not sure what to do with data; currently we end
1074 # up storing only the last variant.
1076 elsif (/^S1 (.+)$/) {
1078 # if S1 not present, means transcriptional start
1080 elsif (/^SF (.+)$/) {
1083 elsif (/^ST (.+)$/) {
1086 elsif (/^RN .+?(RE\d+)/) {
1087 $reference->put($data[0], "$1");
1089 elsif (/^MX (M\d+)/) {
1090 $matrix->put($data[0], "$1");
1092 elsif (/^BF (T\d+); .+?; Quality: (\d)/) {
1093 $factor->put($data[0], "$1");
1094 $qualities{$data[0].SEPARATOR
.$1} = $2;
1097 # end of a record, store previous data and reset
1099 # accession = id gene_id sequence relative_to first_position last_position species_tax_id_or_raw_string type
1100 $sites{$data[0]} = join(SEPARATOR
, ($data[1] || '', $data[2] || '', $data[3] || '', $data[4] || 'TSS', $data[5] || '', $data[6] || '', $data[7] || '', $data[8] || ''));
1107 $site = $id = $species = $quality = $gene = $matrix = $factor = $reference = undef;
1118 my $matrix_dat = "$dat_dir/matrix.dat";
1119 if (! -e
$matrix_index || $force) {
1120 open(MAT
, $matrix_dat) || $self->throw("Cannot open matrix file '$matrix_dat' for reading");
1123 unlink $matrix_index;
1124 my $matrix = tie
(%matrices, 'DB_File', $matrix_index, O_RDWR
|O_CREAT
, 0644, $DB_HASH) || $self->throw("Cannot open file '$matrix_index': $!");
1127 my $matrix_id = $matrix_index.'.id';
1129 my $id = tie
(%id, 'DB_File', $matrix_id, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$matrix_id': $!");
1132 my $matrix_name = $matrix_index.'.name';
1133 unlink $matrix_name;
1134 my $name = tie
(%name, 'DB_File', $matrix_name, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$matrix_name': $!");
1137 my $matrix_site = $site_index.'.matrix';
1138 unlink $matrix_site;
1139 my $site = tie
(%site, 'DB_File', $matrix_site, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$matrix_site': $!");
1142 my $matrix_factor = $factor_index.'.matrix';
1143 unlink $matrix_factor;
1144 my $factor = tie
(%factor, 'DB_File', $matrix_factor, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$matrix_factor': $!");
1147 my $matrix_reference = $reference_index.'.matrix';
1148 unlink $matrix_reference;
1149 my $reference = tie
(%reference, 'DB_File', $matrix_reference, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$matrix_reference': $!");
1151 # skip the first three header lines
1152 <MAT
>; <MAT
>; <MAT
>;
1161 elsif (/^ID (\S+)/) {
1163 $id->put("$1", $data[0]);
1165 elsif (/^NA (.+)$/) {
1167 $name->put("$1", $data[0]);
1169 elsif (/^DE (.+)$/) {
1172 elsif (/^\d\d \s*(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/) {
1173 # a, c, g, t counts/weights
1174 push(@matrix_data, join("\t", ($1, $2, $3, $4)));
1176 # Work out the number of sites as the largest number of
1177 # sites amongst all positions in the sequences. (The BA
1178 # line isn't reliable for telling us the correct number of
1179 # sites all the time)
1180 my $num = $1 + $2 + $3 + $4;
1182 if ($num > $data[4]) {
1186 elsif (/^BS ([\sa-zA-Z]+); (.+?); (-?\d+); \d+;.*; ([np])/) {
1187 # sequence id start strand
1188 push(@site_data, join('_', ($1, $2, $3, $4 eq 'p' ?
1 : -1)));
1189 $site->put($data[0], $2);
1191 elsif (/^BF (T\d+)/) {
1192 $factor->put($data[0], "$1");
1194 elsif (/^RN .+?(RE\d+)/) {
1195 $reference->put($data[0], "$1");
1198 # end of a record, store previous data and reset
1199 my $matrix_data = join(INTERNAL_SEPARATOR
, @matrix_data) || '';
1201 # sites of a matrix are pre-aligned but padded with spaces on
1202 # the left and no padding on the right; pad with -s both sides
1203 my $longest_seq = 0;
1204 foreach my $site_seq (@site_data) {
1205 $site_seq =~ s/ /-/g;
1206 my $length = length($site_seq);
1207 if ($length > $longest_seq) {
1208 $longest_seq = $length;
1211 foreach my $site_seq (@site_data) {
1212 my $length = length($site_seq);
1213 if ($length < $longest_seq) {
1214 $site_seq .= '-' x
($longest_seq - $length);
1217 my $site_data = join(INTERNAL_SEPARATOR
, @site_data) || '';
1219 # accession = id name description num_of_sites matrix_data site_data
1220 $matrices{$data[0]} = join(SEPARATOR
, ($data[1] || '', $data[2] || '', $data[3] || '', $data[4], $matrix_data, $site_data));
1222 @data = @matrix_data = @site_data = ();
1227 $matrix = $id = $name = $site = $factor = $reference = undef;
1236 my $factor_dat = "$dat_dir/factor.dat";
1237 if (! -e
$factor_index || $force) {
1238 open(FAC
, $factor_dat) || $self->throw("Cannot open factor file '$factor_dat' for reading");
1241 unlink $factor_index;
1242 my $factor = tie
(%factors, 'DB_File', $factor_index, O_RDWR
|O_CREAT
, 0644, $DB_HASH) || $self->throw("Cannot open file '$factor_index': $!");
1245 my $factor_id = $factor_index.'.id';
1247 my $id = tie
(%id, 'DB_File', $factor_id, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file 'factor_id': $!");
1250 my $factor_name = $factor_index.'.name';
1251 unlink $factor_name;
1252 my $name = tie
(%name, 'DB_File', $factor_name, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_name': $!");
1255 my $factor_species = $factor_index.'.species';
1256 unlink $factor_species;
1257 my $species = tie
(%species, 'DB_File', $factor_species, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_species': $!");
1260 my $factor_interactors = $factor_index.'.interactors';
1261 unlink $factor_interactors;
1262 my $interact = tie
(%interactors, 'DB_File', $factor_interactors, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_interactors': $!");
1265 my $factor_gene = $gene_index.'.factor';
1266 unlink $factor_gene;
1267 my $gene = tie
(%gene, 'DB_File', $factor_gene, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_gene': $!");
1270 my $factor_matrix = $matrix_index.'.factor';
1271 unlink $factor_matrix;
1272 my $matrix = tie
(%matrix, 'DB_File', $factor_matrix, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_matrix': $!");
1275 my $factor_site = $site_index.'.factor';
1276 unlink $factor_site;
1277 my $site = tie
(%site, 'DB_File', $factor_site, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_site': $!");
1280 my $factor_fragment = $fragment_index.'.factor';
1281 unlink $factor_fragment;
1282 my $fragment = tie
(%fragment, 'DB_File', $factor_fragment, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_fragment': $!");
1285 my $factor_reference = $reference_index.'.factor';
1286 unlink $factor_reference;
1287 my $reference = tie
(%reference, 'DB_File', $factor_reference, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_reference': $!");
1289 # skip the first three header lines
1290 <FAC
>; <FAC
>; <FAC
>;
1298 elsif (/^ID (\S+)/) {
1299 # IDs are always the same as AC? Is this needed?
1301 $id->put("$1", $data[0]);
1303 elsif (/^FA (.+)$/) {
1305 $name->put("$1", $data[0]);
1307 elsif (/^OS (.+)$/) {
1308 # This is the species the actual factor came from, which may
1309 # differ from the species of any sequences it is described as
1310 # binding to. Not all factors that have a species have a gene,
1311 # so can't delegate species to a gene lookup.
1312 my $raw_species = $1;
1313 my $taxid = $self->_species_to_taxid($raw_species);
1314 $data[3] = $taxid || $raw_species;
1315 $species->put($data[3], $data[0]);
1317 elsif (/^GE (G\d+)/) {
1318 $gene->put($data[0], "$1");
1320 elsif (/^SQ (.+)$/) {
1323 elsif (/^IN (T\d+)/) {
1324 $interact->put($data[0], "$1");
1326 elsif (/^MX (M\d+)/) {
1327 $matrix->put($data[0], "$1");
1329 elsif (/^BS (R\d+)/) {
1330 $site->put($data[0], "$1");
1332 elsif (/^BR (FR\d+)/) {
1333 $fragment->put($data[0], "$1");
1335 elsif (/^RN .+?(RE\d+)/) {
1336 $reference->put($data[0], "$1");
1339 # end of a record, store previous data and reset
1341 # accession = id name species sequence
1342 $factors{$data[0]} = join(SEPARATOR
, ($data[1] || '', $data[2] || '', $data[3] || '', $sequence));
1350 $factor = $id = $name = $species = $interact = $gene = $matrix = $site = $fragment = $reference = undef;
1363 my $fragment_dat = "$dat_dir/fragment.dat";
1364 if (! -e
$fragment_index || $force) {
1365 if (open(FRA
, $fragment_dat)) {
1367 unlink $fragment_index;
1368 my $fragment = tie
(%fragments, 'DB_File', $fragment_index, O_RDWR
|O_CREAT
, 0644, $DB_HASH) || $self->throw("Cannot open file '$fragment_index': $!");
1371 my $fragment_id = $fragment_index.'.id';
1372 unlink $fragment_id;
1373 my $id = tie
(%id, 'DB_File', $fragment_id, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$fragment_id': $!");
1376 my $fragment_qualities = $fragment_index.'.qual';
1377 unlink $fragment_qualities;
1378 my $quality = tie
(%qualities, 'DB_File', $fragment_qualities, O_RDWR
|O_CREAT
, 0644, $DB_HASH) || $self->throw("Cannot open file '$fragment_qualities': $!");
1381 my $fragment_species = $fragment_index.'.species';
1382 unlink $fragment_species;
1383 my $species = tie
(%species, 'DB_File', $fragment_species, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$fragment_species': $!");
1386 my $fragment_gene = $gene_index.'.fragment';
1387 unlink $fragment_gene;
1388 my $gene = tie
(%gene, 'DB_File', $fragment_gene, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$fragment_gene': $!");
1391 my $fragment_factor = $factor_index.'.fragment';
1392 unlink $fragment_factor;
1393 my $factor = tie
(%factor, 'DB_File', $fragment_factor, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$fragment_factor': $!");
1396 my $fragment_reference = $reference_index.'.fragment';
1397 unlink $fragment_reference;
1398 my $reference = tie
(%reference, 'DB_File', $fragment_reference, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) || $self->throw("Cannot open file '$fragment_reference': $!");
1400 # skip the first three header lines
1401 <FRA
>; <FRA
>; <FRA
>;
1408 elsif (/^ID (\S+)/) {
1409 # IDs are always the same as AC? Is this needed?
1411 $id->put("$1", $data[0]);
1413 elsif (/^DE Gene: (G\d+)(?:.+Gene: (G\d+))?/) {
1414 my ($gene1, $gene2) = ($1, $2);
1416 $data[3] = $gene2; # could be undef
1417 $gene->put($data[0], $gene1);
1418 $gene->put($data[0], $gene2) if $gene2;
1420 elsif (/^OS (.+)$/) {
1421 # As per the site.dat parsing
1422 my $raw_species = $1;
1423 my $taxid = $self->_species_to_taxid($raw_species);
1424 $data[4] = $taxid || $raw_species;
1425 $species->put($data[4], $data[0]);
1427 elsif (/^SQ [atcgn]*([ATCGN]+)[atcgn]*/) {
1429 # there can be (usually are) multiple SQ lines with a single
1430 # long seq split over them. The 'real' sequence is in caps
1432 elsif (/^SC Build (\S+):$/) {
1434 # maybe parse it out a little more? We have build,
1435 # chromosomal coords and strand, eg.
1436 # SC Build HSA_May2004: Chr.2 43976692..43978487 (FORWARD).
1438 elsif (/^RN .+?(RE\d+)/) {
1439 $reference->put($data[0], "$1");
1441 elsif (/^BF (T\d+); .+?; Quality: (\d)/) {
1442 $factor->put($data[0], "$1");
1443 $qualities{$data[0].SEPARATOR
.$1} = $2;
1446 # end of a record, store previous data and reset
1448 # accession = id gene_id1 gene_id2 species_tax_id_or_raw_string sequence source
1449 $fragments{$data[0]} = join(SEPARATOR
, ($data[1] || '', $data[2] || '', $data[3] || '', $data[4] || '', $data[5] || '', $data[6] || ''));
1456 $fragment = $id = $species = $quality = $gene = $factor = $reference = undef;
1466 $self->warn("Cannot open fragment file '$fragment_dat' for reading, assuming you have an old version of Transfac Pro with no fragment.dat file.");
1471 # connect the internal db handle
1474 return if $self->{'_initialized'};
1476 my $index_dir = $self->index_directory;
1477 my $gene_index = "$index_dir/gene.dat.index";
1478 my $reference_index = "$index_dir/reference.dat.index";
1479 my $matrix_index = "$index_dir/matrix.dat.index";
1480 my $factor_index = "$index_dir/factor.dat.index";
1481 my $site_index = "$index_dir/site.dat.index";
1482 my $fragment_index = "$index_dir/fragment.dat.index";
1484 foreach ($gene_index, $reference_index, $matrix_index, $factor_index, $site_index, $fragment_index) {
1486 #$self->warn("Index files have not been created");
1493 $self->{reference
}->{data
} = {};
1494 tie
(%{$self->{reference
}->{data
}}, 'DB_File', $reference_index, O_RDWR
, undef, $DB_HASH) || $self->throw("Cannot open file '$reference_index': $!");
1496 my $reference_pubmed = $reference_index.'.pubmed';
1497 $self->{reference
}->{pubmed
} = tie
(%{$self->{reference
}->{pubmed
}}, 'DB_File', $reference_pubmed, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_pubmed': $!");
1499 my $reference_gene = $gene_index.'.reference';
1500 $self->{gene
}->{reference
} = tie
(%{$self->{gene
}->{reference
}}, 'DB_File', $reference_gene, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_gene': $!");
1502 my $reference_site = $site_index.'.reference';
1503 $self->{site
}->{reference
} = tie
(%{$self->{site
}->{reference
}}, 'DB_File', $reference_site, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_site': $!");
1505 my $reference_fragment = $fragment_index.'.reference';
1506 $self->{fragment
}->{reference
} = tie
(%{$self->{fragment
}->{reference
}}, 'DB_File', $reference_fragment, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_fragment': $!");
1508 my $reference_factor = $factor_index.'.reference';
1509 $self->{factor
}->{reference
} = tie
(%{$self->{factor
}->{reference
}}, 'DB_File', $reference_factor, undef, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_factor': $!");
1511 my $reference_matrix = $matrix_index.'.reference';
1512 $self->{matrix
}->{reference
} = tie
(%{$self->{matrix
}->{reference
}}, 'DB_File', $reference_matrix, undef, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_matrix': $!");
1517 $self->{gene
}->{data
} = {};
1518 tie
(%{$self->{gene
}->{data
}}, 'DB_File', $gene_index, O_RDWR
, undef, $DB_HASH) || $self->throw("Cannot open file '$gene_index': $!");
1520 my $gene_id = $gene_index.'.id';
1521 $self->{gene
}->{id
} = tie
(%{$self->{gene
}->{id
}}, 'DB_File', $gene_id, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_id': $!");
1523 my $gene_name = $gene_index.'.name';
1524 $self->{gene
}->{name
} = tie
(%{$self->{gene
}->{name
}}, 'DB_File', $gene_name, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_name': $!");
1526 my $gene_species = $gene_index.'.species';
1527 $self->{gene
}->{species
} = tie
(%{$self->{gene
}->{species
}}, 'DB_File', $gene_species, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_species': $!");
1529 my $gene_site = $site_index.'.gene';
1530 $self->{site
}->{gene
} = tie
(%{$self->{site
}->{gene
}}, 'DB_File', $gene_site, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_site': $!");
1532 my $gene_fragment = $fragment_index.'.gene';
1533 $self->{fragment
}->{gene
} = tie
(%{$self->{fragment
}->{gene
}}, 'DB_File', $gene_fragment, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_fragment': $!");
1535 my $gene_factor = $factor_index.'.gene';
1536 $self->{factor
}->{gene
} = tie
(%{$self->{factor
}->{gene
}}, 'DB_File', $gene_factor, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_factor': $!");
1538 my $gene_reference = $reference_index.'.gene';
1539 $self->{reference
}->{gene
} = tie
(%{$self->{reference
}->{gene
}}, 'DB_File', $gene_reference, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_reference': $!");
1544 $self->{site
}->{data
} = {};
1545 tie
(%{$self->{site
}->{data
}}, 'DB_File', $site_index, O_RDWR
, undef, $DB_HASH) || $self->throw("Cannot open file '$site_index': $!");
1547 my $site_id = $site_index.'.id';
1548 $self->{site
}->{id
} = tie
(%{$self->{site
}->{id
}}, 'DB_File', $site_id, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_id': $!");
1550 my $site_species = $site_index.'.species';
1551 $self->{site
}->{species
} = tie
(%{$self->{site
}->{species
}}, 'DB_File', $site_species, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file $site_species': $!");
1553 #*** quality not actually used by anything (yet)
1554 my $site_qualities = $site_index.'.qual';
1555 $self->{quality
} = {};
1556 tie
(%{$self->{quality
}}, 'DB_File', $site_qualities, O_RDWR
, undef, $DB_HASH) || $self->throw("Cannot open file '$site_qualities': $!");
1558 my $site_gene = $gene_index.'.site';
1559 $self->{gene
}->{site
} = tie
(%{$self->{gene
}->{site
}}, 'DB_File', $site_gene, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_gene': $!");
1561 my $site_matrix = $matrix_index.'.site';
1562 $self->{matrix
}->{site
} = tie
(%{$self->{matrix
}->{site
}}, 'DB_File', $site_matrix, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_matrix': $!");
1564 my $site_factor = $factor_index.'.site';
1565 $self->{factor
}->{site
} = tie
(%{$self->{factor
}->{site
}}, 'DB_File', $site_factor, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_factor': $!");
1567 my $site_reference = $reference_index.'.site';
1568 $self->{reference
}->{site
} = tie
(%{$self->{reference
}->{site
}}, 'DB_File', $site_reference, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_reference': $!");
1571 # fragment (may not be in older databases)
1572 if (-e
$fragment_index) {
1573 $self->{fragment
}->{data
} = {};
1574 tie
(%{$self->{fragment
}->{data
}}, 'DB_File', $fragment_index, O_RDWR
, undef, $DB_HASH) || $self->throw("Cannot open file '$fragment_index': $!");
1576 my $fragment_id = $fragment_index.'.id';
1577 $self->{fragment
}->{id
} = tie
(%{$self->{fragment
}->{id
}}, 'DB_File', $fragment_id, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_id': $!");
1579 my $fragment_species = $fragment_index.'.species';
1580 $self->{fragment
}->{species
} = tie
(%{$self->{fragment
}->{species
}}, 'DB_File', $fragment_species, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file $fragment_species': $!");
1582 #*** quality not actually used by anything (yet)
1583 my $fragment_qualities = $fragment_index.'.qual';
1584 $self->{fragment_quality
} = {};
1585 tie
(%{$self->{fragment_quality
}}, 'DB_File', $fragment_qualities, O_RDWR
, undef, $DB_HASH) || $self->throw("Cannot open file '$fragment_qualities': $!");
1587 my $fragment_gene = $gene_index.'.fragment';
1588 $self->{gene
}->{fragment
} = tie
(%{$self->{gene
}->{fragment
}}, 'DB_File', $fragment_gene, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_gene': $!");
1590 my $fragment_factor = $factor_index.'.fragment';
1591 $self->{factor
}->{fragment
} = tie
(%{$self->{factor
}->{fragment
}}, 'DB_File', $fragment_factor, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_factor': $!");
1593 my $fragment_reference = $reference_index.'.fragment';
1594 $self->{reference
}->{fragment
} = tie
(%{$self->{reference
}->{fragment
}}, 'DB_File', $fragment_reference, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_reference': $!");
1597 die "no fragment_index at '$fragment_index'\n";
1602 $self->{matrix
}->{data
} = {};
1603 tie
(%{$self->{matrix
}->{data
}}, 'DB_File', $matrix_index, O_RDWR
, undef, $DB_HASH) || $self->throw("Cannot open file '$matrix_index': $!");
1605 my $matrix_id = $matrix_index.'.id';
1606 $self->{matrix
}->{id
} = tie
(%{$self->{matrix
}->{id
}}, 'DB_File', $matrix_id, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_id': $!");
1608 my $matrix_name = $matrix_index.'.name';
1609 $self->{matrix
}->{name
} = tie
(%{$self->{matrix
}->{name
}}, 'DB_File', $matrix_name, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_name': $!");
1611 my $matrix_site = $site_index.'.matrix';
1612 $self->{site
}->{matrix
} = tie
(%{$self->{site
}->{matrix
}}, 'DB_File', $matrix_site, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_site': $!");
1614 my $matrix_factor = $factor_index.'.matrix';
1615 $self->{factor
}->{matrix
} = tie
(%{$self->{factor
}->{matrix
}}, 'DB_File', $matrix_factor, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_factor': $!");
1617 my $matrix_reference = $reference_index.'.matrix';
1618 $self->{reference
}->{matrix
} = tie
(%{$self->{reference
}->{matrix
}}, 'DB_File', $matrix_reference, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_reference': $!");
1623 $self->{factor
}->{data
} = {};
1624 tie
(%{$self->{factor
}->{data
}}, 'DB_File', $factor_index, O_RDWR
, undef, $DB_HASH) || $self->throw("Cannot open file '$factor_index': $!");
1626 my $factor_id = $factor_index.'.id';
1627 $self->{factor
}->{id
} = tie
(%{$self->{factor
}->{id
}}, 'DB_File', $factor_id, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file 'factor_id': $!");
1629 my $factor_name = $factor_index.'.name';
1630 $self->{factor
}->{name
} = tie
(%{$self->{factor
}->{name
}}, 'DB_File', $factor_name, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_name': $!");
1632 my $factor_species = $factor_index.'.species';
1633 $self->{factor
}->{species
} = tie
(%{$self->{factor
}->{species
}}, 'DB_File', $factor_species, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_species': $!");
1635 my $factor_interactors = $factor_index.'.interactors';
1636 $self->{factor
}->{interactors
} = tie
(%{$self->{factor
}->{interactors
}}, 'DB_File', $factor_interactors, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_interactors': $!");
1638 my $factor_gene = $gene_index.'.factor';
1639 $self->{gene
}->{factor
} = tie
(%{$self->{gene
}->{factor
}}, 'DB_File', $factor_gene, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_gene': $!");
1641 my $factor_matrix = $matrix_index.'.factor';
1642 $self->{matrix
}->{factor
} = tie
(%{$self->{matrix
}->{factor
}}, 'DB_File', $factor_matrix, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_matrix': $!");
1644 my $factor_site = $site_index.'.factor';
1645 $self->{site
}->{factor
} = tie
(%{$self->{site
}->{factor
}}, 'DB_File', $factor_site, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_site': $!");
1647 my $factor_fragment = $fragment_index.'.factor';
1648 $self->{fragment
}->{factor
} = tie
(%{$self->{fragment
}->{factor
}}, 'DB_File', $factor_fragment, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_fragment': $!");
1650 my $factor_reference = $reference_index.'.factor';
1651 $self->{reference
}->{factor
} = tie
(%{$self->{reference
}->{factor
}}, 'DB_File', $factor_reference, O_RDWR
, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_reference': $!");
1654 $self->{'_initialized'} = 1;
1657 =head2 index_directory
1659 Title : index_directory
1660 Funtion : Get/set the location that index files are stored. (this module
1661 will index the supplied database)
1662 Usage : $obj->index_directory($newval)
1663 Returns : value of index_directory (a scalar)
1664 Args : on set, new value (a scalar or undef, optional)
1668 sub index_directory
{
1670 return $self->{'index_directory'} = shift if @_;
1671 return $self->{'index_directory'};
1674 # resolve a transfac species string into an ncbi taxid
1675 sub _species_to_taxid
{
1676 my ($self, $raw_species) = @_;
1677 $raw_species or return;
1680 my @split = split(', ', $raw_species);
1681 (@split > 1) ?
($species_string = $split[1]) : ($species_string = $split[0]);
1684 if ($species_string =~ /^[A-Z]\S+ \S+$/) {
1685 SWITCH
: for ($species_string) {
1686 # some species don't classify so custom handling
1687 /^Darnel ryegrass/ && do { $ncbi_taxid = 34176; last; };
1688 /^Coix lacryma/ && do { $ncbi_taxid = 4505; last; };
1689 /^Rattus spec/ && do { $ncbi_taxid = 10116; last; };
1690 /^Mus spec/ && do { $ncbi_taxid = 10090; last; };
1691 /^Equus spec/ && do { $ncbi_taxid = 9796; last; };
1692 /^Cavia sp/ && do { $ncbi_taxid = 10141; last; };
1693 /^Marsh marigold/ && do { $ncbi_taxid = 3449; last; };
1694 /^Phalaenopsis sp/ && do { $ncbi_taxid = 36900; last; };
1695 /^Anthirrhinum majus/ && do { $ncbi_taxid = 4151; last; };
1696 /^Equus spec/ && do { $ncbi_taxid = 9796; last; };
1697 /^Lycopodium spec/ && do { $ncbi_taxid = 13840; last; };
1698 /^Autographa californica/ && do { $ncbi_taxid = 307456; last; };
1699 /^E26 AEV/ && do { $ncbi_taxid = 31920; last; };
1700 /^Pseudocentrotus miliaris/ && do { $ncbi_taxid = 7677; last; }; # the genus is 7677 but this species isn't there
1701 /^SL3-3 (?:retro)?virus/ && do { $ncbi_taxid = 53454; last; }; # 53454 is unclassified MLV-related, SL3-3 a variant of that?
1702 /^Petunia sp/ && do { $ncbi_taxid = 4104; last; };
1704 if (! $ncbi_taxid && defined $self->{_tax_db
}) {
1705 ($ncbi_taxid) = $self->{_tax_db
}->get_taxonids($species_string);
1709 # some species lines are poorly formated so custom handling
1710 SWITCH
: for ($raw_species) {
1711 # for speed, go by common first letters
1712 my $first_letter = substr($raw_species, 0, 1);
1714 $first_letter eq 'A' && do {
1715 /^Adiantum raddianum/ && do { $ncbi_taxid = 32168; last; };
1716 /^Avian sarcoma virus \(strain 17\)/ && do { $ncbi_taxid = 11877; last; };
1717 /^AMV/ && do { $ncbi_taxid = 11866; last; };
1718 /^AEV/ && do { $ncbi_taxid = 11861; last; };
1719 /^AS42|^Avian musculoaponeurotic/ && do { $ncbi_taxid = 11873; last; };
1720 /^Avian myelocytomatosis/ && do { $ncbi_taxid = 11869; last; };
1721 /^ASV 31/ && do { $ncbi_taxid = 35270; last; };
1722 /^A-MuLV/ && do { $ncbi_taxid = 188539; last; };
1723 /^Asparagus officinalis/ && do { $ncbi_taxid = 4686; last; };
1724 /^Agrobacterium tumefaciens/ && do { $ncbi_taxid = 358; last; };
1725 /^ALV/ && do { $ncbi_taxid = 11864; last; };
1726 /^AAV/ && do { $ncbi_taxid = 272636; last; };
1727 /^AKV MLV/ && do { $ncbi_taxid = 11791; last; };
1731 $first_letter eq 'B' && do {
1732 /^BPV-1/ && do { $ncbi_taxid = 10559; last; };
1733 /^BKV/ && do { $ncbi_taxid = 10629; last; };
1734 /^Bolivian squirrel monkey/ && do { $ncbi_taxid = 39432; last; };
1738 $first_letter eq 'C' && do {
1739 /^Cauliflower/ && do { $ncbi_taxid = 3715; last; };
1740 /^Chamek/ && do { $ncbi_taxid = 118643; last; };
1741 /^Candida albicans/ && do { $ncbi_taxid = 5476; last; };
1742 /^CaMV/ && do { $ncbi_taxid = 10641; last; };
1746 $first_letter eq 'E' && do {
1747 /^Eucalyptus gunnii/ && do { $ncbi_taxid = 3933; last; };
1748 /^EBV, Epstein-Barr virus/ && do { $ncbi_taxid = 10376; last; };
1749 /^Eucalyptus globulus subsp. bicostata/ && do { $ncbi_taxid = 71272; last; };
1750 /^Eucalyptus globulus subsp. globulus/ && do { $ncbi_taxid = 71271; last; };
1754 $first_letter eq 'F' && do {
1755 /^FBR MuLV/ && do { $ncbi_taxid = 11806; last; };
1756 /^FBJ MuLV/ && do { $ncbi_taxid = 11805; last; };
1757 /^FeLV|Feline leukemia/ && do { $ncbi_taxid = 11923; last; };
1758 /^Flaveria trinervia/ && do { $ncbi_taxid = 4227; last; };
1759 /^FSV/ && do { $ncbi_taxid = 11885; last; };
1760 /^F-MuLV/ && do { $ncbi_taxid = 11795; last; };
1764 $first_letter eq 'H' && do {
1765 /^HSV-1/ && do { $ncbi_taxid = 10298; last; };
1766 /^HTLV-I/ && do { $ncbi_taxid = 11908; last; };
1767 /^HIV-1/ && do { $ncbi_taxid = 11676; last; };
1768 /^HPV-16/ && do { $ncbi_taxid = 333760; last; };
1769 /^HBV/ && do { $ncbi_taxid = 10407; last; };
1770 /^HBI/ && do { $ncbi_taxid = 11867; last; };
1771 /^HPV-8/ && do { $ncbi_taxid = 10579; last; };
1772 /^HPV-11/ && do { $ncbi_taxid = 10580; last; };
1773 /^HPV-18/ && do { $ncbi_taxid = 333761; last; };
1774 /^HCMV/ && do { $ncbi_taxid = 10359; last; };
1775 /^HSV/ && do { $ncbi_taxid = 126283; last; };
1776 /^HSV-2/ && do { $ncbi_taxid = 10310; last; };
1777 /^HCV/ && do { $ncbi_taxid = 11108; last; };
1778 /^HIV-2/ && do { $ncbi_taxid = 11709; last; };
1782 $first_letter eq 'M' && do {
1783 /^MMTV/ && do { $ncbi_taxid = 11757; last; };
1784 /^Mo-MuLV/ && do { $ncbi_taxid = 11801; last; };
1785 /^MuLV/ && do { $ncbi_taxid = 11786; last; };
1786 /^MSV/ && do { $ncbi_taxid = 11802; last; };
1787 /^MC29/ && do { $ncbi_taxid = 11868; last; };
1788 /^MVM/ && do { $ncbi_taxid = 10794; last; };
1789 /^MH2E21/ && do { $ncbi_taxid = 11955; last; }; # 11955 is a species, presumably MH2E21 is the strain
1793 $first_letter eq 'R' && do {
1794 /^Raphanus sativus/ && do { $ncbi_taxid = 3726; last; };
1795 /^REV-T/ && do { $ncbi_taxid = 11636; last; };
1796 /^RAV-0/ && do { $ncbi_taxid = 11867; last; }; # should be rous-associated virus 0 variant
1797 /^RSV/ && do { $ncbi_taxid = 11886; last; };
1798 /^RadLV/ && do { $ncbi_taxid = 31689; last; };
1799 /^RTBV/ && do { $ncbi_taxid = 10654; last; };
1803 $first_letter eq 'S' && do {
1804 /^SV40/ && do { $ncbi_taxid = 10633; last; };
1805 /^Sesbania rostrata/ && do { $ncbi_taxid = 3895; last; };
1806 /^SIV/ && do { $ncbi_taxid = 11723; last; };
1807 /^Spinacia oleracea/ && do { $ncbi_taxid = 3562; last; };
1808 /^SCMV/ && do { $ncbi_taxid = 10364; last; }; # supposed to be AGM isolate
1813 $first_letter eq 'a' && do {
1814 /^adenovirus type 5/ && do { $ncbi_taxid = 28285; last; };
1815 /^adenovirus type 2/ && do { $ncbi_taxid = 10515; last; };
1816 /^adenovirus/ && do { $ncbi_taxid = 189831; last; }; # 189831 ('unclassified Adenoviridae') is the closest I can get, but this has no genus and is not a species
1820 $first_letter eq 'b' && do {
1821 /^bell pepper/ && do { $ncbi_taxid = 4072; last; };
1822 /^baculovirus, Autographa californica/ && do { $ncbi_taxid = 46015; last; };
1823 /^broccoli/ && do { $ncbi_taxid = 36774; last; };
1824 /^barley/ && do { $ncbi_taxid = 112509; last; };
1828 $first_letter eq 'c' && do {
1829 /^clawed frog/ && do { $ncbi_taxid = 8355; last; };
1830 /^chipmunk/ && do { $ncbi_taxid = 64680; last; };
1831 /^common tree shrew/ && do { $ncbi_taxid = 37347; last; };
1832 /^cat/ && do { $ncbi_taxid = 9685; last; };
1837 /^NK24/ && do { $ncbi_taxid = 11955; last; };
1838 /^OK10/ && do { $ncbi_taxid = 11871; last; };
1839 /^Dendrobium grex/ && do { $ncbi_taxid = 84618; last; };
1840 /^KSHV/ && do { $ncbi_taxid = 37296; last; };
1841 /^Oncidium/ && do { $ncbi_taxid = 96474; last; };
1842 /^Japanese quail/ && do { $ncbi_taxid = 93934; last; };
1843 /^Nile tilapia/ && do { $ncbi_taxid = 8128; last; };
1844 /^GALV/ && do { $ncbi_taxid = 11840; last; };
1845 /^JCV/ && do { $ncbi_taxid = 10632; last; };
1846 /^LPV/ && do { $ncbi_taxid = 10574; last; };
1847 /^Py,/ && do { $ncbi_taxid = 36362; last; };
1848 /^DHBV/ && do { $ncbi_taxid = 12639; last; };
1849 /^VZV/ && do { $ncbi_taxid = 10335; last; };
1850 /^Vicia faba/ && do { $ncbi_taxid = 3906; last; };
1852 /^hamster/ && do { $ncbi_taxid = 10029; last; };
1853 /^sea urchin/ && do { $ncbi_taxid = 7668; last; };
1854 /^fruit fly/ && do { $ncbi_taxid = 7227; last; };
1855 /^halibut/ && do { $ncbi_taxid = 8267; last; };
1856 /^vaccinia virus/ && do { $ncbi_taxid = 10245; last; };
1857 /^taxonomic class Mammalia/ && do { $ncbi_taxid = 40674; last; }; # not a species
1858 /^taxonomic class Vertebrata/ && do { $ncbi_taxid = 7742; last; }; # not a species
1859 /^dog/ && do { $ncbi_taxid = 9615; last; };
1860 /^parsley/ && do { $ncbi_taxid = 4043; last; };
1861 /^mouse, Mus domesticus Torino/ && do { $ncbi_taxid = 10092; last; }; # 10092 is domesticus subspecies, but not the Torino strain
1862 /^lemur, Eulemur fulvus collaris/ && do { $ncbi_taxid = 47178; last; };
1863 /^red sea bream/ && do { $ncbi_taxid = 143350; last; };
1864 /^zebra finch/ && do { $ncbi_taxid = 59729; last; };
1865 /^mung bean/ && do { $ncbi_taxid = 3916; last; };
1866 /^soybean/ && do { $ncbi_taxid = 3847; last; };
1867 /^oat/ && do { $ncbi_taxid = 4498; last; };
1868 /^pseudorabies virus/ && do { $ncbi_taxid = 10345; last; };
1872 $self->warn("Didn't know what species '$raw_species' was, unable to classify") unless $ncbi_taxid;