tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / DB / TFBS / transfac_pro.pm
blob085b20a397ef688a46def7fdf2364f24e3a2eb29
1 # $Id: transfac_pro.pm,v 1.15 2006/08/12 11:00:03 sendu Exp $
3 # BioPerl module for Bio::DB::TFBS::transfac_pro
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Sendu Bala <bix@sendu.me.uk>
9 # Copyright Sendu Bala
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::DB::TFBS::transfac_pro - An implementation of Bio::DB::TFBS
18 which uses local flat files for transfac pro
20 =head1 SYNOPSIS
22 use Bio::DB::Taxonomy;
24 my $db = new Bio::DB::Taxonomy(-source => 'transfac_pro'
25 -dat_dir => $directory);
27 # we're interested in the gene P5
28 my ($gene_id) = $db->get_gene_ids(-name => 'P5'); # G000001
30 # we want all the transcription factors that bind to our gene
31 my @factor_ids = $db->get_factor_ids(-gene => $gene_id);
33 # get info about those TFs
34 foreach my $factor_id (@factor_ids) {
35 my $factor = $db->get_factor($factor_id);
36 my $name = $factor->universal_name;
37 # etc. - see Bio::Map::TranscriptionFactor, eg. find out where it binds
40 # get a matrix
41 my $matrix = $db->get_matrix('M00001');
43 # get a binding site sequence
44 my $seq = $db->get_site('R00001');
46 =head1 DESCRIPTION
48 This is an implementation which uses local flat files and the DB_File
49 module RECNO data structures to manage a local copy of the Transfac Pro TFBS
50 database.
52 Required database files require a license which can be obtained via
53 http://www.biobase-international.com/pages/index.php?id=170
55 Within the linux installation tarball you will find a cgibin tar ball, and
56 inside that is a data directory containing the .dat files needed by this
57 module. Point to that data directory with -dat_dir
59 =head1 FEEDBACK
61 =head2 Mailing Lists
63 User feedback is an integral part of the evolution of this and other
64 Bioperl modules. Send your comments and suggestions preferably to
65 the Bioperl mailing list. Your participation is much appreciated.
67 bioperl-l@bioperl.org - General discussion
68 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
70 =head2 Support
72 Please direct usage questions or support issues to the mailing list:
74 I<bioperl-l@bioperl.org>
76 rather than to the module maintainer directly. Many experienced and
77 reponsive experts will be able look at the problem and quickly
78 address it. Please include a thorough description of the problem
79 with code and data examples if at all possible.
81 =head2 Reporting Bugs
83 Report bugs to the Bioperl bug tracking system to help us keep track
84 of the bugs and their resolution. Bug reports can be submitted via
85 the web:
87 http://bugzilla.open-bio.org/
89 =head1 AUTHOR - Sendu Bala
91 Email bix@sendu.me.uk
93 =head1 CONTRIBUTORS
95 Based on Bio::DB::Taxonomy::flatfile by Jason Stajich
97 =head1 APPENDIX
99 The rest of the documentation details each of the object methods.
100 Internal methods are usually preceded with a _
102 =cut
104 # Let the code begin...
106 package Bio::DB::TFBS::transfac_pro;
107 use strict;
108 use Bio::Annotation::Reference;
109 use Bio::Annotation::SimpleValue;
110 use Bio::LocatableSeq;
111 use Bio::SimpleAlign;
112 use Bio::Matrix::PSM::SiteMatrix;
113 use Bio::AlignIO;
114 use Bio::Map::GeneMap;
115 use Bio::Map::TranscriptionFactor;
116 use Bio::Map::Position;
117 use Bio::Map::Relative;
118 use DB_File;
120 use constant SEPARATOR => ':!:';
121 use constant INTERNAL_SEPARATOR => '!:!';
123 $DB_BTREE->{'flags'} = R_DUP; # allow duplicate values in DB_File BTREEs
125 use base qw(Bio::DB::TFBS);
127 =head2 new
129 Title : new
130 Usage : my $obj = new Bio::DB::TFBS::transfac_pro();
131 Function: Builds a new Bio::DB::TFBS::transfac_pro object
132 Returns : an instance of Bio::DB::TTFBS::transfac_pro
133 Args : -dat_dir => name of directory where Transfac Pro .dat files
134 (required to initially build indexes)
135 -tax_db => Bio::DB::Taxonomy object, used when initially building
136 indexes, gives better results for species information
137 but not required.
138 -index_dir => name of directory where index files should be created
139 or already exist. (defaults to -dat_dir, required if
140 -dat_dir not supplied)
141 -force => 1 replace current indexes even if they exist
143 =cut
145 sub new {
146 my ($class, @args) = @_;
148 my $self = $class->SUPER::new(@args);
150 my ($dat_dir, $index_dir, $tax_db, $force) = $self->_rearrange([qw(DAT_DIR INDEX_DIR TAX_DB FORCE)], @args);
151 $self->throw("At least one of -dat_dir and -index_dir must be supplied") unless ($dat_dir || $index_dir);
153 $self->index_directory($index_dir || $dat_dir);
154 $self->{_tax_db} = $tax_db if $tax_db;
156 if ($dat_dir) {
157 $self->_build_index($dat_dir, $force);
160 $self->_db_connect;
161 return $self;
164 =head2 Bio::DB::TFBS Interface implementation
166 =cut
168 sub _get_ids {
169 my ($self, $dat, @args) = @_;
170 @args % 2 == 0 || $self->throw("Must provide key => value pairs");
171 my $hash = $self->{$dat} || $self->throw("Unknown .dat type '$dat'");
173 if (@args) {
174 # get a subset corresponding to args
175 my @final;
176 my %args = @args;
177 my $multiple = 0;
178 while (my ($type, $value) = each %args) {
179 unless ($value) {
180 $self->warn("Arguement '$type' has no value, ignored");
181 next;
183 $type =~ s/-//;
184 $type = lc($type);
185 my $converter = $hash->{$type};
186 unless ($converter) {
187 $self->warn("Unknown search type '$type' for .dat type '$dat'");
188 next;
191 my @ids = $converter->get_dup($value);
192 unless (@ids) {
193 @ids = $converter->get_dup(lc($value));
196 if ($multiple) {
197 # we can have multiple types given at once, find the ids that
198 # satisfy all criteria
199 @final || return;
200 my %final = map { $_ => 1 } @final;
201 @final = grep { $final{$_} } @ids;
203 else {
204 @final = @ids;
205 $multiple++;
209 return @final;
211 else {
212 # get them all
213 my $db_file_hash = $self->{$dat}->{id};
215 my ($key, $prev_key, $value) = ('_!_', '!_!');
216 my @ids;
217 while (1) {
218 $db_file_hash->seq($key, $value, R_NEXT);
219 last if $prev_key eq $key;
220 push(@ids, $value); # confusing? when creating objects we store
221 # $value as accession and $key as id, but from
222 # this method we return $value as id given $id!
223 $prev_key = $key;
226 return @ids;
230 =head2 get_reference
232 Title : get_reference
233 Usage : my $ref = $obj->get_reference($id);
234 Function: Get a literature reference.
235 Returns : Bio::Annotation::Reference
236 Args : string - a reference id ('RE...')
238 =cut
240 sub get_reference {
241 my ($self, $id) = @_;
242 $id || return;
243 my $data = $self->{reference}->{data}->{$id} || return;
244 my @data = split(SEPARATOR, $data);
246 return Bio::Annotation::Reference->new(-pubmed => $data[0],
247 -authors => $data[1],
248 -title => $data[2],
249 -location => $data[3] );
252 =head2 get_genemap
254 Title : get_genemap
255 Usage : my $map = $obj->get_genemap($id);
256 Function: Get a GeneMap for a gene.
257 Returns : Bio::Map::GeneMap
258 Args : string - a gene id ('G...'), and optionally int (number of bp
259 upstream)
261 =cut
263 sub get_genemap {
264 my ($self, $id, $upstream) = @_;
265 $id || return;
266 return $self->{got_map}->{$id} if defined $self->{got_map}->{$id};
267 $upstream ||= 1000;
268 my $data = $self->{gene}->{data}->{$id} || return;
269 my @data = split(SEPARATOR, $data);
271 # accession = id name description species_tax_id_or_raw_string
272 my $taxon = $self->{_tax_db} ? $self->{_tax_db}->get_taxon($data[3]) || $data[3] : $data[3];
273 my $map = Bio::Map::GeneMap->get(-uid => $id,
274 -gene => $data[1],
275 -species => $taxon,
276 -description => $data[2],
277 -upstream => $upstream);
278 $self->{got_map}->{$id} = $map; # prevents infinite recurse when we call get_factor below
280 # spawn all the factors that belong on this gene map
281 # get_factor_ids(-gene => ...) only works for genes that encode factors;
282 # have to go via sites
283 foreach my $sid ($self->get_site_ids(-gene => $id)) {
284 foreach my $fid ($self->get_factor_ids(-site => $sid)) {
285 # it is quite deliberate that we deeply recurse to arrive at the
286 # correct answer, which involves pulling in most of the database
287 no warnings "recursion";
288 $self->get_factor($fid);
292 return $map;
295 =head2 get_seq
297 Title : get_seq
298 Usage : my $seq = $obj->get_seq($id);
299 Function: Get the sequence of a site. The sequence will be annotated with the
300 the tags 'relative_start', 'relative_end', 'relative_type' and
301 'relative_to'.
302 Returns : Bio::Seq
303 Args : string - a site id ('R...')
305 =cut
307 sub get_seq {
308 my ($self, $id) = @_;
309 $id || return;
310 my $data = $self->{site}->{data}->{$id} || return;
311 my @data = split(SEPARATOR, $data);
313 my $seq = Bio::Seq->new(-seq => $data[2],
314 -accession_number => $id,
315 -description => $data[6] ? 'Genomic sequence' : 'Consensus or artificial sequence',
316 -id => $data[0],
317 -strand => 1,
318 -alphabet => $data[7] || 'dna',
319 -species => $data[6]);
321 my $annot = $seq->annotation;
322 my $sv = Bio::Annotation::SimpleValue->new(-tagname => 'relative_start', -value => $data[4] || 1);
323 $annot->add_Annotation($sv);
324 $sv = Bio::Annotation::SimpleValue->new(-tagname => 'relative_end', -value => $data[5] || ($data[4] || 1 + length($data[2]) - 1));
325 $annot->add_Annotation($sv);
326 $sv = Bio::Annotation::SimpleValue->new(-tagname => 'relative_type', -value => $data[3] || 'artificial');
327 $annot->add_Annotation($sv);
328 $sv = Bio::Annotation::SimpleValue->new(-tagname => 'relative_to', -value => $data[1]);
329 $annot->add_Annotation($sv);
331 return $seq;
334 =head2 get_fragment
336 Title : get_fragment
337 Usage : my $seq = $obj->get_fragment($id);
338 Function: Get the sequence of a fragment.
339 Returns : Bio::Seq
340 Args : string - a site id ('FR...')
342 =cut
344 sub get_fragment {
345 my ($self, $id) = @_;
346 $id || return;
347 my $data = $self->{fragment}->{data}->{$id} || return;
348 my @data = split(SEPARATOR, $data);
350 # accession = id gene_id1 gene_id2 species_tax_id_or_raw_string sequence source
351 return new Bio::Seq( -seq => $data[4],
352 -accession_number => $id,
353 -description => 'Between genes '.$data[1].' and '.$data[2],
354 -species => $data[3],
355 -id => $data[0],
356 -alphabet => 'dna' );
359 =head2 get_matrix
361 Title : get_matrix
362 Usage : my $matrix = $obj->get_matrix($id);
363 Function: Get a matrix that describes a binding site.
364 Returns : Bio::Matrix::PSM::SiteMatrix
365 Args : string - a matrix id ('M...'), optionally a sequence string from
366 which base frequencies will be calcualted for the matrix model
367 (default 0.25 each)
369 =cut
371 sub get_matrix {
372 my ($self, $id, $seq) = @_;
373 $id || return;
374 $seq ||= 'atgc';
375 $seq = lc($seq);
376 my $data = $self->{matrix}->{data}->{$id} || return;
377 my @data = split(SEPARATOR, $data);
378 $data[4] || $self->throw("Matrix data missing for $id");
380 my ($a, $c, $g, $t);
381 foreach my $position (split(INTERNAL_SEPARATOR, $data[4])) {
382 my ($a_count, $c_count, $g_count, $t_count) = split("\t", $position);
383 push(@{$a}, $a_count);
384 push(@{$c}, $c_count);
385 push(@{$g}, $g_count);
386 push(@{$t}, $t_count);
389 # our psms include a simple background model so we can use
390 # sequence_match_weight() if desired
391 my $a_freq = ($seq =~ tr/a//) / length($seq);
392 my $c_freq = ($seq =~ tr/c//) / length($seq);
393 my $g_freq = ($seq =~ tr/g//) / length($seq);
394 my $t_freq = ($seq =~ tr/t//) / length($seq);
396 my $psm = new Bio::Matrix::PSM::SiteMatrix( -pA => $a,
397 -pC => $c,
398 -pG => $g,
399 -pT => $t,
400 -id => $data[0],
401 -accession_number => $id,
402 -sites => $data[3],
403 -width => scalar(@{$a}),
404 -correction => 1,
405 -model => { A => $a_freq, C => $c_freq, G => $g_freq, T => $t_freq } );
407 #*** used to make a Bio::Matrix::PSM::Psm and add references, but it
408 # didn't seem worth it. You can get references from the database by:
409 #foreach my $ref_id ($db->get_reference_ids(-matrix => $id)) {
410 # my $ref = $db->get_reference($ref_id);
413 return $psm;
416 =head2 get_aln
418 Title : get_aln
419 Usage : my $aln = $obj->get_aln($id);
420 Function: Get the alignment that was used to generate a matrix. Each sequence
421 in the alignment will have an accession_number corresponding to the
422 Transfac site id, and id() based on that but unique within the
423 alignment.
424 Returns : Bio::SimpleAlign
425 Args : string - a matrix id ('M...'), optionally true to, when a matrix
426 lists no sequences, search for sequences via the matrix's factors,
427 picking the sites that best match the matrix
429 =cut
431 sub get_aln {
432 my ($self, $id, $via_factors) = @_;
433 $id || return;
434 my $data = $self->{matrix}->{data}->{$id} || $self->throw("matrix '$id' had no data in DB_File");
435 my @data = split(SEPARATOR, $data);
437 if (! $data[5] && $via_factors) {
438 # This is a matrix with no site sequences given in matrix.dat.
439 # Find some matching site sequences via factors.
441 # First, check its factors for sites
442 my %site_seqs;
443 my %factor_ids;
444 foreach my $factor_id ($self->get_factor_ids(-matrix => $id)) {
445 $factor_ids{$factor_id} = 1;
446 foreach my $site_id ($self->get_site_ids(-factor => $factor_id)) {
447 next if defined $site_seqs{$site_id};
448 my $seq = $self->get_seq($site_id);
450 # skip sites that have no sequence, or have IUPAC symbols in
451 # their sequence (most probably the 'consensus' sequence itself
452 # that was used to make and exactly corresponds to the matrix)
453 my $seq_str = $seq->seq || next;
454 $seq_str =~ /[MRWSYKVHDB]/ and next;
456 $site_seqs{$site_id} = $seq;
459 my @seqs = values %site_seqs;
461 if (@seqs > 1) {
462 # pick the sub-seqs that match to the matrix with the best scores
463 my $matrix = $self->get_matrix($id);
464 my $desired_sequences = $matrix->sites;
465 return if @seqs < $desired_sequences;
467 my $desired_length = $matrix->width;
468 my %best_seqs;
469 foreach my $seq (@seqs) {
470 my $for_str = $seq->seq;
471 next if length($for_str) < $desired_length;
472 my $rev_str = $seq->revcom->seq;
474 my $best_score = 0;
475 my $best_subseq = '';
476 my $best_i = 0;
477 my $best_subseq_caps = 0;
478 my $best_revcom;
479 my $revcom = 0;
480 foreach my $seq_str ($for_str, $rev_str) {
481 for my $i (0..(length($seq_str) - $desired_length)) {
482 my $subseq = substr($seq_str, $i, $desired_length);
483 $subseq =~ s/[^ACGTacgt]//g; # can only score atcg
484 next unless length($subseq) == $desired_length; # short or 0-length seqs could get the highest scores!
485 my $score = $matrix->sequence_match_weight($subseq);
487 # caps represent the author-chosen bit of a site
488 # sequence so we would prefer to choose a subseq that
489 # contains it
490 my $caps = $subseq =~ tr/ACGT//;
492 #*** (don't know why numeric == fails for comparing
493 # scores, when the string eq works)
494 if ($score > $best_score || ("$score" eq "$best_score" && $caps > $best_subseq_caps)) {
495 $best_score = $score;
496 $best_subseq_caps = $caps;
497 $best_subseq = $subseq;
498 $best_i = $i;
499 $best_revcom = $revcom;
502 $revcom++;
505 if ($best_score) {
506 $best_seqs{$seq->accession_number} = [$best_subseq, $seq->accession_number, ($best_i + 1), $revcom ? -1 : 1, $best_score];
509 my @sorted = sort { $best_seqs{$b}->[-1] <=> $best_seqs{$a}->[-1] } keys %best_seqs;
510 return if @sorted < $desired_sequences;
511 splice(@sorted, $desired_sequences);
512 my %wanted = map { $_ => 1 } @sorted;
514 my @site_data;
515 foreach my $seq (@seqs) {
516 next unless exists $wanted{$seq->accession_number};
517 my @data = @{$best_seqs{$seq->accession_number}};
518 pop(@data);
519 push(@site_data, join('_', @data));
522 $data[5] = join(INTERNAL_SEPARATOR, @site_data);
523 $self->{matrix}->{data}->{$id} = join(SEPARATOR, @data);
526 $data[5] || return;
528 my @blocks = split(INTERNAL_SEPARATOR, $data[5]);
530 # append gap chars to all sequences to make them the same length
531 # (applies to sequences found via factors, presumably, since we already
532 # do this for matrix alignments in transfac_pro.pm)
533 my $longest = 0;
534 foreach (@blocks) {
535 my ($seq) = split('_', $_);
536 my $length = length($seq);
537 if ($length > $longest) {
538 $longest = $length;
541 foreach my $i (0..$#blocks) {
542 my $block = $blocks[$i];
543 my ($seq, $seq_id) = split('_', $block);
544 my $length = length($seq);
545 if ($length < $longest) {
546 my $orig_seq = $seq;
547 $seq .= '-'x($longest - $length);
548 $block =~ s/^${orig_seq}_/${seq}_/;
549 $blocks[$i] = $block;
553 # build the alignment
554 my $aln = Bio::SimpleAlign->new(-source => 'transfac_pro');
555 my %done_ids;
556 foreach (@blocks) {
557 my ($seq, $seq_acc, $start, $strand) = split('_', $_);
559 # we can get back multiple different subparts of the same site (sequence),
560 # so $seq_acc isn't unique across this loop. Can't use it as the seq id
561 # of the alignment (ids must be unique in SimpleAlign), so we
562 # uniquify the id and store the original id as the accession_number
563 my $seq_id;
564 $done_ids{$seq_acc}++;
565 if ($done_ids{$seq_acc} > 1) {
566 $seq_id = $seq_acc.'_'.$done_ids{$seq_acc};
568 else {
569 $seq_id = $seq_acc;
572 my $gaps = $seq =~ tr/-//;
573 my $length = length($seq) - $gaps;
574 $self->throw("seq '$seq_id' for matrix '$id' had seq '$seq'") unless $length;
575 $aln->add_seq(Bio::LocatableSeq->new(-seq => $seq,
576 -id => $seq_id,
577 -accession_number => $seq_acc,
578 -start => $start,
579 -end => $start + $length - 1,
580 -strand => $strand));
582 $aln->id($id);
583 # could also store score? of?
585 return $aln;
588 =head2 get_factor
590 Title : get_factor
591 Usage : my $factor = $obj->get_factor($id);
592 Function: Get the details of a transcription factor.
593 Returns : Bio::Map::TranscriptionFactor
594 Args : string - a factor id ('T...')
596 =cut
598 sub get_factor {
599 my ($self, $id) = @_;
600 $id || return;
601 return $self->{got_factor}->{$id} if defined $self->{got_factor}->{$id};
602 my $data = $self->{factor}->{data}->{$id} || return;
603 my @data = split(SEPARATOR, $data);
605 # accession = id name species sequence
606 my $tf = Bio::Map::TranscriptionFactor->get(-id => $id,
607 -universal_name => $data[1]);
608 #*** not sure what to do with species and sequence, since we don't want to
609 # confuse the idea that a TF is a general thing that could bind to any
610 # species... then again, you might want to model species-specific variants
611 # of a TF with different binding abilities...
612 #*** idea of having inclusion and exclusion species so you can prevent/
613 # ignore a tf that binds to the wrong species (a species that doesn't even
614 # have the tf), and associating sequence with each species/tf combo so you
615 # can see how diverged the tf is and make assumptions about site difference
616 # allowance
618 # place it on all its genemaps
619 foreach my $sid ($self->get_site_ids(-factor => $id)) {
620 my $s_data = $self->{site}->{data}->{$sid} || next;
621 my @s_data = split(SEPARATOR, $s_data);
623 # accession = id gene_id sequence relative_to first_position last_position species_tax_id_or_raw_string
624 $s_data[1] || next; # site isn't relative to a gene, meaningless
625 $s_data[4] || next; # don't know where its supposed to be, can't model it
626 $s_data[5] ||= $s_data[4] + ($s_data[2] ? length($s_data[2]) - 1 : 0);
628 # it is quite deliberate that we deeply recurse to arrive at the
629 # correct answer, which involves pulling in most of the database
630 no warnings "recursion";
631 my $gene_map = $self->get_genemap($s_data[1]) || next;
632 return $self->{got_factor}->{$id} if defined $self->{got_factor}->{$id};
634 #*** not always relative to gene start...
635 # we need Bio::Map::Gene s to have some default tss and atg positions
636 # that we can be relative to
637 my $rel = new Bio::Map::Relative(-element => $gene_map->gene, -description => $s_data[3]);
638 Bio::Map::Position->new(-map => $gene_map, -element => $tf, -start => $s_data[4], -end => $s_data[5], -relative => $rel);
641 $self->{got_factor}->{$id} = $tf;
642 return $tf;
645 # since get_factor() is uncertain, just have direct access methods to factor
646 # information
647 sub get_factor_name {
648 my ($self, $id) = @_;
649 my $details = $self->_get_factor_details($id) || return;
650 return $details->{name};
652 sub get_factor_species {
653 my ($self, $id) = @_;
654 my $details = $self->_get_factor_details($id) || return;
655 return $details->{species};
657 sub get_factor_sequence {
658 my ($self, $id) = @_;
659 my $details = $self->_get_factor_details($id) || return;
660 return $details->{sequence};
662 sub _get_factor_details {
663 my ($self, $id) = @_;
664 $id || return;
666 return $self->{factor_details}->{$id} if defined $self->{factor_details}->{$id};
668 my $data = $self->{factor}->{data}->{$id} || return;
669 my @data = split(SEPARATOR, $data);
671 # accession = id name species sequence
673 my %details = (name => $data[1], species => $data[2], sequence => $data[3]);
674 $self->{factor_details}->{$id} = \%details;
676 return \%details;
679 =head2 get_reference_ids
681 Title : get_reference_ids
682 Usage : my @ids = $obj->get_reference_ids(-key => $value);
683 Function: Get all the reference ids that are associated with the supplied
684 args.
685 Returns : list of strings (ids)
686 Args : -key => value, where value is a string id, and key is one of:
687 -pubmed -site -gene -matrix -factor
689 =cut
691 sub get_reference_ids {
692 my $self = shift;
693 return $self->_get_ids('reference', @_);
696 # -id -name -species -site -factor -reference
697 sub get_gene_ids {
698 my $self = shift;
699 return $self->_get_ids('gene', @_);
702 =head2 get_site_ids
704 Title : get_site_ids
705 Usage : my @ids = $obj->get_site_ids(-key => $value);
706 Function: Get all the site ids that are associated with the supplied
707 args.
708 Returns : list of strings (ids)
709 Args : -key => value, where value is a string id, and key is one of:
710 -id -species -gene -matrix -factor -reference
712 =cut
714 sub get_site_ids {
715 my $self = shift;
716 return $self->_get_ids('site', @_);
719 =head2 get_matrix_ids
721 Title : get_matrix_ids
722 Usage : my @ids = $obj->get_matrix_ids(-key => $value);
723 Function: Get all the matrix ids that are associated with the supplied
724 args.
725 Returns : list of strings (ids)
726 Args : -key => value, where value is a string id, and key is one of:
727 -id -name -site -factor -reference
729 =cut
731 sub get_matrix_ids {
732 my $self = shift;
733 return $self->_get_ids('matrix', @_);
736 =head2 get_factor_ids
738 Title : get_factor_ids
739 Usage : my @ids = $obj->get_factor_ids(-key => $value);
740 Function: Get all the factor ids that are associated with the supplied
741 args.
742 Returns : list of strings (ids)
743 Args : -key => value, where value is a string id, and key is one of:
744 -id -name -species -interactors -gene -matrix -site -reference
745 NB: -gene only gets factor ids for genes that encode factors
747 =cut
749 sub get_factor_ids {
750 my $self = shift;
751 return $self->_get_ids('factor', @_);
754 =head2 get_fragment_ids
756 Title : get_fragment_ids
757 Usage : my @ids = $obj->get_fragment_ids(-key => $value);
758 Function: Get all the fragment ids that are associated with the supplied
759 args.
760 Returns : list of strings (ids)
761 Args : -key => value, where value is a string id, and key is one of:
762 -id -species -gene -factor -reference
764 =cut
766 sub get_fragment_ids {
767 my $self = shift;
768 return $self->_get_ids('fragment', @_);
771 =head2 Helper methods
773 =cut
775 # internal method which does the indexing
776 sub _build_index {
777 my ($self, $dat_dir, $force) = @_;
779 # MLDBM would give us transparent complex data structures with DB_File,
780 # allowing just one index file, but its yet another requirment and we
781 # don't strictly need it
783 my $index_dir = $self->index_directory;
784 my $gene_index = "$index_dir/gene.dat.index";
785 my $reference_index = "$index_dir/reference.dat.index";
786 my $matrix_index = "$index_dir/matrix.dat.index";
787 my $factor_index = "$index_dir/factor.dat.index";
788 my $fragment_index = "$index_dir/fragment.dat.index";
789 my $site_index = "$index_dir/site.dat.index";
791 my $reference_dat = "$dat_dir/reference.dat";
792 if (! -e $reference_index || $force) {
793 open(REF, $reference_dat) || $self->throw("Cannot open reference file '$reference_dat' for reading");
795 my %references;
796 unlink $reference_index;
797 my $ref = tie(%references, 'DB_File', $reference_index, O_RDWR|O_CREAT, 0644, $DB_HASH) || $self->throw("Cannot open file '$reference_index': $!");
799 my %pubmed;
800 my $reference_pubmed = $reference_index.'.pubmed';
801 unlink $reference_pubmed;
802 my $pub = tie(%pubmed, 'DB_File', $reference_pubmed, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_pubmed': $!");
804 my %gene;
805 my $reference_gene = $gene_index.'.reference';
806 unlink $reference_gene;
807 my $gene = tie(%gene, 'DB_File', $reference_gene, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_gene': $!");
809 my %site;
810 my $reference_site = $site_index.'.reference';
811 unlink $reference_site;
812 my $site = tie(%site, 'DB_File', $reference_site, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_site': $!");
814 my %fragment;
815 my $reference_fragment = $fragment_index.'.reference';
816 unlink $reference_fragment;
817 my $fragment = tie(%fragment, 'DB_File', $reference_fragment, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_fragment': $!");
819 my %factor;
820 my $reference_factor = $factor_index.'.reference';
821 unlink $reference_factor;
822 my $factor = tie(%factor, 'DB_File', $reference_factor, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_factor': $!");
824 my %matrix;
825 my $reference_matrix = $matrix_index.'.reference';
826 unlink $reference_matrix;
827 my $matrix = tie(%matrix, 'DB_File', $reference_matrix, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_matrix': $!");
829 # skip the first three header lines
830 <REF>; <REF>; <REF>;
832 my @data;
833 while (<REF>) {
834 if (/^AC (\S+)/) {
835 $data[0] = $1;
837 elsif (/^RX PUBMED: (\d+)/) {
838 $data[1] = $1;
839 $pub->put("$1", $data[0]);
841 elsif (/^RA (.+)\n$/) {
842 $data[2] = $1;
844 elsif (/^RT (.+?)\.?\n$/) {
845 $data[3] = $1;
847 elsif (/^RL (.+?)\.?\n$/) {
848 $data[4] = $1;
850 elsif (/^GE TRANSFAC: (\w\d+)/) {
851 $gene->put($data[0], "$1");
853 elsif (/^BS TRANSFAC: (\w\d+)/) {
854 $site->put($data[0], "$1");
856 elsif (/^FA TRANSFAC: (\w\d+)/) {
857 $factor->put($data[0], "$1");
859 elsif (/^FR TRANSFAC: (FR\d+)/) {
860 $fragment->put($data[0], "$1");
862 elsif (/^MX TRANSFAC: (\w\d+)/) {
863 $matrix->put($data[0], "$1");
865 elsif (/^\/\//) {
866 # end of a record, store previous data and reset
868 # accession = pubmed authors title location
869 $references{$data[0]} = join(SEPARATOR, ($data[1] || '', $data[2] || '', $data[3] || '', $data[4] || ''));
871 @data = ();
874 close(REF);
876 $ref = $pub = $gene = $site = $fragment = $factor = $matrix = undef;
877 untie %references;
878 untie %pubmed;
879 untie %gene;
880 untie %site;
881 untie %fragment;
882 untie %factor;
883 untie %matrix;
886 my $gene_dat = "$dat_dir/gene.dat";
887 if (! -e $gene_index || $force) {
888 open(GEN, $gene_dat) || $self->throw("Cannot open gene file '$gene_dat' for reading");
890 my %genes;
891 unlink $gene_index;
892 my $gene = tie(%genes, 'DB_File', $gene_index, O_RDWR|O_CREAT, 0644, $DB_HASH) || $self->throw("Cannot open file '$gene_index': $!");
894 my %id;
895 my $gene_id = $gene_index.'.id';
896 unlink $gene_id;
897 my $id = tie(%id, 'DB_File', $gene_id, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$gene_id': $!");
899 my %name;
900 my $gene_name = $gene_index.'.name';
901 unlink $gene_name;
902 my $name = tie(%name, 'DB_File', $gene_name, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$gene_name': $!");
904 my %species;
905 my $gene_species = $gene_index.'.species';
906 unlink $gene_species;
907 my $species = tie(%species, 'DB_File', $gene_species, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$gene_species': $!");
909 my %site;
910 my $gene_site = $site_index.'.gene';
911 unlink $gene_site;
912 my $site = tie(%site, 'DB_File', $gene_site, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$gene_site': $!");
914 my %factor;
915 my $gene_factor = $factor_index.'.gene';
916 unlink $gene_factor;
917 my $factor = tie(%factor, 'DB_File', $gene_factor, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$gene_factor': $!");
919 my %fragment;
920 my $gene_fragment = $fragment_index.'.gene';
921 unlink $gene_fragment;
922 my $fragment = tie(%fragment, 'DB_File', $gene_fragment, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$gene_fragment': $!");
924 my %reference;
925 my $gene_reference = $reference_index.'.gene';
926 unlink $gene_reference;
927 my $reference = tie(%reference, 'DB_File', $gene_reference, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$gene_reference': $!");
929 # skip the first three header lines
930 <GEN>; <GEN>; <GEN>;
932 my @data;
933 while (<GEN>) {
934 if (/^AC (\S+)/) {
935 $data[0] = $1;
937 elsif (/^ID (\S+)/) {
938 $data[1] = $1;
939 $id->put("$1", $data[0]);
941 elsif (/^SD (.+)$/) {
942 $data[2] = lc("$1");
943 $name->put(lc("$1"), $data[0]);
945 elsif (/^SY (.+)\.$/) {
946 foreach (split('; ', lc("$1"))) {
947 $name->put($_, $data[0]);
950 elsif (/^DE (.+)$/) {
951 $data[3] = $1;
953 elsif (/^OS (.+)$/) {
954 my $raw_species = $1;
955 my $taxid = $self->_species_to_taxid($raw_species);
956 $data[4] = $taxid || $raw_species;
957 $species->put($data[4], $data[0]);
959 elsif (/^RN .+?(RE\d+)/) {
960 $reference->put($data[0], "$1");
962 elsif (/^BS .+?(R\d+)/) {
963 $site->put($data[0], "$1");
965 elsif (/^FA (T\d+)/) {
966 $factor->put($data[0], "$1");
968 elsif (/^BR (FR\d+)/) {
969 $fragment->put($data[0], "$1");
971 elsif (/^\/\//) {
972 # end of a record, store previous data and reset
974 # accession = id name description species_tax_id_or_raw_string
975 $genes{$data[0]} = join(SEPARATOR, ($data[1] || '', $data[2] || '', $data[3] || '', $data[4] || ''));
977 @data = ();
980 close(GEN);
982 $gene = $id = $name = $species = $site = $factor = $reference = undef;
983 untie %genes;
984 untie %id;
985 untie %name;
986 untie %species;
987 untie %site;
988 untie %factor;
989 untie %reference;
992 my $site_dat = "$dat_dir/site.dat";
993 if (! -e $site_index || $force) {
994 open(SIT, $site_dat) || $self->throw("Cannot open site file '$site_dat' for reading");
996 my %sites;
997 unlink $site_index;
998 my $site = tie(%sites, 'DB_File', $site_index, O_RDWR|O_CREAT, 0644, $DB_HASH) || $self->throw("Cannot open file '$site_index': $!");
1000 my %id;
1001 my $site_id = $site_index.'.id';
1002 unlink $site_id;
1003 my $id = tie(%id, 'DB_File', $site_id, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$site_id': $!");
1005 my %species;
1006 my $site_species = $site_index.'.species';
1007 unlink $site_species;
1008 my $species = tie(%species, 'DB_File', $site_species, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$site_species': $!");
1010 my %qualities;
1011 my $site_qualities = $site_index.'.qual';
1012 unlink $site_qualities;
1013 my $quality = tie(%qualities, 'DB_File', $site_qualities, O_RDWR|O_CREAT, 0644, $DB_HASH) || $self->throw("Cannot open file '$site_qualities': $!");
1015 my %gene;
1016 my $site_gene = $gene_index.'.site';
1017 unlink $site_gene;
1018 my $gene = tie(%gene, 'DB_File', $site_gene, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$site_gene': $!");
1020 my %matrix;
1021 my $site_matrix = $matrix_index.'.site';
1022 unlink $site_matrix;
1023 my $matrix = tie(%matrix, 'DB_File', $site_matrix, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$site_matrix': $!");
1025 my %factor;
1026 my $site_factor = $factor_index.'.site';
1027 unlink $site_factor;
1028 my $factor = tie(%factor, 'DB_File', $site_factor, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$site_factor': $!");
1030 my %reference;
1031 my $site_reference = $reference_index.'.site';
1032 unlink $site_reference;
1033 my $reference = tie(%reference, 'DB_File', $site_reference, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$site_reference': $!");
1035 # skip the first three header lines
1036 <SIT>; <SIT>; <SIT>;
1038 my @data;
1039 while (<SIT>) {
1040 if (/^AC (\S+)/) {
1041 $data[0] = $1;
1043 elsif (/^ID (\S+)/) {
1044 $data[1] = $1;
1045 $id->put("$1", $data[0]);
1047 elsif (/^TY (.+)$/) {
1048 $data[8] = $1;
1050 elsif (/^DE .*Gene: (G\d+)/) {
1051 $data[2] = $1;
1052 $gene->put($data[0], "$1");
1054 # if it has no gene it is an artificial sequence, unless it
1055 # has a species (OS line), in which case it is unassigned
1056 # genomic; either way we won't be able to make a
1057 # Bio::Map::PositionI later on, so such sites won't be
1058 # on any MapI.
1060 elsif (/^OS (.+)$/) {
1061 # Since not all sites in site.dat with a species have a gene,
1062 # (small handful are unassigned 'genomic') can't delegate to
1063 # gene.dat and must parse species here (effectively again)
1064 my $raw_species = $1;
1065 my $taxid = $self->_species_to_taxid($raw_species);
1066 $data[7] = $taxid || $raw_species;
1067 $species->put($data[7], $data[0]);
1069 elsif (/^SQ (.+)\.$/) {
1070 $data[3] = $1;
1071 # there can actually be more than one SQ line, seemingly with
1072 # variations of the sequence (not a long sequence split over
1073 # two lines); not sure what to do with data; currently we end
1074 # up storing only the last variant.
1076 elsif (/^S1 (.+)$/) {
1077 $data[4] = $1;
1078 # if S1 not present, means transcriptional start
1080 elsif (/^SF (.+)$/) {
1081 $data[5] = $1;
1083 elsif (/^ST (.+)$/) {
1084 $data[6] = $1;
1086 elsif (/^RN .+?(RE\d+)/) {
1087 $reference->put($data[0], "$1");
1089 elsif (/^MX (M\d+)/) {
1090 $matrix->put($data[0], "$1");
1092 elsif (/^BF (T\d+); .+?; Quality: (\d)/) {
1093 $factor->put($data[0], "$1");
1094 $qualities{$data[0].SEPARATOR.$1} = $2;
1096 elsif (/^\/\//) {
1097 # end of a record, store previous data and reset
1099 # accession = id gene_id sequence relative_to first_position last_position species_tax_id_or_raw_string type
1100 $sites{$data[0]} = join(SEPARATOR, ($data[1] || '', $data[2] || '', $data[3] || '', $data[4] || 'TSS', $data[5] || '', $data[6] || '', $data[7] || '', $data[8] || ''));
1102 @data = ();
1105 close(SIT);
1107 $site = $id = $species = $quality = $gene = $matrix = $factor = $reference = undef;
1108 untie %sites;
1109 untie %id;
1110 untie %species;
1111 untie %qualities;
1112 untie %gene;
1113 untie %matrix;
1114 untie %factor;
1115 untie %reference;
1118 my $matrix_dat = "$dat_dir/matrix.dat";
1119 if (! -e $matrix_index || $force) {
1120 open(MAT, $matrix_dat) || $self->throw("Cannot open matrix file '$matrix_dat' for reading");
1122 my %matrices;
1123 unlink $matrix_index;
1124 my $matrix = tie(%matrices, 'DB_File', $matrix_index, O_RDWR|O_CREAT, 0644, $DB_HASH) || $self->throw("Cannot open file '$matrix_index': $!");
1126 my %id;
1127 my $matrix_id = $matrix_index.'.id';
1128 unlink $matrix_id;
1129 my $id = tie(%id, 'DB_File', $matrix_id, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$matrix_id': $!");
1131 my %name;
1132 my $matrix_name = $matrix_index.'.name';
1133 unlink $matrix_name;
1134 my $name = tie(%name, 'DB_File', $matrix_name, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$matrix_name': $!");
1136 my %site;
1137 my $matrix_site = $site_index.'.matrix';
1138 unlink $matrix_site;
1139 my $site = tie(%site, 'DB_File', $matrix_site, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$matrix_site': $!");
1141 my %factor;
1142 my $matrix_factor = $factor_index.'.matrix';
1143 unlink $matrix_factor;
1144 my $factor = tie(%factor, 'DB_File', $matrix_factor, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$matrix_factor': $!");
1146 my %reference;
1147 my $matrix_reference = $reference_index.'.matrix';
1148 unlink $matrix_reference;
1149 my $reference = tie(%reference, 'DB_File', $matrix_reference, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$matrix_reference': $!");
1151 # skip the first three header lines
1152 <MAT>; <MAT>; <MAT>;
1154 my @data;
1155 my @matrix_data;
1156 my @site_data;
1157 while (<MAT>) {
1158 if (/^AC (\S+)/) {
1159 $data[0] = $1;
1161 elsif (/^ID (\S+)/) {
1162 $data[1] = $1;
1163 $id->put("$1", $data[0]);
1165 elsif (/^NA (.+)$/) {
1166 $data[2] = $1;
1167 $name->put("$1", $data[0]);
1169 elsif (/^DE (.+)$/) {
1170 $data[3] = $1;
1172 elsif (/^\d\d \s*(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/) {
1173 # a, c, g, t counts/weights
1174 push(@matrix_data, join("\t", ($1, $2, $3, $4)));
1176 # Work out the number of sites as the largest number of
1177 # sites amongst all positions in the sequences. (The BA
1178 # line isn't reliable for telling us the correct number of
1179 # sites all the time)
1180 my $num = $1 + $2 + $3 + $4;
1181 $data[4] ||= 0;
1182 if ($num > $data[4]) {
1183 $data[4] = $num;
1186 elsif (/^BS ([\sa-zA-Z]+); (.+?); (-?\d+); \d+;.*; ([np])/) {
1187 # sequence id start strand
1188 push(@site_data, join('_', ($1, $2, $3, $4 eq 'p' ? 1 : -1)));
1189 $site->put($data[0], $2);
1191 elsif (/^BF (T\d+)/) {
1192 $factor->put($data[0], "$1");
1194 elsif (/^RN .+?(RE\d+)/) {
1195 $reference->put($data[0], "$1");
1197 elsif (/^\/\//) {
1198 # end of a record, store previous data and reset
1199 my $matrix_data = join(INTERNAL_SEPARATOR, @matrix_data) || '';
1201 # sites of a matrix are pre-aligned but padded with spaces on
1202 # the left and no padding on the right; pad with -s both sides
1203 my $longest_seq = 0;
1204 foreach my $site_seq (@site_data) {
1205 $site_seq =~ s/ /-/g;
1206 my $length = length($site_seq);
1207 if ($length > $longest_seq) {
1208 $longest_seq = $length;
1211 foreach my $site_seq (@site_data) {
1212 my $length = length($site_seq);
1213 if ($length < $longest_seq) {
1214 $site_seq .= '-' x ($longest_seq - $length);
1217 my $site_data = join(INTERNAL_SEPARATOR, @site_data) || '';
1219 # accession = id name description num_of_sites matrix_data site_data
1220 $matrices{$data[0]} = join(SEPARATOR, ($data[1] || '', $data[2] || '', $data[3] || '', $data[4], $matrix_data, $site_data));
1222 @data = @matrix_data = @site_data = ();
1225 close(MAT);
1227 $matrix = $id = $name = $site = $factor = $reference = undef;
1228 untie %matrices;
1229 untie %id;
1230 untie %name;
1231 untie %site;
1232 untie %factor;
1233 untie %reference;
1236 my $factor_dat = "$dat_dir/factor.dat";
1237 if (! -e $factor_index || $force) {
1238 open(FAC, $factor_dat) || $self->throw("Cannot open factor file '$factor_dat' for reading");
1240 my %factors;
1241 unlink $factor_index;
1242 my $factor = tie(%factors, 'DB_File', $factor_index, O_RDWR|O_CREAT, 0644, $DB_HASH) || $self->throw("Cannot open file '$factor_index': $!");
1244 my %id;
1245 my $factor_id = $factor_index.'.id';
1246 unlink $factor_id;
1247 my $id = tie(%id, 'DB_File', $factor_id, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file 'factor_id': $!");
1249 my %name;
1250 my $factor_name = $factor_index.'.name';
1251 unlink $factor_name;
1252 my $name = tie(%name, 'DB_File', $factor_name, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_name': $!");
1254 my %species;
1255 my $factor_species = $factor_index.'.species';
1256 unlink $factor_species;
1257 my $species = tie(%species, 'DB_File', $factor_species, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_species': $!");
1259 my %interactors;
1260 my $factor_interactors = $factor_index.'.interactors';
1261 unlink $factor_interactors;
1262 my $interact = tie(%interactors, 'DB_File', $factor_interactors, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_interactors': $!");
1264 my %gene;
1265 my $factor_gene = $gene_index.'.factor';
1266 unlink $factor_gene;
1267 my $gene = tie(%gene, 'DB_File', $factor_gene, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_gene': $!");
1269 my %matrix;
1270 my $factor_matrix = $matrix_index.'.factor';
1271 unlink $factor_matrix;
1272 my $matrix = tie(%matrix, 'DB_File', $factor_matrix, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_matrix': $!");
1274 my %site;
1275 my $factor_site = $site_index.'.factor';
1276 unlink $factor_site;
1277 my $site = tie(%site, 'DB_File', $factor_site, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_site': $!");
1279 my %fragment;
1280 my $factor_fragment = $fragment_index.'.factor';
1281 unlink $factor_fragment;
1282 my $fragment = tie(%fragment, 'DB_File', $factor_fragment, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_fragment': $!");
1284 my %reference;
1285 my $factor_reference = $reference_index.'.factor';
1286 unlink $factor_reference;
1287 my $reference = tie(%reference, 'DB_File', $factor_reference, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$factor_reference': $!");
1289 # skip the first three header lines
1290 <FAC>; <FAC>; <FAC>;
1292 my @data;
1293 my $sequence = '';
1294 while (<FAC>) {
1295 if (/^AC (\S+)/) {
1296 $data[0] = $1;
1298 elsif (/^ID (\S+)/) {
1299 # IDs are always the same as AC? Is this needed?
1300 $data[1] = $1;
1301 $id->put("$1", $data[0]);
1303 elsif (/^FA (.+)$/) {
1304 $data[2] = $1;
1305 $name->put("$1", $data[0]);
1307 elsif (/^OS (.+)$/) {
1308 # This is the species the actual factor came from, which may
1309 # differ from the species of any sequences it is described as
1310 # binding to. Not all factors that have a species have a gene,
1311 # so can't delegate species to a gene lookup.
1312 my $raw_species = $1;
1313 my $taxid = $self->_species_to_taxid($raw_species);
1314 $data[3] = $taxid || $raw_species;
1315 $species->put($data[3], $data[0]);
1317 elsif (/^GE (G\d+)/) {
1318 $gene->put($data[0], "$1");
1320 elsif (/^SQ (.+)$/) {
1321 $sequence .= $1;
1323 elsif (/^IN (T\d+)/) {
1324 $interact->put($data[0], "$1");
1326 elsif (/^MX (M\d+)/) {
1327 $matrix->put($data[0], "$1");
1329 elsif (/^BS (R\d+)/) {
1330 $site->put($data[0], "$1");
1332 elsif (/^BR (FR\d+)/) {
1333 $fragment->put($data[0], "$1");
1335 elsif (/^RN .+?(RE\d+)/) {
1336 $reference->put($data[0], "$1");
1338 elsif (/^\/\//) {
1339 # end of a record, store previous data and reset
1341 # accession = id name species sequence
1342 $factors{$data[0]} = join(SEPARATOR, ($data[1] || '', $data[2] || '', $data[3] || '', $sequence));
1344 @data = ();
1345 $sequence = '';
1348 close(FAC);
1350 $factor = $id = $name = $species = $interact = $gene = $matrix = $site = $fragment = $reference = undef;
1351 untie %factors;
1352 untie %id;
1353 untie %name;
1354 untie %species;
1355 untie %interactors;
1356 untie %gene;
1357 untie %matrix;
1358 untie %site;
1359 untie %fragment;
1360 untie %reference;
1363 my $fragment_dat = "$dat_dir/fragment.dat";
1364 if (! -e $fragment_index || $force) {
1365 if (open(FRA, $fragment_dat)) {
1366 my %fragments;
1367 unlink $fragment_index;
1368 my $fragment = tie(%fragments, 'DB_File', $fragment_index, O_RDWR|O_CREAT, 0644, $DB_HASH) || $self->throw("Cannot open file '$fragment_index': $!");
1370 my %id;
1371 my $fragment_id = $fragment_index.'.id';
1372 unlink $fragment_id;
1373 my $id = tie(%id, 'DB_File', $fragment_id, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$fragment_id': $!");
1375 my %qualities;
1376 my $fragment_qualities = $fragment_index.'.qual';
1377 unlink $fragment_qualities;
1378 my $quality = tie(%qualities, 'DB_File', $fragment_qualities, O_RDWR|O_CREAT, 0644, $DB_HASH) || $self->throw("Cannot open file '$fragment_qualities': $!");
1380 my %species;
1381 my $fragment_species = $fragment_index.'.species';
1382 unlink $fragment_species;
1383 my $species = tie(%species, 'DB_File', $fragment_species, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$fragment_species': $!");
1385 my %gene;
1386 my $fragment_gene = $gene_index.'.fragment';
1387 unlink $fragment_gene;
1388 my $gene = tie(%gene, 'DB_File', $fragment_gene, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$fragment_gene': $!");
1390 my %factor;
1391 my $fragment_factor = $factor_index.'.fragment';
1392 unlink $fragment_factor;
1393 my $factor = tie(%factor, 'DB_File', $fragment_factor, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$fragment_factor': $!");
1395 my %reference;
1396 my $fragment_reference = $reference_index.'.fragment';
1397 unlink $fragment_reference;
1398 my $reference = tie(%reference, 'DB_File', $fragment_reference, O_RDWR|O_CREAT, 0644, $DB_BTREE) || $self->throw("Cannot open file '$fragment_reference': $!");
1400 # skip the first three header lines
1401 <FRA>; <FRA>; <FRA>;
1403 my @data;
1404 while (<FRA>) {
1405 if (/^AC (\S+)/) {
1406 $data[0] = $1;
1408 elsif (/^ID (\S+)/) {
1409 # IDs are always the same as AC? Is this needed?
1410 $data[1] = $1;
1411 $id->put("$1", $data[0]);
1413 elsif (/^DE Gene: (G\d+)(?:.+Gene: (G\d+))?/) {
1414 my ($gene1, $gene2) = ($1, $2);
1415 $data[2] = $gene1;
1416 $data[3] = $gene2; # could be undef
1417 $gene->put($data[0], $gene1);
1418 $gene->put($data[0], $gene2) if $gene2;
1420 elsif (/^OS (.+)$/) {
1421 # As per the site.dat parsing
1422 my $raw_species = $1;
1423 my $taxid = $self->_species_to_taxid($raw_species);
1424 $data[4] = $taxid || $raw_species;
1425 $species->put($data[4], $data[0]);
1427 elsif (/^SQ [atcgn]*([ATCGN]+)[atcgn]*/) {
1428 $data[5] .= $1;
1429 # there can be (usually are) multiple SQ lines with a single
1430 # long seq split over them. The 'real' sequence is in caps
1432 elsif (/^SC Build (\S+):$/) {
1433 $data[6] = $1;
1434 # maybe parse it out a little more? We have build,
1435 # chromosomal coords and strand, eg.
1436 # SC Build HSA_May2004: Chr.2 43976692..43978487 (FORWARD).
1438 elsif (/^RN .+?(RE\d+)/) {
1439 $reference->put($data[0], "$1");
1441 elsif (/^BF (T\d+); .+?; Quality: (\d)/) {
1442 $factor->put($data[0], "$1");
1443 $qualities{$data[0].SEPARATOR.$1} = $2;
1445 elsif (/^\/\//) {
1446 # end of a record, store previous data and reset
1448 # accession = id gene_id1 gene_id2 species_tax_id_or_raw_string sequence source
1449 $fragments{$data[0]} = join(SEPARATOR, ($data[1] || '', $data[2] || '', $data[3] || '', $data[4] || '', $data[5] || '', $data[6] || ''));
1451 @data = ();
1454 close(FRA);
1456 $fragment = $id = $species = $quality = $gene = $factor = $reference = undef;
1457 untie %fragments;
1458 untie %id;
1459 untie %species;
1460 untie %qualities;
1461 untie %gene;
1462 untie %factor;
1463 untie %reference;
1465 else {
1466 $self->warn("Cannot open fragment file '$fragment_dat' for reading, assuming you have an old version of Transfac Pro with no fragment.dat file.");
1471 # connect the internal db handle
1472 sub _db_connect {
1473 my $self = shift;
1474 return if $self->{'_initialized'};
1476 my $index_dir = $self->index_directory;
1477 my $gene_index = "$index_dir/gene.dat.index";
1478 my $reference_index = "$index_dir/reference.dat.index";
1479 my $matrix_index = "$index_dir/matrix.dat.index";
1480 my $factor_index = "$index_dir/factor.dat.index";
1481 my $site_index = "$index_dir/site.dat.index";
1482 my $fragment_index = "$index_dir/fragment.dat.index";
1484 foreach ($gene_index, $reference_index, $matrix_index, $factor_index, $site_index, $fragment_index) {
1485 if (! -e $_) {
1486 #$self->warn("Index files have not been created");
1487 #return 0;
1491 # reference
1493 $self->{reference}->{data} = {};
1494 tie (%{$self->{reference}->{data}}, 'DB_File', $reference_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$reference_index': $!");
1496 my $reference_pubmed = $reference_index.'.pubmed';
1497 $self->{reference}->{pubmed} = tie (%{$self->{reference}->{pubmed}}, 'DB_File', $reference_pubmed, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_pubmed': $!");
1499 my $reference_gene = $gene_index.'.reference';
1500 $self->{gene}->{reference} = tie (%{$self->{gene}->{reference}}, 'DB_File', $reference_gene, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_gene': $!");
1502 my $reference_site = $site_index.'.reference';
1503 $self->{site}->{reference} = tie (%{$self->{site}->{reference}}, 'DB_File', $reference_site, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_site': $!");
1505 my $reference_fragment = $fragment_index.'.reference';
1506 $self->{fragment}->{reference} = tie (%{$self->{fragment}->{reference}}, 'DB_File', $reference_fragment, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_fragment': $!");
1508 my $reference_factor = $factor_index.'.reference';
1509 $self->{factor}->{reference} = tie (%{$self->{factor}->{reference}}, 'DB_File', $reference_factor, undef, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_factor': $!");
1511 my $reference_matrix = $matrix_index.'.reference';
1512 $self->{matrix}->{reference} = tie (%{$self->{matrix}->{reference}}, 'DB_File', $reference_matrix, undef, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_matrix': $!");
1515 # gene
1517 $self->{gene}->{data} = {};
1518 tie (%{$self->{gene}->{data}}, 'DB_File', $gene_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$gene_index': $!");
1520 my $gene_id = $gene_index.'.id';
1521 $self->{gene}->{id} = tie(%{$self->{gene}->{id}}, 'DB_File', $gene_id, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_id': $!");
1523 my $gene_name = $gene_index.'.name';
1524 $self->{gene}->{name} = tie(%{$self->{gene}->{name}}, 'DB_File', $gene_name, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_name': $!");
1526 my $gene_species = $gene_index.'.species';
1527 $self->{gene}->{species} = tie(%{$self->{gene}->{species}}, 'DB_File', $gene_species, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_species': $!");
1529 my $gene_site = $site_index.'.gene';
1530 $self->{site}->{gene} = tie(%{$self->{site}->{gene}}, 'DB_File', $gene_site, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_site': $!");
1532 my $gene_fragment = $fragment_index.'.gene';
1533 $self->{fragment}->{gene} = tie(%{$self->{fragment}->{gene}}, 'DB_File', $gene_fragment, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_fragment': $!");
1535 my $gene_factor = $factor_index.'.gene';
1536 $self->{factor}->{gene} = tie(%{$self->{factor}->{gene}}, 'DB_File', $gene_factor, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_factor': $!");
1538 my $gene_reference = $reference_index.'.gene';
1539 $self->{reference}->{gene} = tie(%{$self->{reference}->{gene}}, 'DB_File', $gene_reference, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_reference': $!");
1542 # site
1544 $self->{site}->{data} = {};
1545 tie (%{$self->{site}->{data}}, 'DB_File', $site_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$site_index': $!");
1547 my $site_id = $site_index.'.id';
1548 $self->{site}->{id} = tie(%{$self->{site}->{id}}, 'DB_File', $site_id, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_id': $!");
1550 my $site_species = $site_index.'.species';
1551 $self->{site}->{species} = tie(%{$self->{site}->{species}}, 'DB_File', $site_species, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file $site_species': $!");
1553 #*** quality not actually used by anything (yet)
1554 my $site_qualities = $site_index.'.qual';
1555 $self->{quality} = {};
1556 tie(%{$self->{quality}}, 'DB_File', $site_qualities, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$site_qualities': $!");
1558 my $site_gene = $gene_index.'.site';
1559 $self->{gene}->{site} = tie(%{$self->{gene}->{site}}, 'DB_File', $site_gene, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_gene': $!");
1561 my $site_matrix = $matrix_index.'.site';
1562 $self->{matrix}->{site} = tie(%{$self->{matrix}->{site}}, 'DB_File', $site_matrix, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_matrix': $!");
1564 my $site_factor = $factor_index.'.site';
1565 $self->{factor}->{site} = tie(%{$self->{factor}->{site}}, 'DB_File', $site_factor, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_factor': $!");
1567 my $site_reference = $reference_index.'.site';
1568 $self->{reference}->{site} = tie(%{$self->{reference}->{site}}, 'DB_File', $site_reference, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_reference': $!");
1571 # fragment (may not be in older databases)
1572 if (-e $fragment_index) {
1573 $self->{fragment}->{data} = {};
1574 tie (%{$self->{fragment}->{data}}, 'DB_File', $fragment_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$fragment_index': $!");
1576 my $fragment_id = $fragment_index.'.id';
1577 $self->{fragment}->{id} = tie(%{$self->{fragment}->{id}}, 'DB_File', $fragment_id, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_id': $!");
1579 my $fragment_species = $fragment_index.'.species';
1580 $self->{fragment}->{species} = tie(%{$self->{fragment}->{species}}, 'DB_File', $fragment_species, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file $fragment_species': $!");
1582 #*** quality not actually used by anything (yet)
1583 my $fragment_qualities = $fragment_index.'.qual';
1584 $self->{fragment_quality} = {};
1585 tie(%{$self->{fragment_quality}}, 'DB_File', $fragment_qualities, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$fragment_qualities': $!");
1587 my $fragment_gene = $gene_index.'.fragment';
1588 $self->{gene}->{fragment} = tie(%{$self->{gene}->{fragment}}, 'DB_File', $fragment_gene, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_gene': $!");
1590 my $fragment_factor = $factor_index.'.fragment';
1591 $self->{factor}->{fragment} = tie(%{$self->{factor}->{fragment}}, 'DB_File', $fragment_factor, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_factor': $!");
1593 my $fragment_reference = $reference_index.'.fragment';
1594 $self->{reference}->{fragment} = tie(%{$self->{reference}->{fragment}}, 'DB_File', $fragment_reference, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_reference': $!");
1596 else {
1597 die "no fragment_index at '$fragment_index'\n";
1600 # matrix
1602 $self->{matrix}->{data} = {};
1603 tie (%{$self->{matrix}->{data}}, 'DB_File', $matrix_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$matrix_index': $!");
1605 my $matrix_id = $matrix_index.'.id';
1606 $self->{matrix}->{id} = tie(%{$self->{matrix}->{id}}, 'DB_File', $matrix_id, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_id': $!");
1608 my $matrix_name = $matrix_index.'.name';
1609 $self->{matrix}->{name} = tie(%{$self->{matrix}->{name}}, 'DB_File', $matrix_name, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_name': $!");
1611 my $matrix_site = $site_index.'.matrix';
1612 $self->{site}->{matrix} = tie(%{$self->{site}->{matrix}}, 'DB_File', $matrix_site, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_site': $!");
1614 my $matrix_factor = $factor_index.'.matrix';
1615 $self->{factor}->{matrix} = tie(%{$self->{factor}->{matrix}}, 'DB_File', $matrix_factor, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_factor': $!");
1617 my $matrix_reference = $reference_index.'.matrix';
1618 $self->{reference}->{matrix} = tie(%{$self->{reference}->{matrix}}, 'DB_File', $matrix_reference, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_reference': $!");
1621 # factor
1623 $self->{factor}->{data} = {};
1624 tie (%{$self->{factor}->{data}}, 'DB_File', $factor_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$factor_index': $!");
1626 my $factor_id = $factor_index.'.id';
1627 $self->{factor}->{id} = tie(%{$self->{factor}->{id}}, 'DB_File', $factor_id, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file 'factor_id': $!");
1629 my $factor_name = $factor_index.'.name';
1630 $self->{factor}->{name} = tie(%{$self->{factor}->{name}}, 'DB_File', $factor_name, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_name': $!");
1632 my $factor_species = $factor_index.'.species';
1633 $self->{factor}->{species} = tie(%{$self->{factor}->{species}}, 'DB_File', $factor_species, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_species': $!");
1635 my $factor_interactors = $factor_index.'.interactors';
1636 $self->{factor}->{interactors} = tie(%{$self->{factor}->{interactors}}, 'DB_File', $factor_interactors, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_interactors': $!");
1638 my $factor_gene = $gene_index.'.factor';
1639 $self->{gene}->{factor} = tie(%{$self->{gene}->{factor}}, 'DB_File', $factor_gene, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_gene': $!");
1641 my $factor_matrix = $matrix_index.'.factor';
1642 $self->{matrix}->{factor} = tie(%{$self->{matrix}->{factor}}, 'DB_File', $factor_matrix, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_matrix': $!");
1644 my $factor_site = $site_index.'.factor';
1645 $self->{site}->{factor} = tie(%{$self->{site}->{factor}}, 'DB_File', $factor_site, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_site': $!");
1647 my $factor_fragment = $fragment_index.'.factor';
1648 $self->{fragment}->{factor} = tie(%{$self->{fragment}->{factor}}, 'DB_File', $factor_fragment, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_fragment': $!");
1650 my $factor_reference = $reference_index.'.factor';
1651 $self->{reference}->{factor} = tie(%{$self->{reference}->{factor}}, 'DB_File', $factor_reference, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_reference': $!");
1654 $self->{'_initialized'} = 1;
1657 =head2 index_directory
1659 Title : index_directory
1660 Funtion : Get/set the location that index files are stored. (this module
1661 will index the supplied database)
1662 Usage : $obj->index_directory($newval)
1663 Returns : value of index_directory (a scalar)
1664 Args : on set, new value (a scalar or undef, optional)
1666 =cut
1668 sub index_directory {
1669 my $self = shift;
1670 return $self->{'index_directory'} = shift if @_;
1671 return $self->{'index_directory'};
1674 # resolve a transfac species string into an ncbi taxid
1675 sub _species_to_taxid {
1676 my ($self, $raw_species) = @_;
1677 $raw_species or return;
1679 my $species_string;
1680 my @split = split(', ', $raw_species);
1681 (@split > 1) ? ($species_string = $split[1]) : ($species_string = $split[0]);
1683 my $ncbi_taxid;
1684 if ($species_string =~ /^[A-Z]\S+ \S+$/) {
1685 SWITCH: for ($species_string) {
1686 # some species don't classify so custom handling
1687 /^Darnel ryegrass/ && do { $ncbi_taxid = 34176; last; };
1688 /^Coix lacryma/ && do { $ncbi_taxid = 4505; last; };
1689 /^Rattus spec/ && do { $ncbi_taxid = 10116; last; };
1690 /^Mus spec/ && do { $ncbi_taxid = 10090; last; };
1691 /^Equus spec/ && do { $ncbi_taxid = 9796; last; };
1692 /^Cavia sp/ && do { $ncbi_taxid = 10141; last; };
1693 /^Marsh marigold/ && do { $ncbi_taxid = 3449; last; };
1694 /^Phalaenopsis sp/ && do { $ncbi_taxid = 36900; last; };
1695 /^Anthirrhinum majus/ && do { $ncbi_taxid = 4151; last; };
1696 /^Equus spec/ && do { $ncbi_taxid = 9796; last; };
1697 /^Lycopodium spec/ && do { $ncbi_taxid = 13840; last; };
1698 /^Autographa californica/ && do { $ncbi_taxid = 307456; last; };
1699 /^E26 AEV/ && do { $ncbi_taxid = 31920; last; };
1700 /^Pseudocentrotus miliaris/ && do { $ncbi_taxid = 7677; last; }; # the genus is 7677 but this species isn't there
1701 /^SL3-3 (?:retro)?virus/ && do { $ncbi_taxid = 53454; last; }; # 53454 is unclassified MLV-related, SL3-3 a variant of that?
1702 /^Petunia sp/ && do { $ncbi_taxid = 4104; last; };
1704 if (! $ncbi_taxid && defined $self->{_tax_db}) {
1705 ($ncbi_taxid) = $self->{_tax_db}->get_taxonids($species_string);
1708 else {
1709 # some species lines are poorly formated so custom handling
1710 SWITCH: for ($raw_species) {
1711 # for speed, go by common first letters
1712 my $first_letter = substr($raw_species, 0, 1);
1714 $first_letter eq 'A' && do {
1715 /^Adiantum raddianum/ && do { $ncbi_taxid = 32168; last; };
1716 /^Avian sarcoma virus \(strain 17\)/ && do { $ncbi_taxid = 11877; last; };
1717 /^AMV/ && do { $ncbi_taxid = 11866; last; };
1718 /^AEV/ && do { $ncbi_taxid = 11861; last; };
1719 /^AS42|^Avian musculoaponeurotic/ && do { $ncbi_taxid = 11873; last; };
1720 /^Avian myelocytomatosis/ && do { $ncbi_taxid = 11869; last; };
1721 /^ASV 31/ && do { $ncbi_taxid = 35270; last; };
1722 /^A-MuLV/ && do { $ncbi_taxid = 188539; last; };
1723 /^Asparagus officinalis/ && do { $ncbi_taxid = 4686; last; };
1724 /^Agrobacterium tumefaciens/ && do { $ncbi_taxid = 358; last; };
1725 /^ALV/ && do { $ncbi_taxid = 11864; last; };
1726 /^AAV/ && do { $ncbi_taxid = 272636; last; };
1727 /^AKV MLV/ && do { $ncbi_taxid = 11791; last; };
1728 last;
1731 $first_letter eq 'B' && do {
1732 /^BPV-1/ && do { $ncbi_taxid = 10559; last; };
1733 /^BKV/ && do { $ncbi_taxid = 10629; last; };
1734 /^Bolivian squirrel monkey/ && do { $ncbi_taxid = 39432; last; };
1735 last;
1738 $first_letter eq 'C' && do {
1739 /^Cauliflower/ && do { $ncbi_taxid = 3715; last; };
1740 /^Chamek/ && do { $ncbi_taxid = 118643; last; };
1741 /^Candida albicans/ && do { $ncbi_taxid = 5476; last; };
1742 /^CaMV/ && do { $ncbi_taxid = 10641; last; };
1743 last;
1746 $first_letter eq 'E' && do {
1747 /^Eucalyptus gunnii/ && do { $ncbi_taxid = 3933; last; };
1748 /^EBV, Epstein-Barr virus/ && do { $ncbi_taxid = 10376; last; };
1749 /^Eucalyptus globulus subsp. bicostata/ && do { $ncbi_taxid = 71272; last; };
1750 /^Eucalyptus globulus subsp. globulus/ && do { $ncbi_taxid = 71271; last; };
1751 last;
1754 $first_letter eq 'F' && do {
1755 /^FBR MuLV/ && do { $ncbi_taxid = 11806; last; };
1756 /^FBJ MuLV/ && do { $ncbi_taxid = 11805; last; };
1757 /^FeLV|Feline leukemia/ && do { $ncbi_taxid = 11923; last; };
1758 /^Flaveria trinervia/ && do { $ncbi_taxid = 4227; last; };
1759 /^FSV/ && do { $ncbi_taxid = 11885; last; };
1760 /^F-MuLV/ && do { $ncbi_taxid = 11795; last; };
1761 last;
1764 $first_letter eq 'H' && do {
1765 /^HSV-1/ && do { $ncbi_taxid = 10298; last; };
1766 /^HTLV-I/ && do { $ncbi_taxid = 11908; last; };
1767 /^HIV-1/ && do { $ncbi_taxid = 11676; last; };
1768 /^HPV-16/ && do { $ncbi_taxid = 333760; last; };
1769 /^HBV/ && do { $ncbi_taxid = 10407; last; };
1770 /^HBI/ && do { $ncbi_taxid = 11867; last; };
1771 /^HPV-8/ && do { $ncbi_taxid = 10579; last; };
1772 /^HPV-11/ && do { $ncbi_taxid = 10580; last; };
1773 /^HPV-18/ && do { $ncbi_taxid = 333761; last; };
1774 /^HCMV/ && do { $ncbi_taxid = 10359; last; };
1775 /^HSV/ && do { $ncbi_taxid = 126283; last; };
1776 /^HSV-2/ && do { $ncbi_taxid = 10310; last; };
1777 /^HCV/ && do { $ncbi_taxid = 11108; last; };
1778 /^HIV-2/ && do { $ncbi_taxid = 11709; last; };
1779 last;
1782 $first_letter eq 'M' && do {
1783 /^MMTV/ && do { $ncbi_taxid = 11757; last; };
1784 /^Mo-MuLV/ && do { $ncbi_taxid = 11801; last; };
1785 /^MuLV/ && do { $ncbi_taxid = 11786; last; };
1786 /^MSV/ && do { $ncbi_taxid = 11802; last; };
1787 /^MC29/ && do { $ncbi_taxid = 11868; last; };
1788 /^MVM/ && do { $ncbi_taxid = 10794; last; };
1789 /^MH2E21/ && do { $ncbi_taxid = 11955; last; }; # 11955 is a species, presumably MH2E21 is the strain
1790 last;
1793 $first_letter eq 'R' && do {
1794 /^Raphanus sativus/ && do { $ncbi_taxid = 3726; last; };
1795 /^REV-T/ && do { $ncbi_taxid = 11636; last; };
1796 /^RAV-0/ && do { $ncbi_taxid = 11867; last; }; # should be rous-associated virus 0 variant
1797 /^RSV/ && do { $ncbi_taxid = 11886; last; };
1798 /^RadLV/ && do { $ncbi_taxid = 31689; last; };
1799 /^RTBV/ && do { $ncbi_taxid = 10654; last; };
1800 last;
1803 $first_letter eq 'S' && do {
1804 /^SV40/ && do { $ncbi_taxid = 10633; last; };
1805 /^Sesbania rostrata/ && do { $ncbi_taxid = 3895; last; };
1806 /^SIV/ && do { $ncbi_taxid = 11723; last; };
1807 /^Spinacia oleracea/ && do { $ncbi_taxid = 3562; last; };
1808 /^SCMV/ && do { $ncbi_taxid = 10364; last; }; # supposed to be AGM isolate
1809 last;
1812 # and lower case
1813 $first_letter eq 'a' && do {
1814 /^adenovirus type 5/ && do { $ncbi_taxid = 28285; last; };
1815 /^adenovirus type 2/ && do { $ncbi_taxid = 10515; last; };
1816 /^adenovirus/ && do { $ncbi_taxid = 189831; last; }; # 189831 ('unclassified Adenoviridae') is the closest I can get, but this has no genus and is not a species
1817 last;
1820 $first_letter eq 'b' && do {
1821 /^bell pepper/ && do { $ncbi_taxid = 4072; last; };
1822 /^baculovirus, Autographa californica/ && do { $ncbi_taxid = 46015; last; };
1823 /^broccoli/ && do { $ncbi_taxid = 36774; last; };
1824 /^barley/ && do { $ncbi_taxid = 112509; last; };
1825 last;
1828 $first_letter eq 'c' && do {
1829 /^clawed frog/ && do { $ncbi_taxid = 8355; last; };
1830 /^chipmunk/ && do { $ncbi_taxid = 64680; last; };
1831 /^common tree shrew/ && do { $ncbi_taxid = 37347; last; };
1832 /^cat/ && do { $ncbi_taxid = 9685; last; };
1833 last;
1836 # and misc
1837 /^NK24/ && do { $ncbi_taxid = 11955; last; };
1838 /^OK10/ && do { $ncbi_taxid = 11871; last; };
1839 /^Dendrobium grex/ && do { $ncbi_taxid = 84618; last; };
1840 /^KSHV/ && do { $ncbi_taxid = 37296; last; };
1841 /^Oncidium/ && do { $ncbi_taxid = 96474; last; };
1842 /^Japanese quail/ && do { $ncbi_taxid = 93934; last; };
1843 /^Nile tilapia/ && do { $ncbi_taxid = 8128; last; };
1844 /^GALV/ && do { $ncbi_taxid = 11840; last; };
1845 /^JCV/ && do { $ncbi_taxid = 10632; last; };
1846 /^LPV/ && do { $ncbi_taxid = 10574; last; };
1847 /^Py,/ && do { $ncbi_taxid = 36362; last; };
1848 /^DHBV/ && do { $ncbi_taxid = 12639; last; };
1849 /^VZV/ && do { $ncbi_taxid = 10335; last; };
1850 /^Vicia faba/ && do { $ncbi_taxid = 3906; last; };
1852 /^hamster/ && do { $ncbi_taxid = 10029; last; };
1853 /^sea urchin/ && do { $ncbi_taxid = 7668; last; };
1854 /^fruit fly/ && do { $ncbi_taxid = 7227; last; };
1855 /^halibut/ && do { $ncbi_taxid = 8267; last; };
1856 /^vaccinia virus/ && do { $ncbi_taxid = 10245; last; };
1857 /^taxonomic class Mammalia/ && do { $ncbi_taxid = 40674; last; }; # not a species
1858 /^taxonomic class Vertebrata/ && do { $ncbi_taxid = 7742; last; }; # not a species
1859 /^dog/ && do { $ncbi_taxid = 9615; last; };
1860 /^parsley/ && do { $ncbi_taxid = 4043; last; };
1861 /^mouse, Mus domesticus Torino/ && do { $ncbi_taxid = 10092; last; }; # 10092 is domesticus subspecies, but not the Torino strain
1862 /^lemur, Eulemur fulvus collaris/ && do { $ncbi_taxid = 47178; last; };
1863 /^red sea bream/ && do { $ncbi_taxid = 143350; last; };
1864 /^zebra finch/ && do { $ncbi_taxid = 59729; last; };
1865 /^mung bean/ && do { $ncbi_taxid = 3916; last; };
1866 /^soybean/ && do { $ncbi_taxid = 3847; last; };
1867 /^oat/ && do { $ncbi_taxid = 4498; last; };
1868 /^pseudorabies virus/ && do { $ncbi_taxid = 10345; last; };
1872 $self->warn("Didn't know what species '$raw_species' was, unable to classify") unless $ncbi_taxid;
1873 return $ncbi_taxid;