Massive check of file open lines. Changed bareword filehandles
[bioperl-live.git] / Bio / DB / TFBS / transfac_pro.pm
blob2b7ded80eb0f7c5a72d5e62ade75852341bab1b9
1 # $Id: transfac_pro.pm,v 1.15 2006/08/12 11:00:03 sendu Exp $
3 # BioPerl module for Bio::DB::TFBS::transfac_pro
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Sendu Bala <bix@sendu.me.uk>
9 # Copyright Sendu Bala
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::DB::TFBS::transfac_pro - An implementation of Bio::DB::TFBS
18 which uses local flat files for transfac pro
20 =head1 SYNOPSIS
22 use Bio::DB::Taxonomy;
24 my $db = new Bio::DB::Taxonomy(-source => 'transfac_pro'
25 -dat_dir => $directory);
27 # we're interested in the gene P5
28 my ($gene_id) = $db->get_gene_ids(-name => 'P5'); # G000001
30 # we want all the transcription factors that bind to our gene
31 my @factor_ids = $db->get_factor_ids(-gene => $gene_id);
33 # get info about those TFs
34 foreach my $factor_id (@factor_ids) {
35 my $factor = $db->get_factor($factor_id);
36 my $name = $factor->universal_name;
37 # etc. - see Bio::Map::TranscriptionFactor, eg. find out where it binds
40 # get a matrix
41 my $matrix = $db->get_matrix('M00001');
43 # get a binding site sequence
44 my $seq = $db->get_site('R00001');
46 =head1 DESCRIPTION
48 This is an implementation which uses local flat files and the DB_File
49 module RECNO data structures to manage a local copy of the Transfac Pro TFBS
50 database.
52 Required database files require a license which can be obtained via
53 http://www.biobase-international.com/pages/index.php?id=170
55 Within the linux installation tarball you will find a cgibin tar ball, and
56 inside that is a data directory containing the .dat files needed by this
57 module. Point to that data directory with -dat_dir
59 =head1 FEEDBACK
61 =head2 Mailing Lists
63 User feedback is an integral part of the evolution of this and other
64 Bioperl modules. Send your comments and suggestions preferably to
65 the Bioperl mailing list. Your participation is much appreciated.
67 bioperl-l@bioperl.org - General discussion
68 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
70 =head2 Support
72 Please direct usage questions or support issues to the mailing list:
74 I<bioperl-l@bioperl.org>
76 rather than to the module maintainer directly. Many experienced and
77 reponsive experts will be able look at the problem and quickly
78 address it. Please include a thorough description of the problem
79 with code and data examples if at all possible.
81 =head2 Reporting Bugs
83 Report bugs to the Bioperl bug tracking system to help us keep track
84 of the bugs and their resolution. Bug reports can be submitted via
85 the web:
87 https://redmine.open-bio.org/projects/bioperl/
89 =head1 AUTHOR - Sendu Bala
91 Email bix@sendu.me.uk
93 =head1 CONTRIBUTORS
95 Based on Bio::DB::Taxonomy::flatfile by Jason Stajich
97 =head1 APPENDIX
99 The rest of the documentation details each of the object methods.
100 Internal methods are usually preceded with a _
102 =cut
104 # Let the code begin...
106 package Bio::DB::TFBS::transfac_pro;
107 use strict;
108 use Bio::Annotation::Reference;
109 use Bio::Annotation::SimpleValue;
110 use Bio::LocatableSeq;
111 use Bio::SimpleAlign;
112 use Bio::Matrix::PSM::SiteMatrix;
113 use Bio::AlignIO;
114 use Bio::Map::GeneMap;
115 use Bio::Map::TranscriptionFactor;
116 use Bio::Map::Position;
117 use Bio::Map::Relative;
118 use DB_File;
120 use constant SEPARATOR => ':!:';
121 use constant INTERNAL_SEPARATOR => '!:!';
123 $DB_BTREE->{'flags'} = R_DUP; # allow duplicate values in DB_File BTREEs
125 use base qw(Bio::DB::TFBS);
127 =head2 new
129 Title : new
130 Usage : my $obj = new Bio::DB::TFBS::transfac_pro();
131 Function: Builds a new Bio::DB::TFBS::transfac_pro object
132 Returns : an instance of Bio::DB::TTFBS::transfac_pro
133 Args : -dat_dir => name of directory where Transfac Pro .dat files
134 (required to initially build indexes)
135 -tax_db => Bio::DB::Taxonomy object, used when initially building
136 indexes, gives better results for species information
137 but not required.
138 -index_dir => name of directory where index files should be created
139 or already exist. (defaults to -dat_dir, required if
140 -dat_dir not supplied)
141 -force => 1 replace current indexes even if they exist
143 =cut
145 sub new {
146 my ($class, @args) = @_;
148 my $self = $class->SUPER::new(@args);
150 my ($dat_dir, $index_dir, $tax_db, $force) = $self->_rearrange([qw(DAT_DIR INDEX_DIR TAX_DB FORCE)], @args);
151 $self->throw("At least one of -dat_dir and -index_dir must be supplied") unless ($dat_dir || $index_dir);
153 $self->index_directory($index_dir || $dat_dir);
154 $self->{_tax_db} = $tax_db if $tax_db;
156 if ($dat_dir) {
157 $self->_build_index($dat_dir, $force);
160 $self->_db_connect;
161 return $self;
164 =head2 Bio::DB::TFBS Interface implementation
166 =cut
168 sub _get_ids {
169 my ($self, $dat, @args) = @_;
170 @args % 2 == 0 || $self->throw("Must provide key => value pairs");
171 my $hash = $self->{$dat} || $self->throw("Unknown .dat type '$dat'");
173 if (@args) {
174 # get a subset corresponding to args
175 my @final;
176 my %args = @args;
177 my $multiple = 0;
178 while (my ($type, $value) = each %args) {
179 unless ($value) {
180 $self->warn("Arguement '$type' has no value, ignored");
181 next;
183 $type =~ s/-//;
184 $type = lc($type);
185 my $converter = $hash->{$type};
186 unless ($converter) {
187 $self->warn("Unknown search type '$type' for .dat type '$dat'");
188 next;
191 my @ids = $converter->get_dup($value);
192 unless (@ids) {
193 @ids = $converter->get_dup(lc($value));
196 if ($multiple) {
197 # we can have multiple types given at once, find the ids that
198 # satisfy all criteria
199 @final || return;
200 my %final = map { $_ => 1 } @final;
201 @final = grep { $final{$_} } @ids;
203 else {
204 @final = @ids;
205 $multiple++;
209 return @final;
211 else {
212 # get them all
213 my $db_file_hash = $self->{$dat}->{id};
215 my ($key, $prev_key, $value) = ('_!_', '!_!');
216 my @ids;
217 while (1) {
218 $db_file_hash->seq($key, $value, R_NEXT);
219 last if $prev_key eq $key;
220 push(@ids, $value); # confusing? when creating objects we store
221 # $value as accession and $key as id, but from
222 # this method we return $value as id given $id!
223 $prev_key = $key;
226 return @ids;
230 =head2 get_reference
232 Title : get_reference
233 Usage : my $ref = $obj->get_reference($id);
234 Function: Get a literature reference.
235 Returns : Bio::Annotation::Reference
236 Args : string - a reference id ('RE...')
238 =cut
240 sub get_reference {
241 my ($self, $id) = @_;
242 $id || return;
243 my $data = $self->{reference}->{data}->{$id} || return;
244 my @data = split(SEPARATOR, $data);
246 return Bio::Annotation::Reference->new(-pubmed => $data[0],
247 -authors => $data[1],
248 -title => $data[2],
249 -location => $data[3] );
252 =head2 get_genemap
254 Title : get_genemap
255 Usage : my $map = $obj->get_genemap($id);
256 Function: Get a GeneMap for a gene.
257 Returns : Bio::Map::GeneMap
258 Args : string - a gene id ('G...'), and optionally int (number of bp
259 upstream)
261 =cut
263 sub get_genemap {
264 my ($self, $id, $upstream) = @_;
265 $id || return;
266 return $self->{got_map}->{$id} if defined $self->{got_map}->{$id};
267 $upstream ||= 1000;
268 my $data = $self->{gene}->{data}->{$id} || return;
269 my @data = split(SEPARATOR, $data);
271 # accession = id name description species_tax_id_or_raw_string
272 my $taxon = $self->{_tax_db} ? $self->{_tax_db}->get_taxon($data[3]) || $data[3] : $data[3];
273 my $map = Bio::Map::GeneMap->get(-uid => $id,
274 -gene => $data[1],
275 -species => $taxon,
276 -description => $data[2],
277 -upstream => $upstream);
278 $self->{got_map}->{$id} = $map; # prevents infinite recurse when we call get_factor below
280 # spawn all the factors that belong on this gene map
281 # get_factor_ids(-gene => ...) only works for genes that encode factors;
282 # have to go via sites
283 foreach my $sid ($self->get_site_ids(-gene => $id)) {
284 foreach my $fid ($self->get_factor_ids(-site => $sid)) {
285 # it is quite deliberate that we deeply recurse to arrive at the
286 # correct answer, which involves pulling in most of the database
287 no warnings "recursion";
288 $self->get_factor($fid);
292 return $map;
295 =head2 get_seq
297 Title : get_seq
298 Usage : my $seq = $obj->get_seq($id);
299 Function: Get the sequence of a site. The sequence will be annotated with the
300 the tags 'relative_start', 'relative_end', 'relative_type' and
301 'relative_to'.
302 Returns : Bio::Seq
303 Args : string - a site id ('R...')
305 =cut
307 sub get_seq {
308 my ($self, $id) = @_;
309 $id || return;
310 my $data = $self->{site}->{data}->{$id} || return;
311 my @data = split(SEPARATOR, $data);
313 my $seq = Bio::Seq->new(-seq => $data[2],
314 -accession_number => $id,
315 -description => $data[6] ? 'Genomic sequence' : 'Consensus or artificial sequence',
316 -id => $data[0],
317 -strand => 1,
318 -alphabet => $data[7] || 'dna',
319 -species => $data[6]);
321 my $annot = $seq->annotation;
322 my $sv = Bio::Annotation::SimpleValue->new(-tagname => 'relative_start', -value => $data[4] || 1);
323 $annot->add_Annotation($sv);
324 $sv = Bio::Annotation::SimpleValue->new(-tagname => 'relative_end', -value => $data[5] || ($data[4] || 1 + length($data[2]) - 1));
325 $annot->add_Annotation($sv);
326 $sv = Bio::Annotation::SimpleValue->new(-tagname => 'relative_type', -value => $data[3] || 'artificial');
327 $annot->add_Annotation($sv);
328 $sv = Bio::Annotation::SimpleValue->new(-tagname => 'relative_to', -value => $data[1]);
329 $annot->add_Annotation($sv);
331 return $seq;
334 =head2 get_fragment
336 Title : get_fragment
337 Usage : my $seq = $obj->get_fragment($id);
338 Function: Get the sequence of a fragment.
339 Returns : Bio::Seq
340 Args : string - a site id ('FR...')
342 =cut
344 sub get_fragment {
345 my ($self, $id) = @_;
346 $id || return;
347 my $data = $self->{fragment}->{data}->{$id} || return;
348 my @data = split(SEPARATOR, $data);
350 # accession = id gene_id1 gene_id2 species_tax_id_or_raw_string sequence source
351 return new Bio::Seq( -seq => $data[4],
352 -accession_number => $id,
353 -description => 'Between genes '.$data[1].' and '.$data[2],
354 -species => $data[3],
355 -id => $data[0],
356 -alphabet => 'dna' );
359 =head2 get_matrix
361 Title : get_matrix
362 Usage : my $matrix = $obj->get_matrix($id);
363 Function: Get a matrix that describes a binding site.
364 Returns : Bio::Matrix::PSM::SiteMatrix
365 Args : string - a matrix id ('M...'), optionally a sequence string from
366 which base frequencies will be calcualted for the matrix model
367 (default 0.25 each)
369 =cut
371 sub get_matrix {
372 my ($self, $id, $seq) = @_;
373 $id || return;
374 $seq ||= 'atgc';
375 $seq = lc($seq);
376 my $data = $self->{matrix}->{data}->{$id} || return;
377 my @data = split(SEPARATOR, $data);
378 $data[4] || $self->throw("Matrix data missing for $id");
380 my ($a, $c, $g, $t);
381 foreach my $position (split(INTERNAL_SEPARATOR, $data[4])) {
382 my ($a_count, $c_count, $g_count, $t_count) = split("\t", $position);
383 push(@{$a}, $a_count);
384 push(@{$c}, $c_count);
385 push(@{$g}, $g_count);
386 push(@{$t}, $t_count);
389 # our psms include a simple background model so we can use
390 # sequence_match_weight() if desired
391 my $a_freq = ($seq =~ tr/a//) / length($seq);
392 my $c_freq = ($seq =~ tr/c//) / length($seq);
393 my $g_freq = ($seq =~ tr/g//) / length($seq);
394 my $t_freq = ($seq =~ tr/t//) / length($seq);
396 my $psm = Bio::Matrix::PSM::SiteMatrix->new(-pA => $a,
397 -pC => $c,
398 -pG => $g,
399 -pT => $t,
400 -id => $data[0],
401 -accession_number => $id,
402 -sites => $data[3],
403 -width => scalar(@{$a}),
404 -correction => 1,
405 -model => { A => $a_freq, C => $c_freq, G => $g_freq, T => $t_freq } );
407 #*** used to make a Bio::Matrix::PSM::Psm and add references, but it
408 # didn't seem worth it. You can get references from the database by:
409 #foreach my $ref_id ($db->get_reference_ids(-matrix => $id)) {
410 # my $ref = $db->get_reference($ref_id);
413 return $psm;
416 =head2 get_aln
418 Title : get_aln
419 Usage : my $aln = $obj->get_aln($id);
420 Function: Get the alignment that was used to generate a matrix. Each sequence
421 in the alignment will have an accession_number corresponding to the
422 Transfac site id, and id() based on that but unique within the
423 alignment.
424 Returns : Bio::SimpleAlign
425 Args : string - a matrix id ('M...'), optionally true to, when a matrix
426 lists no sequences, search for sequences via the matrix's factors,
427 picking the sites that best match the matrix
429 =cut
431 my %VALID_STRAND = map {$_ => 1} qw(-1 0 1);
433 sub get_aln {
434 my ($self, $id, $via_factors) = @_;
435 $id || return;
436 my $data = $self->{matrix}->{data}->{$id} || $self->throw("matrix '$id' had no data in DB_File");
437 my @data = split(SEPARATOR, $data);
439 if (! $data[5] && $via_factors) {
440 # This is a matrix with no site sequences given in matrix.dat.
441 # Find some matching site sequences via factors.
443 # First, check its factors for sites
444 my %site_seqs;
445 my %factor_ids;
446 foreach my $factor_id ($self->get_factor_ids(-matrix => $id)) {
447 $factor_ids{$factor_id} = 1;
448 foreach my $site_id ($self->get_site_ids(-factor => $factor_id)) {
449 next if defined $site_seqs{$site_id};
450 my $seq = $self->get_seq($site_id);
452 # skip sites that have no sequence, or have IUPAC symbols in
453 # their sequence (most probably the 'consensus' sequence itself
454 # that was used to make and exactly corresponds to the matrix)
455 my $seq_str = $seq->seq || next;
456 $seq_str =~ /[MRWSYKVHDB]/ and next;
458 $site_seqs{$site_id} = $seq;
461 my @seqs = values %site_seqs;
463 if (@seqs > 1) {
464 # pick the sub-seqs that match to the matrix with the best scores
465 my $matrix = $self->get_matrix($id);
466 my $desired_sequences = $matrix->sites;
467 return if @seqs < $desired_sequences;
469 my $desired_length = $matrix->width;
470 my %best_seqs;
471 foreach my $seq (@seqs) {
472 my $for_str = $seq->seq;
473 next if length($for_str) < $desired_length;
474 my $rev_str = $seq->revcom->seq;
476 my $best_score = 0;
477 my $best_subseq = '';
478 my $best_i = 0;
479 my $best_subseq_caps = 0;
480 my $best_revcom;
481 my $revcom = 0;
482 foreach my $seq_str ($for_str, $rev_str) {
483 for my $i (0..(length($seq_str) - $desired_length)) {
484 my $subseq = substr($seq_str, $i, $desired_length);
485 $subseq =~ s/[^ACGTacgt]//g; # can only score atcg
486 next unless length($subseq) == $desired_length; # short or 0-length seqs could get the highest scores!
487 my $score = $matrix->sequence_match_weight($subseq);
489 # caps represent the author-chosen bit of a site
490 # sequence so we would prefer to choose a subseq that
491 # contains it
492 my $caps = $subseq =~ tr/ACGT//;
494 #*** (don't know why numeric == fails for comparing
495 # scores, when the string eq works)
496 if ($score > $best_score || ("$score" eq "$best_score" && $caps > $best_subseq_caps)) {
497 $best_score = $score;
498 $best_subseq_caps = $caps;
499 $best_subseq = $subseq;
500 $best_i = $i;
501 $best_revcom = $revcom;
504 $revcom++;
507 if ($best_score) {
508 $best_seqs{$seq->accession_number} = [$best_subseq, $seq->accession_number, ($best_i + 1), $revcom ? -1 : 1, $best_score];
511 my @sorted = sort { $best_seqs{$b}->[-1] <=> $best_seqs{$a}->[-1] } keys %best_seqs;
512 return if @sorted < $desired_sequences;
513 splice(@sorted, $desired_sequences);
514 my %wanted = map { $_ => 1 } @sorted;
516 my @site_data;
517 foreach my $seq (@seqs) {
518 next unless exists $wanted{$seq->accession_number};
519 my @data = @{$best_seqs{$seq->accession_number}};
520 pop(@data);
521 push(@site_data, join('_', @data));
524 $data[5] = join(INTERNAL_SEPARATOR, @site_data);
525 $self->{matrix}->{data}->{$id} = join(SEPARATOR, @data);
528 $data[5] || return;
530 my @blocks = split(INTERNAL_SEPARATOR, $data[5]);
532 # append gap chars to all sequences to make them the same length
533 # (applies to sequences found via factors, presumably, since we already
534 # do this for matrix alignments in transfac_pro.pm)
535 my $longest = 0;
536 foreach (@blocks) {
537 my ($seq) = split('_', $_);
538 my $length = length($seq);
539 if ($length > $longest) {
540 $longest = $length;
543 foreach my $i (0..$#blocks) {
544 my $block = $blocks[$i];
545 my ($seq, $seq_id) = split('_', $block);
546 my $length = length($seq);
547 if ($length < $longest) {
548 my $orig_seq = $seq;
549 $seq .= '-'x($longest - $length);
550 $block =~ s/^${orig_seq}_/${seq}_/;
551 $blocks[$i] = $block;
555 # build the alignment
556 my $aln = Bio::SimpleAlign->new(-source => 'transfac_pro');
557 my %done_ids;
558 foreach (@blocks) {
559 my ($seq, $seq_acc, $start, $strand) = split('_', $_);
561 $self->throw("Invalid strand $strand found in block $_")
562 unless exists $VALID_STRAND{$strand};
563 # we can get back multiple different subparts of the same site (sequence),
564 # so $seq_acc isn't unique across this loop. Can't use it as the seq id
565 # of the alignment (ids must be unique in SimpleAlign), so we
566 # uniquify the id and store the original id as the accession_number
567 my $seq_id;
568 $done_ids{$seq_acc}++;
569 if ($done_ids{$seq_acc} > 1) {
570 $seq_id = $seq_acc.'_'.$done_ids{$seq_acc};
572 else {
573 $seq_id = $seq_acc;
576 my $gaps = $seq =~ tr/-//;
577 my $length = length($seq) - $gaps;
578 $self->throw("seq '$seq_id' for matrix '$id' had seq '$seq'") unless $length;
579 $aln->add_seq(Bio::LocatableSeq->new(-seq => $seq,
580 -id => $seq_id,
581 -accession_number => $seq_acc,
582 -start => $start,
583 -end => $start + $length - 1,
584 -strand => $strand));
586 $aln->id($id);
587 # could also store score? of?
589 return $aln;
592 =head2 get_factor
594 Title : get_factor
595 Usage : my $factor = $obj->get_factor($id);
596 Function: Get the details of a transcription factor.
597 Returns : Bio::Map::TranscriptionFactor
598 Args : string - a factor id ('T...')
600 =cut
602 sub get_factor {
603 my ($self, $id) = @_;
604 $id || return;
605 return $self->{got_factor}->{$id} if defined $self->{got_factor}->{$id};
606 my $data = $self->{factor}->{data}->{$id} || return;
607 my @data = split(SEPARATOR, $data);
609 # accession = id name species sequence
610 my $tf = Bio::Map::TranscriptionFactor->get(-id => $id,
611 -universal_name => $data[1]);
612 #*** not sure what to do with species and sequence, since we don't want to
613 # confuse the idea that a TF is a general thing that could bind to any
614 # species... then again, you might want to model species-specific variants
615 # of a TF with different binding abilities...
616 #*** idea of having inclusion and exclusion species so you can prevent/
617 # ignore a tf that binds to the wrong species (a species that doesn't even
618 # have the tf), and associating sequence with each species/tf combo so you
619 # can see how diverged the tf is and make assumptions about site difference
620 # allowance
622 # place it on all its genemaps
623 foreach my $sid ($self->get_site_ids(-factor => $id)) {
624 my $s_data = $self->{site}->{data}->{$sid} || next;
625 my @s_data = split(SEPARATOR, $s_data);
627 # accession = id gene_id sequence relative_to first_position last_position species_tax_id_or_raw_string
628 $s_data[1] || next; # site isn't relative to a gene, meaningless
629 $s_data[4] || next; # don't know where its supposed to be, can't model it
630 $s_data[5] ||= $s_data[4] + ($s_data[2] ? length($s_data[2]) - 1 : 0);
632 # it is quite deliberate that we deeply recurse to arrive at the
633 # correct answer, which involves pulling in most of the database
634 no warnings "recursion";
635 my $gene_map = $self->get_genemap($s_data[1]) || next;
636 return $self->{got_factor}->{$id} if defined $self->{got_factor}->{$id};
638 #*** not always relative to gene start...
639 # we need Bio::Map::Gene s to have some default tss and atg positions
640 # that we can be relative to
641 my $rel = Bio::Map::Relative->new(-element => $gene_map->gene, -description => $s_data[3]);
642 Bio::Map::Position->new(-map => $gene_map, -element => $tf, -start => $s_data[4], -end => $s_data[5], -relative => $rel);
645 $self->{got_factor}->{$id} = $tf;
646 return $tf;
649 # since get_factor() is uncertain, just have direct access methods to factor
650 # information
651 sub get_factor_name {
652 my ($self, $id) = @_;
653 my $details = $self->_get_factor_details($id) || return;
654 return $details->{name};
656 sub get_factor_species {
657 my ($self, $id) = @_;
658 my $details = $self->_get_factor_details($id) || return;
659 return $details->{species};
661 sub get_factor_sequence {
662 my ($self, $id) = @_;
663 my $details = $self->_get_factor_details($id) || return;
664 return $details->{sequence};
666 sub _get_factor_details {
667 my ($self, $id) = @_;
668 $id || return;
670 return $self->{factor_details}->{$id} if defined $self->{factor_details}->{$id};
672 my $data = $self->{factor}->{data}->{$id} || return;
673 my @data = split(SEPARATOR, $data);
675 # accession = id name species sequence
677 my %details = (name => $data[1], species => $data[2], sequence => $data[3]);
678 $self->{factor_details}->{$id} = \%details;
680 return \%details;
683 =head2 get_reference_ids
685 Title : get_reference_ids
686 Usage : my @ids = $obj->get_reference_ids(-key => $value);
687 Function: Get all the reference ids that are associated with the supplied
688 args.
689 Returns : list of strings (ids)
690 Args : -key => value, where value is a string id, and key is one of:
691 -pubmed -site -gene -matrix -factor
693 =cut
695 sub get_reference_ids {
696 my $self = shift;
697 return $self->_get_ids('reference', @_);
700 # -id -name -species -site -factor -reference
701 sub get_gene_ids {
702 my $self = shift;
703 return $self->_get_ids('gene', @_);
706 =head2 get_site_ids
708 Title : get_site_ids
709 Usage : my @ids = $obj->get_site_ids(-key => $value);
710 Function: Get all the site ids that are associated with the supplied
711 args.
712 Returns : list of strings (ids)
713 Args : -key => value, where value is a string id, and key is one of:
714 -id -species -gene -matrix -factor -reference
716 =cut
718 sub get_site_ids {
719 my $self = shift;
720 return $self->_get_ids('site', @_);
723 =head2 get_matrix_ids
725 Title : get_matrix_ids
726 Usage : my @ids = $obj->get_matrix_ids(-key => $value);
727 Function: Get all the matrix ids that are associated with the supplied
728 args.
729 Returns : list of strings (ids)
730 Args : -key => value, where value is a string id, and key is one of:
731 -id -name -site -factor -reference
733 =cut
735 sub get_matrix_ids {
736 my $self = shift;
737 return $self->_get_ids('matrix', @_);
740 =head2 get_factor_ids
742 Title : get_factor_ids
743 Usage : my @ids = $obj->get_factor_ids(-key => $value);
744 Function: Get all the factor ids that are associated with the supplied
745 args.
746 Returns : list of strings (ids)
747 Args : -key => value, where value is a string id, and key is one of:
748 -id -name -species -interactors -gene -matrix -site -reference
749 NB: -gene only gets factor ids for genes that encode factors
751 =cut
753 sub get_factor_ids {
754 my $self = shift;
755 return $self->_get_ids('factor', @_);
758 =head2 get_fragment_ids
760 Title : get_fragment_ids
761 Usage : my @ids = $obj->get_fragment_ids(-key => $value);
762 Function: Get all the fragment ids that are associated with the supplied
763 args.
764 Returns : list of strings (ids)
765 Args : -key => value, where value is a string id, and key is one of:
766 -id -species -gene -factor -reference
768 =cut
770 sub get_fragment_ids {
771 my $self = shift;
772 return $self->_get_ids('fragment', @_);
775 =head2 Helper methods
777 =cut
779 # internal method which does the indexing
780 sub _build_index {
781 my ($self, $dat_dir, $force) = @_;
783 # MLDBM would give us transparent complex data structures with DB_File,
784 # allowing just one index file, but its yet another requirment and we
785 # don't strictly need it
787 my $index_dir = $self->index_directory;
788 my $gene_index = "$index_dir/gene.dat.index";
789 my $reference_index = "$index_dir/reference.dat.index";
790 my $matrix_index = "$index_dir/matrix.dat.index";
791 my $factor_index = "$index_dir/factor.dat.index";
792 my $fragment_index = "$index_dir/fragment.dat.index";
793 my $site_index = "$index_dir/site.dat.index";
795 my $reference_dat = "$dat_dir/reference.dat";
796 if (! -e $reference_index || $force) {
797 open my $REF, '<', $reference_dat or $self->throw("Could not read reference file '$reference_dat': $!");
799 my %references;
800 unlink $reference_index;
801 my $ref = tie(%references, 'DB_File', $reference_index, O_RDWR|O_CREAT, 0644, $DB_HASH)
802 or $self->throw("CCould not open file '$reference_index': $!");
804 my %pubmed;
805 my $reference_pubmed = $reference_index.'.pubmed';
806 unlink $reference_pubmed;
807 my $pub = tie(%pubmed, 'DB_File', $reference_pubmed, O_RDWR|O_CREAT, 0644, $DB_BTREE)
808 or $self->throw("Could not open file '$reference_pubmed': $!");
810 my %gene;
811 my $reference_gene = $gene_index.'.reference';
812 unlink $reference_gene;
813 my $gene = tie(%gene, 'DB_File', $reference_gene, O_RDWR|O_CREAT, 0644, $DB_BTREE)
814 or $self->throw("Could not open file '$reference_gene': $!");
816 my %site;
817 my $reference_site = $site_index.'.reference';
818 unlink $reference_site;
819 my $site = tie(%site, 'DB_File', $reference_site, O_RDWR|O_CREAT, 0644, $DB_BTREE)
820 or $self->throw("Could not open file '$reference_site': $!");
822 my %fragment;
823 my $reference_fragment = $fragment_index.'.reference';
824 unlink $reference_fragment;
825 my $fragment = tie(%fragment, 'DB_File', $reference_fragment, O_RDWR|O_CREAT, 0644, $DB_BTREE)
826 or $self->throw("Could not open file '$reference_fragment': $!");
828 my %factor;
829 my $reference_factor = $factor_index.'.reference';
830 unlink $reference_factor;
831 my $factor = tie(%factor, 'DB_File', $reference_factor, O_RDWR|O_CREAT, 0644, $DB_BTREE)
832 or $self->throw("Could not open file '$reference_factor': $!");
834 my %matrix;
835 my $reference_matrix = $matrix_index.'.reference';
836 unlink $reference_matrix;
837 my $matrix = tie(%matrix, 'DB_File', $reference_matrix, O_RDWR|O_CREAT, 0644, $DB_BTREE)
838 or $self->throw("Could not open file '$reference_matrix': $!");
840 # skip the first three header lines
841 <$REF>; <$REF>; <$REF>;
843 my @data;
844 while (<$REF>) {
845 if (/^AC (\S+)/) {
846 $data[0] = $1;
848 elsif (/^RX PUBMED: (\d+)/) {
849 $data[1] = $1;
850 $pub->put("$1", $data[0]);
852 elsif (/^RA (.+)\n$/) {
853 $data[2] = $1;
855 elsif (/^RT (.+?)\.?\n$/) {
856 $data[3] = $1;
858 elsif (/^RL (.+?)\.?\n$/) {
859 $data[4] = $1;
861 elsif (/^GE TRANSFAC: (\w\d+)/) {
862 $gene->put($data[0], "$1");
864 elsif (/^BS TRANSFAC: (\w\d+)/) {
865 $site->put($data[0], "$1");
867 elsif (/^FA TRANSFAC: (\w\d+)/) {
868 $factor->put($data[0], "$1");
870 elsif (/^FR TRANSFAC: (FR\d+)/) {
871 $fragment->put($data[0], "$1");
873 elsif (/^MX TRANSFAC: (\w\d+)/) {
874 $matrix->put($data[0], "$1");
876 elsif (/^\/\//) {
877 # end of a record, store previous data and reset
879 # accession = pubmed authors title location
880 $references{$data[0]} = join(SEPARATOR, ($data[1] || '',
881 $data[2] || '',
882 $data[3] || '',
883 $data[4] || ''));
885 @data = ();
888 close $REF;
890 $ref = $pub = $gene = $site = $fragment = $factor = $matrix = undef;
891 untie %references;
892 untie %pubmed;
893 untie %gene;
894 untie %site;
895 untie %fragment;
896 untie %factor;
897 untie %matrix;
900 my $gene_dat = "$dat_dir/gene.dat";
901 if (! -e $gene_index || $force) {
902 open my $GEN, '<', $gene_dat or $self->throw("Could not read gene file '$gene_dat': $!");
904 my %genes;
905 unlink $gene_index;
906 my $gene = tie(%genes, 'DB_File', $gene_index, O_RDWR|O_CREAT, 0644, $DB_HASH)
907 or $self->throw("Could not open file '$gene_index': $!");
909 my %id;
910 my $gene_id = $gene_index.'.id';
911 unlink $gene_id;
912 my $id = tie(%id, 'DB_File', $gene_id, O_RDWR|O_CREAT, 0644, $DB_BTREE)
913 or $self->throw("Could not open file '$gene_id': $!");
915 my %name;
916 my $gene_name = $gene_index.'.name';
917 unlink $gene_name;
918 my $name = tie(%name, 'DB_File', $gene_name, O_RDWR|O_CREAT, 0644, $DB_BTREE)
919 or $self->throw("Could not open file '$gene_name': $!");
921 my %species;
922 my $gene_species = $gene_index.'.species';
923 unlink $gene_species;
924 my $species = tie(%species, 'DB_File', $gene_species, O_RDWR|O_CREAT, 0644, $DB_BTREE)
925 or $self->throw("Could not open file '$gene_species': $!");
927 my %site;
928 my $gene_site = $site_index.'.gene';
929 unlink $gene_site;
930 my $site = tie(%site, 'DB_File', $gene_site, O_RDWR|O_CREAT, 0644, $DB_BTREE)
931 or $self->throw("Could not open file '$gene_site': $!");
933 my %factor;
934 my $gene_factor = $factor_index.'.gene';
935 unlink $gene_factor;
936 my $factor = tie(%factor, 'DB_File', $gene_factor, O_RDWR|O_CREAT, 0644, $DB_BTREE)
937 or $self->throw("Could not open file '$gene_factor': $!");
939 my %fragment;
940 my $gene_fragment = $fragment_index.'.gene';
941 unlink $gene_fragment;
942 my $fragment = tie(%fragment, 'DB_File', $gene_fragment, O_RDWR|O_CREAT, 0644, $DB_BTREE)
943 or $self->throw("Could not open file '$gene_fragment': $!");
945 my %reference;
946 my $gene_reference = $reference_index.'.gene';
947 unlink $gene_reference;
948 my $reference = tie(%reference, 'DB_File', $gene_reference, O_RDWR|O_CREAT, 0644, $DB_BTREE)
949 or $self->throw("Could not open file '$gene_reference': $!");
951 # skip the first three header lines
952 <$GEN>; <$GEN>; <$GEN>;
954 my @data;
955 while (<$GEN>) {
956 if (/^AC (\S+)/) {
957 $data[0] = $1;
959 elsif (/^ID (\S+)/) {
960 $data[1] = $1;
961 $id->put("$1", $data[0]);
963 elsif (/^SD (.+)$/) {
964 $data[2] = lc("$1");
965 $name->put(lc("$1"), $data[0]);
967 elsif (/^SY (.+)\.$/) {
968 foreach (split('; ', lc("$1"))) {
969 $name->put($_, $data[0]);
972 elsif (/^DE (.+)$/) {
973 $data[3] = $1;
975 elsif (/^OS (.+)$/) {
976 my $raw_species = $1;
977 my $taxid = $self->_species_to_taxid($raw_species);
978 $data[4] = $taxid || $raw_species;
979 $species->put($data[4], $data[0]);
981 elsif (/^RN .+?(RE\d+)/) {
982 $reference->put($data[0], "$1");
984 elsif (/^BS .+?(R\d+)/) {
985 $site->put($data[0], "$1");
987 elsif (/^FA (T\d+)/) {
988 $factor->put($data[0], "$1");
990 elsif (/^BR (FR\d+)/) {
991 $fragment->put($data[0], "$1");
993 elsif (/^\/\//) {
994 # end of a record, store previous data and reset
996 # accession = id name description species_tax_id_or_raw_string
997 $genes{$data[0]} = join(SEPARATOR, ($data[1] || '',
998 $data[2] || '',
999 $data[3] || '',
1000 $data[4] || ''));
1002 @data = ();
1005 close $GEN;
1007 $gene = $id = $name = $species = $site = $factor = $reference = undef;
1008 untie %genes;
1009 untie %id;
1010 untie %name;
1011 untie %species;
1012 untie %site;
1013 untie %factor;
1014 untie %reference;
1017 my $site_dat = "$dat_dir/site.dat";
1018 if (! -e $site_index || $force) {
1019 open my $SIT, '<', $site_dat or $self->throw("Could not read site file '$site_dat': $!");
1021 my %sites;
1022 unlink $site_index;
1023 my $site = tie(%sites, 'DB_File', $site_index, O_RDWR|O_CREAT, 0644, $DB_HASH)
1024 or $self->throw("Could not open file '$site_index': $!");
1026 my %id;
1027 my $site_id = $site_index.'.id';
1028 unlink $site_id;
1029 my $id = tie(%id, 'DB_File', $site_id, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1030 or $self->throw("Could not open file '$site_id': $!");
1032 my %species;
1033 my $site_species = $site_index.'.species';
1034 unlink $site_species;
1035 my $species = tie(%species, 'DB_File', $site_species, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1036 or $self->throw("Could not open file '$site_species': $!");
1038 my %qualities;
1039 my $site_qualities = $site_index.'.qual';
1040 unlink $site_qualities;
1041 my $quality = tie(%qualities, 'DB_File', $site_qualities, O_RDWR|O_CREAT, 0644, $DB_HASH)
1042 or $self->throw("Could not open file '$site_qualities': $!");
1044 my %gene;
1045 my $site_gene = $gene_index.'.site';
1046 unlink $site_gene;
1047 my $gene = tie(%gene, 'DB_File', $site_gene, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1048 or $self->throw("Could not open file '$site_gene': $!");
1050 my %matrix;
1051 my $site_matrix = $matrix_index.'.site';
1052 unlink $site_matrix;
1053 my $matrix = tie(%matrix, 'DB_File', $site_matrix, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1054 or $self->throw("Could not open file '$site_matrix': $!");
1056 my %factor;
1057 my $site_factor = $factor_index.'.site';
1058 unlink $site_factor;
1059 my $factor = tie(%factor, 'DB_File', $site_factor, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1060 or $self->throw("Could not open file '$site_factor': $!");
1062 my %reference;
1063 my $site_reference = $reference_index.'.site';
1064 unlink $site_reference;
1065 my $reference = tie(%reference, 'DB_File', $site_reference, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1066 or $self->throw("Could not open file '$site_reference': $!");
1068 # skip the first three header lines
1069 <$SIT>; <$SIT>; <$SIT>;
1071 my @data;
1072 while (<$SIT>) {
1073 if (/^AC (\S+)/) {
1074 $data[0] = $1;
1076 elsif (/^ID (\S+)/) {
1077 $data[1] = $1;
1078 $id->put("$1", $data[0]);
1080 elsif (/^TY (.+)$/) {
1081 $data[8] = $1;
1083 elsif (/^DE .*Gene: (G\d+)/) {
1084 $data[2] = $1;
1085 $gene->put($data[0], "$1");
1087 # if it has no gene it is an artificial sequence, unless it
1088 # has a species (OS line), in which case it is unassigned
1089 # genomic; either way we won't be able to make a
1090 # Bio::Map::PositionI later on, so such sites won't be
1091 # on any MapI.
1093 elsif (/^OS (.+)$/) {
1094 # Since not all sites in site.dat with a species have a gene,
1095 # (small handful are unassigned 'genomic') can't delegate to
1096 # gene.dat and must parse species here (effectively again)
1097 my $raw_species = $1;
1098 my $taxid = $self->_species_to_taxid($raw_species);
1099 $data[7] = $taxid || $raw_species;
1100 $species->put($data[7], $data[0]);
1102 elsif (/^SQ (.+)\.$/) {
1103 $data[3] = $1;
1104 # there can actually be more than one SQ line, seemingly with
1105 # variations of the sequence (not a long sequence split over
1106 # two lines); not sure what to do with data; currently we end
1107 # up storing only the last variant.
1109 elsif (/^S1 (.+)$/) {
1110 $data[4] = $1;
1111 # if S1 not present, means transcriptional start
1113 elsif (/^SF (.+)$/) {
1114 $data[5] = $1;
1116 elsif (/^ST (.+)$/) {
1117 $data[6] = $1;
1119 elsif (/^RN .+?(RE\d+)/) {
1120 $reference->put($data[0], "$1");
1122 elsif (/^MX (M\d+)/) {
1123 $matrix->put($data[0], "$1");
1125 elsif (/^BF (T\d+); .+?; Quality: (\d)/) {
1126 $factor->put($data[0], "$1");
1127 $qualities{$data[0].SEPARATOR.$1} = $2;
1129 elsif (/^\/\//) {
1130 # end of a record, store previous data and reset
1132 # accession = id gene_id sequence relative_to first_position last_position species_tax_id_or_raw_string type
1133 $sites{$data[0]} = join(SEPARATOR, ($data[1] || '',
1134 $data[2] || '',
1135 $data[3] || '',
1136 $data[4] || 'TSS',
1137 $data[5] || '',
1138 $data[6] || '',
1139 $data[7] || '',
1140 $data[8] || ''));
1142 @data = ();
1145 close $SIT;
1147 $site = $id = $species = $quality = $gene = $matrix = $factor = $reference = undef;
1148 untie %sites;
1149 untie %id;
1150 untie %species;
1151 untie %qualities;
1152 untie %gene;
1153 untie %matrix;
1154 untie %factor;
1155 untie %reference;
1158 my $matrix_dat = "$dat_dir/matrix.dat";
1159 if (! -e $matrix_index || $force) {
1160 open my $MAT, '<', $matrix_dat or $self->throw("Could not read matrix file '$matrix_dat': $!");
1162 my %matrices;
1163 unlink $matrix_index;
1164 my $matrix = tie(%matrices, 'DB_File', $matrix_index, O_RDWR|O_CREAT, 0644, $DB_HASH)
1165 or $self->throw("Could not open file '$matrix_index': $!");
1167 my %id;
1168 my $matrix_id = $matrix_index.'.id';
1169 unlink $matrix_id;
1170 my $id = tie(%id, 'DB_File', $matrix_id, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1171 or $self->throw("Could not open file '$matrix_id': $!");
1173 my %name;
1174 my $matrix_name = $matrix_index.'.name';
1175 unlink $matrix_name;
1176 my $name = tie(%name, 'DB_File', $matrix_name, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1177 or $self->throw("Could not open file '$matrix_name': $!");
1179 my %site;
1180 my $matrix_site = $site_index.'.matrix';
1181 unlink $matrix_site;
1182 my $site = tie(%site, 'DB_File', $matrix_site, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1183 or $self->throw("Could not open file '$matrix_site': $!");
1185 my %factor;
1186 my $matrix_factor = $factor_index.'.matrix';
1187 unlink $matrix_factor;
1188 my $factor = tie(%factor, 'DB_File', $matrix_factor, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1189 or $self->throw("Could not open file '$matrix_factor': $!");
1191 my %reference;
1192 my $matrix_reference = $reference_index.'.matrix';
1193 unlink $matrix_reference;
1194 my $reference = tie(%reference, 'DB_File', $matrix_reference, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1195 or $self->throw("Could not open file '$matrix_reference': $!");
1197 # skip the first three header lines
1198 <$MAT>; <$MAT>; <$MAT>;
1200 my @data;
1201 my @matrix_data;
1202 my @site_data;
1203 while (<$MAT>) {
1204 if (/^AC (\S+)/) {
1205 $data[0] = $1;
1207 elsif (/^ID (\S+)/) {
1208 $data[1] = $1;
1209 $id->put("$1", $data[0]);
1211 elsif (/^NA (.+)$/) {
1212 $data[2] = $1;
1213 $name->put("$1", $data[0]);
1215 elsif (/^DE (.+)$/) {
1216 $data[3] = $1;
1218 elsif (/^\d\d \s*(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/) {
1219 # a, c, g, t counts/weights
1220 push(@matrix_data, join("\t", ($1, $2, $3, $4)));
1222 # Work out the number of sites as the largest number of
1223 # sites amongst all positions in the sequences. (The BA
1224 # line isn't reliable for telling us the correct number of
1225 # sites all the time)
1226 my $num = $1 + $2 + $3 + $4;
1227 $data[4] ||= 0;
1228 if ($num > $data[4]) {
1229 $data[4] = $num;
1232 elsif (/^BS ([\sa-zA-Z]+); (.+?); (-?\d+); \d+;.*; ([np])/) {
1233 # sequence id start strand
1234 push(@site_data, join('_', ($1, $2, $3, $4 eq 'p' ? 1 : -1)));
1235 $site->put($data[0], $2);
1237 elsif (/^BF (T\d+)/) {
1238 $factor->put($data[0], "$1");
1240 elsif (/^RN .+?(RE\d+)/) {
1241 $reference->put($data[0], "$1");
1243 elsif (/^\/\//) {
1244 # end of a record, store previous data and reset
1245 my $matrix_data = join(INTERNAL_SEPARATOR, @matrix_data) || '';
1247 # sites of a matrix are pre-aligned but padded with spaces on
1248 # the left and no padding on the right; pad with -s both sides
1249 my $longest_seq = 0;
1251 # For all the work, does anything meaningful actually get passed
1252 # on here? Commenting out fixes the latest crashes on trunk.
1253 # 5-10-10 cjfields
1255 #foreach my $site_seq (map {my ($seq) = split("_", $_ ,2); $seq;} @site_data) {
1256 # $site_seq =~ s/ /-/g;
1257 # my $length = length($site_seq);
1258 # if ($length > $longest_seq) {
1259 # $longest_seq = $length;
1262 #foreach my $site (@site_data) {
1263 # my ($site_seq) = split("_", $site ,2);
1264 # my $length = length($site_seq);
1265 # if ($length < $longest_seq) {
1266 # $site_seq .= '-' x ($longest_seq - $length);
1270 my $site_data = join(INTERNAL_SEPARATOR, @site_data) || '';
1272 # accession = id name description num_of_sites matrix_data site_data
1273 $matrices{$data[0]} = join(SEPARATOR, ($data[1] || '',
1274 $data[2] || '',
1275 $data[3] || '',
1276 $data[4],
1277 $matrix_data,
1278 $site_data));
1280 @data = @matrix_data = @site_data = ();
1283 close $MAT;
1285 $matrix = $id = $name = $site = $factor = $reference = undef;
1286 untie %matrices;
1287 untie %id;
1288 untie %name;
1289 untie %site;
1290 untie %factor;
1291 untie %reference;
1294 my $factor_dat = "$dat_dir/factor.dat";
1295 if (! -e $factor_index || $force) {
1296 open my $FAC, '<', $factor_dat or $self->throw("Could not read factor file '$factor_dat': $!");
1298 my %factors;
1299 unlink $factor_index;
1300 my $factor = tie(%factors, 'DB_File', $factor_index, O_RDWR|O_CREAT, 0644, $DB_HASH)
1301 or $self->throw("Could not open file '$factor_index': $!");
1303 my %id;
1304 my $factor_id = $factor_index.'.id';
1305 unlink $factor_id;
1306 my $id = tie(%id, 'DB_File', $factor_id, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1307 or $self->throw("Could not open file '$factor_id': $!");
1309 my %name;
1310 my $factor_name = $factor_index.'.name';
1311 unlink $factor_name;
1312 my $name = tie(%name, 'DB_File', $factor_name, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1313 or $self->throw("Could not open file '$factor_name': $!");
1315 my %species;
1316 my $factor_species = $factor_index.'.species';
1317 unlink $factor_species;
1318 my $species = tie(%species, 'DB_File', $factor_species, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1319 or $self->throw("Could not open file '$factor_species': $!");
1321 my %interactors;
1322 my $factor_interactors = $factor_index.'.interactors';
1323 unlink $factor_interactors;
1324 my $interact = tie(%interactors, 'DB_File', $factor_interactors, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1325 or $self->throw("Could not open file '$factor_interactors': $!");
1327 my %gene;
1328 my $factor_gene = $gene_index.'.factor';
1329 unlink $factor_gene;
1330 my $gene = tie(%gene, 'DB_File', $factor_gene, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1331 or $self->throw("Could not open file '$factor_gene': $!");
1333 my %matrix;
1334 my $factor_matrix = $matrix_index.'.factor';
1335 unlink $factor_matrix;
1336 my $matrix = tie(%matrix, 'DB_File', $factor_matrix, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1337 or $self->throw("Could not open file '$factor_matrix': $!");
1339 my %site;
1340 my $factor_site = $site_index.'.factor';
1341 unlink $factor_site;
1342 my $site = tie(%site, 'DB_File', $factor_site, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1343 or $self->throw("Could not open file '$factor_site': $!");
1345 my %fragment;
1346 my $factor_fragment = $fragment_index.'.factor';
1347 unlink $factor_fragment;
1348 my $fragment = tie(%fragment, 'DB_File', $factor_fragment, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1349 or $self->throw("Could not open file '$factor_fragment': $!");
1351 my %reference;
1352 my $factor_reference = $reference_index.'.factor';
1353 unlink $factor_reference;
1354 my $reference = tie(%reference, 'DB_File', $factor_reference, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1355 or $self->throw("Could not open file '$factor_reference': $!");
1357 # skip the first three header lines
1358 <$FAC>; <$FAC>; <$FAC>;
1360 my @data;
1361 my $sequence = '';
1362 while (<$FAC>) {
1363 if (/^AC (\S+)/) {
1364 $data[0] = $1;
1366 elsif (/^ID (\S+)/) {
1367 # IDs are always the same as AC? Is this needed?
1368 $data[1] = $1;
1369 $id->put("$1", $data[0]);
1371 elsif (/^FA (.+)$/) {
1372 $data[2] = $1;
1373 $name->put("$1", $data[0]);
1375 elsif (/^OS (.+)$/) {
1376 # This is the species the actual factor came from, which may
1377 # differ from the species of any sequences it is described as
1378 # binding to. Not all factors that have a species have a gene,
1379 # so can't delegate species to a gene lookup.
1380 my $raw_species = $1;
1381 my $taxid = $self->_species_to_taxid($raw_species);
1382 $data[3] = $taxid || $raw_species;
1383 $species->put($data[3], $data[0]);
1385 elsif (/^GE (G\d+)/) {
1386 $gene->put($data[0], "$1");
1388 elsif (/^SQ (.+)$/) {
1389 $sequence .= $1;
1391 elsif (/^IN (T\d+)/) {
1392 $interact->put($data[0], "$1");
1394 elsif (/^MX (M\d+)/) {
1395 $matrix->put($data[0], "$1");
1397 elsif (/^BS (R\d+)/) {
1398 $site->put($data[0], "$1");
1400 elsif (/^BR (FR\d+)/) {
1401 $fragment->put($data[0], "$1");
1403 elsif (/^RN .+?(RE\d+)/) {
1404 $reference->put($data[0], "$1");
1406 elsif (/^\/\//) {
1407 # end of a record, store previous data and reset
1409 # accession = id name species sequence
1410 $factors{$data[0]} = join(SEPARATOR, ($data[1] || '',
1411 $data[2] || '',
1412 $data[3] || '',
1413 $sequence));
1415 @data = ();
1416 $sequence = '';
1419 close $FAC;
1421 $factor = $id = $name = $species = $interact = $gene = $matrix = $site = $fragment = $reference = undef;
1422 untie %factors;
1423 untie %id;
1424 untie %name;
1425 untie %species;
1426 untie %interactors;
1427 untie %gene;
1428 untie %matrix;
1429 untie %site;
1430 untie %fragment;
1431 untie %reference;
1434 my $fragment_dat = "$dat_dir/fragment.dat";
1435 if (! -e $fragment_index || $force) {
1436 if (open my $FRA, '<', $fragment_dat) {
1437 my %fragments;
1438 unlink $fragment_index;
1439 my $fragment = tie(%fragments, 'DB_File', $fragment_index, O_RDWR|O_CREAT, 0644, $DB_HASH)
1440 or $self->throw("Could not open file '$fragment_index': $!");
1442 my %id;
1443 my $fragment_id = $fragment_index.'.id';
1444 unlink $fragment_id;
1445 my $id = tie(%id, 'DB_File', $fragment_id, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1446 or $self->throw("Could not open file '$fragment_id': $!");
1448 my %qualities;
1449 my $fragment_qualities = $fragment_index.'.qual';
1450 unlink $fragment_qualities;
1451 my $quality = tie(%qualities, 'DB_File', $fragment_qualities, O_RDWR|O_CREAT, 0644, $DB_HASH)
1452 or $self->throw("Could not open file '$fragment_qualities': $!");
1454 my %species;
1455 my $fragment_species = $fragment_index.'.species';
1456 unlink $fragment_species;
1457 my $species = tie(%species, 'DB_File', $fragment_species, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1458 or $self->throw("Could not open file '$fragment_species': $!");
1460 my %gene;
1461 my $fragment_gene = $gene_index.'.fragment';
1462 unlink $fragment_gene;
1463 my $gene = tie(%gene, 'DB_File', $fragment_gene, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1464 or $self->throw("Could not open file '$fragment_gene': $!");
1466 my %factor;
1467 my $fragment_factor = $factor_index.'.fragment';
1468 unlink $fragment_factor;
1469 my $factor = tie(%factor, 'DB_File', $fragment_factor, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1470 or $self->throw("Could not open file '$fragment_factor': $!");
1472 my %reference;
1473 my $fragment_reference = $reference_index.'.fragment';
1474 unlink $fragment_reference;
1475 my $reference = tie(%reference, 'DB_File', $fragment_reference, O_RDWR|O_CREAT, 0644, $DB_BTREE)
1476 or $self->throw("Could not open file '$fragment_reference': $!");
1478 # skip the first three header lines
1479 <$FRA>; <$FRA>; <$FRA>;
1481 my @data;
1482 while (<$FRA>) {
1483 if (/^AC (\S+)/) {
1484 $data[0] = $1;
1486 elsif (/^ID (\S+)/) {
1487 # IDs are always the same as AC? Is this needed?
1488 $data[1] = $1;
1489 $id->put("$1", $data[0]);
1491 elsif (/^DE Gene: (G\d+)(?:.+Gene: (G\d+))?/) {
1492 my ($gene1, $gene2) = ($1, $2);
1493 $data[2] = $gene1;
1494 $data[3] = $gene2; # could be undef
1495 $gene->put($data[0], $gene1);
1496 $gene->put($data[0], $gene2) if $gene2;
1498 elsif (/^OS (.+)$/) {
1499 # As per the site.dat parsing
1500 my $raw_species = $1;
1501 my $taxid = $self->_species_to_taxid($raw_species);
1502 $data[4] = $taxid || $raw_species;
1503 $species->put($data[4], $data[0]);
1505 elsif (/^SQ [atcgn]*([ATCGN]+)[atcgn]*/) {
1506 $data[5] .= $1;
1507 # there can be (usually are) multiple SQ lines with a single
1508 # long seq split over them. The 'real' sequence is in caps
1510 elsif (/^SC Build (\S+):$/) {
1511 $data[6] = $1;
1512 # maybe parse it out a little more? We have build,
1513 # chromosomal coords and strand, eg.
1514 # SC Build HSA_May2004: Chr.2 43976692..43978487 (FORWARD).
1516 elsif (/^RN .+?(RE\d+)/) {
1517 $reference->put($data[0], "$1");
1519 elsif (/^BF (T\d+); .+?; Quality: (\d)/) {
1520 $factor->put($data[0], "$1");
1521 $qualities{$data[0].SEPARATOR.$1} = $2;
1523 elsif (/^\/\//) {
1524 # end of a record, store previous data and reset
1526 # accession = id gene_id1 gene_id2 species_tax_id_or_raw_string sequence source
1527 $fragments{$data[0]} = join(SEPARATOR, ($data[1] || '',
1528 $data[2] || '',
1529 $data[3] || '',
1530 $data[4] || '',
1531 $data[5] || '',
1532 $data[6] || ''));
1534 @data = ();
1537 close $FRA;
1539 $fragment = $id = $species = $quality = $gene = $factor = $reference = undef;
1540 untie %fragments;
1541 untie %id;
1542 untie %species;
1543 untie %qualities;
1544 untie %gene;
1545 untie %factor;
1546 untie %reference;
1548 else {
1549 $self->warn("Could not read fragment file '$fragment_dat', assuming you have an old version of Transfac Pro with no fragment.dat file");
1554 # connect the internal db handle
1555 sub _db_connect {
1556 my $self = shift;
1557 return if $self->{'_initialized'};
1559 my $index_dir = $self->index_directory;
1560 my $gene_index = "$index_dir/gene.dat.index";
1561 my $reference_index = "$index_dir/reference.dat.index";
1562 my $matrix_index = "$index_dir/matrix.dat.index";
1563 my $factor_index = "$index_dir/factor.dat.index";
1564 my $site_index = "$index_dir/site.dat.index";
1565 my $fragment_index = "$index_dir/fragment.dat.index";
1567 foreach ($gene_index, $reference_index, $matrix_index, $factor_index, $site_index, $fragment_index) {
1568 if (! -e $_) {
1569 #$self->warn("Index files have not been created");
1570 #return 0;
1574 # reference
1576 $self->{reference}->{data} = {};
1577 tie (%{$self->{reference}->{data}}, 'DB_File', $reference_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$reference_index': $!");
1579 my $reference_pubmed = $reference_index.'.pubmed';
1580 $self->{reference}->{pubmed} = tie (%{$self->{reference}->{pubmed}}, 'DB_File', $reference_pubmed, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_pubmed': $!");
1582 my $reference_gene = $gene_index.'.reference';
1583 $self->{gene}->{reference} = tie (%{$self->{gene}->{reference}}, 'DB_File', $reference_gene, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_gene': $!");
1585 my $reference_site = $site_index.'.reference';
1586 $self->{site}->{reference} = tie (%{$self->{site}->{reference}}, 'DB_File', $reference_site, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_site': $!");
1588 my $reference_fragment = $fragment_index.'.reference';
1589 $self->{fragment}->{reference} = tie (%{$self->{fragment}->{reference}}, 'DB_File', $reference_fragment, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$reference_fragment': $!");
1591 my $reference_factor = $factor_index.'.reference';
1592 $self->{factor}->{reference} = tie (%{$self->{factor}->{reference}}, 'DB_File', $reference_factor, undef, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_factor': $!");
1594 my $reference_matrix = $matrix_index.'.reference';
1595 $self->{matrix}->{reference} = tie (%{$self->{matrix}->{reference}}, 'DB_File', $reference_matrix, undef, 0644, $DB_BTREE) || $self->throw("Cannot open file '$reference_matrix': $!");
1598 # gene
1600 $self->{gene}->{data} = {};
1601 tie (%{$self->{gene}->{data}}, 'DB_File', $gene_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$gene_index': $!");
1603 my $gene_id = $gene_index.'.id';
1604 $self->{gene}->{id} = tie(%{$self->{gene}->{id}}, 'DB_File', $gene_id, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_id': $!");
1606 my $gene_name = $gene_index.'.name';
1607 $self->{gene}->{name} = tie(%{$self->{gene}->{name}}, 'DB_File', $gene_name, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_name': $!");
1609 my $gene_species = $gene_index.'.species';
1610 $self->{gene}->{species} = tie(%{$self->{gene}->{species}}, 'DB_File', $gene_species, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_species': $!");
1612 my $gene_site = $site_index.'.gene';
1613 $self->{site}->{gene} = tie(%{$self->{site}->{gene}}, 'DB_File', $gene_site, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_site': $!");
1615 my $gene_fragment = $fragment_index.'.gene';
1616 $self->{fragment}->{gene} = tie(%{$self->{fragment}->{gene}}, 'DB_File', $gene_fragment, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_fragment': $!");
1618 my $gene_factor = $factor_index.'.gene';
1619 $self->{factor}->{gene} = tie(%{$self->{factor}->{gene}}, 'DB_File', $gene_factor, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_factor': $!");
1621 my $gene_reference = $reference_index.'.gene';
1622 $self->{reference}->{gene} = tie(%{$self->{reference}->{gene}}, 'DB_File', $gene_reference, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$gene_reference': $!");
1625 # site
1627 $self->{site}->{data} = {};
1628 tie (%{$self->{site}->{data}}, 'DB_File', $site_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$site_index': $!");
1630 my $site_id = $site_index.'.id';
1631 $self->{site}->{id} = tie(%{$self->{site}->{id}}, 'DB_File', $site_id, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_id': $!");
1633 my $site_species = $site_index.'.species';
1634 $self->{site}->{species} = tie(%{$self->{site}->{species}}, 'DB_File', $site_species, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file $site_species': $!");
1636 #*** quality not actually used by anything (yet)
1637 my $site_qualities = $site_index.'.qual';
1638 $self->{quality} = {};
1639 tie(%{$self->{quality}}, 'DB_File', $site_qualities, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$site_qualities': $!");
1641 my $site_gene = $gene_index.'.site';
1642 $self->{gene}->{site} = tie(%{$self->{gene}->{site}}, 'DB_File', $site_gene, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_gene': $!");
1644 my $site_matrix = $matrix_index.'.site';
1645 $self->{matrix}->{site} = tie(%{$self->{matrix}->{site}}, 'DB_File', $site_matrix, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_matrix': $!");
1647 my $site_factor = $factor_index.'.site';
1648 $self->{factor}->{site} = tie(%{$self->{factor}->{site}}, 'DB_File', $site_factor, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_factor': $!");
1650 my $site_reference = $reference_index.'.site';
1651 $self->{reference}->{site} = tie(%{$self->{reference}->{site}}, 'DB_File', $site_reference, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$site_reference': $!");
1654 # fragment (may not be in older databases)
1655 if (-e $fragment_index) {
1656 $self->{fragment}->{data} = {};
1657 tie (%{$self->{fragment}->{data}}, 'DB_File', $fragment_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$fragment_index': $!");
1659 my $fragment_id = $fragment_index.'.id';
1660 $self->{fragment}->{id} = tie(%{$self->{fragment}->{id}}, 'DB_File', $fragment_id, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_id': $!");
1662 my $fragment_species = $fragment_index.'.species';
1663 $self->{fragment}->{species} = tie(%{$self->{fragment}->{species}}, 'DB_File', $fragment_species, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file $fragment_species': $!");
1665 #*** quality not actually used by anything (yet)
1666 my $fragment_qualities = $fragment_index.'.qual';
1667 $self->{fragment_quality} = {};
1668 tie(%{$self->{fragment_quality}}, 'DB_File', $fragment_qualities, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$fragment_qualities': $!");
1670 my $fragment_gene = $gene_index.'.fragment';
1671 $self->{gene}->{fragment} = tie(%{$self->{gene}->{fragment}}, 'DB_File', $fragment_gene, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_gene': $!");
1673 my $fragment_factor = $factor_index.'.fragment';
1674 $self->{factor}->{fragment} = tie(%{$self->{factor}->{fragment}}, 'DB_File', $fragment_factor, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_factor': $!");
1676 my $fragment_reference = $reference_index.'.fragment';
1677 $self->{reference}->{fragment} = tie(%{$self->{reference}->{fragment}}, 'DB_File', $fragment_reference, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$fragment_reference': $!");
1679 else {
1680 die "no fragment_index at '$fragment_index'\n";
1683 # matrix
1685 $self->{matrix}->{data} = {};
1686 tie (%{$self->{matrix}->{data}}, 'DB_File', $matrix_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$matrix_index': $!");
1688 my $matrix_id = $matrix_index.'.id';
1689 $self->{matrix}->{id} = tie(%{$self->{matrix}->{id}}, 'DB_File', $matrix_id, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_id': $!");
1691 my $matrix_name = $matrix_index.'.name';
1692 $self->{matrix}->{name} = tie(%{$self->{matrix}->{name}}, 'DB_File', $matrix_name, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_name': $!");
1694 my $matrix_site = $site_index.'.matrix';
1695 $self->{site}->{matrix} = tie(%{$self->{site}->{matrix}}, 'DB_File', $matrix_site, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_site': $!");
1697 my $matrix_factor = $factor_index.'.matrix';
1698 $self->{factor}->{matrix} = tie(%{$self->{factor}->{matrix}}, 'DB_File', $matrix_factor, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_factor': $!");
1700 my $matrix_reference = $reference_index.'.matrix';
1701 $self->{reference}->{matrix} = tie(%{$self->{reference}->{matrix}}, 'DB_File', $matrix_reference, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$matrix_reference': $!");
1704 # factor
1706 $self->{factor}->{data} = {};
1707 tie (%{$self->{factor}->{data}}, 'DB_File', $factor_index, O_RDWR, undef, $DB_HASH) || $self->throw("Cannot open file '$factor_index': $!");
1709 my $factor_id = $factor_index.'.id';
1710 $self->{factor}->{id} = tie(%{$self->{factor}->{id}}, 'DB_File', $factor_id, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file 'factor_id': $!");
1712 my $factor_name = $factor_index.'.name';
1713 $self->{factor}->{name} = tie(%{$self->{factor}->{name}}, 'DB_File', $factor_name, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_name': $!");
1715 my $factor_species = $factor_index.'.species';
1716 $self->{factor}->{species} = tie(%{$self->{factor}->{species}}, 'DB_File', $factor_species, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_species': $!");
1718 my $factor_interactors = $factor_index.'.interactors';
1719 $self->{factor}->{interactors} = tie(%{$self->{factor}->{interactors}}, 'DB_File', $factor_interactors, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_interactors': $!");
1721 my $factor_gene = $gene_index.'.factor';
1722 $self->{gene}->{factor} = tie(%{$self->{gene}->{factor}}, 'DB_File', $factor_gene, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_gene': $!");
1724 my $factor_matrix = $matrix_index.'.factor';
1725 $self->{matrix}->{factor} = tie(%{$self->{matrix}->{factor}}, 'DB_File', $factor_matrix, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_matrix': $!");
1727 my $factor_site = $site_index.'.factor';
1728 $self->{site}->{factor} = tie(%{$self->{site}->{factor}}, 'DB_File', $factor_site, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_site': $!");
1730 my $factor_fragment = $fragment_index.'.factor';
1731 $self->{fragment}->{factor} = tie(%{$self->{fragment}->{factor}}, 'DB_File', $factor_fragment, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_fragment': $!");
1733 my $factor_reference = $reference_index.'.factor';
1734 $self->{reference}->{factor} = tie(%{$self->{reference}->{factor}}, 'DB_File', $factor_reference, O_RDWR, undef, $DB_BTREE) || $self->throw("Cannot open file '$factor_reference': $!");
1737 $self->{'_initialized'} = 1;
1740 =head2 index_directory
1742 Title : index_directory
1743 Funtion : Get/set the location that index files are stored. (this module
1744 will index the supplied database)
1745 Usage : $obj->index_directory($newval)
1746 Returns : value of index_directory (a scalar)
1747 Args : on set, new value (a scalar or undef, optional)
1749 =cut
1751 sub index_directory {
1752 my $self = shift;
1753 return $self->{'index_directory'} = shift if @_;
1754 return $self->{'index_directory'};
1757 # resolve a transfac species string into an ncbi taxid
1758 sub _species_to_taxid {
1759 my ($self, $raw_species) = @_;
1760 $raw_species or return;
1762 my $species_string;
1763 my @split = split(', ', $raw_species);
1764 (@split > 1) ? ($species_string = $split[1]) : ($species_string = $split[0]);
1766 my $ncbi_taxid;
1767 if ($species_string =~ /^[A-Z]\S+ \S+$/) {
1768 SWITCH: for ($species_string) {
1769 # some species don't classify so custom handling
1770 /^Darnel ryegrass/ && do { $ncbi_taxid = 34176; last; };
1771 /^Coix lacryma/ && do { $ncbi_taxid = 4505; last; };
1772 /^Rattus spec/ && do { $ncbi_taxid = 10116; last; };
1773 /^Mus spec/ && do { $ncbi_taxid = 10090; last; };
1774 /^Equus spec/ && do { $ncbi_taxid = 9796; last; };
1775 /^Cavia sp/ && do { $ncbi_taxid = 10141; last; };
1776 /^Marsh marigold/ && do { $ncbi_taxid = 3449; last; };
1777 /^Phalaenopsis sp/ && do { $ncbi_taxid = 36900; last; };
1778 /^Anthirrhinum majus/ && do { $ncbi_taxid = 4151; last; };
1779 /^Equus spec/ && do { $ncbi_taxid = 9796; last; };
1780 /^Lycopodium spec/ && do { $ncbi_taxid = 13840; last; };
1781 /^Autographa californica/ && do { $ncbi_taxid = 307456; last; };
1782 /^E26 AEV/ && do { $ncbi_taxid = 31920; last; };
1783 /^Pseudocentrotus miliaris/ && do { $ncbi_taxid = 7677; last; }; # the genus is 7677 but this species isn't there
1784 /^SL3-3 (?:retro)?virus/ && do { $ncbi_taxid = 53454; last; }; # 53454 is unclassified MLV-related, SL3-3 a variant of that?
1785 /^Petunia sp/ && do { $ncbi_taxid = 4104; last; };
1787 if (! $ncbi_taxid && defined $self->{_tax_db}) {
1788 ($ncbi_taxid) = $self->{_tax_db}->get_taxonids($species_string);
1791 else {
1792 # some species lines are poorly formated so custom handling
1793 SWITCH: for ($raw_species) {
1794 # for speed, go by common first letters
1795 my $first_letter = substr($raw_species, 0, 1);
1797 $first_letter eq 'A' && do {
1798 /^Adiantum raddianum/ && do { $ncbi_taxid = 32168; last; };
1799 /^Avian sarcoma virus \(strain 17\)/ && do { $ncbi_taxid = 11877; last; };
1800 /^AMV/ && do { $ncbi_taxid = 11866; last; };
1801 /^AEV/ && do { $ncbi_taxid = 11861; last; };
1802 /^AS42|^Avian musculoaponeurotic/ && do { $ncbi_taxid = 11873; last; };
1803 /^Avian myelocytomatosis/ && do { $ncbi_taxid = 11869; last; };
1804 /^ASV 31/ && do { $ncbi_taxid = 35270; last; };
1805 /^A-MuLV/ && do { $ncbi_taxid = 188539; last; };
1806 /^Asparagus officinalis/ && do { $ncbi_taxid = 4686; last; };
1807 /^Agrobacterium tumefaciens/ && do { $ncbi_taxid = 358; last; };
1808 /^ALV/ && do { $ncbi_taxid = 11864; last; };
1809 /^AAV/ && do { $ncbi_taxid = 272636; last; };
1810 /^AKV MLV/ && do { $ncbi_taxid = 11791; last; };
1811 last;
1814 $first_letter eq 'B' && do {
1815 /^BPV-1/ && do { $ncbi_taxid = 10559; last; };
1816 /^BKV/ && do { $ncbi_taxid = 10629; last; };
1817 /^Bolivian squirrel monkey/ && do { $ncbi_taxid = 39432; last; };
1818 last;
1821 $first_letter eq 'C' && do {
1822 /^Cauliflower/ && do { $ncbi_taxid = 3715; last; };
1823 /^Chamek/ && do { $ncbi_taxid = 118643; last; };
1824 /^Candida albicans/ && do { $ncbi_taxid = 5476; last; };
1825 /^CaMV/ && do { $ncbi_taxid = 10641; last; };
1826 last;
1829 $first_letter eq 'E' && do {
1830 /^Eucalyptus gunnii/ && do { $ncbi_taxid = 3933; last; };
1831 /^EBV, Epstein-Barr virus/ && do { $ncbi_taxid = 10376; last; };
1832 /^Eucalyptus globulus subsp. bicostata/ && do { $ncbi_taxid = 71272; last; };
1833 /^Eucalyptus globulus subsp. globulus/ && do { $ncbi_taxid = 71271; last; };
1834 last;
1837 $first_letter eq 'F' && do {
1838 /^FBR MuLV/ && do { $ncbi_taxid = 11806; last; };
1839 /^FBJ MuLV/ && do { $ncbi_taxid = 11805; last; };
1840 /^FeLV|Feline leukemia/ && do { $ncbi_taxid = 11923; last; };
1841 /^Flaveria trinervia/ && do { $ncbi_taxid = 4227; last; };
1842 /^FSV/ && do { $ncbi_taxid = 11885; last; };
1843 /^F-MuLV/ && do { $ncbi_taxid = 11795; last; };
1844 last;
1847 $first_letter eq 'H' && do {
1848 /^HSV-1/ && do { $ncbi_taxid = 10298; last; };
1849 /^HTLV-I/ && do { $ncbi_taxid = 11908; last; };
1850 /^HIV-1/ && do { $ncbi_taxid = 11676; last; };
1851 /^HPV-16/ && do { $ncbi_taxid = 333760; last; };
1852 /^HBV/ && do { $ncbi_taxid = 10407; last; };
1853 /^HBI/ && do { $ncbi_taxid = 11867; last; };
1854 /^HPV-8/ && do { $ncbi_taxid = 10579; last; };
1855 /^HPV-11/ && do { $ncbi_taxid = 10580; last; };
1856 /^HPV-18/ && do { $ncbi_taxid = 333761; last; };
1857 /^HCMV/ && do { $ncbi_taxid = 10359; last; };
1858 /^HSV/ && do { $ncbi_taxid = 126283; last; };
1859 /^HSV-2/ && do { $ncbi_taxid = 10310; last; };
1860 /^HCV/ && do { $ncbi_taxid = 11108; last; };
1861 /^HIV-2/ && do { $ncbi_taxid = 11709; last; };
1862 last;
1865 $first_letter eq 'M' && do {
1866 /^MMTV/ && do { $ncbi_taxid = 11757; last; };
1867 /^Mo-MuLV/ && do { $ncbi_taxid = 11801; last; };
1868 /^MuLV/ && do { $ncbi_taxid = 11786; last; };
1869 /^MSV/ && do { $ncbi_taxid = 11802; last; };
1870 /^MC29/ && do { $ncbi_taxid = 11868; last; };
1871 /^MVM/ && do { $ncbi_taxid = 10794; last; };
1872 /^MH2E21/ && do { $ncbi_taxid = 11955; last; }; # 11955 is a species, presumably MH2E21 is the strain
1873 last;
1876 $first_letter eq 'R' && do {
1877 /^Raphanus sativus/ && do { $ncbi_taxid = 3726; last; };
1878 /^REV-T/ && do { $ncbi_taxid = 11636; last; };
1879 /^RAV-0/ && do { $ncbi_taxid = 11867; last; }; # should be rous-associated virus 0 variant
1880 /^RSV/ && do { $ncbi_taxid = 11886; last; };
1881 /^RadLV/ && do { $ncbi_taxid = 31689; last; };
1882 /^RTBV/ && do { $ncbi_taxid = 10654; last; };
1883 last;
1886 $first_letter eq 'S' && do {
1887 /^SV40/ && do { $ncbi_taxid = 10633; last; };
1888 /^Sesbania rostrata/ && do { $ncbi_taxid = 3895; last; };
1889 /^SIV/ && do { $ncbi_taxid = 11723; last; };
1890 /^Spinacia oleracea/ && do { $ncbi_taxid = 3562; last; };
1891 /^SCMV/ && do { $ncbi_taxid = 10364; last; }; # supposed to be AGM isolate
1892 last;
1895 # and lower case
1896 $first_letter eq 'a' && do {
1897 /^adenovirus type 5/ && do { $ncbi_taxid = 28285; last; };
1898 /^adenovirus type 2/ && do { $ncbi_taxid = 10515; last; };
1899 /^adenovirus/ && do { $ncbi_taxid = 189831; last; }; # 189831 ('unclassified Adenoviridae') is the closest I can get, but this has no genus and is not a species
1900 last;
1903 $first_letter eq 'b' && do {
1904 /^bell pepper/ && do { $ncbi_taxid = 4072; last; };
1905 /^baculovirus, Autographa californica/ && do { $ncbi_taxid = 46015; last; };
1906 /^broccoli/ && do { $ncbi_taxid = 36774; last; };
1907 /^barley/ && do { $ncbi_taxid = 112509; last; };
1908 last;
1911 $first_letter eq 'c' && do {
1912 /^clawed frog/ && do { $ncbi_taxid = 8355; last; };
1913 /^chipmunk/ && do { $ncbi_taxid = 64680; last; };
1914 /^common tree shrew/ && do { $ncbi_taxid = 37347; last; };
1915 /^cat/ && do { $ncbi_taxid = 9685; last; };
1916 last;
1919 # and misc
1920 /^NK24/ && do { $ncbi_taxid = 11955; last; };
1921 /^OK10/ && do { $ncbi_taxid = 11871; last; };
1922 /^Dendrobium grex/ && do { $ncbi_taxid = 84618; last; };
1923 /^KSHV/ && do { $ncbi_taxid = 37296; last; };
1924 /^Oncidium/ && do { $ncbi_taxid = 96474; last; };
1925 /^Japanese quail/ && do { $ncbi_taxid = 93934; last; };
1926 /^Nile tilapia/ && do { $ncbi_taxid = 8128; last; };
1927 /^GALV/ && do { $ncbi_taxid = 11840; last; };
1928 /^JCV/ && do { $ncbi_taxid = 10632; last; };
1929 /^LPV/ && do { $ncbi_taxid = 10574; last; };
1930 /^Py,/ && do { $ncbi_taxid = 36362; last; };
1931 /^DHBV/ && do { $ncbi_taxid = 12639; last; };
1932 /^VZV/ && do { $ncbi_taxid = 10335; last; };
1933 /^Vicia faba/ && do { $ncbi_taxid = 3906; last; };
1935 /^hamster/ && do { $ncbi_taxid = 10029; last; };
1936 /^sea urchin/ && do { $ncbi_taxid = 7668; last; };
1937 /^fruit fly/ && do { $ncbi_taxid = 7227; last; };
1938 /^halibut/ && do { $ncbi_taxid = 8267; last; };
1939 /^vaccinia virus/ && do { $ncbi_taxid = 10245; last; };
1940 /^taxonomic class Mammalia/ && do { $ncbi_taxid = 40674; last; }; # not a species
1941 /^taxonomic class Vertebrata/ && do { $ncbi_taxid = 7742; last; }; # not a species
1942 /^dog/ && do { $ncbi_taxid = 9615; last; };
1943 /^parsley/ && do { $ncbi_taxid = 4043; last; };
1944 /^mouse, Mus domesticus Torino/ && do { $ncbi_taxid = 10092; last; }; # 10092 is domesticus subspecies, but not the Torino strain
1945 /^lemur, Eulemur fulvus collaris/ && do { $ncbi_taxid = 47178; last; };
1946 /^red sea bream/ && do { $ncbi_taxid = 143350; last; };
1947 /^zebra finch/ && do { $ncbi_taxid = 59729; last; };
1948 /^mung bean/ && do { $ncbi_taxid = 3916; last; };
1949 /^soybean/ && do { $ncbi_taxid = 3847; last; };
1950 /^oat/ && do { $ncbi_taxid = 4498; last; };
1951 /^pseudorabies virus/ && do { $ncbi_taxid = 10345; last; };
1955 $self->warn("Didn't know what species '$raw_species' was, unable to classify") unless $ncbi_taxid;
1956 return $ncbi_taxid;
1959 sub DESTROY {
1960 my $self = shift;
1961 # Destroy tied references to close filehandles
1962 # and allow proper temporary files deletion
1963 undef $self->{_tax_db}->{'_nodes'};
1964 undef $self->{_tax_db}->{'_id2name'};
1965 undef $self->{_tax_db}->{'_name2id'};
1966 undef $self->{_tax_db}->{'_parent2children'};
1967 undef $self->{_tax_db}->{'_parentbtree'};