3 # This script will report the names of the tissues which were seen
4 # in a BLAST/FASTA report against an EST (or cDNA possibly) library.
6 # this script assumes you have a directory which you have downloaded
7 # gbestXX.seq.gz from ncbi genbank release. This will run faster if
8 # they are uncompressed already, but if will uncompress the files
9 # on demand. Be sure that there is sufficient space and the uid
10 # has write permission on the files and in that directory if you
11 # plan to run this script on compressed files.
13 # Alternatively you can use this with the -r option and it will
14 # use the remote sequence databases either genbank or embl to
15 # retrieve the specific EST (so one can attempt to guess the tissue type)
17 # cmd line options are
18 # -i/--index=indexname
19 # -d/--dir=dir where gbest data files are located
20 # -b/--blast=filename blast filename which compared against an EST db
21 # -f/--format=(blast|blastxml|fasta) - type of search output either
22 # from BLAST or FASTA suites
23 # -c/--cache=filename cache for accession number to tissue
24 # -p/pvalue=pvalue pvalue to limit search to
25 # -r/--remote=[GenBank|EMBL] use remote db for searching
33 use Bio
::Index
::GenBank
;
36 my $GZIP = '/usr/bin/gzip';
37 my $GUNZIP = '/usr/bin/gunzip';
39 my $dir = '/home/data/libs/gbest'; # local dir for gbest files
40 my $index = 'dbest_tissue.idx'; # local index filename
41 my $cache; # filename to create cache of accession->tissue
42 my $VERBOSE = 0;# verbosity option
43 my $blastfile; # blastfile to parse
44 my $pvalue; # Max P-Value allowed when parsing blastfile
45 my $remote; # flag for remote database
46 my $db; # generic database handle
47 my %accessions; # cache results
50 &GetOptions
( 'd|dir:s' => \
$dir,
51 'i|index:s' => \
$index,
52 'v|verbose' => \
$VERBOSE,
53 'b|blast:s' => \
$blastfile,
54 'f|format:s' => \
$format,
55 'c|cache:s' => \
$cache,
56 'p|pvalue:s' => \
$pvalue,
57 'r|remote:s'=> \
$remote);
59 if( $cache && -w
$cache ) {
60 print "creating cache file\n";
61 tie
%accessions, "DB_File", $cache, O_RDWR
|O_CREAT
,0660, $DB_HASH;
65 opendir(GBEST
, $dir) or die("cannot open $dir");
67 my $indexfile = new Bio
::Index
::GenBank
(-filename
=> $index,
68 -write_flag
=> 'WRITE');
69 foreach my $file ( readdir(GBEST
) ) {
70 # print "file is $file\n";
71 next unless ( $file =~ /(gbest\d+\.seq)(.gz)?$/ );
75 $indexfile->make_index("$dir/$1");
79 $db = new Bio
::Index
::GenBank
(-filename
=> $index);
82 if( $remote =~ /(ncbi)|(genbank)/i ) {
84 $db = new Bio
::DB
::GenBank
;
85 } elsif( $remote =~ /embl/i ) {
86 $db = new Bio
::DB
::EMBL
;
88 die("remote must be either 'NCBI' or 'EMBL'");
90 # would need to add code to set proxy info for those who need it
93 if(! $blastfile || ! -r
$blastfile ) {
94 die("Must specify a valid blastfile");
97 my $parser = new Bio
::SearchIO
(-format
=> $format,
100 my %tissues_seen = ();
101 my ($result,$hit,$hsp);
102 while( my $result = $parser->next_result ) {
103 HIT
: while( my $hit = $result->next_hit ) {
104 if( defined $pvalue ) {
105 while( my $hsp = $hit->next_hsp ) {
106 if( $hsp->evalue > $pvalue ) {
107 print "skipping ", $hit->name, " because of low evalue \n";
108 # skip this Subject if it contains a pvalue of > $pvalue
113 my ($id) = split(/\s+/, $hit->name);
115 my @ids = split(/\|/, $id);
117 my ($tissuetype) = get_tissue
($id);
118 if( defined $tissuetype ) {
119 push @
{$tissues_seen{$tissuetype}}, $hit->name;
121 print STDERR
"could not find tissue for $id\n" if( $VERBOSE);
124 print "tissues seen for: ", $result->query_name, "\n";
126 foreach my $tissue ( sort keys %tissues_seen ) {
127 print "* $tissue\n-----------\n\t",
128 join("\n\t",@
{$tissues_seen{$tissue}}), "\n\n";
132 # cleanup -- avoid segfault here
140 if( $tissue = $accessions{$id} ) {
144 my $seq = $db->get_Seq_by_acc($id);
145 return unless( $seq );
147 foreach my $feature ( $seq->all_SeqFeatures ) {
148 if( $feature->primary_tag eq 'source' ) {
149 foreach my $tag ( sort { $b cmp $a }
150 $feature->all_tags ) {
151 if( $tag =~ /tissue/i ||
153 $tag =~ /clone_lib/i ) ){
154 ($tissue) = $feature->each_tag_value($tag);
155 $accessions{$seq->display_id} = $tissue;