Merge pull request #111 from adsj/master
[bioperl-live.git] / examples / db / est_tissue_query.pl
blob536b2d2c8efb6918a2812ea2bf81188f4a5797d7
1 #!/usr/bin/perl
3 # This script will report the names of the tissues which were seen
4 # in a BLAST/FASTA report against an EST (or cDNA possibly) library.
6 # this script assumes you have a directory which you have downloaded
7 # gbestXX.seq.gz from ncbi genbank release. This will run faster if
8 # they are uncompressed already, but if will uncompress the files
9 # on demand. Be sure that there is sufficient space and the uid
10 # has write permission on the files and in that directory if you
11 # plan to run this script on compressed files.
13 # Alternatively you can use this with the -r option and it will
14 # use the remote sequence databases either genbank or embl to
15 # retrieve the specific EST (so one can attempt to guess the tissue type)
17 # cmd line options are
18 # -i/--index=indexname
19 # -d/--dir=dir where gbest data files are located
20 # -b/--blast=filename blast filename which compared against an EST db
21 # -f/--format=(blast|blastxml|fasta) - type of search output either
22 # from BLAST or FASTA suites
23 # -c/--cache=filename cache for accession number to tissue
24 # -p/pvalue=pvalue pvalue to limit search to
25 # -r/--remote=[GenBank|EMBL] use remote db for searching
27 use strict;
28 use DB_File;
29 use Bio::SeqIO;
30 use Bio::SearchIO;
31 use Bio::DB::EMBL;
32 use Bio::DB::GenBank;
33 use Bio::Index::GenBank;
35 use Getopt::Long;
36 my $GZIP = '/usr/bin/gzip';
37 my $GUNZIP = '/usr/bin/gunzip';
39 my $dir = '/home/data/libs/gbest'; # local dir for gbest files
40 my $index = 'dbest_tissue.idx'; # local index filename
41 my $cache; # filename to create cache of accession->tissue
42 my $VERBOSE = 0;# verbosity option
43 my $blastfile; # blastfile to parse
44 my $pvalue; # Max P-Value allowed when parsing blastfile
45 my $remote; # flag for remote database
46 my $db; # generic database handle
47 my %accessions; # cache results
48 my $format = 'blast';
50 &GetOptions( 'd|dir:s' => \$dir,
51 'i|index:s' => \$index,
52 'v|verbose' => \$VERBOSE,
53 'b|blast:s' => \$blastfile,
54 'f|format:s' => \$format,
55 'c|cache:s' => \$cache,
56 'p|pvalue:s' => \$pvalue,
57 'r|remote:s'=> \$remote);
59 if( $cache && -w $cache ) {
60 print "creating cache file\n";
61 tie %accessions, "DB_File", $cache, O_RDWR|O_CREAT,0660, $DB_HASH;
64 if( ! $remote ) {
65 opendir(GBEST, $dir) or die("cannot open $dir");
67 my $indexfile = new Bio::Index::GenBank(-filename => $index,
68 -write_flag => 'WRITE');
69 foreach my $file ( readdir(GBEST) ) {
70 # print "file is $file\n";
71 next unless ( $file =~ /(gbest\d+\.seq)(.gz)?$/ );
72 if( $2 ) {
73 `$GUNZIP $dir/$file`;
75 $indexfile->make_index("$dir/$1");
78 $indexfile = undef;
79 $db = new Bio::Index::GenBank(-filename => $index);
81 } else {
82 if( $remote =~ /(ncbi)|(genbank)/i ) {
84 $db = new Bio::DB::GenBank;
85 } elsif( $remote =~ /embl/i ) {
86 $db = new Bio::DB::EMBL;
87 } else {
88 die("remote must be either 'NCBI' or 'EMBL'");
90 # would need to add code to set proxy info for those who need it
93 if(! $blastfile || ! -r $blastfile ) {
94 die("Must specify a valid blastfile");
97 my $parser = new Bio::SearchIO(-format => $format,
98 -file => $blastfile);
100 my %tissues_seen = ();
101 my ($result,$hit,$hsp);
102 while( my $result = $parser->next_result ) {
103 HIT: while( my $hit = $result->next_hit ) {
104 if( defined $pvalue ) {
105 while( my $hsp = $hit->next_hsp ) {
106 if( $hsp->evalue > $pvalue ) {
107 print "skipping ", $hit->name, " because of low evalue \n";
108 # skip this Subject if it contains a pvalue of > $pvalue
109 next HIT;
113 my ($id) = split(/\s+/, $hit->name);
114 # get the last value
115 my @ids = split(/\|/, $id);
116 $id = pop @ids;
117 my ($tissuetype) = get_tissue($id);
118 if( defined $tissuetype ) {
119 push @{$tissues_seen{$tissuetype}}, $hit->name;
120 } else {
121 print STDERR "could not find tissue for $id\n" if( $VERBOSE);
124 print "tissues seen for: ", $result->query_name, "\n";
126 foreach my $tissue ( sort keys %tissues_seen ) {
127 print "* $tissue\n-----------\n\t",
128 join("\n\t",@{$tissues_seen{$tissue}}), "\n\n";
132 # cleanup -- avoid segfault here
133 $db = undef;
135 # subroutines
137 sub get_tissue {
138 my ($id) = @_;
139 my $tissue;
140 if( $tissue = $accessions{$id} ) {
141 return $tissue;
144 my $seq = $db->get_Seq_by_acc($id);
145 return unless( $seq );
147 foreach my $feature ( $seq->all_SeqFeatures ) {
148 if( $feature->primary_tag eq 'source' ) {
149 foreach my $tag ( sort { $b cmp $a }
150 $feature->all_tags ) {
151 if( $tag =~ /tissue/i ||
152 ( ! $tissue &&
153 $tag =~ /clone_lib/i ) ){
154 ($tissue) = $feature->each_tag_value($tag);
155 $accessions{$seq->display_id} = $tissue;
156 return $tissue;
161 return;