Travis CI: Add Test::Most dependency
[bioperl-live.git] / scripts / utilities / bp_download_query_genbank.pl
blob3b51d74a29e88d1423b1e3f0a683d16df5b0aad8
1 #!/usr/bin/perl
4 =head1 NAME
6 bp_download_query_genbank - script to query Genbank and retrieve records
8 =head1 USAGE
10 bp_download_query_genbank --query "Neurospora[ORGN]" --db nucest -o Ncrassa_ESTs.fa --format fasta
12 bp_download_query_genbank --queryfile 'filewithquery' --db nucest -o Ncrassa_ESTs.fa --format fasta
14 =head2 Other options
16 Provide ONE of:
18 -q --query query string OR
19 --queryfile profile file with query OR
20 --gi --gis --gifile file with list of GIs to download
22 Database type:
24 -d --db database (nucleotide [default], nucest, protein, )
26 -o --out --outfile output file (results are displayed on screen otherwise)
27 -f --format sequence file output format (fasta by default)
28 -v --verbose debugging output
30 =head2 Query options
32 --maxids maximum number of IDs to retrieve in a set (100 at a time by default)
33 --reldate
34 --maxdate maxdate for a record
35 --mindate minimum date for record
36 --datetype edat or mdat (entered or modified)
38 =head1 AUTHOR Jason Stajich
40 Jason Stajich, jason-AT-bioperl.org
42 =cut
44 use strict;
45 use warnings;
46 use Bio::DB::GenBank;
47 use Bio::DB::GenPept;
48 use Bio::DB::Query::GenBank;
49 use Bio::SeqIO;
50 use Getopt::Long;
52 my ($queryfile,$outfile,$format,$debug,%options);
54 $format = 'fasta';
56 $options{'-maxids'} = '100';
57 $options{'-db'} = 'nucleotide'; # can be nucleotide, nucest, protein
58 my $gifile;
59 GetOptions(
60 'h|help' => sub { exec('perldoc', $0);
61 exit(0);
63 'v|verbose' => \$debug,
64 'f|format:s' => \$format,
65 'queryfile:s' => \$queryfile,
66 'o|out|outfile:s' => \$outfile,
67 'gi|gifile|gis:s' => \$gifile,
68 # DB::Query options
69 'd|db:s' => \$options{'-db'},
70 'mindate:s' => \$options{'-mindate'},
71 'maxdate:s' => \$options{'-maxdate'},
72 'reldate:s' => \$options{'-reldate'},
73 'datetype:s' => \$options{'-datetype'}, # edat or mdat
74 'maxids:i' => \$options{'-maxids'},
75 'q|query:s' => \$options{'-query'},
78 my $out;
80 if( $outfile ) {
81 $out = Bio::SeqIO->new(-format => $format,
82 -file => ">$outfile");
83 } else {
84 $out = Bio::SeqIO->new(-format => $format); # write to STDOUT
87 my $dbh;
88 if( $options{'-db'} eq 'protein' ) {
89 $dbh = Bio::DB::GenPept->new(-verbose => $debug);
90 } else {
91 $dbh = Bio::DB::GenBank->new(-verbose => $debug);
93 my $query;
94 if( $gifile ) {
95 my @ids;
96 open my $fh, '<', $gifile or die "Could not read file '$gifile': $!\n";
97 while(<$fh>) {
98 push @ids, split;
100 close $fh;
101 while( @ids ) {
102 my @mini_ids = splice(@ids, 0, $options{'-maxids'});
103 $query = Bio::DB::Query::GenBank->new(%options,
104 -verbose =>$debug,
105 -ids => \@mini_ids,
107 my $stream = $dbh->get_Stream_by_query($query);
108 while( my $seq = $stream->next_seq ) {
109 $out->write_seq($seq);
112 exit;
113 } elsif( $options{'-query'}) {
114 $query = Bio::DB::Query::GenBank->new(%options,-verbose => $debug);
115 } elsif( $queryfile ) {
116 open my $fh, '<', $queryfile or die "Could not read file '$queryfile': $!\n";
117 while(<$fh>) {
118 chomp;
119 $options{'-query'} .= $_;
121 $query = Bio::DB::Query::GenBank->new(%options,-verbose => $debug);
122 close $fh;
123 } else {
124 die("no query string or gifile\n");
126 my $stream = $dbh->get_Stream_by_query($query);
127 while( my $seq = $stream->next_seq ) {
128 $out->write_seq($seq);