1 # -*-Perl-*- Test Harness script for Bioperl
2 # $Id: EUtilities.t 15112 2008-12-08 18:12:38Z sendu $
11 $NUMTESTS = 4; # base number of tests (those not in blocks)
13 # I have set up eutils tests to run in sections for easier test maintenance
14 # and keeping track of problematic tests. The below hash is the list of
15 # tests, with test number and coderef.
17 # these now run very simple tests for connectivity and data sampling
18 # main tests now with the parser
21 'efetch' => {'tests' => 5,
23 'epost' => {'tests' => 11,
25 'esummary' => {'tests' => 254,
27 'esearch' => {'tests' => 13,
29 'einfo' => {'tests' => 10,
31 'elink1' => {'tests' => 8,
33 'egquery' => {'tests' => 4,
36 $NUMTESTS += $EUTILS{$_}->{'tests'} for (keys %EUTILS);
37 $DEBUG = $ENV{'BIOPERLDEBUG'} || 0;
38 # this seems to work for perl 5.6 and perl 5.8
42 test_begin(-tests => $NUMTESTS,
43 -requires_modules => [qw(XML::Simple LWP::UserAgent)],
47 use_ok('Bio::DB::EUtilities');
48 use_ok('LWP::UserAgent');
49 use_ok('Bio::Tools::EUtilities');
50 use_ok('Bio::Tools::EUtilities::EUtilParameters');
53 my $email = test_email();
55 diag("Using $email for tests") if $DEBUG;
57 # NOTE : Bio::DB::EUtilities is just a specialized pipeline to get any
58 # data available via NCBI's Entrez interface, with a few convenience methods
59 # to get UIDs and other additional information. All data returned
60 # using EFetch is raw (not Bioperl objects) and is meant to be piped into
61 # other Bioperl modules at a later point for further processing
64 my @acc = qw(MUSIGHBA1 P18584 CH402638);
67 my @ids = sort qw(1621261 89318838 68536103 20807972 730439);
70 my $term = 'dihydroorotase AND human';
72 my ($eutil, $response);
74 my %dbs = (taxonomy => 1,
77 my %links = (protein_taxonomy => 1,
78 protein_nucleotide => 1,
79 protein_nucleotide_wgs => 1,
81 protein_pubmed_refseq => 1
84 # this loops through the required tests, only running what is in %EUTILS
85 for my $test (keys %EUTILS) {
86 $EUTILS{$test}->{'sub'}->();
93 $eutil = Bio::DB::EUtilities->new(
100 isa_ok($eutil, 'Bio::DB::GenericWebAgent');
101 eval {$response = $eutil->get_Response; };
102 skip("EFetch HTTP error: $@", 4) if $@;
103 isa_ok($response, 'HTTP::Response');
104 my $content = $response->content;
105 like($content, qr(PYRR \[Mycobacterium tuberculosis H37Rv\]),
106 'EFetch: Fasta format');
108 # reuse the EUtilities webagent
109 $eutil->parameter_base->id([$ids[1]]);
110 $eutil->parameter_base->rettype('gb');
111 eval {$response = $eutil->get_Response; };
112 skip("EFetch HTTP error: $@", 2) if $@;
113 isa_ok($response, 'HTTP::Response');
114 $content = $response->content;
115 like($content, qr(^LOCUS\s+NP_623143),'EFetch: GenBank format');
119 # EPost->EFetch with History
123 $eutil = Bio::DB::EUtilities->new(
130 isa_ok($eutil, 'Bio::DB::GenericWebAgent');
131 eval {$response = $eutil->get_Response; };
132 skip("EPost HTTP error: $@", 10) if $@;
133 isa_ok($response, 'HTTP::Response');
134 # Any parameters are passed in to the parser, so these should be set.
135 # Databases and IDs always default back to the submitted ones unless
136 # the data being retrieved are IDs or contain new IDs (esearch, elink)
138 is($eutil->get_database, 'protein', '$epost->get_database()');
139 is(join(',',$eutil->get_ids), '1621261,20807972,68536103,730439,89318838', '$epost->get_ids()');
141 # these are the submitted IDs
142 is($eutil->get_count, 5, '$epost->get_count()');
144 # these are not set using epost
145 is($eutil->get_term, undef, '$epost->get_term()');
147 my $history = $eutil->next_History;
148 is($history->eutil, 'epost', 'History->eutil()');
149 isa_ok($history, 'Bio::Tools::EUtilities::HistoryI');
151 # check the actual History
152 my ($webenv, $key) = $history->history;
153 like($webenv, qr{^\S{25}}, '$epost WebEnv');
154 like($key, qr{^\d+}, '$epost query key');
156 # can we fetch the sequences?
157 $eutil->set_parameters(
159 -history => $history,
162 # look for fasta headers
164 eval{ $r = $eutil->get_Response->content;};
165 skip("EPost HTTP error", 1) if $@;
166 $t = grep m{^>.*$}, split("\n", $r);
167 is($t, 5, 'EPost to EFetch');
174 my %docsum = (1621261=> { 'Caption' => ['String','CAB02640'],
175 'Title' => ['String','PROBABLE PYRIMIDINE OPERON REGULATORY PROTEIN PYRR '.
176 '[Mycobacterium tuberculosis H37Rv]'],
177 'Extra' => ['String','gi|1621261|emb|CAB02640.1|[1621261]'],
178 'Gi' => ['Integer','1621261'],
179 'CreateDate' => ['String','2003/11/21'],
180 'UpdateDate' => ['String','2005/04/17'],
181 'Flags' => ['Integer',''],
182 'TaxId' => ['Integer','83332'],
183 'Length' => ['Integer','193'],
184 'Status' => ['String','live'],
185 'ReplacedBy' => ['String',''],
186 'Comment' => ['String',''], },
187 20807972 => {'Caption' => ['String','NP_623143'],
188 'Title' => ['String','pyrimidine regulatory protein PyrR '.
189 '[Thermoanaerobacter tengcongensis MB4]'],
190 'Extra' => ['String','gi|20807972|ref|NP_623143.1|[20807972]'],
191 'Gi' => ['Integer','20807972'],
192 'CreateDate' => ['String','2002/05/09'],
193 'UpdateDate' => ['String','2005/12/03'],
194 'Flags' => ['Integer','512'],
195 'TaxId' => ['Integer','273068'],
196 'Length' => ['Integer','178'],
197 'Status' => ['String','live'],
198 'ReplacedBy' => ['String',''],
199 'Comment' => ['String',''], },
200 68536103 => {'Caption' => ['String','YP_250808'],
201 'Title' => ['String','putative pyrimidine operon regulatory protein '.
202 '[Corynebacterium jeikeium K411]'],
203 'Extra' => ['String','gi|68536103|ref|YP_250808.1|[68536103]'],
204 'Gi' => ['Integer','68536103'],
205 'CreateDate' => ['String','2005/07/04'],
206 'UpdateDate' => ['String','2006/03/30'],
207 'Flags' => ['Integer','512'],
208 'TaxId' => ['Integer','306537'],
209 'Length' => ['Integer','195'],
210 'Status' => ['String','live'],
211 'ReplacedBy' => ['String',''],
212 'Comment' => ['String',''], },
213 730439 => {'Caption' => ['String','P41007'],
214 'Title' => ['String','PyrR bifunctional protein '.
215 '[Includes: Pyrimidine operon regulatory protein; '.
216 'Uracil phosphoribosyltransferase (UPRTase)]'],
217 'Extra' => ['String','gi|730439|sp|P41007|PYRR_BACCL[730439]'],
218 'Gi' => ['Integer','730439'],
219 'CreateDate' => ['String','1995/02/01'],
220 'UpdateDate' => ['String','2006/07/25'],
221 'Flags' => ['Integer',''],
222 'TaxId' => ['Integer','1394'],
223 'Length' => ['Integer','179'],
224 'Status' => ['String','live'],
225 'ReplacedBy' => ['String',''],
226 'Comment' => ['String',''] },
227 89318838 => { 'Caption' => ['String','EAS10332'],
228 'Title' => ['String','Phosphoribosyltransferase '.
229 '[Mycobacterium gilvum PYR-GCK]'],
230 'Extra' => ['String','gi|89318838|gb|EAS10332.1|[89318838]'],
231 'Gi' => ['Integer','89318838'],
232 'CreateDate' => ['String','2006/03/09'],
233 'UpdateDate' => ['String','2006/03/09'],
234 'Flags' => ['Integer',''],
235 'TaxId' => ['Integer','350054'],
236 'Length' => ['Integer','193'],
237 'Status' => ['String','live'],
238 'ReplacedBy' => ['String',''],
239 'Comment' => ['String',''] } );
241 $eutil = Bio::DB::EUtilities->new(
242 -eutil => 'esummary',
247 isa_ok($eutil, 'Bio::DB::GenericWebAgent');
249 eval {$response = $eutil->get_Response; };
250 skip("ESummary HTTP error:$@", 253) if $@;
251 isa_ok($response, 'HTTP::Response');
253 my @docs = $eutil->get_DocSums();
254 is(scalar(@docs), 5, '$esum->get_DocSums()');
257 while (my $ds = $eutil->next_DocSum) {
258 isa_ok($ds, 'Bio::Tools::EUtilities::Summary::DocSum');
260 my $id = $ds->get_id();
261 ok(exists($docsum{$id}), '$docsum->get_id()');
263 my %items = %{ $docsum{$id} };
265 # iterate using item names
267 for my $name ($ds->get_all_names()) {
269 my ($it) = $ds->get_Items_by_name($name);
270 ok(exists $items{$name},'DocSum Name exists');
271 is($it->get_name, $name, 'get_name(),DocSum Name');
272 is($ds->get_type_by_name($name), $items{$name}->[0],
273 'get_type_by_name() from DocSum');
274 is($it->get_type, $items{$name}->[0], 'get_type() from Item');
281 # ESearch, ESearch History
285 $eutil = Bio::DB::EUtilities->new(
293 isa_ok($eutil, 'Bio::DB::GenericWebAgent');
294 eval {$response = $eutil->get_Response; };
295 skip("ESearch HTTP error:$@", 12) if $@;
296 isa_ok($response, 'HTTP::Response');
298 # can't really check for specific ID's but can check total ID's returned
299 my @esearch_ids = $eutil->get_ids;
300 is(scalar(@esearch_ids), 100, '$esearch->get_ids()');
302 cmp_ok($eutil->get_count, '>', 117, '$esearch->get_count()');
305 $eutil = Bio::DB::EUtilities->new(
314 eval {$response = $eutil->get_Response; };
315 skip("ESearch HTTP error:$@", 9) if $@;
316 is($eutil->eutil, 'esearch', 'eutil()');
317 is($eutil->get_database, 'protein', 'get_database()');
318 cmp_ok($eutil->get_count, '>', 117, 'get_count()');
319 is($eutil->get_term, $term, 'get_term()');
320 is($eutil->get_ids, 100, 'History->get_ids()');
322 my $history = $eutil->next_History;
323 isa_ok($history, 'Bio::Tools::EUtilities::HistoryI');
325 # check the actual data
326 my ($webenv, $key) = $history->history;
327 like($webenv, qr{^\S{15}}, 'WebEnv');
328 like($key, qr{^\d+}, 'query key');
330 # can we fetch the sequences?
331 $eutil->set_parameters(
333 -history => $history,
337 # look for fasta headers
339 eval{ $r = $eutil->get_Response->content;};
340 skip("EPost HTTP error", 1) if $@;
341 $t = grep m{^>.*$}, split("\n", $r);
342 is($t, 5, 'EPost to EFetch');
350 $eutil = Bio::DB::EUtilities->new(
355 isa_ok($eutil, 'Bio::DB::GenericWebAgent');
356 eval {$response = $eutil->get_Response; };
357 skip("EInfo HTTP error:$@", 10) if $@;
358 isa_ok($response, 'HTTP::Response');
359 like($response->content, qr(<eInfoResult>), 'EInfo response');
360 is(($eutil->get_database)[0], 'protein', '$einfo->get_database()');
361 like($eutil->get_last_update, qr(\d{4}\/\d{2}\/\d{2}\s\d{2}:\d{2}),
362 '$einfo->get_last_update()');
363 cmp_ok($eutil->get_record_count, '>', 9200000, '$einfo->get_record_count()');
364 is($eutil->get_description, 'Protein sequence record', '$einfo->get_description()');
365 my @links = $eutil->get_LinkInfo;
366 my @fields = $eutil->get_FieldInfo;
367 cmp_ok(scalar(@links), '>',30, '$einfo->get_LinkInfo()');
368 cmp_ok(scalar(@fields), '>',24, '$einfo->get_FieldInfo()');
370 # all databases (list)
371 $eutil = Bio::DB::EUtilities->new(
376 eval {$response = $eutil->get_Response; };
377 skip("EInfo HTTP error:$@", 1) if $@;
379 my @db = sort qw(pubmed protein nucleotide nuccore nucgss nucest structure
380 genome books cancerchromosomes cdd domains gene genomeprj gensat
381 geo gds homologene journals mesh ncbisearch nlmcatalog omia omim
382 pmc popset probe pcassay pccompound pcsubstance snp taxonomy toolkit
385 my @einfo_dbs = sort $eutil->get_databases;
386 cmp_ok(scalar(@einfo_dbs), '>=', scalar(@db), 'All EInfo databases');
391 # ELink - normal (single ID array) - single db - ElinkData tests
395 $eutil = Bio::DB::EUtilities->new(
398 -dbfrom => 'protein',
403 isa_ok($eutil, 'Bio::DB::GenericWebAgent');
404 eval {$response = $eutil->get_Response; };
405 skip("ELink HTTP error:$@", 7) if $@;
406 isa_ok($response, 'HTTP::Response');
407 like($response->content, qr(<eLinkResult>), 'ELink response');
408 # Data is too volatile to test; commenting for now...
409 #my @ids2 = qw(350054 306537 273068 83332 1394);
410 cmp_ok($eutil->get_ids, '>=', 4);
411 #is_deeply([sort $eutil->get_ids], [sort @ids2],'$elink->get_ids()');
414 is($eutil->get_LinkSets, 1, '$elink->get_LinkSets()');
415 my $linkobj = $eutil->next_LinkSet;
416 isa_ok($linkobj, 'Bio::Tools::EUtilities::Link::LinkSet');
417 is($linkobj->get_dbfrom, 'protein', '$linkdata->get_dbfrom()');
418 #is_deeply([sort $linkobj->elink_queryids],
419 # [sort @ids], '$linkdata->elink_queryids()');
420 my $db = $linkobj->get_dbto;
421 is($db, 'taxonomy', '$linkdata->get_dbto()');
422 #is_deeply([sort $linkobj->get_LinkIds_by_db($db)],
423 # [sort @ids2], '$linkdata->get_LinkIds_by_db($db)');
428 my @genome_ids = qw(30807 33011 12997 16707 45843 31129 31141 31131 31133 32203 31135);
430 $eutil = Bio::DB::EUtilities->new(
433 -dbfrom => 'genomeprj',
438 eval {$response = $eutil->get_Response; };
439 skip("ELink HTTP error:$@", 7) if $@;
440 isa_ok($response, 'HTTP::Response');
441 like($response->content, qr(<eLinkResult>), 'ELink response');
442 # Data is too volatile to test; commenting for now...
443 #my @ids2 = qw(350054 306537 273068 83332 1394);
444 cmp_ok($eutil->get_ids, '>=', 4);
445 #is_deeply([sort $eutil->get_ids], [sort @ids2],'$elink->get_ids()');
448 is($eutil->get_LinkSets, 1, '$elink->get_LinkSets()');
449 my $linkobj = $eutil->next_LinkSet;
450 isa_ok($linkobj, 'Bio::Tools::EUtilities::Link::LinkSet');
451 is($linkobj->get_dbfrom, 'protein', '$linkdata->get_dbfrom()');
452 #is_deeply([sort $linkobj->elink_queryids],
453 # [sort @ids], '$linkdata->elink_queryids()');
454 my $db = $linkobj->get_dbto;
455 is($db, 'taxonomy', '$linkdata->get_dbto()');
456 #is_deeply([sort $linkobj->get_LinkIds_by_db($db)],
457 # [sort @ids2], '$linkdata->get_LinkIds_by_db($db)');
463 $eutil = Bio::DB::EUtilities->new(
469 isa_ok($eutil, 'Bio::DB::GenericWebAgent');
470 eval {$response = $eutil->get_Response; };
471 skip("EGQuery HTTP error:$@", 3) if $@;
472 isa_ok($response, 'HTTP::Response');
473 like($response->content, qr(<eGQueryResult>), 'EGQuery response');
474 my @gq = $eutil->get_GlobalQueries;
475 cmp_ok(scalar(@gq), '>=', 30, 'get_GlobalQueries')