From 1fb6f73fe4f68dc333ddc18ede0b7d718dbcdc01 Mon Sep 17 00:00:00 2001 From: cjfields Date: Tue, 16 Dec 2008 20:17:58 +0000 Subject: [PATCH] EUtilities remote tests back online (simple tests only) svn path=/bioperl-live/trunk/; revision=15189 --- t/RemoteDB/EUtilities.t | 430 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 430 insertions(+) create mode 100644 t/RemoteDB/EUtilities.t diff --git a/t/RemoteDB/EUtilities.t b/t/RemoteDB/EUtilities.t new file mode 100644 index 000000000..158ae95fe --- /dev/null +++ b/t/RemoteDB/EUtilities.t @@ -0,0 +1,430 @@ +# -*-Perl-*- Test Harness script for Bioperl +# $Id: EUtilities.t 15112 2008-12-08 18:12:38Z sendu $ +# + +use strict; +our $NUMTESTS; +our $DEBUG; +our %EUTILS; + +BEGIN { + $NUMTESTS = 4; # base number of tests (those not in blocks) + + # I have set up eutils tests to run in sections for easier test maintenance + # and keeping track of problematic tests. The below hash is the list of + # tests, with test number and coderef. + + # these now run very simple tests for connectivity and data sampling + # main tests now with the parser + + %EUTILS = ( + 'efetch' => {'tests' => 5, + 'sub' => \&efetch}, + 'epost' => {'tests' => 11, + 'sub' => \&epost}, + 'esummary' => {'tests' => 254, + 'sub' => \&esummary}, + 'esearch' => {'tests' => 13, + 'sub' => \&esearch}, + 'einfo' => {'tests' => 10, + 'sub' => \&einfo}, + 'elink1' => {'tests' => 8, + 'sub' => \&elink1}, + 'egquery' => {'tests' => 4, + 'sub' => \&egquery}, + ); + $NUMTESTS += $EUTILS{$_}->{'tests'} for (keys %EUTILS); + $DEBUG = $ENV{'BIOPERLDEBUG'} || 0; + # this seems to work for perl 5.6 and perl 5.8 + + use Bio::Root::Test; + + test_begin(-tests => $NUMTESTS, + -requires_modules => [qw(XML::Simple LWP::UserAgent)], + -requires_networking => 1, + ); + + use_ok('Bio::DB::EUtilities'); + use_ok('LWP::UserAgent'); + use_ok('Bio::Tools::EUtilities'); + use_ok('Bio::Tools::EUtilities::EUtilParameters'); +} + +# NOTE : Bio::DB::EUtilities is just a specialized pipeline to get any +# data available via NCBI's Entrez interface, with a few convenience methods +# to get UIDs and other additional information. All data returned +# using EFetch is raw (not Bioperl objects) and is meant to be piped into +# other Bioperl modules at a later point for further processing + +# protein acc +my @acc = qw(MUSIGHBA1 P18584 CH402638); + +# protein GI +my @ids = sort qw(1621261 89318838 68536103 20807972 730439); + +# test search term +my $term = 'dihydroorotase AND human'; + +my ($eutil, $response); + +my %dbs = (taxonomy => 1, + nucleotide => 1, + pubmed => 1); +my %links = (protein_taxonomy => 1, + protein_nucleotide => 1, + protein_nucleotide_wgs => 1, + protein_pubmed => 1, + protein_pubmed_refseq => 1 + ); + +# this loops through the required tests, only running what is in %EUTILS +for my $test (keys %EUTILS) { + $EUTILS{$test}->{'sub'}->(); +} + +# Simple EFetch + +sub efetch { + SKIP: { + $eutil = Bio::DB::EUtilities->new( + -db => 'protein', + -id => [$ids[0]], + -rettype => 'fasta' + ); + + isa_ok($eutil, 'Bio::DB::GenericWebAgent'); + eval {$response = $eutil->get_Response; }; + skip("EFetch HTTP error: $@", 4) if $@; + isa_ok($response, 'HTTP::Response'); + my $content = $response->content; + like($content, qr(PYRR \[Mycobacterium tuberculosis H37Rv\]), + 'EFetch: Fasta format'); + + # reuse the EUtilities webagent + $eutil->parameter_base->id([$ids[1]]); + $eutil->parameter_base->rettype('gb'); + eval {$response = $eutil->get_Response; }; + skip("EFetch HTTP error: $@", 2) if $@; + isa_ok($response, 'HTTP::Response'); + $content = $response->content; + like($content, qr(^LOCUS\s+NP_623143),'EFetch: GenBank format'); + } +} + +# EPost->EFetch with History + +sub epost { + SKIP: { + $eutil = Bio::DB::EUtilities->new( + -eutil => 'epost', + -db => 'protein', + -id => \@ids, + ); + + isa_ok($eutil, 'Bio::DB::GenericWebAgent'); + eval {$response = $eutil->get_Response; }; + skip("EPost HTTP error: $@", 10) if $@; + isa_ok($response, 'HTTP::Response'); + # Any parameters are passed in to the parser, so these should be set. + # Databases and IDs always default back to the submitted ones unless + # the data being retrieved are IDs or contain new IDs (esearch, elink) + + is($eutil->get_database, 'protein', '$epost->get_database()'); + is(join(',',$eutil->get_ids), '1621261,20807972,68536103,730439,89318838', '$epost->get_ids()'); + + # these are not set using epost + is($eutil->get_count, undef, '$epost->get_count()'); + is($eutil->get_term, undef, '$epost->get_term()'); + + my $history = $eutil->next_History; + is($history->eutil, 'epost', 'History->eutil()'); + isa_ok($history, 'Bio::Tools::EUtilities::HistoryI'); + + # check the actual History + my ($webenv, $key) = $history->history; + like($webenv, qr{^\S{50}}, '$epost WebEnv'); + like($key, qr{^\d+}, '$epost query key'); + + # can we fetch the sequences? + $eutil->set_parameters( + -eutil => 'efetch', + -history => $history, + -rettype => 'fasta' + ); + # look for fasta headers + my ($r, $t); + eval{ $r = $eutil->get_Response->content;}; + skip("EPost HTTP error", 1) if $@; + $t = grep m{^>.*$}, split("\n", $r); + is($t, 5, 'EPost to EFetch'); + } +} + +# ESummary + +sub esummary { + my %docsum = (1621261=> { 'Caption' => ['String','CAB02640'], + 'Title' => ['String','PROBABLE PYRIMIDINE OPERON REGULATORY PROTEIN PYRR '. + '[Mycobacterium tuberculosis H37Rv]'], + 'Extra' => ['String','gi|1621261|emb|CAB02640.1|[1621261]'], + 'Gi' => ['Integer','1621261'], + 'CreateDate' => ['String','2003/11/21'], + 'UpdateDate' => ['String','2005/04/17'], + 'Flags' => ['Integer',''], + 'TaxId' => ['Integer','83332'], + 'Length' => ['Integer','193'], + 'Status' => ['String','live'], + 'ReplacedBy' => ['String',''], + 'Comment' => ['String',''], }, + 20807972 => {'Caption' => ['String','NP_623143'], + 'Title' => ['String','pyrimidine regulatory protein PyrR '. + '[Thermoanaerobacter tengcongensis MB4]'], + 'Extra' => ['String','gi|20807972|ref|NP_623143.1|[20807972]'], + 'Gi' => ['Integer','20807972'], + 'CreateDate' => ['String','2002/05/09'], + 'UpdateDate' => ['String','2005/12/03'], + 'Flags' => ['Integer','512'], + 'TaxId' => ['Integer','273068'], + 'Length' => ['Integer','178'], + 'Status' => ['String','live'], + 'ReplacedBy' => ['String',''], + 'Comment' => ['String',''], }, + 68536103 => {'Caption' => ['String','YP_250808'], + 'Title' => ['String','putative pyrimidine operon regulatory protein '. + '[Corynebacterium jeikeium K411]'], + 'Extra' => ['String','gi|68536103|ref|YP_250808.1|[68536103]'], + 'Gi' => ['Integer','68536103'], + 'CreateDate' => ['String','2005/07/04'], + 'UpdateDate' => ['String','2006/03/30'], + 'Flags' => ['Integer','512'], + 'TaxId' => ['Integer','306537'], + 'Length' => ['Integer','195'], + 'Status' => ['String','live'], + 'ReplacedBy' => ['String',''], + 'Comment' => ['String',''], }, + 730439 => {'Caption' => ['String','P41007'], + 'Title' => ['String','PyrR bifunctional protein '. + '[Includes: Pyrimidine operon regulatory protein; '. + 'Uracil phosphoribosyltransferase (UPRTase)]'], + 'Extra' => ['String','gi|730439|sp|P41007|PYRR_BACCL[730439]'], + 'Gi' => ['Integer','730439'], + 'CreateDate' => ['String','1995/02/01'], + 'UpdateDate' => ['String','2006/07/25'], + 'Flags' => ['Integer',''], + 'TaxId' => ['Integer','1394'], + 'Length' => ['Integer','179'], + 'Status' => ['String','live'], + 'ReplacedBy' => ['String',''], + 'Comment' => ['String',''] }, + 89318838 => { 'Caption' => ['String','EAS10332'], + 'Title' => ['String','Phosphoribosyltransferase '. + '[Mycobacterium gilvum PYR-GCK]'], + 'Extra' => ['String','gi|89318838|gb|EAS10332.1|[89318838]'], + 'Gi' => ['Integer','89318838'], + 'CreateDate' => ['String','2006/03/09'], + 'UpdateDate' => ['String','2006/03/09'], + 'Flags' => ['Integer',''], + 'TaxId' => ['Integer','350054'], + 'Length' => ['Integer','193'], + 'Status' => ['String','live'], + 'ReplacedBy' => ['String',''], + 'Comment' => ['String',''] } ); + SKIP: { + $eutil = Bio::DB::EUtilities->new( + -eutil => 'esummary', + -db => 'protein', + -id => \@ids, + ); + isa_ok($eutil, 'Bio::DB::GenericWebAgent'); + + eval {$response = $eutil->get_Response; }; + skip("ESummary HTTP error:$@", 253) if $@; + isa_ok($response, 'HTTP::Response'); + + my @docs = $eutil->get_DocSums(); + is(scalar(@docs), 5, '$esum->get_DocSums()'); + + my $ct = 0; + while (my $ds = $eutil->next_DocSum) { + isa_ok($ds, 'Bio::Tools::EUtilities::Summary::DocSum'); + + my $id = $ds->get_id(); + ok(exists($docsum{$id}), '$docsum->get_id()'); + + my %items = %{ $docsum{$id} }; + + # iterate using item names + + for my $name ($ds->get_all_names()) { + $ct++; + my ($it) = $ds->get_Items_by_name($name); + ok(exists $items{$name},'DocSum Name exists'); + is($it->get_name, $name, 'get_name(),DocSum Name'); + is($ds->get_type_by_name($name), $items{$name}->[0], + 'get_type_by_name() from DocSum'); + is($it->get_type, $items{$name}->[0], 'get_type() from Item'); + } + } + is($ct, 60); + } +} + +# ESearch, ESearch History + +sub esearch { + SKIP: { + $eutil = Bio::DB::EUtilities->new( + -eutil => 'esearch', + -db => 'protein', + -term => $term, + -retmax => 100 + ); + + isa_ok($eutil, 'Bio::DB::GenericWebAgent'); + eval {$response = $eutil->get_Response; }; + skip("ESearch HTTP error:$@", 12) if $@; + isa_ok($response, 'HTTP::Response'); + + # can't really check for specific ID's but can check total ID's returned + my @esearch_ids = $eutil->get_ids; + is(scalar(@esearch_ids), 100, '$esearch->get_ids()'); + + cmp_ok($eutil->get_count, '>', 117, '$esearch->get_count()'); + + # usehistory + $eutil = Bio::DB::EUtilities->new( + -eutil => 'esearch', + -db => 'protein', + -usehistory => 'y', + -term => $term, + -retmax => 100 + ); + + eval {$response = $eutil->get_Response; }; + skip("ESearch HTTP error:$@", 9) if $@; + is($eutil->eutil, 'esearch', 'eutil()'); + is($eutil->get_database, 'protein', 'get_database()'); + cmp_ok($eutil->get_count, '>', 117, 'get_count()'); + is($eutil->get_term, $term, 'get_term()'); + is($eutil->get_ids, 100, 'History->get_ids()'); + + my $history = $eutil->next_History; + isa_ok($history, 'Bio::Tools::EUtilities::HistoryI'); + + # check the actual data + my ($webenv, $key) = $history->history; + like($webenv, qr{^\S{50}}, 'WebEnv'); + like($key, qr{^\d+}, 'query key'); + + # can we fetch the sequences? + $eutil->set_parameters( + -eutil => 'efetch', + -history => $history, + -rettype => 'fasta', + -retmax => 5 + ); + # look for fasta headers + my ($r, $t); + eval{ $r = $eutil->get_Response->content;}; + skip("EPost HTTP error", 1) if $@; + $t = grep m{^>.*$}, split("\n", $r); + is($t, 5, 'EPost to EFetch'); + } +} + +# EInfo + +sub einfo { + SKIP: { + $eutil = Bio::DB::EUtilities->new( + -eutil => 'einfo', + -db => 'protein', + ); + isa_ok($eutil, 'Bio::DB::GenericWebAgent'); + eval {$response = $eutil->get_Response; }; + skip("EInfo HTTP error:$@", 10) if $@; + isa_ok($response, 'HTTP::Response'); + like($response->content, qr(), 'EInfo response'); + is(($eutil->get_database)[0], 'protein', '$einfo->get_database()'); + like($eutil->get_last_update, qr(\d{4}\/\d{2}\/\d{2}\s\d{2}:\d{2}), + '$einfo->get_last_update()'); + cmp_ok($eutil->get_record_count, '>', 9200000, '$einfo->get_record_count()'); + is($eutil->get_description, 'Protein sequence record', '$einfo->get_description()'); + my @links = $eutil->get_LinkInfo; + my @fields = $eutil->get_FieldInfo; + cmp_ok(scalar(@links), '>',30, '$einfo->get_LinkInfo()'); + cmp_ok(scalar(@fields), '>',24, '$einfo->get_FieldInfo()'); + + # all databases (list) + $eutil = Bio::DB::EUtilities->new( + -eutil => 'einfo', + ); + + eval {$response = $eutil->get_Response; }; + skip("EInfo HTTP error:$@", 1) if $@; + + my @db = sort qw(pubmed protein nucleotide nuccore nucgss nucest structure + genome books cancerchromosomes cdd domains gene genomeprj gensat + geo gds homologene journals mesh ncbisearch nlmcatalog omia omim + pmc popset probe pcassay pccompound pcsubstance snp taxonomy toolkit + unigene unists); + + my @einfo_dbs = sort $eutil->get_databases; + cmp_ok(scalar(@einfo_dbs), '>=', scalar(@db), 'All EInfo databases'); + } +} + + +# ELink - normal (single ID array) - single db - ElinkData tests + +sub elink1 { + SKIP: { + $eutil = Bio::DB::EUtilities->new( + -eutil => 'elink', + -db => 'taxonomy', + -dbfrom => 'protein', + -id => \@ids, + ); + + isa_ok($eutil, 'Bio::DB::GenericWebAgent'); + eval {$response = $eutil->get_Response; }; + skip("ELink HTTP error:$@", 7) if $@; + isa_ok($response, 'HTTP::Response'); + like($response->content, qr(), 'ELink response'); + # Data is too volatile to test; commenting for now... + #my @ids2 = qw(350054 306537 273068 83332 1394); + cmp_ok($eutil->get_ids, '>=', 4); + #is_deeply([sort $eutil->get_ids], [sort @ids2],'$elink->get_ids()'); + + # Linkset tests + is($eutil->get_LinkSets, 1, '$elink->get_LinkSets()'); + my $linkobj = $eutil->next_LinkSet; + isa_ok($linkobj, 'Bio::Tools::EUtilities::Link::LinkSet'); + is($linkobj->get_dbfrom, 'protein', '$linkdata->get_dbfrom()'); + #is_deeply([sort $linkobj->elink_queryids], + # [sort @ids], '$linkdata->elink_queryids()'); + my $db = $linkobj->get_dbto; + is($db, 'taxonomy', '$linkdata->get_dbto()'); + #is_deeply([sort $linkobj->get_LinkIds_by_db($db)], + # [sort @ids2], '$linkdata->get_LinkIds_by_db($db)'); + } +} + +sub egquery { + SKIP: { + $eutil = Bio::DB::EUtilities->new( + -eutil => 'egquery', + -term => $term, + ); + + isa_ok($eutil, 'Bio::DB::GenericWebAgent'); + eval {$response = $eutil->get_Response; }; + skip("EGQuery HTTP error:$@", 3) if $@; + isa_ok($response, 'HTTP::Response'); + like($response->content, qr(), 'EGQuery response'); + my @gq = $eutil->get_GlobalQueries; + cmp_ok(scalar(@gq), '>=', 30, 'get_GlobalQueries') + } +} + +1; -- 2.11.4.GIT