From 8d33320ff2192d6f9661a8ccce5786a5339a238f Mon Sep 17 00:00:00 2001 From: birney Date: Tue, 6 Apr 1999 16:14:24 +0000 Subject: [PATCH] Beefed up SimpleAlign tests for Pfam format Added swisspfam indexer and test suite svn path=/bioperl-live/trunk/; revision=939 --- Bio/Index/SwissPfam.pm | 276 +++++++++++++++++++++++++++++++++++++++++++++++++ t/SimpleAlign.t | 31 +++++- t/swisspfam.data | 12 +++ t/test.pfam | 38 +++++++ 4 files changed, 356 insertions(+), 1 deletion(-) create mode 100644 Bio/Index/SwissPfam.pm create mode 100644 t/swisspfam.data create mode 100755 t/test.pfam diff --git a/Bio/Index/SwissPfam.pm b/Bio/Index/SwissPfam.pm new file mode 100644 index 000000000..c693c6554 --- /dev/null +++ b/Bio/Index/SwissPfam.pm @@ -0,0 +1,276 @@ + +# +# BioPerl module for Bio::Index::SwissPfam +# +# Cared for by Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Index::SwissPfam - Interface for indexing swisspfam files + +=head1 SYNOPSIS + + use Bio::Index::SwissPfam; + + my $Index_File_Name = shift; + my $inx = Bio::Index::SwissPfam->new($Index_File_Name, 'WRITE'); + $inx->make_index(@ARGV); + + use Bio::Index::SwissPfam; + + my $Index_File_Name = shift; + my $inx = Bio::Index::SwissPfam->new($Index_File_Name); + + foreach my $id (@ARGV) { + my $seq = $inx->get_stream($id); # Returns stream + while( <$seq> ) { + /^>/ && last; + print; + } + + +=head1 DESCRIPTION + +Inherits functions for managing dbm files from Bio::Index::Abstract.pm, +and provides the basic funtionallity for indexing SwissPfam files. +Only retrieves FileStreams at the moment. Once we have something better, +will use that. Heavily snaffled from James Gilbert's +Fasta system. + +=head1 FEED_BACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + vsns-bcd-perl@lists.uni-bielefeld.de - General discussion + vsns-bcd-perl-guts@lists.uni-bielefeld.de - Technically-oriented discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bio.perl.org + http://bio.perl.org/bioperl-bugs/ + +=head1 AUTHOR - Ewan Birney + +Email - birney@sanger.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Index::SwissPfam; + +use vars qw($VERSION @ISA @EXPORT_OK); +use strict; + +use Bio::Index::Abstract; +use Bio::Seq; + +@ISA = qw(Bio::Index::Abstract Exporter); +@EXPORT_OK = qw(); + +sub _type_stamp { + return '__SWISSPFAM__'; # What kind of index are we? +} + +sub _version { + return 0.1; +} +$VERSION = _version(); + + + +=head2 _initialize + + Title : _initialize + Usage : $index->_initialize + Function: Calls $index->SUPER::_initialize(), and then adds + Example : + Returns : + Args : + +=cut + +sub _initialize { + my($self, $index_file, $write_flag) = @_; + + $self->SUPER::_initialize($index_file, $write_flag); +} + + +=head2 _index_file + + Title : _index_file + Usage : $index->_index_file( $file_name, $i ) + Function: Specialist function to index EMBL format files. + Is provided with a filename and an integer + by make_index in its SUPER class. + Example : + Returns : + Args : + +=cut + +sub _index_file { + my( $self, + $file, # File name + $i # Index-number of file being indexed + ) = @_; + + my( $begin, # Offset from start of file of the start + # of the last found record. + $end, # Offset from start of file of the end + # of the last found record. + $id, # ID of last found record. + $acc, # accession of last record. Also put into the index + $nid, $nacc, # new ids for the record just found + ); + + $begin = 0; + $end = 0; + + open SP, $file or $self->throw("Can't open file for read : $file"); + + # Main indexing loop + while () { + if (/^>(\S+)\s+\|=*\|\s+(\S+)/) { + $nid = $1; + $nacc = $2; + my $new_begin = tell(SP) - length( $_ ); + $end = $new_begin - 1; + + if( $id ) { + $self->add_record($id, $i, $begin, $end); + if( $acc ne $id ) { + $self->add_record($acc, $i, $begin, $end); + } + } + + $begin = $new_begin; + + $id = $nid; + $acc = $nacc; + } + } + # Don't forget to add the last record + $end = tell(SP); + $self->add_record($id, $i, $begin, $end) if $id; + + + close SP; + return 1; +} + + +=head2 fetch + + Title : fetch + Usage : $index->fetch( $id ) + Function: Returns a Bio::Seq object from the index + Example : $seq = $index->fetch( 'dJ67B12' ) + Returns : Bio::Seq object + Args : ID + +=cut + +sub fetch { + my( $self, $id ) = @_; + my $desc; + my $db = $self->db(); + if (my $rec = $db->{ $id }) { + my( @record ); + + my ($file, $begin, $end) = $self->unpack_record( $rec ); + + # Get the (possibly cached) filehandle + my $fh = $self->_file_handle( $file ); + + # move to start + seek($fh, $begin, 0); + + + #get id from file, and then loop to SQ line + while (<$fh>) { + #print STDERR "Got $_"; + /^SQ\s/ && last; + /^ID\s+(\S+)/ && do { $id = $1; }; + /^DE\s+(.*?)\s+$/ && do { $desc .= $1; }; + # accession numbers??? + } + + while (<$fh>) { + /^\/\// && last; + #print STDERR "Got $_"; + s/[\W0-9]//g; + push(@record, $_); + last if tell($fh) > $end; + } + + $self->throw("Can't fetch sequence for record : $id") + unless @record; + + # Return a shiny Bio::Seq object + return Bio::Seq->new( -ID => $id, + -DESC => $desc, + -SEQ => uc(join('', @record)) ); + } else { + $self->throw("Unable to find a record for $id in EMBL flat file index"); + } +} + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : $seq = $db->get_Seq_by_id() + Function: retrieves a sequence object, identically to + ->fetch, but here behaving as a Bio::DB::BioSeqI + Returns : new Bio::Seq object + Args : string represents the id + + +=cut + +sub get_Seq_by_id{ + my ($self,$id) = @_; + + return $self->fetch($id); +} + +=head2 get_Seq_by_acc + + Title : get_Seq_by_acc + Usage : $seq = $db->get_Seq_by_acc() + Function: retrieves a sequence object, identically to + ->fetch, but here behaving as a Bio::DB::BioSeqI + Returns : new Bio::Seq object + Args : string represents the accession number + + +=cut + +sub get_Seq_by_acc { + my ($self,$id) = @_; + + return $self->fetch($id); +} + + +1; diff --git a/t/SimpleAlign.t b/t/SimpleAlign.t index 9f5b396bf..b41fc62fe 100644 --- a/t/SimpleAlign.t +++ b/t/SimpleAlign.t @@ -18,7 +18,7 @@ ## We start with some black magic to print on failure. -BEGIN { $| = 1; print "1..3\n"; } +BEGIN { $| = 1; print "1..6\n"; } END {print "not ok 1\n" unless $loaded;} use lib '../'; @@ -51,5 +51,34 @@ close(OUT); print "ok 3\n"; +$aln = Bio::SimpleAlign->new(); +open(FH,"t/test.pfam"); +$aln->read_Pfam(\*FH); +close(FH); + +if( $aln ) { + print "ok 4\n"; +} else { + print "not ok 4\n"; +} + +open(OUT,">t/out.pfam"); +$aln->write_Pfam(\*OUT); +close(OUT); +print "ok 5\n"; + + +$aln = Bio::SimpleAlign->new(); +open(IN,"t/out.pfam"); +$aln->read_Pfam(\*IN); +close(IN); + +if( $aln ) { + print "ok 6\n"; +} else { + print "not ok 6\n"; +} + + diff --git a/t/swisspfam.data b/t/swisspfam.data new file mode 100644 index 000000000..07be87719 --- /dev/null +++ b/t/swisspfam.data @@ -0,0 +1,12 @@ +>ROA1_MOUSE |==============================================| P49312 319 a.a. +Pfam-B_14464 1 --------------------- (3) PD14464 +178-319 +rrm 2 ---------- ---------- (1058) PF00076 RNA recognition motif. (aka RRM, RBD, or RNP domain) 15-85 106-176 +>ROA1_RAT |==============================================| P04256 319 a.a. +Pfam-B_14464 1 --------------------- (3) PD14464 +178-319 +rrm 2 ---------- ---------- (1058) PF00076 RNA recognition motif. (aka RRM, RBD, or RNP domain) 15-85 106-176 +>ROA1_SCHAM |=================================================| P21522 342 a.a. +rrm 2 ---------- ---------- (1058) PF00076 RNA recognition motif. (aka RRM, RBD, or RNP domain) 19-89 110-180 +>ROA1_XENLA |==============================================| P17130 365 a.a. +rrm 2 --------- --------- (1058) PF00076 RNA recognition motif. (aka RRM, RBD, or RNP domain) 16-86 107-177 diff --git a/t/test.pfam b/t/test.pfam new file mode 100755 index 000000000..0ebd153ed --- /dev/null +++ b/t/test.pfam @@ -0,0 +1,38 @@ +TASM_BFDV/6-67 RLTELLCLPV.......TATAADIKTAYRRTALKYHPDKGGD.................EEKMKELNTLMEEFRETEGLRADETLE +TASM_SV40/12-75 QLMDLLGLERS.....AWGNIPLMRKAYLKKCKEFHPDKGGD.................EEKMKKMNTLYKKMEDGVKYAHQPDFG +TASM_POVLY/12-75 ELMDLLQITRA.....AWGNLSMMKKAYKNVSKLYHPDKGGD.................SAKMQRLNELFQRVQVTLMEIRSQCGS +TASM_POVMA/12-75 RLLELLKLPRQ.....LWGDFGRMQQAYKQQSLLLHPDKGGS.................HALMQELNSLWGTFKTEVYNLRMNLGG +TAMI_POVHA/12-75 ALISLLDLEPQ.....YWGDYGRMQKCYKKKCLQLHPDKGGN.................EELMQQLNTLWTKLKDGLYRVRLLLGP +TASM_POVBO/10-71 ELRGLLGTPD.......IGNADTLKKAFLKACKVHHPDKGGN.................EEAMKRLLYLYNKAKIAASATTSQVWY +DNJ1_HUMAN/4-68 DYYQTLGLAR.......GASDEEIKRAYRRQALRYHPDKNKE..............PGAEEKFKEIAEAYDVLSDPRKREIFDRYG +DNAJ_HAEDU/5-70 DYYEVLGLQK.......GATEKDIKRAYKRLAAKYHPDKNQG.............SKDSEEKFKQITEAYEILTDDQKRAAYDQYG +DNJ2_ALLPO/13-74 KYYEVLGVSK.......NATPEDLKKAYRKAAIKNHPDKGGD.................PEKFKEIGQAYEVLNDPEKREIYDQYG +PSI_SCHPO/6-68 KLYDCLEVRP.......EASEAELKKAYRKLALKYHPDKNPN................GEKKFKEISLAYEVLSDPQRRKLYDQYG +XDJ1_YEAST/9-77 RLYDVLGVTR.......DATVQEIKTAYRKLALKHHPDKYVDQD..........SKEVNEIKFKEITAAYEILSDPEKKSHYDLYG +DNAJ_BORBU/4-69 DYYEILGLSK.......GASKDEIKKAYRKIAIKYHPDRNQG.............NEEAASIFKEATQAYEILIDDNKKAKYDRFG +CSP_RAT/15-80 SLYHVLGLDK.......NATSDDIKKSYRKLALKYHPDKNPD.............NPEAADKFKEINNAHAILTDATKRNIYDKYG +HLJ1_YEAST/21-85 EFYEILKVDR.......KATDSEIKKAYRKLAIKLHPDKNSH..............PKAGEAFKVINRAFEVLSNEEKRSIYDRIG +CAJ1_YEAST/6-71 EYYDILGIKP.......EATPTEIKKAYRRKAMETHPDKHPD.............DPDAQAKFQAVGEAYQVLSDPGLRSKYDQFG +YIS4_YEAST/6-71 EYYDLLGVST.......TASSIEIKKAYRKKSIQEHPDKNPN.............DPTATERFQAISEAYQVLGDDDLRAKYDKYG +YNW7_YEAST/4-70 CYYELLGVET.......HASDLELKKAYRKKALQYHPDKNPDN............VEEATQKFAVIRAAYEVLSDPQERAWYDSHK +DNAJ_ERYRH/6-70 DFYEILGVSK.......SATDAEIKKAYRQLAKKYHPDINKE..............DGAEAKFKEVQEAYEVLSDSQKRANYDQFG +YLW5_CAEEL/531-595 DYYKTLGVDK.......KSDAKAIKKAYFQLAKKYHPDVNKT..............KEAQTKFQEISEAYEVLSDDTKRQEYDAYG +CBPA_ECOLI/5-69 DYYAIMGVKP.......TDDLKTIKTAYRRLARKYHPDVSKE..............PDAEARFKEVAEAWEVLSDEQRRAEYDQMW +DNAJ_CAUCR/3-68 DYYEILGVTR.......TIDEAGLKSRVRKLAMEHHPDRNGG.............CENAAGRFKEINEAYSVLSDSQKRAAYDRFG +DNJM_MYCGE/7-71 DYYEVLGITP.......DADQSEIKKAFRKLAKKYHPDRNNA..............PDAAKIFAEINEANDVLSNPKKRANYDKYG +DNAJ_SYNP7/6-71 DYYALLGIPQ.......SADQAAIKAAFRKLARQCHPDLNPG.............DRQAEERFKQISEAYEILSDPDRRAEYQRFS +DNAJ_STRCO/10-75 DYYKVLGVPK.......DATEAEIKKAYRKLARENHPDANKG.............NVKAEERFKEISEANDILGDPKKRKEYDEAR +YJH3_YEAST/585-655 DYYKILGVSP.......SASSKEIRKAYLNLTKKYHPDKIKANHN........DKQESIHETMSQINEAYETLSDDDKRKEYDLSR +DNJL_MYCGE/2-64 NLYDLLELPT.......TASIKEIKIAYKRLAKRYHPDVNKL................GSQTFVEINNAYSILSDPNQKEKYDSML +YFL1_YEAST/44-108 NFYKFLKLPKL.....QNSSTKEITKNLRKLSKKYHPDKNPK................YRKLYERLNLATQILSNSSNRKIYDYYL +YGM8_YEAST/79-151 NLYDVLELPTPLDVHTIYDDLPQIKRKYRTLALKYHPDKHPD.............NPSIIHKFHLLSTATNILTNADVRPHYDRWL +YD1J_SCHPO/32-110 TPYEILELPR.......TCTANDIKRKYIELVKKHHPDKMKNASQLAPTESPPEINKHNEEYFRLLLAANALLSDKRRREEYDRFG +YJQ2_YEAST/13-77 TYYSILGLTS.......NATSSEVHKSYLKLARLLHPDKTKS..............DKSEELFKAVVHAHSILTDEDQKLRYDRDL +NPL1_YEAST/125-196 DPYEILGIST.......SASDRDIKSAYRKLSVKFHPDKLAKGLT.......PDEKSVMEETYVQITKAYESLTDELVRQNYLKYG +YJ67_YEAST/8-76 THYEILRIPS.......DATQDEIKKAYRNRLLNTHPDKLSKSI..........HDTVSNVTINKIQDAYKILSNIKTRREYDRLI +ZUO1_YEAST/97-168 DLYAAMGLSKLR....FRATESQIIKAHRKQVVKYHPDKQSAAG..........GSLDQDGFFKIIQKAFETLTDSNKRAQYDSCD +ZRF1_MOUSE/88-161 DHYAVLGLGHVR....YTATQRQIKAAHKAMVLKHHPDKRKAAGE........PIKEGDNDYFTCITKAYEMLSDPVKRRAFNSVD +RESA_PLAFF/523-587 LYYDILGVGV.......NADMNEITERYFKLAENYYPYQRSG..............STVFHNFRKVNEAYQVLGDIDKKRWYNKYG +YQ07_CAEEL/562-626 DAYSVFGLRS.......DCSDDDIKRNYKRLAALVSPDKCTI..............DAADQVYELVDVAFSAIGYKDSRSEYTLEN +YFHE_ECOLI/2-74 DYFTLFGLPAR.....YQLDTQALSLRFQDLQRQYHPDKFASGSQ........AEQLAAVQQSATINQAWQTLRHPLMRAEYLLSL +YGB8_YEAST/13-82 TFYELFPKTFPKKLPIWTIDQSRLRKEYRQLQAQHHPDMAQQ................GSEQSSTLNQAYHTLKDPLRRSQYMLKL -- 2.11.4.GIT