From 823590fd90f1367b9f3bf046bad05ccf95b2bebf Mon Sep 17 00:00:00 2001 From: cjfields Date: Mon, 6 Nov 2006 16:29:21 +0000 Subject: [PATCH] Add experimental stockholm write_aln() support svn path=/bioperl-live/trunk/; revision=10817 --- Bio/AlignIO/stockholm.pm | 440 +++++++++++++++++++++++++++++++++++++++-------- Bio/SimpleAlign.pm | 40 +++++ t/AlignIO.t | 45 ++++- 3 files changed, 448 insertions(+), 77 deletions(-) diff --git a/Bio/AlignIO/stockholm.pm b/Bio/AlignIO/stockholm.pm index da4cc48b6..b9368a673 100644 --- a/Bio/AlignIO/stockholm.pm +++ b/Bio/AlignIO/stockholm.pm @@ -2,7 +2,7 @@ # # BioPerl module for Bio::AlignIO::stockholm -# based on the Bio::SeqIO::stockholm module +# Based on the Bio::SeqIO::stockholm module # by Ewan Birney # and Lincoln Stein # @@ -37,6 +37,29 @@ Bio::AlignIO::stockholm - stockholm sequence input/output stream This object can transform L objects to and from stockholm flat file databases. +Note that many Stockholm sequence files contain additional sequence-based +and alignment-based annotation + + GF Lines (alignment feature/annotation): + #=GF + Placed above the alignment + + GC Lines (Alignment consensus) + #=GC + Placed below the alignment + + GS Lines (Sequence annotations) + #=GS + + GR Lines (Sequence meta data) + #=GR + +This module is currently being refactored to incorporate Meta data for +sequences and alignments. Annotations are also now added for alignments. + =head1 FEEDBACK =head2 Reporting Bugs @@ -47,13 +70,14 @@ web: http://bugzilla.open-bio.org/ -=head1 AUTHORS - Peter Schattner +=head1 AUTHORS - Peter Schattner, Chris Fields -Email: schattner@alum.mit.edu +Email: schattner@alum.mit.edu, cjfields-at-uiuc-dot-edu =head1 CONTRIBUTORS -Andreas Kahari, ak@ebi.ac.uk +Chris Fields, cjfields-at-uiuc-dot-edu +Andreas Kahari, ak-at-ebi.ac.uk Jason Stajich, jason-at-bioperl.org =head1 APPENDIX @@ -68,9 +92,123 @@ methods. Internal methods are usually preceded with a _ package Bio::AlignIO::stockholm; use strict; +use Bio::Seq::Meta; +use Bio::Annotation::AnnotationFactory; +use Data::Dumper; +use Text::Wrap qw(wrap); use base qw(Bio::AlignIO); +our $STKVERSION = 'STOCKHOLM 1.0'; + +# This maps the two-letter annotation key to a Annotation/parameter/tagname +# combination. Some data is stored using get/set methods ('Methods') The rest +# is mapped to Annotation objects using the parameter for the parsed data +# and the tagname for, well, the Annotation tagname. A few are treated differently +# based on the type of data stored (Reference data in particular). + +our %READMAP = ( + 'AC' => 'Method/accession', + 'ID' => 'Method/id', + 'DE' => 'Method/description', + 'AU' => 'SimpleValue/-value/record_authors', + 'SE' => 'SimpleValue/-value/seed_source', + 'GA' => 'SimpleValue/-value/gathering_threshold', + 'NC' => 'SimpleValue/-value/noise_cutoff', + 'TC' => 'SimpleValue/-value/trusted_cutoff', + 'TP' => 'SimpleValue/-value/entry_type', + 'SQ' => 'SimpleValue/-value/num_sequences', + 'PI' => 'SimpleValue/-value/previous_ids', + 'DC' => 'Comment/-text/database_comment', + 'DR' => 'SimpleValue/-value/database_reference', + 'CC' => 'Comment/-text/alignment_comment', + # Pfam-specific + 'AM' => 'SimpleValue/-value/build_method', + 'NE' => 'SimpleValue/-value/pfam_family_accession', + 'NL' => 'SimpleValue/-value/sequence_start_stop', + # Rfam-specific GF lines + 'SS' => 'SimpleValue/-value/sec_structure_source', + # Reference objects mapped differently + 'RN' => '-number', # reference number is dumped + 'RC' => '-comment', + 'RM' => '-pubmed', + 'RT' => '-title', + 'RA' => '-authors', + 'RL' => '-location', + # Build model mapped differently + 'BM' => '-value', + ); + +# this is the order that annotations are written +our @WRITEORDER = qw(accession + id + description + previous_ids + record_authors + seed_source + sec_structure_source + gathering_threshold + trusted_cutoff + noise_cutoff + entry_type + build_command + build_method + pfam_family_accession + seq_start_stop + reference + database_comment + alignment_comment + num_sequences + ); + +# This maps the tagname back to a tagname-annotation value combination. +# Some data is stored using get/set methods ('Methods'), others +# are mapped b/c of more complex annotation types. + +our %WRITEMAP = ( + 'accession' => 'AC/Method', + 'id' => 'ID/Method', + 'description' => 'DE/Method', + 'record_authors' => 'AU/SimpleValue', + 'seed_source' => 'SE/SimpleValue', + 'build_command' => 'BM/SimpleValue', + 'gathering_threshold' => 'GA/SimpleValue', + 'noise_cutoff' => 'NC/SimpleValue', + 'trusted_cutoff' => 'TC/SimpleValue', + 'entry_type' => 'TP/SimpleValue', + 'num_sequences' => 'SQ/SimpleValue', + 'previous_ids' => 'PI/SimpleValue', + 'database_comment' => 'DC/SimpleValue', + 'database_reference' => 'DR/SimpleValue', + 'reference' => 'RX/Reference', + 'ref_number' => 'RN/number', + 'ref_comment' => 'RC/comment', + 'ref_pubmed' => 'RM/pubmed', + 'ref_title' => 'RT/title', + 'ref_authors' => 'RA/authors', + 'ref_location' => 'RL/location', + 'alignment_comment' => 'CC/Comment', + #Pfam-specific + 'build_method' => 'AM/SimpleValue', + 'pfam_family_accession' => 'NE/SimpleValue', + 'seq_start_stop' => 'NL/SimpleValue', + # Rfam-specific GF lines + 'sec_structure_source' => 'SS/SimpleValue' + ); + +# Add mapping hash do deal with alignment annotations +# GS lines => sequence data +# GC lines are consensus-like, need new method for meta string to be stored in SimpleAlign +# GF lines => Bio::Annotation objects in a Bio::Annotation::Collection +# GR lines => Meta data stored in Bio::Seq::Meta objects + +#sub _initialize { +# my ( $self, @args ) = @_; +# $self->SUPER::_initialize(@args); +# my ( $build_anno ) = $self->_rearrange( [qw(BUILD_ANNO)], @args ); +# $build_anno && $self->build_annotation($build_anno); +#} + =head2 next_aln Title : next_aln @@ -83,10 +221,14 @@ use base qw(Bio::AlignIO); sub next_aln { my $self = shift; - my $entry; + my $line; - my ($start,$end,%align,$name,$seqname,$seq,$count,%desc, - %hash,@c2name, %accession); + my ($start, $end, $id, $name, $seqname, $seq, $count, $tag, $data); + my $seen_rc; + my $refct = 0; + my $bct = 0; + my @c2name; + my (%align, %accession, %desc, %seq_meta, %aln_meta, %annotation); # in stockholm format, every non-blank line that does not start # with '#=' is an alignment segment; the '#=' lines are mark up lines. @@ -94,72 +236,162 @@ sub next_aln { # lines, which give accession numbers for each segment my $aln = Bio::SimpleAlign->new(-source => 'stockholm'); - - while( defined($entry = $self->_readline) ) { - next unless $entry =~ /\w+/; - - if ($entry =~ /^#\s*STOCKHOLM\s+/) { - last; - } else { - $self->throw("Not Stockholm format: Expecting \"# STOCKHOLM 1.0\"; Found \"$_\""); - } + while( defined($line = $self->_readline) ) { + next unless $line =~ /\w+/; + if ($line =~ /^#\s*STOCKHOLM\s+/) { + last; + } else { + $self->throw("Not Stockholm format: Expecting \"# STOCKHOLM 1.0\"; Found \"$_\""); + } } -# -# Next section is same as for selex format -# - while( defined($entry = $self->_readline) ) { - # Double slash (//) signals end of file. The flat Pfam-A data from - # ftp://ftp.sanger.ac.uk/pub/databases/Pfam/Pfam-A.full.gz consists - # of several concatenated Stockholm-formatted files. The following - # line makes it possible to parse it without this module trying to - # read the whole file into memory. Andreas Kähäri 10/3/2003. - last if $entry =~ m{^//}; - - # Extra bonus: Get the name of the alignment. - # Andreas Kähäri 10/3/2003. - if ($entry =~ /^\#=GF\s+AC\s+(\S+)/) { - $aln->id($1); - } elsif( $entry =~ /^\#=GS\s+(\S+)\s+AC\s+(\S+)/ ) { - $accession{$1} = $2; - } elsif( $entry =~ /^\#=GS\s+(\S+)\s+DE\s+(.+)\s*$/ ) { - $desc{$1} .= $2; - } elsif( $entry =~ /^([A-Za-z.\-\*]+)$/ ) { - $align{$name} .= $1; - } elsif( $entry =~ /^([^\#]\S+)\s+([A-Za-z.\-\*]+)\s*/ ) { - ($name,$seq) = ($1,$2); - if( ! defined $align{$name} ) { - push @c2name, $name; - } - $align{$name} .= $seq; - } + + READLINE: + while( defined($line = $self->_readline) ) { + #skip empty lines + next if $line =~ /^\s+$/; + + # Double slash (//) signals end of file. + last if $line =~ m{^//}; + + # GF/GS lines, by convention, should be at the top of the alignment + if ($line =~ m{^\#=GF\s+(\w{2})\s+([^\n]*)$}xms) { + + # alignment annotation + ($tag, $data) = ($1, $2); + if (exists $READMAP{$tag}) { + + # reference data + if (index($tag, 'R') == 0) { + # comments come before numbering, tricky + $refct++ if ( ($tag eq 'RN' && !$seen_rc) || $tag eq 'RC'); + $seen_rc = 1 if $tag eq 'RC'; + # Don't need + next READLINE if $tag eq 'RN'; + # # of ref parameter + $annotation{ 'reference' }->[$refct]->{ $READMAP{$tag} } .= $data.' '; + + # Build commands + } elsif ($tag eq 'BM') { + $bct++; + # # build cmd parameter + $annotation{ 'build_command' }->[$bct]->{ $READMAP{$tag} } .= $data.' '; + + # Everything else (for now) + } else { + # # param/-value/tagname + $annotation{ $READMAP{$tag} } .= $data.' '; + } + + } else { + $self->debug("Unknown tag: $tag:\t$data"); + } + + } elsif( $line =~ m{^\#=GS\s+(\S+)\s+(\w{2})\s+(\S+)}xms ) { + # sequence annotation and data + ($id, $tag, $data) = ($1, $2, $3); + if ($tag eq 'AC') { + $accession{$id} .= $data; + } elsif ($tag eq 'DE') { + $desc{$id} .= $data; + } + # Does not catch other GS-based tags (DR, OS, OC, LO) yet + #else { + # $self->debug("Missed data: $entry"); + #} + } elsif( $line =~ m{^\#=GR\s+(\S+)\s+(\w+)\s+(.+)} ) { + # meta strings per sequence + ($name, $tag, $data) = ($1, $2, $3); + $seq_meta{$name}->{$tag} .= $data; + } elsif( $line =~ m{^\#=GC\s+(\S+)\s+(\S+)}xms ) { + # meta strings per alignment + ($tag, $data) = ($1, $2); + $aln_meta{$tag} .= $data; + } elsif( $line =~ m{^([^\#]\S+)\s+([A-Za-z.\-\*]+)\s*}xms ) { + ($name,$seq) = ($1,$2); + if( ! exists $align{$name} ) { + push @c2name, $name; + } + $align{$name} .= $seq; + } else { + $self->debug("Missed Data: $line"); + } } - + # ok... now we can make the sequences for my $name ( @c2name ) { - if( $name =~ m{(\S+)/(\d+)-(\d+)} ) { - $seqname = $1; - $start = $2; - $end = $3; - } else { - $seqname=$name; - $start = 1; - $end = length($align{$name}); - } - $seq = new Bio::LocatableSeq - ('-seq' => $align{$name}, - '-display_id' => $seqname, - '-start' => $start, - '-end' => $end, - '-description' => $desc{$name}, - '-accession_number' => $accession{$name}, - ); - - $aln->add_seq($seq); + if( $name =~ m{(\S+)/(\d+)-(\d+)}xms ) { + ($seqname, $start, $end) = ($1, $2, $3); + } else { + $seqname=$name; + $start = 1; + $end = length($align{$name}); + } + $seq = Bio::Seq::Meta->new + ('-seq' => $align{$name}, + '-display_id' => $seqname, + '-start' => $start, + '-end' => $end, + '-description' => $desc{$name}, + '-accession_number' => $accession{$name} + ); + if (exists $seq_meta{$name}) { + for my $tag (sort keys %{ $seq_meta{$name} }) { + $seq->named_meta($tag, $seq_meta{$name}->{$tag}); + } + } + $aln->add_seq($seq); } - -# If $end <= 0, we have either reached the end of -# file in or we have encountered some other error -# + + # Make the annotation collection... + + my $coll = Bio::Annotation::Collection->new(); + + for my $tag (sort keys %annotation) { + + # most annotations + if (!ref($annotation{$tag})) { + my ($atype, $aparam, $tagname) = split q(/), $tag; + # remove trailing newline, convert internal newlines to spaces + $annotation{$tag} =~ s{\s+$}{}g; + # split the READTYPE map to determine Annotation type, parameters, etc. + if ($atype eq 'Method') { + $aln->$aparam($annotation{$tag}); + } else { + my $factory = Bio::Annotation::AnnotationFactory->new( + -type => "Bio::Annotation::$atype"); + my $ann = $factory->create_object(-tagname => $tagname, + $aparam => $annotation{$tag}); + $coll->add_Annotation($ann); + } + } + else { + my $data; + my $atype = ($tag eq 'reference') ? 'Reference' : + ($tag eq 'build_command') ? 'SimpleValue' : + 'BadValue'; # this will cause the factory to choke + $self->throw("Bad tag value : $tag.") if $atype eq 'BadValue'; + my $factory = Bio::Annotation::AnnotationFactory->new( + -type => "Bio::Annotation::$atype"); + while(@{ $annotation{$tag} }) { + $data = shift @{ $annotation{$tag} }; + next unless $data; + # remove trailing spaces + my %clean_data = map { + $data->{$_} =~ s{\s+$}{}g; + $_ => $data->{$_}; + } keys %{ $data }; + my $ann = $factory->create_object(%clean_data); + $coll->add_Annotation($tag,$ann); + $refct++; + } + } + } + + # add annotations + $aln->annotation($coll); + + # If $end <= 0, we have either reached the end of + # file in or we have encountered some other error return if ($end <= 0); return $aln; } @@ -177,8 +409,74 @@ sub next_aln { =cut sub write_aln { - my ($self,@aln) = @_; - $self->throw_not_implemented(); + # enable array of SimpleAlign objects as well (see clustalw write_aln()) + my ($self, $aln) = @_; + $self->throw('Need Bio::Align::AlignI object') + if (!$aln || !($aln->isa('Bio::Align::AlignI'))); + + my (@anns, @ann_objs); + my $coll = $aln->annotation; + my ($aln_ann, $seq_ann, $aln_meta, $seq_meta) = + ('#=GF ', '#=GS ', '#=GC ', '#=GR' ); + $self->_print("# $STKVERSION\n\n"); + + for my $param (@WRITEORDER) { + # no point in going through this if there is no annotation! + last if !$coll; + # alignment annotations + my $ct = 1; + $self->throw("Bad parameter: $param") if !exists $WRITEMAP{$param}; + my ($tag, $key) = split q(/), $WRITEMAP{$param}; + if ($key eq 'Method') { + push @anns, $aln->$param; + } else { + @anns = $coll->get_Annotations($param); + } + my $rn = 1; + while (my $ann = shift @anns) { + # using Text::Wrap::wrap() for word wrap + # references + if ($tag eq 'RX') { + for my $rkey (qw(ref_comment ref_number ref_pubmed + ref_title ref_authors ref_location)) { + my $text; + my ($newtag, $method) = split q(/), $WRITEMAP{$rkey}; + if ($rkey eq 'ref_number') { + $text = wrap($aln_ann.$newtag.' ', $aln_ann.$newtag.' ', "[$rn]"); + } else { + $text = wrap($aln_ann.$newtag.' ', $aln_ann.$newtag.' ', $ann->$method) + if $ann->$method; + } + #$self->_print("REFERENCE\n"); + $self->_print("$text\n") if $text; + } + $rn++; + } else { + my $text = wrap($aln_ann.$tag.' ', $aln_ann.$tag.' ', $ann); + $self->_print("$text\n"); + } + } + } + + # now the sequences... + + # copied from pfam.pm for now; will modify to interleave sequences, print + # meta data, etc. + + my ($namestr,$seq,$add); + my ($maxn); + $maxn = $aln->maxdisplayname_length(); + + foreach $seq ( $aln->each_seq() ) { + $namestr = $aln->displayname($seq->get_nse()); + $add = $maxn - length($namestr) + 2; + $namestr .= " " x $add; + $self->_print (sprintf("%s %s\n",$namestr,$seq->seq())) or return; + } + $self->flush() if $self->_flush_on_write && defined $self->_fh; + + $self->_print("//\n"); + return; } -1; +1; \ No newline at end of file diff --git a/Bio/SimpleAlign.pm b/Bio/SimpleAlign.pm index df45034dd..8d8643a55 100644 --- a/Bio/SimpleAlign.pm +++ b/Bio/SimpleAlign.pm @@ -1610,6 +1610,46 @@ sub id { return $self->{'_id'}; } +=head2 accession + + Title : accession + Usage : $myalign->accession("PF00244") + Function : Gets/sets the accession field of the alignment + Returns : An acc string + Argument : An acc string (optional) + +=cut + +sub accession { + my ($self, $acc) = @_; + + if (defined( $acc )) { + $self->{'_accession'} = $acc; + } + + return $self->{'_accession'}; +} + +=head2 description + + Title : description + Usage : $myalign->description("14-3-3 proteins") + Function : Gets/sets the description field of the alignment + Returns : An description string + Argument : An description string (optional) + +=cut + +sub description { + my ($self, $name) = @_; + + if (defined( $name )) { + $self->{'_description'} = $name; + } + + return $self->{'_description'}; +} + =head2 missing_char Title : missing_char diff --git a/t/AlignIO.t b/t/AlignIO.t index 31e8b9cad..e66df0788 100644 --- a/t/AlignIO.t +++ b/t/AlignIO.t @@ -9,7 +9,7 @@ BEGIN { use lib 't/lib'; } use Test::More; - plan tests => 168; + plan tests => 193; } use_ok('Bio::SimpleAlign'); use_ok('Bio::AlignIO'); @@ -42,21 +42,54 @@ isa_ok($aln,'Bio::Align::AlignI'); is($aln->get_seq_by_pos(1)->get_nse, 'QUERY/1-798'); is($aln->no_sequences, 56); -# STOCKHOLM (multiple concatenated files, as Pfam flatfile) +# STOCKHOLM (multiple concatenated files) +# Rfam $str = new Bio::AlignIO ( - '-file' => Bio::Root::IO->catfile("t","data","testaln.stockholm"), + '-file' => Bio::Root::IO->catfile("t","data","rfam_tests.stk"), '-format' => 'stockholm'); isa_ok($str,'Bio::AlignIO'); $aln = $str->next_aln(); isa_ok($aln,'Bio::Align::AlignI'); -is($aln->get_seq_by_pos(1)->get_nse, '1433_LYCES/9-246'); +is($aln->get_seq_by_pos(1)->get_nse, 'Z11765.1/1-89'); +is($aln->accession, 'RF00006'); +is($aln->id, 'Vault'); +is($aln->description,'Vault RNA'); $aln = $str->next_aln(); isa_ok($aln,'Bio::Align::AlignI'); -is($aln->get_seq_by_pos(1)->get_nse, '1433_LYCES/9-246'); +is($aln->get_seq_by_pos(1)->get_nse, 'L43844.1/2-149'); +is($aln->accession, 'RF00007'); +is($aln->id, 'U12'); +is($aln->description,'U12 minor spliceosomal RNA'); $aln = $str->next_aln(); isa_ok($aln,'Bio::Align::AlignI'); -is($aln->get_seq_by_pos(1)->get_nse, '1433_LYCES/9-246'); +is($aln->get_seq_by_pos(1)->get_nse, 'AJ295015.1/58-1'); +is($aln->accession, 'RF00008'); +is($aln->id, 'Hammerhead_3'); +is($aln->description,'Hammerhead ribozyme (type III)'); +# Pfam +$str = new Bio::AlignIO ( + '-file' => Bio::Root::IO->catfile("t","data","pfam_tests.stk"), + '-format' => 'stockholm'); +isa_ok($str,'Bio::AlignIO'); +$aln = $str->next_aln(); +isa_ok($aln,'Bio::Align::AlignI'); +is($aln->get_seq_by_pos(1)->get_nse, 'RAD25_SCHPO/5-240'); +is($aln->accession, 'PF00244.9'); +is($aln->id, '14-3-3'); +is($aln->description,'14-3-3 protein'); +$aln = $str->next_aln(); +isa_ok($aln,'Bio::Align::AlignI'); +is($aln->get_seq_by_pos(1)->get_nse, 'COMB_CLOAB/6-235'); +is($aln->accession, 'PF04029.4'); +is($aln->id, '2-ph_phosp'); +is($aln->description,'2-phosphosulpholactate phosphatase'); +$aln = $str->next_aln(); +isa_ok($aln,'Bio::Align::AlignI'); +is($aln->get_seq_by_pos(1)->get_nse, 'Y278_HAEIN/174-219'); +is($aln->accession, 'PF03475.3'); +is($aln->id, '3-alpha'); +is($aln->description,'3-alpha domain'); # PFAM $str = Bio::AlignIO->new( -- 2.11.4.GIT