2 # BioPerl module for Bio::AlignIO::stockholm
4 # Based on the Bio::SeqIO::stockholm module
5 # by Ewan Birney <birney@ebi.ac.uk>
6 # and Lincoln Stein <lstein@cshl.org>
8 # and the SimpleAlign.pm module of Ewan Birney
10 # Copyright Peter Schattner, Chris Fields
12 # You may distribute this module under the same terms as perl itself
15 # November 6, 2006 - completely refactor read_aln(), add write_aln()
16 # POD documentation - main docs before the code
20 Bio::AlignIO::stockholm - stockholm sequence input/output stream
24 # Do not use this module directly. Use it via the L<Bio::AlignIO> class.
29 my $in = Bio::AlignIO->new(-format => 'stockholm',
30 -file => 't/data/testaln.stockholm');
31 while( my $aln = $in->next_aln ) {
37 This object can transform L<Bio::Align::AlignI> objects to and from
38 stockholm flat file databases. This has been completely refactored
39 from the original stockholm parser to handle annotation data and now
40 includes a write_aln() method for (almost) complete stockholm
43 Stockholm alignment records normally contain additional sequence-based
44 and alignment-based annotation
46 GF Lines (alignment feature/annotation):
47 #=GF <featurename> <Generic per-file annotation, free text>
48 Placed above the alignment
50 GC Lines (Alignment consensus)
51 #=GC <featurename> <Generic per-column annotation, exactly 1
53 Placed below the alignment
55 GS Lines (Sequence annotations)
56 #=GS <seqname> <featurename> <Generic per-sequence annotation, free
59 GR Lines (Sequence meta data)
60 #=GR <seqname> <featurename> <Generic per-sequence AND per-column
61 mark up, exactly 1 character per column>
63 Currently, sequence annotations (those designated with GS tags) are
64 parsed only for accession numbers and descriptions. It is intended that
65 full parsing will be added at some point in the near future along with
66 a builder option for optionally parsing alignment annotation and meta data.
68 The following methods/tags are currently used for storing and writing
69 the alignment annotation data.
73 ----------------------------------------------------------------------
77 ----------------------------------------------------------------------
79 Tag Bio::Annotation TagName Parameters
81 ----------------------------------------------------------------------
82 AU SimpleValue record_authors value
83 SE SimpleValue seed_source value
84 GA SimpleValue gathering_threshold value
85 NC SimpleValue noise_cutoff value
86 TC SimpleValue trusted_cutoff value
87 TP SimpleValue entry_type value
88 SQ SimpleValue num_sequences value
89 PI SimpleValue previous_ids value
90 DC Comment database_comment comment
91 CC Comment alignment_comment comment
92 DR Target dblink database
95 AM SimpleValue build_method value
96 NE SimpleValue pfam_family_accession value
97 NL SimpleValue sequence_start_stop value
98 SS SimpleValue sec_structure_source value
99 BM SimpleValue build_model value
100 RN Reference reference *
101 RC Reference reference comment
102 RM Reference reference pubmed
103 RT Reference reference title
104 RA Reference reference authors
105 RL Reference reference location
106 ----------------------------------------------------------------------
107 * RN is generated based on the number of Bio::Annotation::Reference objects
109 =head2 Custom annotation
111 Some users may want to add custom annotation beyond those mapped above.
112 Currently there are two methods to do so; however, the methods used for adding
113 such annotation may change in the future, particularly if alignment Writer
114 classes are introduced. In particular, do not rely on changing the global
115 variables @WRITEORDER or %WRITEMAP as these may be made private at some point.
117 1) Use (and abuse) the 'custom' tag. The tagname for the object can differ
118 from the tagname used to store the object in the AnnotationCollection.
120 # AnnotationCollection from the SimpleAlign object
121 my $coll = $aln->annotation;
122 my $factory = Bio::Annotation::AnnotationFactory->new(-type =>
123 Bio::Annotation::SimpleValue');
124 my $rfann = $factory->create_object(-value => $str,
125 -tagname => 'mytag');
126 $coll->add_Annotation('custom', $rfann);
127 $rfann = $factory->create_object(-value => 'foo',
129 $coll->add_Annotation('custom', $rfann);
136 #=GF mytag katnayygqelggvnhdyddlakfyfgaglealdffnnkeaaakiinwvaEDTTRGKIQDLV??
137 #=GF mytag TPtd~????LDPETQALLV???????????????????????NAIYFKGRWE?????????~??
138 #=GF mytag ??HEF?A?EMDTKPY??DFQH?TNen?????GRI??????V???KVAM??MF?????????N??
139 #=GF mytag ???DD?VFGYAEL????DE???????L??D??????A??TALELAY??????????????????
140 #=GF mytag ?????????????KG??????Sa???TSMLILLP???????????????D??????????????
141 #=GF mytag ???????????EGTr?????AGLGKLLQ??QL????????SREef??DLNK??L???AH????R
142 #=GF mytag ????????????L????????????????????????????????????????R?????????R
143 #=GF mytag ??QQ???????V???????AVRLPKFSFefefdlkeplknlgmhqafdpnsdvfklmdqavlvi
144 #=GF mytag gdlqhayafkvd????????????????????????????????????????????????????
145 #=GF mytag ????????????????????????????????????????????????????????????????
146 #=GF mytag ????????????????????????????????????????????????????????????????
147 #=GF mytag ????????????????????????????????????????????????????????????????
148 #=GF mytag ?????????????INVDEAG?TEAAAATAAKFVPLSLppkt??????????????????PIEFV
149 #=GF mytag ADRPFAFAIR??????E?PAT?G????SILFIGHVEDPTP?msv?
153 2) Modify the global @WRITEORDER and %WRITEMAP.
155 # AnnotationCollection from the SimpleAlign object
156 my $coll = $aln->annotation;
159 my @order = @Bio::AlignIO::stockholm::WRITEORDER;
160 push @order, 'my_stuff';
161 @Bio::AlignIO::stockholm::WRITEORDER = @order;
163 # make sure new tag maps to something
164 $Bio::AlignIO::stockholm::WRITEMAP{my_stuff} = 'Hobbit/SimpleValue';
166 my $rfann = $factory->create_object(-value => 'Frodo',
167 -tagname => 'Hobbit');
168 $coll->add_Annotation('my_stuff', $rfann);
169 $rfann = $factory->create_object(-value => 'Bilbo',
170 -tagname => 'Hobbit');
171 $coll->add_Annotation('my_stuff', $rfann);
186 Please direct usage questions or support issues to the mailing list:
188 I<bioperl-l@bioperl.org>
190 rather than to the module maintainer directly. Many experienced and
191 reponsive experts will be able look at the problem and quickly
192 address it. Please include a thorough description of the problem
193 with code and data examples if at all possible.
195 =head2 Reporting Bugs
197 Report bugs to the Bioperl bug tracking system to help us keep track
198 the bugs and their resolution. Bug reports can be submitted via the
201 https://github.com/bioperl/bioperl-live/issues
203 =head1 AUTHORS - Chris Fields, Peter Schattner
205 Email: cjfields-at-uiuc-dot-edu, schattner@alum.mit.edu
209 Andreas Kahari, ak-at-ebi.ac.uk
210 Jason Stajich, jason-at-bioperl.org
214 The rest of the documentation details each of the object
215 methods. Internal methods are usually preceded with a _
219 # Let the code begin...
221 package Bio
::AlignIO
::stockholm
;
225 use Bio
::AlignIO
::Handler
::GenericAlignHandler
;
226 use Text
::Wrap
qw(wrap);
228 use base
qw(Bio::AlignIO);
230 my $STKVERSION = 'STOCKHOLM 1.0';
232 # This maps the two-letter annotation key to a Annotation/parameter/tagname
233 # combination. Some data is stored using get/set methods ('Methods') The rest
234 # is mapped to Annotation objects using the parameter for the parsed data
235 # and the tagname for, well, the Annotation tagname. A few are treated differently
236 # based on the type of data stored (Reference data in particular).
241 'DE' => ['DESCRIPTION' => 'DESCRIPTION'],
242 'AU' => ['RECORD_AUTHORS' => 'RECORD_AUTHORS'],
243 'SE' => 'SEED_SOURCE',
244 'BM' => 'BUILD_COMMAND',
245 'GA' => 'GATHERING_THRESHOLD',
246 'NC' => 'NOISE_CUTOFF',
247 'TC' => 'TRUSTED_CUTOFF',
248 'TP' => 'ENTRY_TYPE',
249 'SQ' => 'NUM_SEQUENCES',
250 'PI' => 'PREVIOUS_IDS',
251 'DC' => ['DATABASE_COMMENT' => 'DATABASE_COMMENT'],
253 'RN' => ['REFERENCE' => 'REFERENCE'],
254 'RC' => ['REFERENCE' => 'COMMENT'],
255 'RM' => ['REFERENCE' => 'PUBMED'],
256 'RT' => ['REFERENCE' => 'TITLE'],
257 'RA' => ['REFERENCE' => 'AUTHORS'],
258 'RL' => ['REFERENCE' => 'JOURNAL'],
259 'CC' => ['ALIGNMENT_COMMENT' => 'ALIGNMENT_COMMENT'],
261 'AM' => 'BUILD_METHOD',
262 'NE' => 'PFAM_FAMILY_ACCESSION',
263 'NL' => 'SEQ_START_STOP',
264 # Rfam-specific GF lines
265 #'SS' => 'SEC_STRUCTURE_SOURCE',
266 'SEQUENCE' => 'SEQUENCE'
269 # this is the order that annotations are written
270 our @WRITEORDER = qw(accession
283 pfam_family_accession
294 # This maps the tagname back to a tagname-annotation value combination.
295 # Some data is stored using get/set methods ('Methods'), others
296 # are mapped b/c of more complex annotation types.
299 'accession' => 'AC/Method',
301 'description' => 'DE/Method',
302 'record_authors' => 'AU/SimpleValue',
303 'seed_source' => 'SE/SimpleValue',
304 'build_command' => 'BM/SimpleValue',
305 'gathering_threshold' => 'GA/SimpleValue',
306 'noise_cutoff' => 'NC/SimpleValue',
307 'trusted_cutoff' => 'TC/SimpleValue',
308 'entry_type' => 'TP/SimpleValue',
309 'num_sequences' => 'SQ/SimpleValue',
310 'previous_ids' => 'PI/SimpleValue',
311 'database_comment' => 'DC/SimpleValue',
312 'dblink' => 'DR/DBLink',
313 'reference' => 'RX/Reference',
314 'ref_number' => 'RN/number',
315 'ref_comment' => 'RC/comment',
316 'ref_pubmed' => 'RM/pubmed',
317 'ref_title' => 'RT/title',
318 'ref_authors' => 'RA/authors',
319 'ref_location' => 'RL/location',
320 'alignment_comment' => 'CC/Comment',
321 'seq_annotation' => 'DR/Collection',
323 'build_method' => 'AM/SimpleValue',
324 'pfam_family_accession' => 'NE/SimpleValue',
325 'seq_start_stop' => 'NL/SimpleValue',
326 # Rfam-specific GF lines
327 'sec_structure_source' => 'SS/SimpleValue',
328 # custom; this is used to carry over anything from the input alignment
329 # not mapped to LocatableSeqs or SimpleAlign in a meaningful way
330 'custom' => 'XX/SimpleValue'
333 # This maps the tagname back to a tagname-annotation value combination.
334 # Some data is stored using get/set methods ('Methods'), others
335 # are mapped b/c of more complex annotation types.
340 Usage : my $alignio = Bio::AlignIO->new(-format => 'stockholm'
342 Function: Initialize a new L<Bio::AlignIO::stockholm> reader or writer
343 Returns : L<Bio::AlignIO> object
344 Args : -line_length : length of the line for the alignment block
345 -alphabet : symbol alphabet to set the sequences to. If not set,
346 the parser will try to guess based on the alignment
347 accession (if present), defaulting to 'dna'.
348 -spaces : (optional, def = 1) boolean to add a space in between
349 the "# STOCKHOLM 1.0" header and the annotation and
350 the annotation and the alignment.
355 my ( $self, @args ) = @_;
356 $self->SUPER::_initialize
(@args);
357 my ($handler, $linelength, $spaces) = $self->_rearrange([qw(HANDLER LINE_LENGTH SPACES)],@args);
358 $spaces = defined $spaces ?
$spaces : 1;
359 $self->spaces($spaces);
360 # hash for functions for decoding keys.
361 $handler ?
$self->alignhandler($handler) :
362 $self->alignhandler(Bio
::AlignIO
::Handler
::GenericAlignHandler
->new(
363 -format
=> 'stockholm',
364 -verbose
=> $self->verbose,
366 $linelength && $self->line_length($linelength);
372 Usage : $aln = $stream->next_aln()
373 Function: returns the next alignment in the stream.
374 Returns : L<Bio::Align::AlignI> object
382 my $handler = $self->alignhandler;
383 # advance to alignment header
384 while( defined(my $line = $self->_readline) ) {
385 if ($line =~ m{^\#\s*STOCKHOLM\s+}xmso) {
390 $self->{block_line
} = 0;
391 # go into main body of alignment
392 my ($data_chunk, $isa_primary, $name, $alphabet);
394 while( defined(my $line = $self->_readline) ) {
395 # only blank lines are in between blocks, so reset block line
396 my ($primary_tag, $secondary_tag, $data, $nse, $feat, $align, $concat);
397 if ($line =~ m{^\s*$}xmso) {
398 $self->{block_line
} &&= 0;
403 if (index($line, '//') == 0) {
405 $handler->data_handler($data_chunk);
407 $handler->data_handler({ALIGNMENT
=> 1,
409 DATA
=> $self->alphabet})
413 elsif ($line =~ m{^\#=([A-Z]{2})\s
+([^\n]+?
)\s
*$}xmso
) {
414 ($primary_tag, $data) = ($1, $2);
415 if ($primary_tag eq 'GS' || $primary_tag eq 'GR') {
416 ($nse, $feat, $data) = split(/\s+/, $data, 3);
418 ($feat, $data) = split(/\s+/, $data, 2);
420 $align = ($primary_tag eq 'GF' || $primary_tag eq 'GR') ?
1 : 0;
422 elsif ($line =~ m{^(\S+)\s+([^\s]+)\s*}) {
423 $self->{block_line
}++;
424 ($feat, $nse, $data) = ('SEQUENCE', $1, $2);
427 $self->debug("Missed line : $line\n");
429 $primary_tag ||= ''; # when no #= line is present
432 # array refs where the two values are equal indicate the start of a
433 # primary chunk of data, otherwise it is to be folded into the last
434 # data chunk under a secondary tag. These are also concatenated
435 # to previous values if the
437 if (exists($MAPPING{$feat}) && ref $MAPPING{$feat} eq 'ARRAY') {
438 ($name, $secondary_tag, $isa_primary) = ( $MAPPING{$feat}->[0] eq $MAPPING{$feat}->[1] ) ?
439 ($MAPPING{$feat}->[0], 'DATA', 1) :
440 (@
{ $MAPPING{$feat} }, 0) ;
441 $concat = $last_feat eq $feat ?
1 : 0;
442 } elsif (exists($MAPPING{$feat})) {
443 ($name, $secondary_tag, $isa_primary) = ($MAPPING{$feat}, 'DATA', 1);
444 # catch alphabet here if possible
445 if ($align && $name eq 'ACCESSION' && !$self->alphabet) {
446 if ($data =~ m{^(P|R)F}) {
447 $self->alphabet($1 eq 'R' ?
'rna' : $1 eq 'P' ?
'protein' : undef );
451 $name = ($primary_tag eq 'GR') ?
'NAMED_META' :
452 ($primary_tag eq 'GC') ?
'CONSENSUS_META' :
454 ($secondary_tag, $isa_primary) = ('DATA', 1);
457 # Since we can't determine whether data should be passed into the
458 # Handler until the next round (due to concatenation and combining
459 # data), we always check for the presence of the last chunk when the
460 # occasion calls for it (i.e. when the current data string needs to go
461 # into a new data chunk). If the data needs to be concatenated it is
462 # flagged above and checked below (and passed by if the conditions
465 # We run into a bit of a fencepost problem, (one chunk left over at
466 # the end); that is taken care of above when the end of the record is
469 if ($isa_primary && defined $data_chunk && !$concat) {
470 $handler->data_handler($data_chunk);
473 $data_chunk->{NAME
} = $name; # used for the handler
474 $data_chunk->{ALIGNMENT
} = $align; # flag that determines chunk destination
475 $data_chunk->{$secondary_tag} .= (defined($data_chunk->{$secondary_tag})) ?
477 $data_chunk->{NSE
} = $nse if $nse;
478 if ($name eq 'SEQUENCE' || $name eq 'NAMED_META' || $name eq 'CONSENSUS_META') {
479 $data_chunk->{BLOCK_LINE
} = $self->{block_line
};
480 $data_chunk->{META_TAG
} = $feat if ($name ne 'SEQUENCE');
485 my $aln = $handler->build_alignment;
486 $handler->reset_parameters;
493 Usage : $stream->write_aln(@aln)
494 Function: writes the $aln object into the stream in stockholm format
495 Returns : 1 for success and 0 for error
496 Args : L<Bio::Align::AlignI> object
502 'PDB' => sub {join('; ',($_[0]->database,
503 $_[0]->primary_id.' '.
504 ($_[0]->optional_id || ''),
507 'SCOP' => sub {join('; ',($_[0]->database,
508 $_[0]->primary_id || '',
509 $_[0]->optional_id)).';'},
510 '_DEFAULT_' => sub {join('; ',($_[0]->database,
511 $_[0]->primary_id)).';'},
515 # enable array of SimpleAlign objects as well (see clustalw write_aln())
516 my ($self, @aln) = @_;
518 $self->throw('Need Bio::Align::AlignI object')
519 if (!$aln || !($aln->isa('Bio::Align::AlignI')));
521 my $coll = $aln->annotation;
522 my ($aln_ann, $seq_ann) =
524 $self->_print("# $STKVERSION\n") || return 0;
525 $self->spaces && $self->_print("\n");
529 for my $param (@WRITEORDER) {
531 # no point in going through this if there is no annotation!
533 # alignment annotations
535 $self->throw("Bad parameter: $param") if !exists $WRITEMAP{$param};
536 # get the data, act on it based on the tag
537 my ($tag, $key) = split q
(/), $WRITEMAP{$param};
538 if ($key eq 'Method') {
539 push @anns, $aln->$param;
541 @anns = $coll->get_Annotations($param);
545 for my $ann (@anns) {
546 # using Text::Wrap::wrap() for word wrap
547 my ($text, $alntag, $data);
550 for my $rkey (qw(ref_comment ref_number ref_pubmed
551 ref_title ref_authors ref_location)) {
552 my ($newtag, $method) = split q
(/), $WRITEMAP{$rkey};
553 $alntag = sprintf('%-10s',$aln_ann.$newtag);
554 if ($rkey eq 'ref_number') {
557 $data = $ann->$method;
559 next REFS
unless $data;
560 $text = wrap
($alntag, $alntag, $data);
561 $self->_print("$text\n") or return 0;
566 elsif ($tag eq 'XX') { # custom
567 my $newtag = $ann->tagname;
568 my $tmp = $aln_ann.$newtag;
569 $alntag = sprintf('%-*s',length($tmp) + 1, $tmp);
570 $data = $ann->display_text;
572 elsif ($tag eq 'SQ') {
573 # use the actual number, not the stored Annotation data
574 my $tmp = $aln_ann.$tag;
575 $alntag = sprintf('%-*s',length($tmp) + 1, $tmp);
576 $data = $aln->num_sequences;
578 elsif ($tag eq 'DR') {
579 my $tmp = $aln_ann.$tag;
580 $alntag = sprintf('%-*s',length($tmp) + 1, $tmp);
581 my $db = uc $ann->database;
582 my $cb = exists $LINK_CB{$db} ?
$LINK_CB{$db} : $LINK_CB{_DEFAULT_
};
583 $data = $ann->display_text($cb);
586 my $tmp = $aln_ann.$tag;
587 $alntag = sprintf('%-*s',length($tmp) + 1, $tmp);
588 $data = ref $ann ?
$ann->display_text : $ann;
591 $text = wrap
($alntag, $alntag, $data);
592 $self->_print("$text\n") || return 0;
596 #=GS <seq-id> AC xxxxxx
598 for my $seq ($aln->each_seq) {
599 if (my $acc = $seq->accession_number) {
600 my $text = sprintf("%-4s%-22s%-3s%s\n",$seq_ann,
601 $aln->displayname($seq->get_nse), $tag, $acc);
602 $self->_print($text) || return 0;
606 #=GS <seq-id> DR xxxxxx
608 for my $sf ($aln->get_SeqFeatures) {
609 if (my @links = $sf->annotation->get_Annotations('dblink')) {
610 for my $link (@links) {
611 my $db = uc $link->database;
612 my $cb = exists $LINK_CB{$db} ?
$LINK_CB{$db} : $LINK_CB{_DEFAULT_
};
613 my $text = sprintf("%-4s%-22s%-3s%s\n",$seq_ann,
614 $aln->displayname($sf->entire_seq->get_nse),
616 $link->display_text($cb));
617 $self->_print($text) || return 0;
622 $self->spaces && $self->_print("\n");
623 # now the sequences...
625 my $blocklen = $self->line_length;
626 my $maxlen = $aln->maxdisplayname_length() + 3;
627 my $metalen = $aln->max_metaname_length() || 0;
630 my $alnlen = $aln->length;
631 while ($blockstart < $alnlen) {
632 my $subaln = $aln->slice($blockstart, $blockstart+$blocklen-1 ,1);
633 $self->_print_seqs($subaln,$maxlen,$metalen);
634 $blockstart += $blocklen;
635 $self->_print("\n") unless $blockstart >= $alnlen;
638 $self->_print_seqs($aln,$maxlen,$metalen);
641 $self->_print("//\n") || return 0;
643 $self->flush() if $self->_flush_on_write && defined $self->_fh;
653 Usage : $obj->line_length($newval)
654 Function: Set the alignment output line length
655 Returns : value of line_length
656 Args : newvalue (optional)
661 my ( $self, $value ) = @_;
662 if ( defined $value ) {
663 $self->{'_line_length'} = $value;
665 return $self->{'_line_length'};
671 Usage : $obj->spaces(1)
672 Function: Set the 'spaces' flag, which prints extra newlines between the
673 header and the annotation and the annotation and the alignment
674 Returns : sequence data type
675 Args : newvalue (optional)
681 return $self->{'_spaces'} = shift if @_;
682 return $self->{'_spaces'};
688 Usage : $stream->alignhandler($handler)
689 Function: Get/Set the Bio::HandlerBaseI object
690 Returns : Bio::HandlerBaseI
691 Args : Bio::HandlerBaseI
696 my ($self, $handler) = @_;
698 $self->throw("Not a Bio::HandlerBaseI") unless
699 ref($handler) && $handler->isa("Bio::HandlerBaseI");
700 $self->{'_alignhandler'} = $handler;
702 return $self->{'_alignhandler'};
705 ############# PRIVATE INIT/HANDLER METHODS #############
708 my ($self, $aln, $maxlen, $metalen) = @_;
710 my ($seq_meta, $aln_meta) = ('#=GR','#=GC');
711 # modified (significantly) from AlignIO::pfam
713 my ($namestr,$seq,$add);
715 # pad extra for meta lines
717 for $seq ( $aln->each_seq() ) {
718 my ($s, $e, $str) = ($seq->start, $seq->end, $seq->strand);
719 $namestr = $aln->displayname($seq->get_nse());
720 $self->_print(sprintf("%-*s%s\n",$maxlen+$metalen,
722 $seq->seq())) || return 0;
723 if ($seq->isa('Bio::Seq::MetaI')) {
724 for my $mname ($seq->meta_names) {
725 $self->_print(sprintf("%-*s%s\n",$maxlen+$metalen,
726 $seq_meta.' '.$namestr.' '.$mname,
727 $seq->named_meta($mname))) || return 0;
731 # alignment consensus
732 my $ameta = $aln->consensus_meta;
734 for my $mname ($ameta->meta_names) {
735 $self->_print(sprintf("%-*s%s\n",$maxlen+$metalen,
736 $aln_meta.' '.$mname,
737 $ameta->named_meta($mname))) || return 0;