changes all issue tracking in preparation for switch to github issues
[bioperl-live.git] / Bio / SeqIO / strider.pm
blob12cde46551384854bfda9254db5b44e6ee8a5844
1 # BioPerl module for Bio::SeqIO::strider
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Malcolm Cook <mec@stowers-institute.org>
7 # You may distribute this module under the same terms as perl itself
9 # _history
10 # April 7th, 2005 Malcolm Cook authored
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::SeqIO::strider - DNA strider sequence input/output stream
18 =head1 SYNOPSIS
20 Do not use this module directly. Use it via the Bio::SeqIO class.
22 =head1 DESCRIPTION
24 This object can transform Bio::Seq objects to and from strider
25 'binary' format, as documented in the strider manual, in which the
26 first 112 bytes are a header, following by the sequence, followed by a
27 sequence description.
29 Note: it does NOT assign any sequence identifier, since they are not
30 contained in the byte stream of the file; the Strider application
31 simply displays the name of the file on disk as the name of the
32 sequence. The caller should set the id, probably based on the name of
33 the file (after possibly cleaning up whitespace, which ought not to be
34 used as the id in most applications).
36 Note: the strider 'comment' is mapped to the BioPerl 'description'
37 (since there is no other text field, and description maps to defline
38 text).
40 =head1 FEEDBACK
42 =head2 Mailing Lists
44 User feedback is an integral part of the evolution of this and other
45 Bioperl modules. Send your comments and suggestions preferably to one
46 of the Bioperl mailing lists. Your participation is much appreciated.
48 bioperl-l@bioperl.org - General discussion
49 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
51 =head2 Support
53 Please direct usage questions or support issues to the mailing list:
55 I<bioperl-l@bioperl.org>
57 rather than to the module maintainer directly. Many experienced and
58 reponsive experts will be able look at the problem and quickly
59 address it. Please include a thorough description of the problem
60 with code and data examples if at all possible.
62 =head2 Reporting Bugs
64 Report bugs to the Bioperl bug tracking system to help us keep track
65 the bugs and their resolution. Bug reports can be submitted via the
66 web:
68 https://github.com/bioperl/bioperl-live/issues
70 =head1 AUTHORS - Malcolm Cook
72 Email: mec@stowers-institute.org
74 =head1 CONTRIBUTORS
76 Modelled after Bio::SeqIO::fasta by Ewan Birney E<lt>birney@ebi.ac.ukE<gt> and
77 Lincoln Stein E<lt>lstein@cshl.orgE<gt>
79 =head1 APPENDIX
81 The rest of the documentation details each of the object
82 methods. Internal methods are usually preceded with a _
84 =cut
86 # Let the code begin...
88 package Bio::SeqIO::strider;
89 use strict;
90 use warnings;
93 use Bio::Seq::SeqFactory;
94 use Convert::Binary::C;
96 use base qw(Bio::SeqIO);
98 my $c = Convert::Binary::C->new (
99 ByteOrder => 'BigEndian',
100 Alignment => 2
103 my $headerdef;
104 {local ($/);
105 # See this file's __DATA__ section for the c structure definitions
106 # for strider binary header data. Here we slurp it all into $headerdef.
107 $headerdef = <DATA>};
109 $c->parse($headerdef);
111 my $size_F_HEADER = 112;
113 die "expected strider header structure size of $size_F_HEADER" unless $size_F_HEADER eq $c->sizeof('F_HEADER');
115 my %alphabet2type = (
116 # map between BioPerl alphabet and strider
117 # sequence type code.
119 # From Strider Documentation: the sequence type:
120 # 1, 2, 3 and 4 for DNA, DNA Degenerate, RNA and
121 # Protein sequence files, respectively.
123 # TODO: determine 'DNA Degenerate' based on
124 # sequence alphabet?
126 dna => 1,
127 rna => 3,
128 protein => 4,
131 my %type2alphabet = reverse %alphabet2type;
133 sub _initialize {
134 my($self,@args) = @_;
135 $self->SUPER::_initialize(@args);
136 unless ( defined $self->sequence_factory ) {
137 $self->sequence_factory(Bio::Seq::SeqFactory->new(-verbose => $self->verbose(),
138 -type => 'Bio::Seq::RichSeq'));
142 =head2 next_seq
144 Title : next_seq
145 Usage : $seq = $stream->next_seq()
146 Function: returns the next sequence in the stream
147 Returns : Bio::Seq object
148 Args : NONE
150 =cut
152 sub next_seq {
153 my( $self ) = @_;
154 my $fh = $self->_fh;
155 my ($header,$sequence,$fulldesc);
156 eval {read $fh,$header,$size_F_HEADER};
157 $self->throw ("$@ while attempting to reading strider header from " . $self->{'_file'}) if $@;
158 $self->throw("required $size_F_HEADER bytes while reading strider header in " . $self->{'_file'} . " but found: " . length($header))
159 unless $size_F_HEADER == length($header);
160 my $headerdata = $c->unpack('F_HEADER',$header) or return;
161 read $fh,$sequence,$headerdata->{nLength};
162 read $fh,$fulldesc,$headerdata->{com_length};
163 $fulldesc =~ s/\cM/ /g; # gratuitous replacement of mac
164 # linefeed with space.
165 my $seq = $self->sequence_factory->create(
166 # -id => $main::ARGV, #might want to set this in caller to $ARGV.
167 -seq => $sequence,
168 -desc => $fulldesc,
169 -alphabet => $type2alphabet{$headerdata->{type}} || 'dna',
172 return $seq;
175 =head2 write_seq
177 Title : write_seq
178 Usage : $stream->write_seq(@seq)
179 Function: writes the $seq object into the stream
180 Returns : 1 for success and 0 for error
181 Args : array of 1 to n Bio::PrimarySeqI objects
184 =cut
186 sub write_seq {
187 my ($self,@seq) = @_;
188 my $fh = $self->_fh() || *STDOUT; #die "could not determine filehandle in strider.pm";
189 foreach my $seq (@seq) {
190 $self->throw("Did not provide a valid Bio::PrimarySeqI object")
191 unless defined $seq && ref($seq) && $seq->isa('Bio::PrimarySeqI');
192 my $headerdata = $c->pack('F_HEADER',{
193 versionNb => 0,
194 type => $alphabet2type{$seq->alphabet} || $alphabet2type{dna},
195 topology => $seq->is_circular ? 1 : 0,
196 nLength => $seq->length,
197 nMinus => 0,
198 com_length => length($seq->desc || ""),
200 print $fh $headerdata, $seq->seq() || "" , $seq->desc || "";
206 __DATA__
208 //The following was taken from the strider 1.4 release notes Appendix (with
209 //some comments gleaned from other parts of manual)
211 struct F_HEADER
213 char versionNb; // the format version number, currently it is set to 0
214 char type; // 1=DNA, 2=DNA Degenerate, 3=RNA or 4=Protein
215 char topology; // linear or circular - 0 for a linear sequence, 1 for a circular one
216 char reserved1;
217 int reserved2;
218 int reserved3;
219 int reserved4;
220 char reserved5;
221 char filler1;
222 short filler2;
223 int filler3;
224 int reserved6;
225 int nLength; // Sequence length - the length the Sequence field (the number of char in the text, each being a base or an aa)
226 int nMinus; // nb of "negative" bases, i.e. the number of bases numbered with negative numbers
227 int reserved7;
228 int reserved8;
229 int reserved9;
230 int reserved10;
231 int reserved11;
232 char reserved12[32];
233 short reserved13;
234 short filler4;
235 char reserved14;
236 char reserved15;
237 char reserved16;
238 char filler5;
239 int com_length; // the length the Comment field (the number of char in the text).
240 int reserved17;
241 int filler6;
242 int filler7;