Bio/SeqIO/strider.pm

   1 # BioPerl module for Bio::SeqIO::strider
   2 #
   3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   4 #
   5 # Cared for by Malcolm Cook <mec@stowers-institute.org>
   6 #
   7 # You may distribute this module under the same terms as perl itself
   8 #
   9 # _history
  10 # April 7th, 2005  Malcolm Cook authored
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::SeqIO::strider - DNA strider sequence input/output stream
  17
  18 =head1 SYNOPSIS
  19
  20 Do not use this module directly.  Use it via the Bio::SeqIO class.
  21
  22 =head1 DESCRIPTION
  23
  24 This object can transform Bio::Seq objects to and from strider
  25 'binary' format, as documented in the strider manual, in which the
  26 first 112 bytes are a header, following by the sequence, followed by a
  27 sequence description.
  28
  29 Note: it does NOT assign any sequence identifier, since they are not
  30 contained in the byte stream of the file; the Strider application
  31 simply displays the name of the file on disk as the name of the
  32 sequence. The caller should set the id, probably based on the name of
  33 the file (after possibly cleaning up whitespace, which ought not to be
  34 used as the id in most applications).
  35
  36 Note: the strider 'comment' is mapped to the BioPerl 'description'
  37 (since there is no other text field, and description maps to defline
  38 text).
  39
  40 =head1 FEEDBACK
  41
  42 =head2 Mailing Lists
  43
  44 User feedback is an integral part of the evolution of this and other
  45 Bioperl modules. Send your comments and suggestions preferably to one
  46 of the Bioperl mailing lists.  Your participation is much appreciated.
  47
  48   bioperl-l@bioperl.org                  - General discussion
  49   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  50
  51 =head2 Support
  52
  53 Please direct usage questions or support issues to the mailing list:
  54
  55 I<bioperl-l@bioperl.org>
  56
  57 rather than to the module maintainer directly. Many experienced and
  58 reponsive experts will be able look at the problem and quickly
  59 address it. Please include a thorough description of the problem
  60 with code and data examples if at all possible.
  61
  62 =head2 Reporting Bugs
  63
  64 Report bugs to the Bioperl bug tracking system to help us keep track
  65 the bugs and their resolution.  Bug reports can be submitted via the
  66 web:
  67
  68   https://github.com/bioperl/bioperl-live/issues
  69
  70 =head1 AUTHORS - Malcolm Cook
  71
  72 Email: mec@stowers-institute.org
  73
  74 =head1 CONTRIBUTORS
  75
  76 Modelled after Bio::SeqIO::fasta by Ewan Birney E<lt>birney@ebi.ac.ukE<gt> and
  77 Lincoln Stein E<lt>lstein@cshl.orgE<gt>
  78
  79 =head1 APPENDIX
  80
  81 The rest of the documentation details each of the object
  82 methods. Internal methods are usually preceded with a _
  83
  84 =cut
  85
  86 # Let the code begin...
  87
  88 package Bio::SeqIO::strider;
  89 use strict;
  90 use warnings;
  91
  92
  93 use Bio::Seq::SeqFactory;
  94 use Convert::Binary::C;
  95
  96 use base qw(Bio::SeqIO);
  97
  98 my $c = Convert::Binary::C->new (
  99                                 ByteOrder => 'BigEndian',
 100                                 Alignment => 2
 101                                );
 102
 103 my $headerdef;
 104 {local ($/);
 105  # See this file's __DATA__ section for the c structure definitions
 106  # for strider binary header data.  Here we slurp it all into $headerdef.
 107  $headerdef = <DATA>};
 108
 109 $c->parse($headerdef);
 110
 111 my $size_F_HEADER = 112;
 112
 113 die "expected strider header structure size of $size_F_HEADER" unless $size_F_HEADER eq $c->sizeof('F_HEADER');
 114
 115 my %alphabet2type = (
 116                      # map between BioPerl alphabet and strider
 117                      # sequence type code.
 118
 119                      # From Strider Documentation: the sequence type:
 120                      # 1, 2, 3 and 4 for DNA, DNA Degenerate, RNA and
 121                      # Protein sequence files, respectively.
 122
 123                      # TODO: determine 'DNA Degenerate' based on
 124                      # sequence alphabet?
 125
 126                      dna => 1,
 127                      rna => 3,
 128                      protein => 4,
 129                     );
 130
 131 my %type2alphabet = reverse %alphabet2type;
 132
 133 sub _initialize {
 134   my($self,@args) = @_;
 135   $self->SUPER::_initialize(@args);
 136   unless ( defined $self->sequence_factory ) {
 137     $self->sequence_factory(Bio::Seq::SeqFactory->new(-verbose => $self->verbose(),
 138                                                       -type => 'Bio::Seq::RichSeq'));
 139   }
 140 }
 141
 142 =head2 next_seq
 143
 144  Title   : next_seq
 145  Usage   : $seq = $stream->next_seq()
 146  Function: returns the next sequence in the stream
 147  Returns : Bio::Seq object
 148  Args    : NONE
 149
 150 =cut
 151
 152 sub next_seq {
 153   my( $self ) = @_;
 154   my $fh =  $self->_fh;
 155   my ($header,$sequence,$fulldesc);
 156   eval {read $fh,$header,$size_F_HEADER};
 157   $self->throw ("$@  while attempting to reading strider header from " . $self->{'_file'}) if $@;
 158   $self->throw("required $size_F_HEADER bytes while reading strider header in " . $self->{'_file'} . " but found: " . length($header))
 159     unless $size_F_HEADER == length($header);
 160   my $headerdata = $c->unpack('F_HEADER',$header) or return;
 161   read $fh,$sequence,$headerdata->{nLength};
 162   read $fh,$fulldesc,$headerdata->{com_length};
 163   $fulldesc =~ s/\cM/ /g;       # gratuitous replacement of mac
 164                                 # linefeed with space.
 165   my $seq = $self->sequence_factory->create(
 166                                             # -id          => $main::ARGV, #might want to set this in caller to $ARGV.
 167                                             -seq         => $sequence,
 168                                             -desc        => $fulldesc,
 169                                             -alphabet    => $type2alphabet{$headerdata->{type}} || 'dna',
 170                                            );
 171
 172   return $seq;
 173 }
 174
 175 =head2 write_seq
 176
 177  Title   : write_seq
 178  Usage   : $stream->write_seq(@seq)
 179  Function: writes the $seq object into the stream
 180  Returns : 1 for success and 0 for error
 181  Args    : array of 1 to n Bio::PrimarySeqI objects
 182
 183
 184 =cut
 185
 186 sub write_seq {
 187   my ($self,@seq) = @_;
 188   my $fh =  $self->_fh() || *STDOUT; #die "could not determine filehandle in strider.pm";
 189   foreach my $seq (@seq) {
 190     $self->throw("Did not provide a valid Bio::PrimarySeqI object")
 191       unless defined $seq && ref($seq) && $seq->isa('Bio::PrimarySeqI');
 192     my  $headerdata = $c->pack('F_HEADER',{
 193                                            versionNb   => 0,
 194                                            type        => $alphabet2type{$seq->alphabet} || $alphabet2type{dna},
 195                                            topology    => $seq->is_circular ? 1 : 0,
 196                                            nLength     => $seq->length,
 197                                            nMinus      => 0,
 198                                            com_length  => length($seq->desc || ""),
 199                                           });
 200     print $fh $headerdata, $seq->seq() || "" , $seq->desc || "";
 201   }
 202 }
 203
 204 1;
 205
 206 __DATA__
 207
 208 //The following was taken from the strider 1.4 release notes Appendix (with
 209 //some comments gleaned from other parts of manual)
 210
 211 struct F_HEADER
 212 {
 213 char versionNb;  // the format version number, currently it is set to 0
 214 char type;       // 1=DNA, 2=DNA Degenerate, 3=RNA or 4=Protein
 215 char topology;   // linear or circular - 0 for a linear sequence, 1 for a circular one
 216 char reserved1;
 217 int reserved2;
 218 int reserved3;
 219 int reserved4;
 220 char reserved5;
 221 char filler1;
 222 short filler2;
 223 int filler3;
 224 int reserved6;
 225 int nLength; // Sequence length -  the length the Sequence field (the number of char in the text, each being a base or an aa)
 226 int nMinus; // nb of "negative" bases, i.e. the number of bases numbered with negative numbers
 227 int reserved7;
 228 int reserved8;
 229 int reserved9;
 230 int reserved10;
 231 int reserved11;
 232 char reserved12[32];
 233 short reserved13;
 234 short filler4;
 235 char reserved14;
 236 char reserved15;
 237 char reserved16;
 238 char filler5;
 239 int com_length; //  the length the Comment field (the number of char in the text).
 240 int reserved17;
 241 int filler6;
 242 int filler7;
 243 };