lib/Bio/SeqIO/fasta.pm

   1 # BioPerl module for Bio::SeqIO::fasta
   2 #
   3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   4 #
   5 # Cared for by Ewan Birney <birney@ebi.ac.uk>
   6 #          and Lincoln Stein <lstein@cshl.org>
   7 #
   8 # Copyright Ewan Birney & Lincoln Stein
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11 # _history
  12 # October 18, 1999  Largely rewritten by Lincoln Stein
  13
  14 # POD documentation - main docs before the code
  15
  16 =head1 NAME
  17
  18 Bio::SeqIO::fasta - fasta sequence input/output stream
  19
  20 =head1 SYNOPSIS
  21
  22 Do not use this module directly.  Use it via the Bio::SeqIO class.
  23
  24 =head1 DESCRIPTION
  25
  26 This object can transform Bio::Seq objects to and from fasta flat
  27 file databases.
  28
  29 =head1 FEEDBACK
  30
  31 =head2 Mailing Lists
  32
  33 User feedback is an integral part of the evolution of this and other
  34 Bioperl modules. Send your comments and suggestions preferably to one
  35 of the Bioperl mailing lists.  Your participation is much appreciated.
  36
  37   bioperl-l@bioperl.org                  - General discussion
  38   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  39
  40 =head2 Support
  41
  42 Please direct usage questions or support issues to the mailing list:
  43
  44 I<bioperl-l@bioperl.org>
  45
  46 rather than to the module maintainer directly. Many experienced and
  47 reponsive experts will be able look at the problem and quickly
  48 address it. Please include a thorough description of the problem
  49 with code and data examples if at all possible.
  50
  51 =head2 Reporting Bugs
  52
  53 Report bugs to the Bioperl bug tracking system to help us keep track
  54 the bugs and their resolution.  Bug reports can be submitted via the
  55 web:
  56
  57   https://github.com/bioperl/bioperl-live/issues
  58
  59 =head1 AUTHORS - Ewan Birney & Lincoln Stein
  60
  61 Email: birney@ebi.ac.uk
  62        lstein@cshl.org
  63
  64 =head1 CONTRIBUTORS
  65
  66 Jason Stajich, jason-at-bioperl.org
  67
  68 =head1 APPENDIX
  69
  70 The rest of the documentation details each of the object
  71 methods. Internal methods are usually preceded with a _
  72
  73 =cut
  74
  75 # Let the code begin...
  76
  77 package Bio::SeqIO::fasta;
  78 use strict;
  79 use warnings;
  80
  81 use Bio::Seq::SeqFastaSpeedFactory;
  82
  83 use parent qw(Bio::SeqIO);
  84
  85 sub _initialize {
  86     my ($self, @args) = @_;
  87     $self->SUPER::_initialize(@args);
  88
  89     ## Initialize fasta specific parameters
  90     ## There are some problems with _rearrange. If there's no value for one of
  91     ## the parameters, it will return an empty value (not undef). This means we
  92     ## can't just merge two hashes since the empty values would override the
  93     ## defaults anyway.
  94     my (%defs) = (
  95                   "width"             => 60,
  96                   "block"             => "", # default is same as width
  97                   "preferred_id_type" => "display",
  98                   );
  99     foreach my $param (keys %defs) {
 100         $self->$param( $self->_rearrange([$param], @args) ||
 101                        $defs{$param});
 102     }
 103
 104     unless ( defined $self->sequence_factory ) {
 105         $self->sequence_factory(Bio::Seq::SeqFastaSpeedFactory->new());
 106     }
 107 }
 108
 109 =head2 next_seq
 110
 111  Title   : next_seq
 112  Usage   : $seq = $stream->next_seq()
 113  Function: returns the next sequence in the stream
 114  Returns : Bio::Seq object, or nothing if no more available
 115  Args    : NONE
 116
 117 =cut
 118
 119 sub next_seq {
 120     my( $self ) = @_;
 121     my $seq;
 122     my $alphabet;
 123     local $/ = "\n>";
 124     return unless my $entry = $self->_readline;
 125
 126     # Replacing chomp for s///, since chomp is not working in some cases
 127     $entry =~ s/\n$//;
 128     $entry =~ s/\r$//;
 129     if ($entry =~ m/\A\s*\Z/s)  { # very first one
 130         return unless $entry = $self->_readline;
 131         chomp($entry);
 132     }
 133
 134     # this just checks the initial input; beyond that, due to setting $/ above,
 135     # the > is part of the record separator and is removed
 136     $self->throw("The sequence does not appear to be FASTA format ".
 137         "(lacks a descriptor line '>')") if $. == 1 && $entry !~ /^>/;
 138
 139     $entry =~ s/^>//;
 140
 141     my ($top,$sequence) = split(/\n/,$entry,2);
 142     defined $sequence && $sequence =~ s/>//g;
 143     #my ($top,$sequence) = $entry =~ /^>?(.+?)\n+([^>]*)/s
 144     #    or $self->throw("Can't parse fasta entry");
 145
 146     my ($id,$fulldesc);
 147     if( $top =~ /^\s*(\S+)\s*(.*)/ ) {
 148         ($id,$fulldesc) = ($1,$2);
 149     }
 150
 151     if (defined $id && $id eq '') {$id=$fulldesc;} # FIX incase no space
 152                                                    # between > and name \AE
 153     defined $sequence && $sequence =~ tr/ \t\n\r//d; # Remove whitespace
 154
 155     # for empty sequences we need to know the mol.type
 156     $alphabet = $self->alphabet();
 157     if(defined $sequence && length($sequence) == 0) {
 158         if(! defined($alphabet)) {
 159             # let's default to dna
 160             $alphabet = "dna";
 161         }
 162     }# else {
 163         # we don't need it really, so disable
 164         # we want to keep this if SeqIO alphabet was set by user
 165         # not sure if this could break something
 166         #$alphabet = undef;
 167     #}
 168
 169     $seq = $self->sequence_factory->create(
 170         -seq         => $sequence,
 171         -id          => $id,
 172         # Ewan's note - I don't think this healthy
 173         # but obviously to taste.
 174         #-primary_id  => $id,
 175         -desc        => $fulldesc,
 176         -alphabet    => $alphabet,
 177         -direct      => 1,
 178     );
 179
 180     # if there wasn't one before, set the guessed type
 181     #unless ( defined $alphabet ) {
 182         # don't assume that all our seqs are the same as the first one found
 183         #$self->alphabet($seq->alphabet());
 184     #}
 185     return $seq;
 186
 187 }
 188
 189 =head2 write_seq
 190
 191  Title   : write_seq
 192  Usage   : $stream->write_seq(@seq)
 193  Function: Writes the $seq object into the stream
 194  Returns : 1 for success and 0 for error
 195  Args    : Array of 1 or more Bio::PrimarySeqI objects
 196
 197 =cut
 198
 199 sub write_seq {
 200     my ($self,@seq) = @_;
 201     my $width = $self->width;
 202     my $block = $self->block;
 203
 204     ## take a reference for single string (the sequence) and add the whitespace
 205     local *format_str = sub {
 206         my $str = $_[0];
 207         my @lines = unpack ("(A$width)*", $$str);
 208         if ($block >= $width) {
 209             $$str = join ("\n", @lines)."\n";
 210         } else {
 211             $$str = "";
 212             $$str .= join (" ", unpack ("(A$block)*", $_)) . "\n" foreach (@lines);
 213         }
 214     };
 215
 216     foreach my $seq (@seq) {
 217         $self->throw("Did not provide a valid Bio::PrimarySeqI object")
 218           unless defined $seq && ref($seq) && $seq->isa('Bio::PrimarySeqI');
 219
 220         # Allow for different ids
 221         my $top;
 222         my $id_type = $self->preferred_id_type;
 223         if( $id_type =~ /^acc/i ) {
 224             $top = $seq->accession_number();
 225             if( $id_type =~ /vers/i ) {
 226                 $top .= "." . $seq->version();
 227             }
 228         } elsif($id_type =~ /^displ/i ) {
 229             $self->warn("No whitespace allowed in FASTA ID [". $seq->display_id. "]")
 230               if defined $seq->display_id && $seq->display_id =~ /\s/;
 231             $top = $seq->display_id();
 232             $top = '' unless defined $top;
 233             $self->warn("No whitespace allowed in FASTA ID [". $top. "]")
 234               if defined $top && $top =~ /\s/;
 235         } elsif($id_type =~ /^pri/i ) {
 236             $top = $seq->primary_id();
 237         }
 238
 239         if ($seq->can('desc') and my $desc = $seq->desc()) {
 240             $desc =~ s/\n//g;
 241             $top .= " $desc";
 242         }
 243
 244         if( $seq->isa('Bio::Seq::LargeSeqI') ) {
 245             $self->_print(">$top\n");
 246             # for large seqs, don't call seq(), it defeats the
 247             # purpose of the largeseq functionality.  instead get
 248             # chunks of the seq, $width at a time
 249             my $buff_max = 2000;
 250             my $buff_size = int($buff_max/$width)*$width; #< buffer is even multiple of widths
 251             my $seq_length = $seq->length;
 252             my $num_chunks = int($seq_length/$buff_size+1);
 253             for( my $c = 0; $c < $num_chunks; $c++ ) {
 254                 my $buff_end = $buff_size*($c+1);
 255                 $buff_end = $seq_length if $buff_end > $seq_length;
 256                 my $buff = $seq->subseq($buff_size*$c+1,$buff_end);
 257                 if($buff) {
 258                     format_str (\$buff);
 259                     $self->_print($buff);
 260                 } else {
 261                     $self->_print("\n");
 262                 }
 263             }
 264         } else {
 265             my $str = $seq->seq;
 266             if(defined $str && length($str) > 0) {
 267                 format_str (\$str);
 268             } else {
 269                 $str = "\n";
 270             }
 271             $self->_print (">",$top,"\n",$str) or return;
 272         }
 273     }
 274
 275     $self->flush if $self->_flush_on_write && defined $self->_fh;
 276     return 1;
 277 }
 278
 279 =head2 width
 280
 281  Title   : width
 282  Usage   : $obj->width($newval)
 283  Function: Get/Set the line width for FASTA output (not counting whitespace).
 284  Returns : value of width
 285  Args    : newvalue (optional)
 286
 287 =cut
 288
 289 sub width {
 290     my ($self,$value) = @_;
 291     if (defined $value) {
 292         $self->{'width'} = $value;
 293     }
 294     return $self->{'width'};
 295 }
 296
 297 =head2 block
 298
 299  Title   : block
 300  Usage   : $obj->block($newval)
 301  Function: Get/Set the length of each block for FASTA output. Sequence blocks
 302            will be split with a space. Configuring block, to a value of 10 for
 303            example, allows one to easily identify a position in a sequence by eye.
 304  Default : same value used for width.
 305  Returns : value of block
 306  Args    : newvalue (optional)
 307
 308 =cut
 309
 310 sub block {
 311     my ($self,$value) = @_;
 312     if (defined $value) {
 313         $self->{'block'} = $value;
 314     }
 315     return $self->{'block'} || $self->width;
 316 }
 317
 318 =head2 preferred_id_type
 319
 320  Title   : preferred_id_type
 321  Usage   : $obj->preferred_id_type('accession')
 322  Function: Get/Set the preferred type of identifier to use in the ">ID" position
 323            for FASTA output.
 324  Returns : string, one of values defined in @Bio::SeqIO::fasta::SEQ_ID_TYPES.
 325  Default : display
 326  Args    : string when setting. This must be one of values defined in
 327            @Bio::SeqIO::fasta::SEQ_ID_TYPES. Allowable values:
 328            accession, accession.version, display, primary
 329  Throws  : fatal exception if the supplied id type is not in @SEQ_ID_TYPES.
 330
 331 =cut
 332
 333 our @SEQ_ID_TYPES = qw(accession accession.version display primary);
 334
 335 sub preferred_id_type {
 336     my ($self,$type) = @_;
 337     if (defined $type) {
 338         if( ! grep lc($type) eq $_, @SEQ_ID_TYPES) {
 339             $self->throw(-class=>'Bio::Root::BadParameter',
 340                          -text=>"Invalid ID type \"$type\". Must be one of: @SEQ_ID_TYPES");
 341         }
 342         $self->{'_seq_id_type'} = lc($type);
 343     }
 344     $self->{'_seq_id_type'};
 345 }
 346
 347 1;