Merge remote branch 'upstream/master' into topic/tree_dbsqlite_memoryfix
[bioperl-live.git] / Bio / SeqIO.pm
blobfe498225d02cc301349d26f5094ae23b2fe4daf9
2 # BioPerl module for Bio::SeqIO
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
7 # and Lincoln Stein <lstein@cshl.org>
9 # Copyright Ewan Birney
11 # You may distribute this module under the same terms as perl itself
13 # _history
14 # October 18, 1999 Largely rewritten by Lincoln Stein
16 # POD documentation - main docs before the code
18 =head1 NAME
20 Bio::SeqIO - Handler for SeqIO Formats
22 =head1 SYNOPSIS
24 use Bio::SeqIO;
26 $in = Bio::SeqIO->new(-file => "inputfilename" ,
27 -format => 'Fasta');
28 $out = Bio::SeqIO->new(-file => ">outputfilename" ,
29 -format => 'EMBL');
31 while ( my $seq = $in->next_seq() ) {
32 $out->write_seq($seq);
35 # Now, to actually get at the sequence object, use the standard Bio::Seq
36 # methods (look at Bio::Seq if you don't know what they are)
38 use Bio::SeqIO;
40 $in = Bio::SeqIO->new(-file => "inputfilename" ,
41 -format => 'genbank');
43 while ( my $seq = $in->next_seq() ) {
44 print "Sequence ",$seq->id, " first 10 bases ",
45 $seq->subseq(1,10), "\n";
49 # The SeqIO system does have a filehandle binding. Most people find this
50 # a little confusing, but it does mean you can write the world's
51 # smallest reformatter
53 use Bio::SeqIO;
55 $in = Bio::SeqIO->newFh(-file => "inputfilename" ,
56 -format => 'Fasta');
57 $out = Bio::SeqIO->newFh(-format => 'EMBL');
59 # World's shortest Fasta<->EMBL format converter:
60 print $out $_ while <$in>;
63 =head1 DESCRIPTION
65 Bio::SeqIO is a handler module for the formats in the SeqIO set (eg,
66 Bio::SeqIO::fasta). It is the officially sanctioned way of getting at
67 the format objects, which most people should use.
69 The Bio::SeqIO system can be thought of like biological file handles.
70 They are attached to filehandles with smart formatting rules (eg,
71 genbank format, or EMBL format, or binary trace file format) and
72 can either read or write sequence objects (Bio::Seq objects, or
73 more correctly, Bio::SeqI implementing objects, of which Bio::Seq is
74 one such object). If you want to know what to do with a Bio::Seq
75 object, read L<Bio::Seq>.
77 The idea is that you request a stream object for a particular format.
78 All the stream objects have a notion of an internal file that is read
79 from or written to. A particular SeqIO object instance is configured
80 for either input or output. A specific example of a stream object is
81 the Bio::SeqIO::fasta object.
83 Each stream object has functions
85 $stream->next_seq();
87 and
89 $stream->write_seq($seq);
91 As an added bonus, you can recover a filehandle that is tied to the
92 SeqIO object, allowing you to use the standard E<lt>E<gt> and print
93 operations to read and write sequence objects:
95 use Bio::SeqIO;
97 $stream = Bio::SeqIO->newFh(-format => 'Fasta',
98 -fh => \*ARGV);
99 # read from standard input or the input filenames
101 while ( $seq = <$stream> ) {
102 # do something with $seq
107 print $stream $seq; # when stream is in output mode
109 This makes the simplest ever reformatter
111 #!/usr/bin/perl
113 $format1 = shift;
114 $format2 = shift || die
115 "Usage: reformat format1 format2 < input > output";
117 use Bio::SeqIO;
119 $in = Bio::SeqIO->newFh(-format => $format1, -fh => \*ARGV );
120 $out = Bio::SeqIO->newFh(-format => $format2 );
121 # Note: you might want to quote -format to keep older
122 # perl's from complaining.
124 print $out $_ while <$in>;
127 =head1 CONSTRUCTORS
129 =head2 Bio::SeqIO-E<gt>new()
131 $seqIO = Bio::SeqIO->new(-file => 'filename', -format=>$format);
132 $seqIO = Bio::SeqIO->new(-fh => \*FILEHANDLE, -format=>$format);
133 $seqIO = Bio::SeqIO->new(-format => $format);
135 The new() class method constructs a new Bio::SeqIO object. The
136 returned object can be used to retrieve or print Seq objects. new()
137 accepts the following parameters:
139 =over 5
141 =item -file
143 A file path to be opened for reading or writing. The usual Perl
144 conventions apply:
146 'file' # open file for reading
147 '>file' # open file for writing
148 '>>file' # open file for appending
149 '+<file' # open file read/write
150 'command |' # open a pipe from the command
151 '| command' # open a pipe to the command
153 =item -fh
155 You may provide new() with a previously-opened filehandle. For
156 example, to read from STDIN:
158 $seqIO = Bio::SeqIO->new(-fh => \*STDIN);
160 Note that you must pass filehandles as references to globs.
162 If neither a filehandle nor a filename is specified, then the module
163 will read from the @ARGV array or STDIN, using the familiar E<lt>E<gt>
164 semantics.
166 A string filehandle is handy if you want to modify the output in the
167 memory, before printing it out. The following program reads in EMBL
168 formatted entries from a file and prints them out in fasta format with
169 some HTML tags:
171 use Bio::SeqIO;
172 use IO::String;
173 my $in = Bio::SeqIO->new(-file => "emblfile",
174 -format => 'EMBL');
175 while ( my $seq = $in->next_seq() ) {
176 # the output handle is reset for every file
177 my $stringio = IO::String->new($string);
178 my $out = Bio::SeqIO->new(-fh => $stringio,
179 -format => 'fasta');
180 # output goes into $string
181 $out->write_seq($seq);
182 # modify $string
183 $string =~ s|(>)(\w+)|$1<font color="Red">$2</font>|g;
184 # print into STDOUT
185 print $string;
188 =item -format
190 Specify the format of the file. Supported formats include fasta,
191 genbank, embl, swiss (SwissProt), Entrez Gene and tracefile formats
192 such as abi (ABI) and scf. There are many more, for a complete listing
193 see the SeqIO HOWTO (L<http://bioperl.open-bio.org/wiki/HOWTO:SeqIO>).
195 If no format is specified and a filename is given then the module will
196 attempt to deduce the format from the filename suffix. If there is no
197 suffix that Bioperl understands then it will attempt to guess the
198 format based on file content. If this is unsuccessful then SeqIO will
199 throw a fatal error.
201 The format name is case-insensitive: 'FASTA', 'Fasta' and 'fasta' are
202 all valid.
204 Currently, the tracefile formats (except for SCF) require installation
205 of the external Staden "io_lib" package, as well as the
206 Bio::SeqIO::staden::read package available from the bioperl-ext
207 repository.
209 =item -alphabet
211 Sets the alphabet ('dna', 'rna', or 'protein'). When the alphabet is
212 set then Bioperl will not attempt to guess what the alphabet is. This
213 may be important because Bioperl does not always guess correctly.
215 =item -flush
217 By default, all files (or filehandles) opened for writing sequences
218 will be flushed after each write_seq() (making the file immediately
219 usable). If you do not need this facility and would like to marginally
220 improve the efficiency of writing multiple sequences to the same file
221 (or filehandle), pass the -flush option '0' or any other value that
222 evaluates as defined but false:
224 my $gb = Bio::SeqIO->new(-file => "<gball.gbk",
225 -format => "gb");
226 my $fa = Bio::SeqIO->new(-file => ">gball.fa",
227 -format => "fasta",
228 -flush => 0); # go as fast as we can!
229 while($seq = $gb->next_seq) { $fa->write_seq($seq) }
232 =back
234 =head2 Bio::SeqIO-E<gt>newFh()
236 $fh = Bio::SeqIO->newFh(-fh => \*FILEHANDLE, -format=>$format);
237 $fh = Bio::SeqIO->newFh(-format => $format);
238 # etc.
240 This constructor behaves like new(), but returns a tied filehandle
241 rather than a Bio::SeqIO object. You can read sequences from this
242 object using the familiar E<lt>E<gt> operator, and write to it using
243 print(). The usual array and $_ semantics work. For example, you can
244 read all sequence objects into an array like this:
246 @sequences = <$fh>;
248 Other operations, such as read(), sysread(), write(), close(), and
249 printf() are not supported.
251 =head1 OBJECT METHODS
253 See below for more detailed summaries. The main methods are:
255 =head2 $sequence = $seqIO-E<gt>next_seq()
257 Fetch the next sequence from the stream, or nothing if no more.
259 =head2 $seqIO-E<gt>write_seq($sequence [,$another_sequence,...])
261 Write the specified sequence(s) to the stream.
263 =head2 TIEHANDLE(), READLINE(), PRINT()
265 These provide the tie interface. See L<perltie> for more details.
267 =head1 FEEDBACK
269 =head2 Mailing Lists
271 User feedback is an integral part of the evolution of this and other
272 Bioperl modules. Send your comments and suggestions preferably to one
273 of the Bioperl mailing lists.
275 Your participation is much appreciated.
277 bioperl-l@bioperl.org - General discussion
278 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
280 =head2 Support
282 Please direct usage questions or support issues to the mailing list:
284 bioperl-l@bioperl.org
286 rather than to the module maintainer directly. Many experienced and
287 responsive experts will be able look at the problem and quickly
288 address it. Please include a thorough description of the problem
289 with code and data examples if at all possible.
291 =head2 Reporting Bugs
293 Report bugs to the Bioperl bug tracking system to help us keep track
294 the bugs and their resolution. Bug reports can be submitted via the
295 web:
297 http://bugzilla.open-bio.org/
299 =head1 AUTHOR - Ewan Birney, Lincoln Stein
301 Email birney@ebi.ac.uk
302 lstein@cshl.org
304 =head1 APPENDIX
306 The rest of the documentation details each of the object
307 methods. Internal methods are usually preceded with a _
309 =cut
311 #' Let the code begin...
313 package Bio::SeqIO;
315 use strict;
317 use Bio::Factory::FTLocationFactory;
318 use Bio::Seq::SeqBuilder;
319 use Bio::Tools::GuessSeqFormat;
320 use Symbol;
322 use base qw(Bio::Root::Root Bio::Root::IO Bio::Factory::SequenceStreamI);
324 my %valid_alphabet_cache;
326 =head2 new
328 Title : new
329 Usage : $stream = Bio::SeqIO->new(-file => $filename,
330 -format => 'Format')
331 Function: Returns a new sequence stream
332 Returns : A Bio::SeqIO stream initialised with the appropriate format
333 Args : Named parameters:
334 -file => $filename
335 -fh => filehandle to attach to
336 -format => format
338 Additional arguments may be used to set factories and
339 builders involved in the sequence object creation. None of
340 these must be provided, they all have reasonable defaults.
341 -seqfactory the Bio::Factory::SequenceFactoryI object
342 -locfactory the Bio::Factory::LocationFactoryI object
343 -objbuilder the Bio::Factory::ObjectBuilderI object
345 See L<Bio::SeqIO::Handler>
347 =cut
349 my $entry = 0;
351 sub new {
352 my ($caller,@args) = @_;
353 my $class = ref($caller) || $caller;
355 # or do we want to call SUPER on an object if $caller is an
356 # object?
357 if( $class =~ /Bio::SeqIO::(\S+)/ ) {
358 my ($self) = $class->SUPER::new(@args);
359 $self->_initialize(@args);
360 return $self;
361 } else {
363 my %param = @args;
364 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
366 if (!defined($param{-file}) && !defined($param{-fh})) {
367 $class->throw("file argument provided, but with an undefined value") if exists($param{'-file'});
368 $class->throw("fh argument provided, but with an undefined value") if (exists($param{'-fh'}));
371 my $format = $param{'-format'} ||
372 $class->_guess_format( $param{-file} || $ARGV[0] );
374 if( ! $format ) {
375 if ($param{-file}) {
376 $format = Bio::Tools::GuessSeqFormat->new(-file => $param{-file}||$ARGV[0] )->guess;
377 } elsif ($param{-fh}) {
378 $format = Bio::Tools::GuessSeqFormat->new(-fh => $param{-fh}||$ARGV[0] )->guess;
381 $class->throw(sprintf("Unknown format given or could not determine it [%s]",$format || ''))
382 unless $format;
383 $format = "\L$format"; # normalize capitalization to lower case
385 if ($format =~ /-/) {
386 ($format, my $variant) = split('-', $format, 2);
387 push @args, (-variant => $variant);
389 return unless( $class->_load_format_module($format) );
390 return "Bio::SeqIO::$format"->new(@args);
394 =head2 newFh
396 Title : newFh
397 Usage : $fh = Bio::SeqIO->newFh(-file=>$filename,-format=>'Format')
398 Function: does a new() followed by an fh()
399 Example : $fh = Bio::SeqIO->newFh(-file=>$filename,-format=>'Format')
400 $sequence = <$fh>; # read a sequence object
401 print $fh $sequence; # write a sequence object
402 Returns : filehandle tied to the Bio::SeqIO::Fh class
403 Args :
405 See L<Bio::SeqIO::Fh>
407 =cut
409 sub newFh {
410 my $class = shift;
411 return unless my $self = $class->new(@_);
412 return $self->fh;
415 =head2 fh
417 Title : fh
418 Usage : $obj->fh
419 Function:
420 Example : $fh = $obj->fh; # make a tied filehandle
421 $sequence = <$fh>; # read a sequence object
422 print $fh $sequence; # write a sequence object
423 Returns : filehandle tied to Bio::SeqIO class
424 Args : none
426 =cut
429 sub fh {
430 my $self = shift;
431 my $class = ref($self) || $self;
432 my $s = Symbol::gensym;
433 tie $$s,$class,$self;
434 return $s;
437 # _initialize is chained for all SeqIO classes
439 sub _initialize {
440 my($self, @args) = @_;
442 # flush is initialized by the Root::IO init
444 my ($seqfact,$locfact,$objbuilder, $alphabet) =
445 $self->_rearrange([qw(SEQFACTORY
446 LOCFACTORY
447 OBJBUILDER
448 ALPHABET)
449 ], @args);
451 $locfact = Bio::Factory::FTLocationFactory->new(-verbose => $self->verbose)
452 if ! $locfact;
453 $objbuilder = Bio::Seq::SeqBuilder->new(-verbose => $self->verbose)
454 unless $objbuilder;
455 $self->sequence_builder($objbuilder);
456 $self->location_factory($locfact);
458 # note that this should come last because it propagates the sequence
459 # factory to the sequence builder
460 $seqfact && $self->sequence_factory($seqfact);
462 #bug 2160
463 $alphabet && $self->alphabet($alphabet);
466 # initialize the IO part
467 $self->_initialize_io(@args);
470 =head2 next_seq
472 Title : next_seq
473 Usage : $seq = stream->next_seq
474 Function: Reads the next sequence object from the stream and returns it.
476 Certain driver modules may encounter entries in the stream
477 that are either misformatted or that use syntax not yet
478 understood by the driver. If such an incident is
479 recoverable, e.g., by dismissing a feature of a feature
480 table or some other non-mandatory part of an entry, the
481 driver will issue a warning. In the case of a
482 non-recoverable situation an exception will be thrown. Do
483 not assume that you can resume parsing the same stream
484 after catching the exception. Note that you can always turn
485 recoverable errors into exceptions by calling
486 $stream->verbose(2).
488 Returns : a Bio::Seq sequence object, or nothing if no more sequences
489 are available
491 Args : none
493 See L<Bio::Root::RootI>, L<Bio::Factory::SeqStreamI>, L<Bio::Seq>
495 =cut
497 sub next_seq {
498 my ($self, $seq) = @_;
499 $self->throw("Sorry, you cannot read from a generic Bio::SeqIO object.");
502 =head2 write_seq
504 Title : write_seq
505 Usage : $stream->write_seq($seq)
506 Function: writes the $seq object into the stream
507 Returns : 1 for success and 0 for error
508 Args : Bio::Seq object
510 =cut
512 sub write_seq {
513 my ($self, $seq) = @_;
514 $self->throw("Sorry, you cannot write to a generic Bio::SeqIO object.");
518 =head2 alphabet
520 Title : alphabet
521 Usage : $self->alphabet($newval)
522 Function: Set/get the molecule type for the Seq objects to be created.
523 Example : $seqio->alphabet('protein')
524 Returns : value of alphabet: 'dna', 'rna', or 'protein'
525 Args : newvalue (optional)
526 Throws : Exception if the argument is not one of 'dna', 'rna', or 'protein'
528 =cut
530 sub alphabet {
531 my ($self, $value) = @_;
533 if ( defined $value) {
534 $value = lc $value;
535 unless ($valid_alphabet_cache{$value}) {
536 # instead of hard-coding the allowed values once more, we check by
537 # creating a dummy sequence object
538 eval {
539 require Bio::PrimarySeq;
540 my $seq = Bio::PrimarySeq->new('-verbose' => $self->verbose,
541 '-alphabet' => $value);
543 if ($@) {
544 $self->throw("Invalid alphabet: $value\n. See Bio::PrimarySeq for allowed values.");
546 $valid_alphabet_cache{$value} = 1;
548 $self->{'alphabet'} = $value;
550 return $self->{'alphabet'};
553 =head2 _load_format_module
555 Title : _load_format_module
556 Usage : *INTERNAL SeqIO stuff*
557 Function: Loads up (like use) a module at run time on demand
558 Example :
559 Returns :
560 Args :
562 =cut
564 sub _load_format_module {
565 my ($self, $format) = @_;
566 my $module = "Bio::SeqIO::" . $format;
567 my $ok;
569 eval {
570 $ok = $self->_load_module($module);
572 if ( $@ ) {
573 print STDERR <<END;
574 $self: $format cannot be found
575 Exception $@
576 For more information about the SeqIO system please see the SeqIO docs.
577 This includes ways of checking for formats at compile time, not run time
581 return $ok;
584 =head2 _concatenate_lines
586 Title : _concatenate_lines
587 Usage : $s = _concatenate_lines($line, $continuation_line)
588 Function: Private. Concatenates two strings assuming that the second stems
589 from a continuation line of the first. Adds a space between both
590 unless the first ends with a dash.
592 Takes care of either arg being empty.
593 Example :
594 Returns : A string.
595 Args :
597 =cut
599 sub _concatenate_lines {
600 my ($self, $s1, $s2) = @_;
602 $s1 .= " " if($s1 && ($s1 !~ /-$/) && $s2);
603 return ($s1 ? $s1 : "") . ($s2 ? $s2 : "");
606 =head2 _filehandle
608 Title : _filehandle
609 Usage : $obj->_filehandle($newval)
610 Function: This method is deprecated. Call _fh() instead.
611 Example :
612 Returns : value of _filehandle
613 Args : newvalue (optional)
615 =cut
617 sub _filehandle {
618 my ($self,@args) = @_;
619 return $self->_fh(@args);
622 =head2 _guess_format
624 Title : _guess_format
625 Usage : $obj->_guess_format($filename)
626 Function: guess format based on file suffix
627 Example :
628 Returns : guessed format of filename (lower case)
629 Args :
630 Notes : formats that _filehandle() will guess include fasta,
631 genbank, scf, pir, embl, raw, gcg, ace, bsml, swissprot,
632 fastq and phd/phred
634 =cut
636 sub _guess_format {
637 my $class = shift;
638 return unless $_ = shift;
639 return 'abi' if /\.ab[i1]$/i;
640 return 'ace' if /\.ace$/i;
641 return 'alf' if /\.alf$/i;
642 return 'bsml' if /\.(bsm|bsml)$/i;
643 return 'ctf' if /\.ctf$/i;
644 return 'embl' if /\.(embl|ebl|emb|dat)$/i;
645 return 'entrezgene' if /\.asn$/i;
646 return 'exp' if /\.exp$/i;
647 return 'fasta' if /\.(fasta|fast|fas|seq|fa|fsa|nt|aa|fna|faa)$/i;
648 return 'fastq' if /\.fastq$/i;
649 return 'gcg' if /\.gcg$/i;
650 return 'genbank' if /\.(gb|gbank|genbank|gbk|gbs)$/i;
651 return 'phd' if /\.(phd|phred)$/i;
652 return 'pir' if /\.pir$/i;
653 return 'pln' if /\.pln$/i;
654 return 'raw' if /\.(txt)$/i;
655 return 'scf' if /\.scf$/i;
656 return 'swiss' if /\.(swiss|sp)$/i;
658 # from Strider 1.4 Release Notes: The file name extensions used by
659 # Strider 1.4 are ".xdna", ".xdgn", ".xrna" and ".xprt" for DNA,
660 # DNA Degenerate, RNA and Protein Sequence Files, respectively
661 return 'strider' if /\.(xdna|xdgn|xrna|xprt)$/i;
663 return 'ztr' if /\.ztr$/i;
666 sub DESTROY {
667 my $self = shift;
668 $self->close();
671 sub TIEHANDLE {
672 my ($class,$val) = @_;
673 return bless {'seqio' => $val}, $class;
676 sub READLINE {
677 my $self = shift;
678 return $self->{'seqio'}->next_seq() unless wantarray;
679 my (@list, $obj);
680 push @list, $obj while $obj = $self->{'seqio'}->next_seq();
681 return @list;
684 sub PRINT {
685 my $self = shift;
686 $self->{'seqio'}->write_seq(@_);
689 =head2 sequence_factory
691 Title : sequence_factory
692 Usage : $seqio->sequence_factory($seqfactory)
693 Function: Get/Set the Bio::Factory::SequenceFactoryI
694 Returns : Bio::Factory::SequenceFactoryI
695 Args : [optional] Bio::Factory::SequenceFactoryI
697 =cut
699 sub sequence_factory{
700 my ($self,$obj) = @_;
701 if( defined $obj ) {
702 if( ! ref($obj) || ! $obj->isa('Bio::Factory::SequenceFactoryI') ) {
703 $self->throw("Must provide a valid Bio::Factory::SequenceFactoryI object to ".ref($self)."::sequence_factory()");
705 $self->{'_seqio_seqfactory'} = $obj;
706 my $builder = $self->sequence_builder();
707 if($builder && $builder->can('sequence_factory') &&
708 (! $builder->sequence_factory())) {
709 $builder->sequence_factory($obj);
712 $self->{'_seqio_seqfactory'};
715 =head2 object_factory
717 Title : object_factory
718 Usage : $obj->object_factory($newval)
719 Function: This is an alias to sequence_factory with a more generic name.
720 Example :
721 Returns : value of object_factory (a scalar)
722 Args : on set, new value (a scalar or undef, optional)
724 =cut
726 sub object_factory{
727 return shift->sequence_factory(@_);
730 =head2 sequence_builder
732 Title : sequence_builder
733 Usage : $seqio->sequence_builder($seqfactory)
734 Function: Get/Set the Bio::Factory::ObjectBuilderI used to build sequence
735 objects.
737 If you do not set the sequence object builder yourself, it
738 will in fact be an instance of L<Bio::Seq::SeqBuilder>, and
739 you may use all methods documented there to configure it.
741 Returns : a Bio::Factory::ObjectBuilderI compliant object
742 Args : [optional] a Bio::Factory::ObjectBuilderI compliant object
744 =cut
746 sub sequence_builder{
747 my ($self,$obj) = @_;
748 if( defined $obj ) {
749 if( ! ref($obj) || ! $obj->isa('Bio::Factory::ObjectBuilderI') ) {
750 $self->throw("Must provide a valid Bio::Factory::ObjectBuilderI object to ".ref($self)."::sequence_builder()");
752 $self->{'_object_builder'} = $obj;
754 $self->{'_object_builder'};
757 =head2 location_factory
759 Title : location_factory
760 Usage : $seqio->location_factory($locfactory)
761 Function: Get/Set the Bio::Factory::LocationFactoryI object to be used for
762 location string parsing
763 Returns : a Bio::Factory::LocationFactoryI implementing object
764 Args : [optional] on set, a Bio::Factory::LocationFactoryI implementing
765 object.
767 =cut
769 sub location_factory{
770 my ($self,$obj) = @_;
771 if( defined $obj ) {
772 if( ! ref($obj) || ! $obj->isa('Bio::Factory::LocationFactoryI') ) {
773 $self->throw("Must provide a valid Bio::Factory::LocationFactoryI" .
774 " object to ".ref($self)."->location_factory()");
776 $self->{'_seqio_locfactory'} = $obj;
778 $self->{'_seqio_locfactory'};