Add dna_to_aa_aln method and tests
[bioperl-live.git] / Bio / Perl.pm
blobdbbc0acd78b31ac30ef594c1bd22c0ce225b7dd6
2 # BioPerl module for Bio::Perl
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Perl - Functional access to BioPerl for people who don't know objects
18 =head1 SYNOPSIS
20 use Bio::Perl;
22 # will guess file format from extension
23 $seq_object = read_sequence($filename);
25 # forces genbank format
26 $seq_object = read_sequence($filename,'genbank');
28 # reads an array of sequences
29 @seq_object_array = read_all_sequences($filename,'fasta');
31 # sequences are Bio::Seq objects, so the following methods work
32 # for more info see Bio::Seq, or do 'perldoc Bio/Seq.pm'
34 print "Sequence name is ",$seq_object->display_id,"\n";
35 print "Sequence acc is ",$seq_object->accession_number,"\n";
36 print "First 5 bases is ",$seq_object->subseq(1,5),"\n";
38 # get the whole sequence as a single string
40 $sequence_as_a_string = $seq_object->seq();
42 # writing sequences
44 write_sequence(">$filename",'genbank',$seq_object);
46 write_sequence(">$filename",'genbank',@seq_object_array);
48 # making a new sequence from just a string
50 $seq_object = new_sequence("ATTGGTTTGGGGACCCAATTTGTGTGTTATATGTA",
51 "myname","AL12232");
53 # getting a sequence from a database (assumes internet connection)
55 $seq_object = get_sequence('swissprot',"ROA1_HUMAN");
57 $seq_object = get_sequence('embl',"AI129902");
59 $seq_object = get_sequence('genbank',"AI129902");
61 # BLAST a sequence (assummes an internet connection)
63 $blast_report = blast_sequence($seq_object);
65 write_blast(">blast.out",$blast_report);
68 =head1 DESCRIPTION
70 Easy first time access to BioPerl via functions.
72 =head1 FEEDBACK
74 =head2 Mailing Lists
76 User feedback is an integral part of the evolution of this and other
77 Bioperl modules. Send your comments and suggestions preferably to one
78 of the Bioperl mailing lists. Your participation is much appreciated.
80 bioperl-l@bioperl.org
82 =head2 Support
84 Please direct usage questions or support issues to the mailing list:
86 I<bioperl-l@bioperl.org>
88 rather than to the module maintainer directly. Many experienced and
89 reponsive experts will be able look at the problem and quickly
90 address it. Please include a thorough description of the problem
91 with code and data examples if at all possible.
93 =head2 Reporting Bugs
95 Report bugs to the Bioperl bug tracking system to help us keep track
96 the bugs and their resolution. Bug reports can be submitted via the web:
98 https://redmine.open-bio.org/projects/bioperl/
100 =head1 AUTHOR - Ewan Birney
102 Email birney@ebi.ac.uk
104 =head1 APPENDIX
106 The rest of the documentation details each of the object methods.
107 Internal methods are usually preceded with a _
109 =cut
112 # Let the code begin...
115 package Bio::Perl;
116 use vars qw(@EXPORT @EXPORT_OK $DBOKAY);
117 use strict;
118 use Carp;
120 use Bio::SeqIO;
121 use Bio::Seq;
122 use Bio::Root::Version '$VERSION';
123 BEGIN {
124 eval {
125 require Bio::DB::EMBL;
126 require Bio::DB::GenBank;
127 require Bio::DB::SwissProt;
128 require Bio::DB::RefSeq;
129 require Bio::DB::GenPept;
131 if( $@ ) {
132 $DBOKAY = 0;
133 } else {
134 $DBOKAY = 1;
138 use base qw(Exporter);
140 @EXPORT = qw(read_sequence read_all_sequences write_sequence
141 new_sequence get_sequence translate translate_as_string
142 reverse_complement revcom revcom_as_string
143 reverse_complement_as_string blast_sequence write_blast);
145 @EXPORT_OK = @EXPORT;
148 =head2 read_sequence
150 Title : read_sequence
151 Usage : $seq = read_sequence('sequences.fa')
152 $seq = read_sequence($filename,'genbank');
154 # pipes are fine
155 $seq = read_sequence("my_fetching_program $id |",'fasta');
157 Function: Reads the top sequence from the file. If no format is given, it will
158 try to guess the format from the filename. If a format is given, it
159 forces that format. The filename can be any valid perl open() string
160 - in particular, you can put in pipes
162 Returns : A Bio::Seq object. A quick synopsis:
163 $seq_object->display_id - name of the sequence
164 $seq_object->seq - sequence as a string
166 Args : Two strings, first the filename - any Perl open() string is ok
167 Second string is the format, which is optional
169 For more information on Seq objects see L<Bio::Seq>.
171 =cut
173 sub read_sequence{
174 my ($filename,$format) = @_;
176 if( !defined $filename ) {
177 confess "read_sequence($filename) - usage incorrect";
180 my $seqio;
182 if( defined $format ) {
183 $seqio = Bio::SeqIO->new( '-file' => $filename, '-format' => $format);
184 } else {
185 $seqio = Bio::SeqIO->new( '-file' => $filename);
188 my $seq = $seqio->next_seq();
190 return $seq;
194 =head2 read_all_sequences
196 Title : read_all_sequences
197 Usage : @seq_object_array = read_all_sequences($filename);
198 @seq_object_array = read_all_sequences($filename,'genbank');
200 Function: Just as the function above, but reads all the sequences in the
201 file and loads them into an array.
203 For very large files, you will run out of memory. When this
204 happens, you've got to use the SeqIO system directly (this is
205 not so hard! Don't worry about it!).
207 Returns : array of Bio::Seq objects
209 Args : two strings, first the filename (any open() string is ok)
210 second the format (which is optional)
212 See L<Bio::SeqIO> and L<Bio::Seq> for more information
214 =cut
216 sub read_all_sequences{
217 my ($filename,$format) = @_;
219 if( !defined $filename ) {
220 confess "read_all_sequences($filename) - usage incorrect";
223 my $seqio;
225 if( defined $format ) {
226 $seqio = Bio::SeqIO->new( '-file' => $filename, '-format' => $format);
227 } else {
228 $seqio = Bio::SeqIO->new( '-file' => $filename);
231 my @seq_array;
233 while( my $seq = $seqio->next_seq() ) {
234 push(@seq_array,$seq);
237 return @seq_array;
241 =head2 write_sequence
243 Title : write_sequence
244 Usage : write_sequence(">new_file.gb",'genbank',$seq)
245 write_sequence(">new_file.gb",'genbank',@array_of_sequence_objects)
247 Function: writes sequences in the specified format
249 Returns : true
251 Args : filename as a string, must provide an open() output file
252 format as a string
253 one or more sequence objects
256 =cut
258 sub write_sequence{
259 my ($filename,$format,@sequence_objects) = @_;
261 if( scalar(@sequence_objects) == 0 ) {
262 confess("write_sequence(filename,format,sequence_object)");
265 my $error = 0;
266 my $seqname = "sequence1";
268 # catch users who haven't passed us a filename we can open
269 if( $filename !~ /^\>/ && $filename !~ /^|/ ) {
270 $filename = ">".$filename;
273 my $seqio = Bio::SeqIO->new('-file' => $filename, '-format' => $format);
275 foreach my $seq ( @sequence_objects ) {
276 my $seq_obj;
278 if( !ref $seq ) {
279 if( length $seq > 50 ) {
280 # odds are this is a sequence as a string, and someone has not figured out
281 # how to make objects. Warn him/her and then make a sequence object
282 # from this
283 if( $error == 0 ) {
284 carp("WARNING: You have put in a long string into write_sequence.\n".
285 "I suspect this means that this is the actual sequence\n".
286 "In the future try the\n".
287 " new_sequence method of this module to make a new sequence object.\n".
288 "Doing this for you here\n");
289 $error = 1;
292 $seq_obj = new_sequence($seq,$seqname);
293 $seqname++;
294 } else {
295 confess("You have a non object [$seq] passed to write_sequence. It maybe that you".
296 "want to use new_sequence to make this string into a sequence object?");
298 } else {
299 if( !$seq->isa("Bio::SeqI") ) {
300 confess("object [$seq] is not a Bio::Seq object; can't write it out");
302 $seq_obj = $seq;
305 # finally... we get to write out the sequence!
306 $seqio->write_seq($seq_obj);
311 =head2 new_sequence
313 Title : new_sequence
314 Usage : $seq_obj = new_sequence("GATTACA", "kino-enzyme");
316 Function: Construct a sequency object from sequence string
317 Returns : A Bio::Seq object
319 Args : sequence string
320 name string (optional, default "no-name-for-sequence")
321 accession - accession number (optional, no default)
323 =cut
325 sub new_sequence{
326 my ($seq,$name,$accession) = @_;
328 if( !defined $seq ) {
329 confess("new_sequence(sequence_as_string) usage");
332 $name ||= "no-name-for-sequence";
334 my $seq_object = Bio::Seq->new( -seq => $seq, -id => $name);
336 $accession && $seq_object->accession_number($accession);
338 return $seq_object;
341 =head2 blast_sequence
343 Title : blast_sequence
344 Usage : $blast_result = blast_sequence($seq)
345 $blast_result = blast_sequence('MFVEGGTFASEDDDSASAEDE');
347 Function: If the computer has Internet accessibility, blasts
348 the sequence using the NCBI BLAST server against nrdb.
350 It chooses the flavour of BLAST on the basis of the sequence.
352 This function uses Bio::Tools::Run::RemoteBlast, which itself
353 use Bio::SearchIO - as soon as you want to know more, check out
354 these modules
355 Returns : Bio::Search::Result::GenericResult.pm
357 Args : Either a string of protein letters or nucleotides, or a
358 Bio::Seq object
360 =cut
362 sub blast_sequence {
363 my ($seq,$verbose) = @_;
365 if( !defined $verbose ) {
366 $verbose = 1;
369 if( !ref $seq ) {
370 $seq = Bio::Seq->new( -seq => $seq, -id => 'blast-sequence-temp-id');
371 } elsif ( !$seq->isa('Bio::PrimarySeqI') ) {
372 croak("[$seq] is an object, but not a Bio::Seq object, cannot be blasted");
375 require Bio::Tools::Run::RemoteBlast;
377 my $prog = ( $seq->alphabet eq 'protein' ) ? 'blastp' : 'blastn';
378 my $e_val= '1e-10';
380 my @params = ( '-prog' => $prog,
381 '-expect' => $e_val,
382 '-readmethod' => 'SearchIO' );
384 my $factory = Bio::Tools::Run::RemoteBlast->new(@params);
386 my $r = $factory->submit_blast($seq);
387 if( $verbose ) {
388 print STDERR "Submitted Blast for [".$seq->id."] ";
390 sleep 5;
392 my $result;
394 LOOP :
395 while( my @rids = $factory->each_rid) {
396 foreach my $rid ( @rids ) {
397 my $rc = $factory->retrieve_blast($rid);
398 if( !ref($rc) ) {
399 if( $rc < 0 ) {
400 $factory->remove_rid($rid);
402 if( $verbose ) {
403 print STDERR ".";
405 sleep 10;
406 } else {
407 $result = $rc->next_result();
408 $factory->remove_rid($rid);
409 last LOOP;
414 if( $verbose ) {
415 print STDERR "\n";
417 return $result;
420 =head2 write_blast
422 Title : write_blast
423 Usage : write_blast($filename,$blast_report);
425 Function: Writes a BLAST result object (or more formally
426 a SearchIO result object) out to a filename
427 in BLAST-like format
429 Returns : none
431 Args : filename as a string
432 Bio::SearchIO::Results object
434 =cut
436 sub write_blast {
437 my ($filename,$blast) = @_;
439 if( $filename !~ /^\>/ && $filename !~ /^|/ ) {
440 $filename = ">".$filename;
443 my $output = Bio::SearchIO->new( -output_format => 'blast', -file => $filename);
445 $output->write_result($blast);
449 =head2 get_sequence
451 Title : get_sequence
452 Usage : $seq_object = get_sequence('swiss',"ROA1_HUMAN");
454 Function: If the computer has Internet access this method gets
455 the sequence from Internet accessible databases. Currently
456 this supports Swissprot ('swiss'), EMBL ('embl'), GenBank
457 ('genbank'), GenPept ('genpept'), and RefSeq ('refseq').
459 Swissprot and EMBL are more robust than GenBank fetching.
461 If the user is trying to retrieve a RefSeq entry from
462 GenBank/EMBL, the query is silently redirected.
464 Returns : A Bio::Seq object
466 Args : database type - one of swiss, embl, genbank, genpept, or
467 refseq
469 =cut
471 my $genbank_db = undef;
472 my $genpept_db = undef;
473 my $embl_db = undef;
474 my $swiss_db = undef;
475 my $refseq_db = undef;
477 sub get_sequence{
478 my ($db_type,$identifier) = @_;
479 if( ! $DBOKAY ) {
480 confess ("Your system does not have one of LWP, HTTP::Request::Common, IO::String\n".
481 "installed so the DB retrieval method is not available.\n".
482 "Full error message is:\n $!\n");
483 return;
485 $db_type = lc($db_type);
487 my $db;
489 if( $db_type =~ /genbank/ ) {
490 if( !defined $genbank_db ) {
491 $genbank_db = Bio::DB::GenBank->new();
493 $db = $genbank_db;
495 if( $db_type =~ /genpept/ ) {
496 if( !defined $genpept_db ) {
497 $genpept_db = Bio::DB::GenPept->new();
499 $db = $genpept_db;
502 if( $db_type =~ /swiss/ ) {
503 if( !defined $swiss_db ) {
504 $swiss_db = Bio::DB::SwissProt->new();
506 $db = $swiss_db;
509 if( $db_type =~ /embl/ ) {
510 if( !defined $embl_db ) {
511 $embl_db = Bio::DB::EMBL->new();
513 $db = $embl_db;
516 if( $db_type =~ /refseq/ or ($db_type !~ /swiss/ and
517 $identifier =~ /^\s*N\S+_/)) {
518 if( !defined $refseq_db ) {
519 $refseq_db = Bio::DB::RefSeq->new();
521 $db = $refseq_db;
524 my $seq;
526 if( $identifier =~ /^\w+\d+$/ ) {
527 $seq = $db->get_Seq_by_acc($identifier);
528 } else {
529 $seq = $db->get_Seq_by_id($identifier);
532 return $seq;
536 =head2 translate
538 Title : translate
539 Usage : $seqobj = translate($seq_or_string_scalar)
541 Function: translates a DNA sequence object OR just a plain
542 string of DNA to amino acids
543 Returns : A Bio::Seq object
545 Args : Either a sequence object or a string of
546 just DNA sequence characters
548 =cut
550 sub translate {
551 my ($scalar) = shift;
553 my $obj;
554 if( ref $scalar ) {
555 if( !$scalar->isa("Bio::PrimarySeqI") ) {
556 confess("Expecting a sequence object not a $scalar");
557 } else {
558 $obj= $scalar;
560 } else {
561 # check this looks vaguely like DNA
562 my $n = ( $scalar =~ tr/ATGCNatgcn/ATGCNatgcn/ );
563 if( $n < length($scalar) * 0.85 ) {
564 confess("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me");
566 $obj = Bio::PrimarySeq->new(-id => 'internalbioperlseq',-seq => $scalar);
568 return $obj->translate();
572 =head2 translate_as_string
574 Title : translate_as_string
575 Usage : $seqstring = translate_as_string($seq_or_string_scalar)
577 Function: translates a DNA sequence object OR just a plain
578 string of DNA to amino acids
579 Returns : A string of just amino acids
581 Args : Either a sequence object or a string of
582 just DNA sequence characters
584 =cut
586 sub translate_as_string {
587 my ($scalar) = shift;
588 my $obj = Bio::Perl::translate($scalar);
589 return $obj->seq;
593 =head2 reverse_complement
595 Title : reverse_complement
596 Usage : $seqobj = reverse_complement($seq_or_string_scalar)
598 Function: reverse complements a string or sequence argument
599 producing a Bio::Seq - if you want a string, you
600 can use reverse_complement_as_string
601 Returns : A Bio::Seq object
603 Args : Either a sequence object or a string of
604 just DNA sequence characters
606 =cut
608 sub reverse_complement {
609 my ($scalar) = shift;
611 my $obj;
613 if( ref $scalar ) {
614 if( !$scalar->isa("Bio::PrimarySeqI") ) {
615 confess("Expecting a sequence object not a $scalar");
616 } else {
617 $obj= $scalar;
620 } else {
622 # check this looks vaguely like DNA
623 my $n = ( $scalar =~ tr/ATGCNatgcn/ATGCNatgcn/ );
625 if( $n < length($scalar) * 0.85 ) {
626 confess("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me");
629 $obj = Bio::PrimarySeq->new(-id => 'internalbioperlseq',-seq => $scalar);
632 return $obj->revcom();
635 =head2 revcom
637 Title : revcom
638 Usage : $seqobj = revcom($seq_or_string_scalar)
640 Function: reverse complements a string or sequence argument
641 producing a Bio::Seq - if you want a string, you
642 can use reverse_complement_as_string
644 This is an alias for reverse_complement
645 Returns : A Bio::Seq object
647 Args : Either a sequence object or a string of
648 just DNA sequence characters
650 =cut
652 sub revcom {
653 return &Bio::Perl::reverse_complement(@_);
657 =head2 reverse_complement_as_string
659 Title : reverse_complement_as_string
660 Usage : $string = reverse_complement_as_string($seq_or_string_scalar)
662 Function: reverse complements a string or sequence argument
663 producing a string
664 Returns : A string of DNA letters
666 Args : Either a sequence object or a string of
667 just DNA sequence characters
669 =cut
671 sub reverse_complement_as_string {
672 my ($scalar) = shift;
673 my $obj = &Bio::Perl::reverse_complement($scalar);
674 return $obj->seq;
678 =head2 revcom_as_string
680 Title : revcom_as_string
681 Usage : $string = revcom_as_string($seq_or_string_scalar)
683 Function: reverse complements a string or sequence argument
684 producing a string
685 Returns : A string of DNA letters
687 Args : Either a sequence object or a string of
688 just DNA sequence characters
690 =cut
692 sub revcom_as_string {
693 my ($scalar) = shift;
694 my $obj = &Bio::Perl::reverse_complement($scalar);
695 return $obj->seq;