a few more updates
[bioperl-live.git] / Bio / Perl.pm
blob60fa1cbd62418e3d43e7b86cb99ce427d7f63be9
2 # BioPerl module for Bio::Perl
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Perl - Functional access to BioPerl for people who don't know objects
18 =head1 SYNOPSIS
20 use Bio::Perl;
22 # will guess file format from extension
23 $seq_object = read_sequence($filename);
25 # forces genbank format
26 $seq_object = read_sequence($filename,'genbank');
28 # reads an array of sequences
29 @seq_object_array = read_all_sequences($filename,'fasta');
31 # sequences are Bio::Seq objects, so the following methods work
32 # for more info see Bio::Seq, or do 'perldoc Bio/Seq.pm'
34 print "Sequence name is ",$seq_object->display_id,"\n";
35 print "Sequence acc is ",$seq_object->accession_number,"\n";
36 print "First 5 bases is ",$seq_object->subseq(1,5),"\n";
38 # get the whole sequence as a single string
40 $sequence_as_a_string = $seq_object->seq();
42 # writing sequences
44 write_sequence(">$filename",'genbank',$seq_object);
46 write_sequence(">$filename",'genbank',@seq_object_array);
48 # making a new sequence from just a string
50 $seq_object = new_sequence("ATTGGTTTGGGGACCCAATTTGTGTGTTATATGTA",
51 "myname","AL12232");
53 # getting a sequence from a database (assumes internet connection)
55 $seq_object = get_sequence('swissprot',"ROA1_HUMAN");
57 $seq_object = get_sequence('embl',"AI129902");
59 $seq_object = get_sequence('genbank',"AI129902");
61 # BLAST a sequence (assummes an internet connection)
63 $blast_report = blast_sequence($seq_object);
65 write_blast(">blast.out",$blast_report);
68 =head1 DESCRIPTION
70 Easy first time access to BioPerl via functions.
72 =head1 FEEDBACK
74 =head2 Mailing Lists
76 User feedback is an integral part of the evolution of this and other
77 Bioperl modules. Send your comments and suggestions preferably to one
78 of the Bioperl mailing lists. Your participation is much appreciated.
80 bioperl-l@bioperl.org
82 =head2 Support
84 Please direct usage questions or support issues to the mailing list:
86 I<bioperl-l@bioperl.org>
88 rather than to the module maintainer directly. Many experienced and
89 reponsive experts will be able look at the problem and quickly
90 address it. Please include a thorough description of the problem
91 with code and data examples if at all possible.
93 =head2 Reporting Bugs
95 Report bugs to the Bioperl bug tracking system to help us keep track
96 the bugs and their resolution. Bug reports can be submitted via the web:
98 https://redmine.open-bio.org/projects/bioperl/
100 =head1 AUTHOR - Ewan Birney
102 Email birney@ebi.ac.uk
104 =head1 APPENDIX
106 The rest of the documentation details each of the object methods.
107 Internal methods are usually preceded with a _
109 =cut
112 # Let the code begin...
115 package Bio::Perl;
116 use vars qw(@EXPORT @EXPORT_OK $DBOKAY);
117 use strict;
118 use Carp;
120 use Bio::SeqIO;
121 use Bio::Seq;
122 use Bio::Root::Version '$VERSION';
123 BEGIN {
124 eval {
125 require Bio::DB::EMBL;
126 require Bio::DB::GenBank;
127 require Bio::DB::SwissProt;
128 require Bio::DB::RefSeq;
129 require Bio::DB::GenPept;
131 if( $@ ) {
132 $DBOKAY = 0;
133 } else {
134 $DBOKAY = 1;
138 use base qw(Exporter);
140 @EXPORT = qw(read_sequence read_all_sequences write_sequence
141 new_sequence get_sequence translate translate_as_string
142 reverse_complement revcom revcom_as_string
143 reverse_complement_as_string blast_sequence write_blast);
145 @EXPORT_OK = @EXPORT;
148 =head2 read_sequence
150 Title : read_sequence
151 Usage : $seq = read_sequence('sequences.fa')
152 $seq = read_sequence($filename,'genbank');
154 # pipes are fine
155 $seq = read_sequence("my_fetching_program $id |",'fasta');
157 Function: Reads the top sequence from the file. If no format is given, it will
158 try to guess the format from the filename. If a format is given, it
159 forces that format. The filename can be any valid perl open() string
160 - in particular, you can put in pipes
162 Returns : A Bio::Seq object. A quick synopsis:
163 $seq_object->display_id - name of the sequence
164 $seq_object->seq - sequence as a string
166 Args : Two strings, first the filename - any Perl open() string is ok
167 Second string is the format, which is optional
169 For more information on Seq objects see L<Bio::Seq>.
171 =cut
173 sub read_sequence{
174 my ($filename,$format) = @_;
176 if( !defined $filename ) {
177 confess "read_sequence($filename) - usage incorrect";
180 my $seqio;
182 if( defined $format ) {
183 $seqio = Bio::SeqIO->new( '-file' => $filename, '-format' => $format);
184 } else {
185 $seqio = Bio::SeqIO->new( '-file' => $filename);
188 my $seq = $seqio->next_seq();
190 return $seq;
194 =head2 read_all_sequences
196 Title : read_all_sequences
197 Usage : @seq_object_array = read_all_sequences($filename);
198 @seq_object_array = read_all_sequences($filename,'genbank');
200 Function: Just as the function above, but reads all the sequences in the
201 file and loads them into an array.
203 For very large files, you will run out of memory. When this
204 happens, you've got to use the SeqIO system directly (this is
205 not so hard! Don't worry about it!).
207 Returns : array of Bio::Seq objects
209 Args : two strings, first the filename (any open() string is ok)
210 second the format (which is optional)
212 See L<Bio::SeqIO> and L<Bio::Seq> for more information
214 =cut
216 sub read_all_sequences{
217 my ($filename,$format) = @_;
219 if( !defined $filename ) {
220 confess "read_all_sequences($filename) - usage incorrect";
223 my $seqio;
225 if( defined $format ) {
226 $seqio = Bio::SeqIO->new( '-file' => $filename, '-format' => $format);
227 } else {
228 $seqio = Bio::SeqIO->new( '-file' => $filename);
231 my @seq_array;
233 while( my $seq = $seqio->next_seq() ) {
234 push(@seq_array,$seq);
237 return @seq_array;
241 =head2 write_sequence
243 Title : write_sequence
244 Usage : write_sequence(">new_file.gb",'genbank',$seq)
245 write_sequence(">new_file.gb",'genbank',@array_of_sequence_objects)
247 Function: writes sequences in the specified format
249 Returns : true
251 Args : filename as a string, must provide an open() output file
252 format as a string
253 one or more sequence objects
256 =cut
258 sub write_sequence{
259 my ($filename,$format,@sequence_objects) = @_;
261 if( scalar(@sequence_objects) == 0 ) {
262 confess("write_sequence(filename,format,sequence_object)");
265 my $error = 0;
266 my $seqname = "sequence1";
268 # catch users who haven't passed us a filename we can open
269 if( $filename !~ /^\>/ && $filename !~ /^|/ ) {
270 $filename = ">".$filename;
273 my $seqio = Bio::SeqIO->new('-file' => $filename, '-format' => $format);
275 foreach my $seq ( @sequence_objects ) {
276 my $seq_obj;
278 if( !ref $seq ) {
279 if( length $seq > 50 ) {
280 # odds are this is a sequence as a string, and someone has not figured out
281 # how to make objects. Warn him/her and then make a sequence object
282 # from this
283 if( $error == 0 ) {
284 carp("WARNING: You have put in a long string into write_sequence.\nI suspect this means that this is the actual sequence\nIn the future try the\n new_sequence method of this module to make a new sequence object.\nDoing this for you here\n");
285 $error = 1;
288 $seq_obj = new_sequence($seq,$seqname);
289 $seqname++;
290 } else {
291 confess("You have a non object [$seq] passed to write_sequence. It maybe that you want to use new_sequence to make this string into a sequence object?");
293 } else {
294 if( !$seq->isa("Bio::SeqI") ) {
295 confess("object [$seq] is not a Bio::Seq object; can't write it out");
297 $seq_obj = $seq;
300 # finally... we get to write out the sequence!
301 $seqio->write_seq($seq_obj);
306 =head2 new_sequence
308 Title : new_sequence
309 Usage : $seq_obj = new_sequence("GATTACA", "kino-enzyme");
311 Function: Construct a sequency object from sequence string
312 Returns : A Bio::Seq object
314 Args : sequence string
315 name string (optional, default "no-name-for-sequence")
316 accession - accession number (optional, no default)
318 =cut
320 sub new_sequence{
321 my ($seq,$name,$accession) = @_;
323 if( !defined $seq ) {
324 confess("new_sequence(sequence_as_string) usage");
327 $name ||= "no-name-for-sequence";
329 my $seq_object = Bio::Seq->new( -seq => $seq, -id => $name);
331 $accession && $seq_object->accession_number($accession);
333 return $seq_object;
336 =head2 blast_sequence
338 Title : blast_sequence
339 Usage : $blast_result = blast_sequence($seq)
340 $blast_result = blast_sequence('MFVEGGTFASEDDDSASAEDE');
342 Function: If the computer has Internet accessibility, blasts
343 the sequence using the NCBI BLAST server against nrdb.
345 It chooses the flavour of BLAST on the basis of the sequence.
347 This function uses Bio::Tools::Run::RemoteBlast, which itself
348 use Bio::SearchIO - as soon as you want to know more, check out
349 these modules
350 Returns : Bio::Search::Result::GenericResult.pm
352 Args : Either a string of protein letters or nucleotides, or a
353 Bio::Seq object
355 =cut
357 sub blast_sequence {
358 my ($seq,$verbose) = shift;
360 if( !defined $verbose ) {
361 $verbose = 1;
364 if( !ref $seq ) {
365 $seq = Bio::Seq->new( -seq => $seq, -id => 'blast-sequence-temp-id');
366 } elsif ( !$seq->isa('Bio::PrimarySeqI') ) {
367 croak("[$seq] is an object, but not a Bio::Seq object, cannot be blasted");
370 require Bio::Tools::Run::RemoteBlast;
372 my $prog = 'blastp';
373 my $e_val= '1e-10';
375 my @params = ( '-prog' => $prog,
376 '-expect' => $e_val,
377 '-readmethod' => 'SearchIO' );
379 my $factory = Bio::Tools::Run::RemoteBlast->new(@params);
381 my $r = $factory->submit_blast($seq);
382 if( $verbose ) {
383 print STDERR "Submitted Blast for [".$seq->id."] ";
385 sleep 5;
387 my $result;
389 LOOP :
390 while( my @rids = $factory->each_rid) {
391 foreach my $rid ( @rids ) {
392 my $rc = $factory->retrieve_blast($rid);
393 if( !ref($rc) ) {
394 if( $rc < 0 ) {
395 $factory->remove_rid($rid);
397 if( $verbose ) {
398 print STDERR ".";
400 sleep 10;
401 } else {
402 $result = $rc->next_result();
403 $factory->remove_rid($rid);
404 last LOOP;
409 if( $verbose ) {
410 print STDERR "\n";
412 return $result;
415 =head2 write_blast
417 Title : write_blast
418 Usage : write_blast($filename,$blast_report);
420 Function: Writes a BLAST result object (or more formally
421 a SearchIO result object) out to a filename
422 in BLAST-like format
424 Returns : none
426 Args : filename as a string
427 Bio::SearchIO::Results object
429 =cut
431 sub write_blast {
432 my ($filename,$blast) = @_;
434 if( $filename !~ /^\>/ && $filename !~ /^|/ ) {
435 $filename = ">".$filename;
438 my $output = Bio::SearchIO->new( -output_format => 'blast', -file => $filename);
440 $output->write_result($blast);
444 =head2 get_sequence
446 Title : get_sequence
447 Usage : $seq_object = get_sequence('swiss',"ROA1_HUMAN");
449 Function: If the computer has Internet access this method gets
450 the sequence from Internet accessible databases. Currently
451 this supports Swissprot ('swiss'), EMBL ('embl'), GenBank
452 ('genbank'), GenPept ('genpept'), and RefSeq ('refseq').
454 Swissprot and EMBL are more robust than GenBank fetching.
456 If the user is trying to retrieve a RefSeq entry from
457 GenBank/EMBL, the query is silently redirected.
459 Returns : A Bio::Seq object
461 Args : database type - one of swiss, embl, genbank, genpept, or
462 refseq
464 =cut
466 my $genbank_db = undef;
467 my $genpept_db = undef;
468 my $embl_db = undef;
469 my $swiss_db = undef;
470 my $refseq_db = undef;
472 sub get_sequence{
473 my ($db_type,$identifier) = @_;
474 if( ! $DBOKAY ) {
475 confess ("Your system does not have one of LWP, HTTP::Request::Common, IO::String installed so the DB retrieval method is not available. \nFull error message is:\n $!\n");
476 return;
478 $db_type = lc($db_type);
480 my $db;
482 if( $db_type =~ /genbank/ ) {
483 if( !defined $genbank_db ) {
484 $genbank_db = Bio::DB::GenBank->new();
486 $db = $genbank_db;
488 if( $db_type =~ /genpept/ ) {
489 if( !defined $genpept_db ) {
490 $genpept_db = Bio::DB::GenPept->new();
492 $db = $genpept_db;
495 if( $db_type =~ /swiss/ ) {
496 if( !defined $swiss_db ) {
497 $swiss_db = Bio::DB::SwissProt->new();
499 $db = $swiss_db;
502 if( $db_type =~ /embl/ ) {
503 if( !defined $embl_db ) {
504 $embl_db = Bio::DB::EMBL->new();
506 $db = $embl_db;
509 if( $db_type =~ /refseq/ or ($db_type !~ /swiss/ and
510 $identifier =~ /^\s*N\S+_/)) {
511 if( !defined $refseq_db ) {
512 $refseq_db = Bio::DB::RefSeq->new();
514 $db = $refseq_db;
517 my $seq;
519 if( $identifier =~ /^\w+\d+$/ ) {
520 $seq = $db->get_Seq_by_acc($identifier);
521 } else {
522 $seq = $db->get_Seq_by_id($identifier);
525 return $seq;
529 =head2 translate
531 Title : translate
532 Usage : $seqobj = translate($seq_or_string_scalar)
534 Function: translates a DNA sequence object OR just a plain
535 string of DNA to amino acids
536 Returns : A Bio::Seq object
538 Args : Either a sequence object or a string of
539 just DNA sequence characters
541 =cut
543 sub translate {
544 my ($scalar) = shift;
546 my $obj;
548 if( ref $scalar ) {
549 if( !$scalar->isa("Bio::PrimarySeqI") ) {
550 confess("Expecting a sequence object not a $scalar");
551 } else {
552 $obj= $scalar;
556 } else {
558 # check this looks vaguely like DNA
559 my $n = ( $scalar =~ tr/ATGCNatgcn/ATGCNatgcn/ );
561 if( $n < length($scalar) * 0.85 ) {
562 confess("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me");
565 $obj = Bio::PrimarySeq->new(-id => 'internalbioperlseq',-seq => $scalar);
568 return $obj->translate();
572 =head2 translate_as_string
574 Title : translate_as_string
575 Usage : $seqstring = translate_as_string($seq_or_string_scalar)
577 Function: translates a DNA sequence object OR just a plain
578 string of DNA to amino acids
579 Returns : A string of just amino acids
581 Args : Either a sequence object or a string of
582 just DNA sequence characters
584 =cut
586 sub translate_as_string {
587 my ($scalar) = shift;
589 my $obj = Bio::Perl::translate($scalar);
591 return $obj->seq;
595 =head2 reverse_complement
597 Title : reverse_complement
598 Usage : $seqobj = reverse_complement($seq_or_string_scalar)
600 Function: reverse complements a string or sequence argument
601 producing a Bio::Seq - if you want a string, you
602 can use reverse_complement_as_string
603 Returns : A Bio::Seq object
605 Args : Either a sequence object or a string of
606 just DNA sequence characters
608 =cut
610 sub reverse_complement {
611 my ($scalar) = shift;
613 my $obj;
615 if( ref $scalar ) {
616 if( !$scalar->isa("Bio::PrimarySeqI") ) {
617 confess("Expecting a sequence object not a $scalar");
618 } else {
619 $obj= $scalar;
623 } else {
625 # check this looks vaguely like DNA
626 my $n = ( $scalar =~ tr/ATGCNatgcn/ATGCNatgcn/ );
628 if( $n < length($scalar) * 0.85 ) {
629 confess("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me");
632 $obj = Bio::PrimarySeq->new(-id => 'internalbioperlseq',-seq => $scalar);
635 return $obj->revcom();
638 =head2 revcom
640 Title : revcom
641 Usage : $seqobj = revcom($seq_or_string_scalar)
643 Function: reverse complements a string or sequence argument
644 producing a Bio::Seq - if you want a string, you
645 can use reverse_complement_as_string
647 This is an alias for reverse_complement
648 Returns : A Bio::Seq object
650 Args : Either a sequence object or a string of
651 just DNA sequence characters
653 =cut
655 sub revcom {
656 return &Bio::Perl::reverse_complement(@_);
660 =head2 reverse_complement_as_string
662 Title : reverse_complement_as_string
663 Usage : $string = reverse_complement_as_string($seq_or_string_scalar)
665 Function: reverse complements a string or sequence argument
666 producing a string
667 Returns : A string of DNA letters
669 Args : Either a sequence object or a string of
670 just DNA sequence characters
672 =cut
674 sub reverse_complement_as_string {
675 my ($scalar) = shift;
677 my $obj = &Bio::Perl::reverse_complement($scalar);
679 return $obj->seq;
683 =head2 revcom_as_string
685 Title : revcom_as_string
686 Usage : $string = revcom_as_string($seq_or_string_scalar)
688 Function: reverse complements a string or sequence argument
689 producing a string
690 Returns : A string of DNA letters
692 Args : Either a sequence object or a string of
693 just DNA sequence characters
695 =cut
697 sub revcom_as_string {
698 my ($scalar) = shift;
700 my $obj = &Bio::Perl::reverse_complement($scalar);
702 return $obj->seq;