[bug 2714]
[bioperl-live.git] / Bio / Perl.pm
bloba2ef632d535c2402fc8bd1f8d8767efa2b292e8e
1 # $Id$
3 # BioPerl module for Bio::Perl
5 # Cared for by Ewan Birney <birney@ebi.ac.uk>
7 # Copyright Ewan Birney
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::Perl - Functional access to BioPerl for people who don't know objects
17 =head1 SYNOPSIS
19 use Bio::Perl;
21 # will guess file format from extension
22 $seq_object = read_sequence($filename);
24 # forces genbank format
25 $seq_object = read_sequence($filename,'genbank');
27 # reads an array of sequences
28 @seq_object_array = read_all_sequences($filename,'fasta');
30 # sequences are Bio::Seq objects, so the following methods work
31 # for more info see Bio::Seq, or do 'perldoc Bio/Seq.pm'
33 print "Sequence name is ",$seq_object->display_id,"\n";
34 print "Sequence acc is ",$seq_object->accession_number,"\n";
35 print "First 5 bases is ",$seq_object->subseq(1,5),"\n";
37 # get the whole sequence as a single string
39 $sequence_as_a_string = $seq_object->seq();
41 # writing sequences
43 write_sequence(">$filename",'genbank',$seq_object);
45 write_sequence(">$filename",'genbank',@seq_object_array);
47 # making a new sequence from just a string
49 $seq_object = new_sequence("ATTGGTTTGGGGACCCAATTTGTGTGTTATATGTA",
50 "myname","AL12232");
52 # getting a sequence from a database (assumes internet connection)
54 $seq_object = get_sequence('swissprot',"ROA1_HUMAN");
56 $seq_object = get_sequence('embl',"AI129902");
58 $seq_object = get_sequence('genbank',"AI129902");
60 # BLAST a sequence (assummes an internet connection)
62 $blast_report = blast_sequence($seq_object);
64 write_blast(">blast.out",$blast_report);
67 =head1 DESCRIPTION
69 Easy first time access to BioPerl via functions.
71 =head1 FEEDBACK
73 =head2 Mailing Lists
75 User feedback is an integral part of the evolution of this and other
76 Bioperl modules. Send your comments and suggestions preferably to one
77 of the Bioperl mailing lists. Your participation is much appreciated.
79 bioperl-l@bioperl.org
81 =head2 Reporting Bugs
83 Report bugs to the Bioperl bug tracking system to help us keep track
84 the bugs and their resolution. Bug reports can be submitted via the web:
86 http://bugzilla.open-bio.org/
88 =head1 AUTHOR - Ewan Birney
90 Email birney@ebi.ac.uk
92 =head1 APPENDIX
94 The rest of the documentation details each of the object methods.
95 Internal methods are usually preceded with a _
97 =cut
100 # Let the code begin...
103 package Bio::Perl;
104 use vars qw(@EXPORT @EXPORT_OK $DBOKAY);
105 use strict;
106 use Carp;
108 use Bio::SeqIO;
109 use Bio::Seq;
110 use Bio::Root::Version '$VERSION';
111 BEGIN {
112 eval {
113 require Bio::DB::EMBL;
114 require Bio::DB::GenBank;
115 require Bio::DB::SwissProt;
116 require Bio::DB::RefSeq;
117 require Bio::DB::GenPept;
119 if( $@ ) {
120 $DBOKAY = 0;
121 } else {
122 $DBOKAY = 1;
126 use base qw(Exporter);
128 @EXPORT = qw(read_sequence read_all_sequences write_sequence
129 new_sequence get_sequence translate translate_as_string
130 reverse_complement revcom revcom_as_string
131 reverse_complement_as_string blast_sequence write_blast);
133 @EXPORT_OK = @EXPORT;
136 =head2 read_sequence
138 Title : read_sequence
139 Usage : $seq = read_sequence('sequences.fa')
140 $seq = read_sequence($filename,'genbank');
142 # pipes are fine
143 $seq = read_sequence("my_fetching_program $id |",'fasta');
145 Function: Reads the top sequence from the file. If no format is given, it will
146 try to guess the format from the filename. If a format is given, it
147 forces that format. The filename can be any valid perl open() string
148 - in particular, you can put in pipes
150 Returns : A Bio::Seq object. A quick synopsis:
151 $seq_object->display_id - name of the sequence
152 $seq_object->seq - sequence as a string
154 Args : Two strings, first the filename - any Perl open() string is ok
155 Second string is the format, which is optional
157 For more information on Seq objects see L<Bio::Seq>.
159 =cut
161 sub read_sequence{
162 my ($filename,$format) = @_;
164 if( !defined $filename ) {
165 confess "read_sequence($filename) - usage incorrect";
168 my $seqio;
170 if( defined $format ) {
171 $seqio = Bio::SeqIO->new( '-file' => $filename, '-format' => $format);
172 } else {
173 $seqio = Bio::SeqIO->new( '-file' => $filename);
176 my $seq = $seqio->next_seq();
178 return $seq;
182 =head2 read_all_sequences
184 Title : read_all_sequences
185 Usage : @seq_object_array = read_all_sequences($filename);
186 @seq_object_array = read_all_sequences($filename,'genbank');
188 Function: Just as the function above, but reads all the sequences in the
189 file and loads them into an array.
191 For very large files, you will run out of memory. When this
192 happens, you've got to use the SeqIO system directly (this is
193 not so hard! Don't worry about it!).
195 Returns : array of Bio::Seq objects
197 Args : two strings, first the filename (any open() string is ok)
198 second the format (which is optional)
200 See L<Bio::SeqIO> and L<Bio::Seq> for more information
202 =cut
204 sub read_all_sequences{
205 my ($filename,$format) = @_;
207 if( !defined $filename ) {
208 confess "read_all_sequences($filename) - usage incorrect";
211 my $seqio;
213 if( defined $format ) {
214 $seqio = Bio::SeqIO->new( '-file' => $filename, '-format' => $format);
215 } else {
216 $seqio = Bio::SeqIO->new( '-file' => $filename);
219 my @seq_array;
221 while( my $seq = $seqio->next_seq() ) {
222 push(@seq_array,$seq);
225 return @seq_array;
229 =head2 write_sequence
231 Title : write_sequence
232 Usage : write_sequence(">new_file.gb",'genbank',$seq)
233 write_sequence(">new_file.gb",'genbank',@array_of_sequence_objects)
235 Function: writes sequences in the specified format
237 Returns : true
239 Args : filename as a string, must provide an open() output file
240 format as a string
241 one or more sequence objects
244 =cut
246 sub write_sequence{
247 my ($filename,$format,@sequence_objects) = @_;
249 if( scalar(@sequence_objects) == 0 ) {
250 confess("write_sequence(filename,format,sequence_object)");
253 my $error = 0;
254 my $seqname = "sequence1";
256 # catch users who haven't passed us a filename we can open
257 if( $filename !~ /^\>/ && $filename !~ /^|/ ) {
258 $filename = ">".$filename;
261 my $seqio = Bio::SeqIO->new('-file' => $filename, '-format' => $format);
263 foreach my $seq ( @sequence_objects ) {
264 my $seq_obj;
266 if( !ref $seq ) {
267 if( length $seq > 50 ) {
268 # odds are this is a sequence as a string, and someone has not figured out
269 # how to make objects. Warn him/her and then make a sequence object
270 # from this
271 if( $error == 0 ) {
272 carp("WARNING: You have put in a long string into write_sequence.\nI suspect this means that this is the actual sequence\nIn the future try the\n new_sequence method of this module to make a new sequence object.\nDoing this for you here\n");
273 $error = 1;
276 $seq_obj = new_sequence($seq,$seqname);
277 $seqname++;
278 } else {
279 confess("You have a non object [$seq] passed to write_sequence. It maybe that you want to use new_sequence to make this string into a sequence object?");
281 } else {
282 if( !$seq->isa("Bio::SeqI") ) {
283 confess("object [$seq] is not a Bio::Seq object; can't write it out");
285 $seq_obj = $seq;
288 # finally... we get to write out the sequence!
289 $seqio->write_seq($seq_obj);
294 =head2 new_sequence
296 Title : new_sequence
297 Usage : $seq_obj = new_sequence("GATTACA", "kino-enzyme");
299 Function: Construct a sequency object from sequence string
300 Returns : A Bio::Seq object
302 Args : sequence string
303 name string (optional, default "no-name-for-sequence")
304 accession - accession number (optional, no default)
306 =cut
308 sub new_sequence{
309 my ($seq,$name,$accession) = @_;
311 if( !defined $seq ) {
312 confess("new_sequence(sequence_as_string) usage");
315 $name ||= "no-name-for-sequence";
317 my $seq_object = Bio::Seq->new( -seq => $seq, -id => $name);
319 $accession && $seq_object->accession_number($accession);
321 return $seq_object;
324 =head2 blast_sequence
326 Title : blast_sequence
327 Usage : $blast_result = blast_sequence($seq)
328 $blast_result = blast_sequence('MFVEGGTFASEDDDSASAEDE');
330 Function: If the computer has Internet accessibility, blasts
331 the sequence using the NCBI BLAST server against nrdb.
333 It chooses the flavour of BLAST on the basis of the sequence.
335 This function uses Bio::Tools::Run::RemoteBlast, which itself
336 use Bio::SearchIO - as soon as you want to know more, check out
337 these modules
338 Returns : Bio::Search::Result::GenericResult.pm
340 Args : Either a string of protein letters or nucleotides, or a
341 Bio::Seq object
343 =cut
345 sub blast_sequence {
346 my ($seq,$verbose) = shift;
348 if( !defined $verbose ) {
349 $verbose = 1;
352 if( !ref $seq ) {
353 $seq = Bio::Seq->new( -seq => $seq, -id => 'blast-sequence-temp-id');
354 } elsif ( !$seq->isa('Bio::PrimarySeqI') ) {
355 croak("[$seq] is an object, but not a Bio::Seq object, cannot be blasted");
358 require Bio::Tools::Run::RemoteBlast;
360 my $prog = 'blastp';
361 my $e_val= '1e-10';
363 my @params = ( '-prog' => $prog,
364 '-expect' => $e_val,
365 '-readmethod' => 'SearchIO' );
367 my $factory = Bio::Tools::Run::RemoteBlast->new(@params);
369 my $r = $factory->submit_blast($seq);
370 if( $verbose ) {
371 print STDERR "Submitted Blast for [".$seq->id."] ";
373 sleep 5;
375 my $result;
377 LOOP :
378 while( my @rids = $factory->each_rid) {
379 foreach my $rid ( @rids ) {
380 my $rc = $factory->retrieve_blast($rid);
381 if( !ref($rc) ) {
382 if( $rc < 0 ) {
383 $factory->remove_rid($rid);
385 if( $verbose ) {
386 print STDERR ".";
388 sleep 10;
389 } else {
390 $result = $rc->next_result();
391 $factory->remove_rid($rid);
392 last LOOP;
397 if( $verbose ) {
398 print STDERR "\n";
400 return $result;
403 =head2 write_blast
405 Title : write_blast
406 Usage : write_blast($filename,$blast_report);
408 Function: Writes a BLAST result object (or more formally
409 a SearchIO result object) out to a filename
410 in BLAST-like format
412 Returns : none
414 Args : filename as a string
415 Bio::SearchIO::Results object
417 =cut
419 sub write_blast {
420 my ($filename,$blast) = @_;
422 if( $filename !~ /^\>/ && $filename !~ /^|/ ) {
423 $filename = ">".$filename;
426 my $output = Bio::SearchIO->new( -output_format => 'blast', -file => $filename);
428 $output->write_result($blast);
432 =head2 get_sequence
434 Title : get_sequence
435 Usage : $seq_object = get_sequence('swiss',"ROA1_HUMAN");
437 Function: If the computer has Internet access this method gets
438 the sequence from Internet accessible databases. Currently
439 this supports Swissprot ('swiss'), EMBL ('embl'), GenBank
440 ('genbank'), GenPept ('genpept'), and RefSeq ('refseq').
442 Swissprot and EMBL are more robust than GenBank fetching.
444 If the user is trying to retrieve a RefSeq entry from
445 GenBank/EMBL, the query is silently redirected.
447 Returns : A Bio::Seq object
449 Args : database type - one of swiss, embl, genbank, genpept, or
450 refseq
452 =cut
454 my $genbank_db = undef;
455 my $genpept_db = undef;
456 my $embl_db = undef;
457 my $swiss_db = undef;
458 my $refseq_db = undef;
460 sub get_sequence{
461 my ($db_type,$identifier) = @_;
462 if( ! $DBOKAY ) {
463 confess ("Your system does not have one of LWP, HTTP::Request::Common, IO::String installed so the DB retrieval method is not available. \nFull error message is:\n $!\n");
464 return;
466 $db_type = lc($db_type);
468 my $db;
470 if( $db_type =~ /genbank/ ) {
471 if( !defined $genbank_db ) {
472 $genbank_db = Bio::DB::GenBank->new();
474 $db = $genbank_db;
476 if( $db_type =~ /genpept/ ) {
477 if( !defined $genpept_db ) {
478 $genpept_db = Bio::DB::GenPept->new();
480 $db = $genpept_db;
483 if( $db_type =~ /swiss/ ) {
484 if( !defined $swiss_db ) {
485 $swiss_db = Bio::DB::SwissProt->new();
487 $db = $swiss_db;
490 if( $db_type =~ /embl/ ) {
491 if( !defined $embl_db ) {
492 $embl_db = Bio::DB::EMBL->new();
494 $db = $embl_db;
497 if( $db_type =~ /refseq/ or ($db_type !~ /swiss/ and
498 $identifier =~ /^\s*N\S+_/)) {
499 if( !defined $refseq_db ) {
500 $refseq_db = Bio::DB::RefSeq->new();
502 $db = $refseq_db;
505 my $seq;
507 if( $identifier =~ /^\w+\d+$/ ) {
508 $seq = $db->get_Seq_by_acc($identifier);
509 } else {
510 $seq = $db->get_Seq_by_id($identifier);
513 return $seq;
517 =head2 translate
519 Title : translate
520 Usage : $seqobj = translate($seq_or_string_scalar)
522 Function: translates a DNA sequence object OR just a plain
523 string of DNA to amino acids
524 Returns : A Bio::Seq object
526 Args : Either a sequence object or a string of
527 just DNA sequence characters
529 =cut
531 sub translate {
532 my ($scalar) = shift;
534 my $obj;
536 if( ref $scalar ) {
537 if( !$scalar->isa("Bio::PrimarySeqI") ) {
538 confess("Expecting a sequence object not a $scalar");
539 } else {
540 $obj= $scalar;
544 } else {
546 # check this looks vaguely like DNA
547 my $n = ( $scalar =~ tr/ATGCNatgcn/ATGCNatgcn/ );
549 if( $n < length($scalar) * 0.85 ) {
550 confess("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me");
553 $obj = Bio::PrimarySeq->new(-id => 'internalbioperlseq',-seq => $scalar);
556 return $obj->translate();
560 =head2 translate_as_string
562 Title : translate_as_string
563 Usage : $seqstring = translate_as_string($seq_or_string_scalar)
565 Function: translates a DNA sequence object OR just a plain
566 string of DNA to amino acids
567 Returns : A string of just amino acids
569 Args : Either a sequence object or a string of
570 just DNA sequence characters
572 =cut
574 sub translate_as_string {
575 my ($scalar) = shift;
577 my $obj = Bio::Perl::translate($scalar);
579 return $obj->seq;
583 =head2 reverse_complement
585 Title : reverse_complement
586 Usage : $seqobj = reverse_complement($seq_or_string_scalar)
588 Function: reverse complements a string or sequence argument
589 producing a Bio::Seq - if you want a string, you
590 can use reverse_complement_as_string
591 Returns : A Bio::Seq object
593 Args : Either a sequence object or a string of
594 just DNA sequence characters
596 =cut
598 sub reverse_complement {
599 my ($scalar) = shift;
601 my $obj;
603 if( ref $scalar ) {
604 if( !$scalar->isa("Bio::PrimarySeqI") ) {
605 confess("Expecting a sequence object not a $scalar");
606 } else {
607 $obj= $scalar;
611 } else {
613 # check this looks vaguely like DNA
614 my $n = ( $scalar =~ tr/ATGCNatgc/ATGCNatgcn/ );
616 if( $n < length($scalar) * 0.85 ) {
617 confess("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me");
620 $obj = Bio::PrimarySeq->new(-id => 'internalbioperlseq',-seq => $scalar);
623 return $obj->revcom();
626 =head2 revcom
628 Title : revcom
629 Usage : $seqobj = revcom($seq_or_string_scalar)
631 Function: reverse complements a string or sequence argument
632 producing a Bio::Seq - if you want a string, you
633 can use reverse_complement_as_string
635 This is an alias for reverse_complement
636 Returns : A Bio::Seq object
638 Args : Either a sequence object or a string of
639 just DNA sequence characters
641 =cut
643 sub revcom {
644 return &Bio::Perl::reverse_complement(@_);
648 =head2 reverse_complement_as_string
650 Title : reverse_complement_as_string
651 Usage : $string = reverse_complement_as_string($seq_or_string_scalar)
653 Function: reverse complements a string or sequence argument
654 producing a string
655 Returns : A string of DNA letters
657 Args : Either a sequence object or a string of
658 just DNA sequence characters
660 =cut
662 sub reverse_complement_as_string {
663 my ($scalar) = shift;
665 my $obj = &Bio::Perl::reverse_complement($scalar);
667 return $obj->seq;
671 =head2 revcom_as_string
673 Title : revcom_as_string
674 Usage : $string = revcom_as_string($seq_or_string_scalar)
676 Function: reverse complements a string or sequence argument
677 producing a string
678 Returns : A string of DNA letters
680 Args : Either a sequence object or a string of
681 just DNA sequence characters
683 =cut
685 sub revcom_as_string {
686 my ($scalar) = shift;
688 my $obj = &Bio::Perl::reverse_complement($scalar);
690 return $obj->seq;