2 # BioPerl module for Bio::Perl
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Perl - Functional access to BioPerl for people who don't know objects
22 # will guess file format from extension
23 $seq_object = read_sequence($filename);
25 # forces genbank format
26 $seq_object = read_sequence($filename,'genbank');
28 # reads an array of sequences
29 @seq_object_array = read_all_sequences($filename,'fasta');
31 # sequences are Bio::Seq objects, so the following methods work
32 # for more info see Bio::Seq, or do 'perldoc Bio/Seq.pm'
34 print "Sequence name is ",$seq_object->display_id,"\n";
35 print "Sequence acc is ",$seq_object->accession_number,"\n";
36 print "First 5 bases is ",$seq_object->subseq(1,5),"\n";
38 # get the whole sequence as a single string
40 $sequence_as_a_string = $seq_object->seq();
44 write_sequence(">$filename",'genbank',$seq_object);
46 write_sequence(">$filename",'genbank',@seq_object_array);
48 # making a new sequence from just a string
50 $seq_object = new_sequence("ATTGGTTTGGGGACCCAATTTGTGTGTTATATGTA",
53 # getting a sequence from a database (assumes internet connection)
55 $seq_object = get_sequence('swissprot',"ROA1_HUMAN");
57 $seq_object = get_sequence('embl',"AI129902");
59 $seq_object = get_sequence('genbank',"AI129902");
61 # BLAST a sequence (assummes an internet connection)
63 $blast_report = blast_sequence($seq_object);
65 write_blast(">blast.out",$blast_report);
70 Easy first time access to BioPerl via functions.
76 User feedback is an integral part of the evolution of this and other
77 Bioperl modules. Send your comments and suggestions preferably to one
78 of the Bioperl mailing lists. Your participation is much appreciated.
84 Please direct usage questions or support issues to the mailing list:
86 I<bioperl-l@bioperl.org>
88 rather than to the module maintainer directly. Many experienced and
89 reponsive experts will be able look at the problem and quickly
90 address it. Please include a thorough description of the problem
91 with code and data examples if at all possible.
95 Report bugs to the Bioperl bug tracking system to help us keep track
96 the bugs and their resolution. Bug reports can be submitted via the web:
98 https://redmine.open-bio.org/projects/bioperl/
100 =head1 AUTHOR - Ewan Birney
102 Email birney@ebi.ac.uk
106 The rest of the documentation details each of the object methods.
107 Internal methods are usually preceded with a _
112 # Let the code begin...
116 use vars
qw(@EXPORT @EXPORT_OK $DBOKAY);
122 use Bio::Root::Version '$VERSION';
125 require Bio::DB::EMBL;
126 require Bio::DB::GenBank;
127 require Bio::DB::SwissProt;
128 require Bio::DB::RefSeq;
129 require Bio::DB::GenPept;
138 use base qw(Exporter);
140 @EXPORT = qw(read_sequence read_all_sequences write_sequence
141 new_sequence get_sequence translate translate_as_string
142 reverse_complement revcom revcom_as_string
143 reverse_complement_as_string blast_sequence write_blast);
145 @EXPORT_OK = @EXPORT;
150 Title : read_sequence
151 Usage : $seq = read_sequence('sequences.fa')
152 $seq = read_sequence($filename,'genbank');
155 $seq = read_sequence("my_fetching_program $id |",'fasta');
157 Function: Reads the top sequence from the file. If no format is given, it will
158 try to guess the format from the filename. If a format is given, it
159 forces that format. The filename can be any valid perl open() string
160 - in particular, you can put in pipes
162 Returns : A Bio::Seq object. A quick synopsis:
163 $seq_object->display_id - name of the sequence
164 $seq_object->seq - sequence as a string
166 Args : Two strings, first the filename - any Perl open() string is ok
167 Second string is the format, which is optional
169 For more information on Seq objects see L<Bio::Seq>.
174 my ($filename,$format) = @_;
176 if( !defined $filename ) {
177 confess
"read_sequence($filename) - usage incorrect";
182 if( defined $format ) {
183 $seqio = Bio
::SeqIO
->new( '-file' => $filename, '-format' => $format);
185 $seqio = Bio
::SeqIO
->new( '-file' => $filename);
188 my $seq = $seqio->next_seq();
194 =head2 read_all_sequences
196 Title : read_all_sequences
197 Usage : @seq_object_array = read_all_sequences($filename);
198 @seq_object_array = read_all_sequences($filename,'genbank');
200 Function: Just as the function above, but reads all the sequences in the
201 file and loads them into an array.
203 For very large files, you will run out of memory. When this
204 happens, you've got to use the SeqIO system directly (this is
205 not so hard! Don't worry about it!).
207 Returns : array of Bio::Seq objects
209 Args : two strings, first the filename (any open() string is ok)
210 second the format (which is optional)
212 See L<Bio::SeqIO> and L<Bio::Seq> for more information
216 sub read_all_sequences
{
217 my ($filename,$format) = @_;
219 if( !defined $filename ) {
220 confess
"read_all_sequences($filename) - usage incorrect";
225 if( defined $format ) {
226 $seqio = Bio
::SeqIO
->new( '-file' => $filename, '-format' => $format);
228 $seqio = Bio
::SeqIO
->new( '-file' => $filename);
233 while( my $seq = $seqio->next_seq() ) {
234 push(@seq_array,$seq);
241 =head2 write_sequence
243 Title : write_sequence
244 Usage : write_sequence(">new_file.gb",'genbank',$seq)
245 write_sequence(">new_file.gb",'genbank',@array_of_sequence_objects)
247 Function: writes sequences in the specified format
251 Args : filename as a string, must provide an open() output file
253 one or more sequence objects
259 my ($filename,$format,@sequence_objects) = @_;
261 if( scalar(@sequence_objects) == 0 ) {
262 confess
("write_sequence(filename,format,sequence_object)");
266 my $seqname = "sequence1";
268 # catch users who haven't passed us a filename we can open
269 if( $filename !~ /^\>/ && $filename !~ /^|/ ) {
270 $filename = ">".$filename;
273 my $seqio = Bio
::SeqIO
->new('-file' => $filename, '-format' => $format);
275 foreach my $seq ( @sequence_objects ) {
279 if( length $seq > 50 ) {
280 # odds are this is a sequence as a string, and someone has not figured out
281 # how to make objects. Warn him/her and then make a sequence object
284 carp
("WARNING: You have put in a long string into write_sequence.\n".
285 "I suspect this means that this is the actual sequence\n".
286 "In the future try the\n".
287 " new_sequence method of this module to make a new sequence object.\n".
288 "Doing this for you here\n");
292 $seq_obj = new_sequence
($seq,$seqname);
295 confess
("You have a non object [$seq] passed to write_sequence. It maybe that you".
296 "want to use new_sequence to make this string into a sequence object?");
299 if( !$seq->isa("Bio::SeqI") ) {
300 confess
("object [$seq] is not a Bio::Seq object; can't write it out");
305 # finally... we get to write out the sequence!
306 $seqio->write_seq($seq_obj);
314 Usage : $seq_obj = new_sequence("GATTACA", "kino-enzyme");
316 Function: Construct a sequency object from sequence string
317 Returns : A Bio::Seq object
319 Args : sequence string
320 name string (optional, default "no-name-for-sequence")
321 accession - accession number (optional, no default)
326 my ($seq,$name,$accession) = @_;
328 if( !defined $seq ) {
329 confess
("new_sequence(sequence_as_string) usage");
332 $name ||= "no-name-for-sequence";
334 my $seq_object = Bio
::Seq
->new( -seq
=> $seq, -id
=> $name);
336 $accession && $seq_object->accession_number($accession);
341 =head2 blast_sequence
343 Title : blast_sequence
344 Usage : $blast_result = blast_sequence($seq)
345 $blast_result = blast_sequence('MFVEGGTFASEDDDSASAEDE');
347 Function: If the computer has Internet accessibility, blasts
348 the sequence using the NCBI BLAST server against nrdb.
350 It chooses the flavour of BLAST on the basis of the sequence.
352 This function uses Bio::Tools::Run::RemoteBlast, which itself
353 use Bio::SearchIO - as soon as you want to know more, check out
355 Returns : Bio::Search::Result::GenericResult.pm
357 Args : Either a string of protein letters or nucleotides, or a
363 my ($seq,$verbose) = @_;
365 if( !defined $verbose ) {
370 $seq = Bio
::Seq
->new( -seq
=> $seq, -id
=> 'blast-sequence-temp-id');
371 } elsif ( !$seq->isa('Bio::PrimarySeqI') ) {
372 croak
("[$seq] is an object, but not a Bio::Seq object, cannot be blasted");
375 require Bio
::Tools
::Run
::RemoteBlast
;
377 my $prog = ( $seq->alphabet eq 'protein' ) ?
'blastp' : 'blastn';
380 my @params = ( '-prog' => $prog,
382 '-readmethod' => 'SearchIO' );
384 my $factory = Bio
::Tools
::Run
::RemoteBlast
->new(@params);
386 my $r = $factory->submit_blast($seq);
388 print STDERR
"Submitted Blast for [".$seq->id."] ";
395 while( my @rids = $factory->each_rid) {
396 foreach my $rid ( @rids ) {
397 my $rc = $factory->retrieve_blast($rid);
400 $factory->remove_rid($rid);
407 $result = $rc->next_result();
408 $factory->remove_rid($rid);
423 Usage : write_blast($filename,$blast_report);
425 Function: Writes a BLAST result object (or more formally
426 a SearchIO result object) out to a filename
431 Args : filename as a string
432 Bio::SearchIO::Results object
437 my ($filename,$blast) = @_;
439 if( $filename !~ /^\>/ && $filename !~ /^|/ ) {
440 $filename = ">".$filename;
443 my $output = Bio
::SearchIO
->new( -output_format
=> 'blast', -file
=> $filename);
445 $output->write_result($blast);
452 Usage : $seq_object = get_sequence('swiss',"ROA1_HUMAN");
454 Function: If the computer has Internet access this method gets
455 the sequence from Internet accessible databases. Currently
456 this supports Swissprot ('swiss'), EMBL ('embl'), GenBank
457 ('genbank'), GenPept ('genpept'), and RefSeq ('refseq').
459 Swissprot and EMBL are more robust than GenBank fetching.
461 If the user is trying to retrieve a RefSeq entry from
462 GenBank/EMBL, the query is silently redirected.
464 Returns : A Bio::Seq object
466 Args : database type - one of swiss, embl, genbank, genpept, or
471 my $genbank_db = undef;
472 my $genpept_db = undef;
474 my $swiss_db = undef;
475 my $refseq_db = undef;
478 my ($db_type,$identifier) = @_;
480 confess
("Your system does not have one of LWP, HTTP::Request::Common, IO::String\n".
481 "installed so the DB retrieval method is not available.\n".
482 "Full error message is:\n $!\n");
485 $db_type = lc($db_type);
489 if( $db_type =~ /genbank/ ) {
490 if( !defined $genbank_db ) {
491 $genbank_db = Bio
::DB
::GenBank
->new();
495 if( $db_type =~ /genpept/ ) {
496 if( !defined $genpept_db ) {
497 $genpept_db = Bio
::DB
::GenPept
->new();
502 if( $db_type =~ /swiss/ ) {
503 if( !defined $swiss_db ) {
504 $swiss_db = Bio
::DB
::SwissProt
->new();
509 if( $db_type =~ /embl/ ) {
510 if( !defined $embl_db ) {
511 $embl_db = Bio
::DB
::EMBL
->new();
516 if( $db_type =~ /refseq/ or ($db_type !~ /swiss/ and
517 $identifier =~ /^\s*N\S+_/)) {
518 if( !defined $refseq_db ) {
519 $refseq_db = Bio
::DB
::RefSeq
->new();
526 if( $identifier =~ /^\w+\d+$/ ) {
527 $seq = $db->get_Seq_by_acc($identifier);
529 $seq = $db->get_Seq_by_id($identifier);
539 Usage : $seqobj = translate($seq_or_string_scalar)
541 Function: translates a DNA sequence object OR just a plain
542 string of DNA to amino acids
543 Returns : A Bio::Seq object
545 Args : Either a sequence object or a string of
546 just DNA sequence characters
551 my ($scalar) = shift;
555 if( !$scalar->isa("Bio::PrimarySeqI") ) {
556 confess
("Expecting a sequence object not a $scalar");
561 # check this looks vaguely like DNA
562 my $n = ( $scalar =~ tr/ATGCNatgcn/ATGCNatgcn/ );
563 if( $n < length($scalar) * 0.85 ) {
564 confess
("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me");
566 $obj = Bio
::PrimarySeq
->new(-id
=> 'internalbioperlseq',-seq
=> $scalar);
568 return $obj->translate();
572 =head2 translate_as_string
574 Title : translate_as_string
575 Usage : $seqstring = translate_as_string($seq_or_string_scalar)
577 Function: translates a DNA sequence object OR just a plain
578 string of DNA to amino acids
579 Returns : A string of just amino acids
581 Args : Either a sequence object or a string of
582 just DNA sequence characters
586 sub translate_as_string
{
587 my ($scalar) = shift;
588 my $obj = Bio
::Perl
::translate
($scalar);
593 =head2 reverse_complement
595 Title : reverse_complement
596 Usage : $seqobj = reverse_complement($seq_or_string_scalar)
598 Function: reverse complements a string or sequence argument
599 producing a Bio::Seq - if you want a string, you
600 can use reverse_complement_as_string
601 Returns : A Bio::Seq object
603 Args : Either a sequence object or a string of
604 just DNA sequence characters
608 sub reverse_complement
{
609 my ($scalar) = shift;
614 if( !$scalar->isa("Bio::PrimarySeqI") ) {
615 confess
("Expecting a sequence object not a $scalar");
622 # check this looks vaguely like DNA
623 my $n = ( $scalar =~ tr/ATGCNatgcn/ATGCNatgcn/ );
625 if( $n < length($scalar) * 0.85 ) {
626 confess
("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me");
629 $obj = Bio
::PrimarySeq
->new(-id
=> 'internalbioperlseq',-seq
=> $scalar);
632 return $obj->revcom();
638 Usage : $seqobj = revcom($seq_or_string_scalar)
640 Function: reverse complements a string or sequence argument
641 producing a Bio::Seq - if you want a string, you
642 can use reverse_complement_as_string
644 This is an alias for reverse_complement
645 Returns : A Bio::Seq object
647 Args : Either a sequence object or a string of
648 just DNA sequence characters
653 return &Bio
::Perl
::reverse_complement
(@_);
657 =head2 reverse_complement_as_string
659 Title : reverse_complement_as_string
660 Usage : $string = reverse_complement_as_string($seq_or_string_scalar)
662 Function: reverse complements a string or sequence argument
664 Returns : A string of DNA letters
666 Args : Either a sequence object or a string of
667 just DNA sequence characters
671 sub reverse_complement_as_string
{
672 my ($scalar) = shift;
673 my $obj = &Bio
::Perl
::reverse_complement
($scalar);
678 =head2 revcom_as_string
680 Title : revcom_as_string
681 Usage : $string = revcom_as_string($seq_or_string_scalar)
683 Function: reverse complements a string or sequence argument
685 Returns : A string of DNA letters
687 Args : Either a sequence object or a string of
688 just DNA sequence characters
692 sub revcom_as_string
{
693 my ($scalar) = shift;
694 my $obj = &Bio
::Perl
::reverse_complement
($scalar);