3 # BioPerl module for Bio::Perl
5 # Cared for by Ewan Birney <birney@ebi.ac.uk>
7 # Copyright Ewan Birney
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::Perl - Functional access to BioPerl for people who don't know objects
21 # will guess file format from extension
22 $seq_object = read_sequence($filename);
24 # forces genbank format
25 $seq_object = read_sequence($filename,'genbank');
27 # reads an array of sequences
28 @seq_object_array = read_all_sequences($filename,'fasta');
30 # sequences are Bio::Seq objects, so the following methods work
31 # for more info see Bio::Seq, or do 'perldoc Bio/Seq.pm'
33 print "Sequence name is ",$seq_object->display_id,"\n";
34 print "Sequence acc is ",$seq_object->accession_number,"\n";
35 print "First 5 bases is ",$seq_object->subseq(1,5),"\n";
37 # get the whole sequence as a single string
39 $sequence_as_a_string = $seq_object->seq();
43 write_sequence(">$filename",'genbank',$seq_object);
45 write_sequence(">$filename",'genbank',@seq_object_array);
47 # making a new sequence from just a string
49 $seq_object = new_sequence("ATTGGTTTGGGGACCCAATTTGTGTGTTATATGTA",
52 # getting a sequence from a database (assumes internet connection)
54 $seq_object = get_sequence('swissprot',"ROA1_HUMAN");
56 $seq_object = get_sequence('embl',"AI129902");
58 $seq_object = get_sequence('genbank',"AI129902");
60 # BLAST a sequence (assummes an internet connection)
62 $blast_report = blast_sequence($seq_object);
64 write_blast(">blast.out",$blast_report);
69 Easy first time access to BioPerl via functions.
75 User feedback is an integral part of the evolution of this and other
76 Bioperl modules. Send your comments and suggestions preferably to one
77 of the Bioperl mailing lists. Your participation is much appreciated.
83 Report bugs to the Bioperl bug tracking system to help us keep track
84 the bugs and their resolution. Bug reports can be submitted via the web:
86 http://bugzilla.open-bio.org/
88 =head1 AUTHOR - Ewan Birney
90 Email birney@ebi.ac.uk
94 The rest of the documentation details each of the object methods.
95 Internal methods are usually preceded with a _
100 # Let the code begin...
104 use vars
qw(@EXPORT @EXPORT_OK $DBOKAY);
110 use Bio::Root::Version '$VERSION';
113 require Bio::DB::EMBL;
114 require Bio::DB::GenBank;
115 require Bio::DB::SwissProt;
116 require Bio::DB::RefSeq;
117 require Bio::DB::GenPept;
126 use base qw(Exporter);
128 @EXPORT = qw(read_sequence read_all_sequences write_sequence
129 new_sequence get_sequence translate translate_as_string
130 reverse_complement revcom revcom_as_string
131 reverse_complement_as_string blast_sequence write_blast);
133 @EXPORT_OK = @EXPORT;
138 Title : read_sequence
139 Usage : $seq = read_sequence('sequences.fa')
140 $seq = read_sequence($filename,'genbank');
143 $seq = read_sequence("my_fetching_program $id |",'fasta');
145 Function: Reads the top sequence from the file. If no format is given, it will
146 try to guess the format from the filename. If a format is given, it
147 forces that format. The filename can be any valid perl open() string
148 - in particular, you can put in pipes
150 Returns : A Bio::Seq object. A quick synopsis:
151 $seq_object->display_id - name of the sequence
152 $seq_object->seq - sequence as a string
154 Args : Two strings, first the filename - any Perl open() string is ok
155 Second string is the format, which is optional
157 For more information on Seq objects see L<Bio::Seq>.
162 my ($filename,$format) = @_;
164 if( !defined $filename ) {
165 confess
"read_sequence($filename) - usage incorrect";
170 if( defined $format ) {
171 $seqio = Bio
::SeqIO
->new( '-file' => $filename, '-format' => $format);
173 $seqio = Bio
::SeqIO
->new( '-file' => $filename);
176 my $seq = $seqio->next_seq();
182 =head2 read_all_sequences
184 Title : read_all_sequences
185 Usage : @seq_object_array = read_all_sequences($filename);
186 @seq_object_array = read_all_sequences($filename,'genbank');
188 Function: Just as the function above, but reads all the sequences in the
189 file and loads them into an array.
191 For very large files, you will run out of memory. When this
192 happens, you've got to use the SeqIO system directly (this is
193 not so hard! Don't worry about it!).
195 Returns : array of Bio::Seq objects
197 Args : two strings, first the filename (any open() string is ok)
198 second the format (which is optional)
200 See L<Bio::SeqIO> and L<Bio::Seq> for more information
204 sub read_all_sequences
{
205 my ($filename,$format) = @_;
207 if( !defined $filename ) {
208 confess
"read_all_sequences($filename) - usage incorrect";
213 if( defined $format ) {
214 $seqio = Bio
::SeqIO
->new( '-file' => $filename, '-format' => $format);
216 $seqio = Bio
::SeqIO
->new( '-file' => $filename);
221 while( my $seq = $seqio->next_seq() ) {
222 push(@seq_array,$seq);
229 =head2 write_sequence
231 Title : write_sequence
232 Usage : write_sequence(">new_file.gb",'genbank',$seq)
233 write_sequence(">new_file.gb",'genbank',@array_of_sequence_objects)
235 Function: writes sequences in the specified format
239 Args : filename as a string, must provide an open() output file
241 one or more sequence objects
247 my ($filename,$format,@sequence_objects) = @_;
249 if( scalar(@sequence_objects) == 0 ) {
250 confess
("write_sequence(filename,format,sequence_object)");
254 my $seqname = "sequence1";
256 # catch users who haven't passed us a filename we can open
257 if( $filename !~ /^\>/ && $filename !~ /^|/ ) {
258 $filename = ">".$filename;
261 my $seqio = Bio
::SeqIO
->new('-file' => $filename, '-format' => $format);
263 foreach my $seq ( @sequence_objects ) {
267 if( length $seq > 50 ) {
268 # odds are this is a sequence as a string, and someone has not figured out
269 # how to make objects. Warn him/her and then make a sequence object
272 carp
("WARNING: You have put in a long string into write_sequence.\nI suspect this means that this is the actual sequence\nIn the future try the\n new_sequence method of this module to make a new sequence object.\nDoing this for you here\n");
276 $seq_obj = new_sequence
($seq,$seqname);
279 confess
("You have a non object [$seq] passed to write_sequence. It maybe that you want to use new_sequence to make this string into a sequence object?");
282 if( !$seq->isa("Bio::SeqI") ) {
283 confess
("object [$seq] is not a Bio::Seq object; can't write it out");
288 # finally... we get to write out the sequence!
289 $seqio->write_seq($seq_obj);
297 Usage : $seq_obj = new_sequence("GATTACA", "kino-enzyme");
299 Function: Construct a sequency object from sequence string
300 Returns : A Bio::Seq object
302 Args : sequence string
303 name string (optional, default "no-name-for-sequence")
304 accession - accession number (optional, no default)
309 my ($seq,$name,$accession) = @_;
311 if( !defined $seq ) {
312 confess
("new_sequence(sequence_as_string) usage");
315 $name ||= "no-name-for-sequence";
317 my $seq_object = Bio
::Seq
->new( -seq
=> $seq, -id
=> $name);
319 $accession && $seq_object->accession_number($accession);
324 =head2 blast_sequence
326 Title : blast_sequence
327 Usage : $blast_result = blast_sequence($seq)
328 $blast_result = blast_sequence('MFVEGGTFASEDDDSASAEDE');
330 Function: If the computer has Internet accessibility, blasts
331 the sequence using the NCBI BLAST server against nrdb.
333 It chooses the flavour of BLAST on the basis of the sequence.
335 This function uses Bio::Tools::Run::RemoteBlast, which itself
336 use Bio::SearchIO - as soon as you want to know more, check out
338 Returns : Bio::Search::Result::GenericResult.pm
340 Args : Either a string of protein letters or nucleotides, or a
346 my ($seq,$verbose) = shift;
348 if( !defined $verbose ) {
353 $seq = Bio
::Seq
->new( -seq
=> $seq, -id
=> 'blast-sequence-temp-id');
354 } elsif ( !$seq->isa('Bio::PrimarySeqI') ) {
355 croak
("[$seq] is an object, but not a Bio::Seq object, cannot be blasted");
358 require Bio
::Tools
::Run
::RemoteBlast
;
363 my @params = ( '-prog' => $prog,
365 '-readmethod' => 'SearchIO' );
367 my $factory = Bio
::Tools
::Run
::RemoteBlast
->new(@params);
369 my $r = $factory->submit_blast($seq);
371 print STDERR
"Submitted Blast for [".$seq->id."] ";
378 while( my @rids = $factory->each_rid) {
379 foreach my $rid ( @rids ) {
380 my $rc = $factory->retrieve_blast($rid);
383 $factory->remove_rid($rid);
390 $result = $rc->next_result();
391 $factory->remove_rid($rid);
406 Usage : write_blast($filename,$blast_report);
408 Function: Writes a BLAST result object (or more formally
409 a SearchIO result object) out to a filename
414 Args : filename as a string
415 Bio::SearchIO::Results object
420 my ($filename,$blast) = @_;
422 if( $filename !~ /^\>/ && $filename !~ /^|/ ) {
423 $filename = ">".$filename;
426 my $output = Bio
::SearchIO
->new( -output_format
=> 'blast', -file
=> $filename);
428 $output->write_result($blast);
435 Usage : $seq_object = get_sequence('swiss',"ROA1_HUMAN");
437 Function: If the computer has Internet access this method gets
438 the sequence from Internet accessible databases. Currently
439 this supports Swissprot ('swiss'), EMBL ('embl'), GenBank
440 ('genbank'), GenPept ('genpept'), and RefSeq ('refseq').
442 Swissprot and EMBL are more robust than GenBank fetching.
444 If the user is trying to retrieve a RefSeq entry from
445 GenBank/EMBL, the query is silently redirected.
447 Returns : A Bio::Seq object
449 Args : database type - one of swiss, embl, genbank, genpept, or
454 my $genbank_db = undef;
455 my $genpept_db = undef;
457 my $swiss_db = undef;
458 my $refseq_db = undef;
461 my ($db_type,$identifier) = @_;
463 confess
("Your system does not have one of LWP, HTTP::Request::Common, IO::String installed so the DB retrieval method is not available. \nFull error message is:\n $!\n");
466 $db_type = lc($db_type);
470 if( $db_type =~ /genbank/ ) {
471 if( !defined $genbank_db ) {
472 $genbank_db = Bio
::DB
::GenBank
->new();
476 if( $db_type =~ /genpept/ ) {
477 if( !defined $genpept_db ) {
478 $genpept_db = Bio
::DB
::GenPept
->new();
483 if( $db_type =~ /swiss/ ) {
484 if( !defined $swiss_db ) {
485 $swiss_db = Bio
::DB
::SwissProt
->new();
490 if( $db_type =~ /embl/ ) {
491 if( !defined $embl_db ) {
492 $embl_db = Bio
::DB
::EMBL
->new();
497 if( $db_type =~ /refseq/ or ($db_type !~ /swiss/ and
498 $identifier =~ /^\s*N\S+_/)) {
499 if( !defined $refseq_db ) {
500 $refseq_db = Bio
::DB
::RefSeq
->new();
507 if( $identifier =~ /^\w+\d+$/ ) {
508 $seq = $db->get_Seq_by_acc($identifier);
510 $seq = $db->get_Seq_by_id($identifier);
520 Usage : $seqobj = translate($seq_or_string_scalar)
522 Function: translates a DNA sequence object OR just a plain
523 string of DNA to amino acids
524 Returns : A Bio::Seq object
526 Args : Either a sequence object or a string of
527 just DNA sequence characters
532 my ($scalar) = shift;
537 if( !$scalar->isa("Bio::PrimarySeqI") ) {
538 confess
("Expecting a sequence object not a $scalar");
546 # check this looks vaguely like DNA
547 my $n = ( $scalar =~ tr/ATGCNatgcn/ATGCNatgcn/ );
549 if( $n < length($scalar) * 0.85 ) {
550 confess
("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me");
553 $obj = Bio
::PrimarySeq
->new(-id
=> 'internalbioperlseq',-seq
=> $scalar);
556 return $obj->translate();
560 =head2 translate_as_string
562 Title : translate_as_string
563 Usage : $seqstring = translate_as_string($seq_or_string_scalar)
565 Function: translates a DNA sequence object OR just a plain
566 string of DNA to amino acids
567 Returns : A string of just amino acids
569 Args : Either a sequence object or a string of
570 just DNA sequence characters
574 sub translate_as_string
{
575 my ($scalar) = shift;
577 my $obj = Bio
::Perl
::translate
($scalar);
583 =head2 reverse_complement
585 Title : reverse_complement
586 Usage : $seqobj = reverse_complement($seq_or_string_scalar)
588 Function: reverse complements a string or sequence argument
589 producing a Bio::Seq - if you want a string, you
590 can use reverse_complement_as_string
591 Returns : A Bio::Seq object
593 Args : Either a sequence object or a string of
594 just DNA sequence characters
598 sub reverse_complement
{
599 my ($scalar) = shift;
604 if( !$scalar->isa("Bio::PrimarySeqI") ) {
605 confess
("Expecting a sequence object not a $scalar");
613 # check this looks vaguely like DNA
614 my $n = ( $scalar =~ tr/ATGCNatgc/ATGCNatgcn/ );
616 if( $n < length($scalar) * 0.85 ) {
617 confess
("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me");
620 $obj = Bio
::PrimarySeq
->new(-id
=> 'internalbioperlseq',-seq
=> $scalar);
623 return $obj->revcom();
629 Usage : $seqobj = revcom($seq_or_string_scalar)
631 Function: reverse complements a string or sequence argument
632 producing a Bio::Seq - if you want a string, you
633 can use reverse_complement_as_string
635 This is an alias for reverse_complement
636 Returns : A Bio::Seq object
638 Args : Either a sequence object or a string of
639 just DNA sequence characters
644 return &Bio
::Perl
::reverse_complement
(@_);
648 =head2 reverse_complement_as_string
650 Title : reverse_complement_as_string
651 Usage : $string = reverse_complement_as_string($seq_or_string_scalar)
653 Function: reverse complements a string or sequence argument
655 Returns : A string of DNA letters
657 Args : Either a sequence object or a string of
658 just DNA sequence characters
662 sub reverse_complement_as_string
{
663 my ($scalar) = shift;
665 my $obj = &Bio
::Perl
::reverse_complement
($scalar);
671 =head2 revcom_as_string
673 Title : revcom_as_string
674 Usage : $string = revcom_as_string($seq_or_string_scalar)
676 Function: reverse complements a string or sequence argument
678 Returns : A string of DNA letters
680 Args : Either a sequence object or a string of
681 just DNA sequence characters
685 sub revcom_as_string
{
686 my ($scalar) = shift;
688 my $obj = &Bio
::Perl
::reverse_complement
($scalar);