maint: restructure to use Dist::Zilla
[bioperl-live.git] / lib / Bio / LiveSeq / IO / BioPerl.pm
blob564a0c1013c6fb43d6a310ecd214b5aca71eea7c
2 # bioperl module for Bio::LiveSeq::IO::BioPerl
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Joseph Insana <insana@ebi.ac.uk> <jinsana@gmx.net>
8 # Copyright Joseph Insana
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::LiveSeq::IO::BioPerl - Loader for LiveSeq from EMBL entries with BioPerl
18 =head1 SYNOPSIS
20 my $db="EMBL";
21 my $file="../data/M20132";
22 my $id="HSANDREC";
24 my $loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"$db", -file=>"$file");
25 # or
26 my $loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"$db", -id=>"$id");
28 my @translationobjects=$loader->entry2liveseq();
30 my $genename="AR";
31 my $gene=$loader->gene2liveseq(-gene_name => "$genename",
32 -getswissprotinfo => 0);
34 #NOTE1: The only -db now supported is EMBL. Hence it defaults to EMBL.
35 #NOTE2: -file requires a filename (and path if necessary) containing an
36 # EMBL entry
37 # -id will use Bio::DB::EMBL.pm to fetch the sequence from the web,
38 # (bioperl wraparound to [w]getz from SRS)
39 #NOTE3: To retrieve the swissprot (if possible) attached to the embl entry
40 # (to get protein domains at dna level), only Bio::DB::EMBL.pm
41 # is supported under BioPerl. Refer to Bio::LiveSeq::IO::SRS
42 # otherwise.
43 #NOTE4: NOTE3 is not implemented yet for bioperl, working on it
46 =head1 DESCRIPTION
48 This package uses BioPerl (SeqIO) to fetch a sequence database entry,
49 analyse it and create LiveSeq objects out of it.
51 A filename (or an ID that will fetch entry through the web) has to be passed
52 to this package which will return references to all translation objects
53 created from the EMBL entry. References to Transcription, DNA and Exon
54 objects can all be retrieved departing from these.
56 Alternatively, a specific "gene" name can be specified, together with
57 the embl-acc ID. This will create a LiveSeq::Gene object with all
58 relevant gene features attached/created.
60 ATTENTION: if web fetching is requested, the package HTTP::Request needs
61 to be installed.
64 =head1 AUTHOR - Joseph A.L. Insana
66 Email: Insana@ebi.ac.uk, jinsana@gmx.net
68 =head1 APPENDIX
70 The rest of the documentation details each of the object
71 methods. Internal methods are usually preceded with a _
73 =cut
75 # Let the code begin...
77 package Bio::LiveSeq::IO::BioPerl;
79 # TODO->TOCHECK
80 # each_secondary_access not working
81 # why array from each_tag_value($qual) ? When will there be more than one
82 # element in such array?
83 # what is the annotation object? ($seqobj->annotation)
84 # unsatisfied by both BioPerl binomial and SRS "org" to retrieve Organism info
86 use strict;
87 use Carp qw(cluck croak carp);
88 use vars qw($DBEMBLLOADED);
89 use Bio::SeqIO; # for -file entry loading
91 # Note, the following requires HTTP::Request. If the modules are not installed
92 # uncomment the following and use only -filename and don't request swissprotinfo
93 eval {
94 require Bio::DB::EMBL; # for -id entry loading
95 $DBEMBLLOADED = 1;
99 use base qw(Bio::LiveSeq::IO::Loader);
101 # This package can in the future host other databases loading subroutines.
102 # e.g. ensembl2hash
104 =head2 load
106 Title : load
107 Usage : my $filename="../data/M20132";
108 $loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"EMBL", -file=>"$filename");
110 $loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"EMBL", -id=>"HSANDREC");
112 Function: loads an entry with BioPerl from a database into a hash
113 Returns : reference to a new object of class IO::BioPerl holding an entry
114 Errorcode 0
115 Args : an filename containing an EMBL entry OR an ID or ACCESSION code
117 =cut
119 sub load {
120 my ($thing, %args) = @_;
121 my $class = ref($thing) || $thing;
122 my ($obj,%loader);
124 my ($db,$filename,$id)=($args{-db},$args{-file},$args{-id});
126 if (defined($db)) {
127 unless ($db eq "EMBL") {
128 carp "Note: only EMBL now supported!";
129 return(0);
131 } else {
132 $db="EMBL";
135 if (defined($id) && defined($filename)) {
136 carp "You can either specify a -id or a -filename!";
137 return(0);
140 unless (defined($id) || defined($filename)) {
141 carp "You must specify either a -id or a -filename!";
142 return(0);
145 my $hashref;
146 if ($db eq "EMBL") {
147 my $test_transl=0; # change to 0 to avoid comparison of translation
149 # these can be changed for future needs
150 my @embl_valid_feature_names=qw(CDS CDS_span exon prim_transcript intron repeat_unit repeat_region mRNA);
151 my @embl_valid_qual_names=qw(gene codon_start db_xref product note number rpt_family transl_table);
153 # dunno yet how to implement test_transl again....
154 # probably on a one-on-one basis with each translation?
155 if ($test_transl) {
156 push (@embl_valid_qual_names,"translation"); # needed for test_transl
159 my $seqobj; # bioperl sequence object, to be passed to embl2hash
161 if (defined($filename)) {
162 my $stream = Bio::SeqIO->new('-file' => $filename, '-format' => 'EMBL');
163 $seqobj = $stream->next_seq();
164 } else { # i.e. if -id
166 if( $DBEMBLLOADED ) {
167 my $embl = Bio::DB::EMBL->new();
168 $seqobj = $embl->get_Seq_by_id($id); # EMBL ID or ACC
169 } else {
170 my $root = Bio::Root::Root->new();
171 $root->warn("Must have HTTP::Request::Common installed, cannot run load without the -filename option specified, see docs for Bio::LiveSeq::IO::BioPerl");
172 return;
176 $hashref=&embl2hash($seqobj,\@embl_valid_feature_names,\@embl_valid_qual_names);
178 unless ($hashref) { return (0); }
180 %loader = (db => $db, filename => $filename, id => $id, hash => $hashref);
181 $obj = \%loader;
182 $obj = bless $obj, $class;
183 return $obj;
186 =head2 embl2hash
188 Title : embl2hash
189 Function: retrieves with BioPerl an EMBL entry, parses it and creates
190 a hash that contains all the information.
191 Returns : a reference to a hash
192 Errorcode: 0
193 Args : a BioPerl Sequence Object (from file or web fetching)
194 two array references to skip features and qualifiers (for
195 performance)
196 Example: @valid_features=qw(CDS exon prim_transcript mRNA);
197 @valid_qualifiers=qw(gene codon_start db_xref product rpt_family);
198 $hashref=&embl2hash($seqobj,\@valid_features,\@valid_qualifiers);
200 =cut
202 # arguments: Bioperl $seqobj
203 # to skip features and qualifiers (for performance), two array
204 # references must be passed (this can change into string arguments to
205 # be passed....)
206 # returns: a reference to a hash containing the important features requested
207 sub embl2hash {
208 my $seqobj=$_[0];
209 my %valid_features; my %valid_names;
210 if ($_[1]) {
211 %valid_features = map {$_, 1} @{$_[1]}; # to skip features
213 if ($_[2]) {
214 %valid_names = map {$_, 1} @{$_[2]}; # to skip qualifiers
217 my $annobj = $seqobj->annotation(); # what's this?
219 my $entry_Sequence = lc($seqobj->seq()); # SRS returns lowercase
221 my $entry_ID = $seqobj->display_id;
222 my $entry_AccNumber = $seqobj->accession; # or maybe accession_number ?
223 my $secondary_acc; # to fetch the other acc numbers
224 foreach $secondary_acc ($seqobj->get_secondary_accessions) { # not working!
225 $entry_AccNumber .= " $secondary_acc";
227 my $entry_Molecule = $seqobj->molecule; # this alone returns molec+division
228 my $entry_Division = $seqobj->division;
229 # fixed: now Molecule works in BioPerl, no need for next lines
230 #my @Molecule=split(" ",$entry_Molecule);
231 #my $entry_Division = pop(@Molecule); # only division
232 #$entry_Molecule = join(" ",@Molecule); # only molecule
233 my $entry_Description = $seqobj->desc;
235 my $speciesobj = $seqobj->species;
236 my $entry_Organism = $speciesobj->binomial;
238 my $entry_SeqLength = $seqobj->length;
240 # put into the hash
241 my %entryhash;
242 $entryhash{ID}=$entry_ID;
243 $entryhash{AccNumber}=$entry_AccNumber;
244 $entryhash{Molecule}=$entry_Molecule;
245 $entryhash{Division}=$entry_Division;
246 $entryhash{Description}=$entry_Description;
247 $entryhash{Organism}=$entry_Organism;
248 $entryhash{Sequence}=$entry_Sequence;
249 $entryhash{SeqLength}=$entry_SeqLength;
251 my @topfeatures=$seqobj->top_SeqFeatures();
252 # create features array
253 my $featuresnumber= scalar(@topfeatures);
254 $entryhash{FeaturesNumber}=$featuresnumber;
255 my $feature_name;
256 my @feature_qual_names; my @feature_qual_value;
257 my ($feature_qual_name,$feature_qual_number);
258 my @features;
260 my ($feat,$qual,$subfeat);
261 my @subfeat;
262 my $i=0;
263 foreach $feat (@topfeatures) {
264 my %feature;
265 $feature_name = $feat->primary_tag;
266 unless ($valid_features{$feature_name}) {
267 #print "skipping $feature_name\n";
268 next;
270 # works ok with 0.6.2
271 # if ($feature_name eq "CDS_span") { # case of CDS with various exons 0.6.2
272 # $feature_name="CDS"; # 0.6.2
273 my $featlocation=$feat->location; # 0.7
274 if (($feature_name eq "CDS")&&($featlocation->isa('Bio::Location::SplitLocationI'))) { # case of CDS with various exons BioPerl 0.7
275 # @subfeat=$feat->sub_SeqFeature; # 0.6.2
276 @subfeat=$featlocation->sub_Location(); # 0.7
277 my @transcript;
278 foreach $subfeat (@subfeat) {
279 my @range;
280 if ($subfeat->strand == -1) {
281 @range=($subfeat->end,$subfeat->start,$subfeat->strand);
282 } else {
283 @range=($subfeat->start,$subfeat->end,$subfeat->strand);
285 push (@transcript,\@range);
287 $feature{range}=\@transcript;
288 } else {
289 my @range;
290 ($feat->strand == -1) ? (@range = ($feat->end, $feat->start, $feat->strand) ) :
291 (@range = ( $feat->start,$feat->end,$feat->strand) );
292 # works ok with 0.6.2
293 if ($feature_name eq "CDS") { # case of single exon CDS (CDS name but not split location)
294 my @transcript=(\@range);
295 $feature{range}=\@transcript;
296 } else { # all other range features
297 $feature{range}=\@range;
300 $feature{location}="deprecated";
302 $feature{position}=$i;
303 $feature{name}=$feature_name;
305 @feature_qual_names= $feat->all_tags();
306 $feature_qual_number= scalar(@feature_qual_names);
308 $feature{qual_number}=$feature_qual_number;
310 my %feature_qualifiers;
311 for $qual (@feature_qual_names) {
312 $feature_qual_name=$qual;
313 unless ($valid_names{$feature_qual_name}) {
314 next;
316 @feature_qual_value=$feat->each_tag_value($qual);
317 #print "$qual => @feature_qual_value \n";
318 $feature_qualifiers{$feature_qual_name}=$feature_qual_value[0]; # ?
319 # maybe the whole array should be entered, not just the 1st element?
320 # what could be the other elements? TOCHECK!
322 $feature{qualifiers}=\%feature_qualifiers;
323 push (@features,\%feature); # array of features
324 $i++;
326 $entryhash{Features}=\@features; # put this also into the hash
328 my @cds; # array just of CDSs
329 for $i (0..$#features) {
330 if ($features[$i]->{'name'} eq "CDS") {
331 push(@cds,$features[$i]);
334 $entryhash{CDS}=\@cds; # put this also into the hash
335 return (\%entryhash);
338 =head2 novelaasequence2gene
340 Title : novelaasequence2gene
341 Usage : $gene=Bio::LiveSeq::IO::BioPerl->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
342 : $gene=Bio::LiveSeq::IO::BioPerl->novelaasequence2gene(-aasequence => "MGLAAPTRS*",
343 -cusg_data => "58 44 7 29 3 3 480 267 105 143 122 39 144 162 14 59 53 25 233 292 19 113 88 246 28 68 161 231 27 102 128 151 67 60 138 131 48 61 153 19 233 73 150 31 129 38 147 71 138 43 181 81 44 15 255 118 312 392 236 82 20 10 14 141");
344 : $gene=Bio::LiveSeq::IO::BioPerl->novelaasequence2gene(-aasequence => "MGLAAPTRS*",
345 -cusg_data => "58 44 7 29 3 3 480 267 105 143 122 39 144 162 14 59 53 25 233 292 19 113 88 246 28 68 161 231 27 102 128 151 67 60 138 131 48 61 153 19 233 73 150 31 129 38 147 71 138 43 181 81 44 15 255 118 312 392 236 82 20 10 14 141",
346 -translation_table => "2",
347 -gene_name => "tyr-kinase");
349 Function: creates LiveSeq objects from a novel amino acid sequence,
350 using codon usage information (loaded from a file) to choose
351 codons according to relative frequencies.
352 If a codon_usage information is not specified,
353 the default is to use Homo sapiens data (taxonomy ID 9606).
354 If a translation_table ID is not specified, it will default to 1
355 (standard code).
356 Returns : reference to a Gene object containing references to LiveSeq objects
357 Errorcode 0
358 Args : string containing an amino acid sequence
359 string (optional) with codon usage data (64 integer numbers)
360 string (optional) specifying a gene_name
361 integer (optional) specifying a translation_table ID
363 =cut
365 sub novelaasequence2gene {
366 my ($self, %args) = @_;
367 my ($gene_name,$cusg_data,$aasequence,$ttabid)=($args{-gene_name},$args{-cusg_data},$args{-aasequence},$args{-translation_table});
369 my @species_codon_usage;
370 unless ($aasequence) {
371 carp "aasequence not given";
372 return (0);
374 unless ($gene_name) {
375 $gene_name="Novel Unknown";
377 unless ($ttabid) {
378 $ttabid=1;
380 unless ($cusg_data) {
381 @species_codon_usage=
382 qw(68664 118404 126679 51100 125600 123646 75667 210903 435317
383 139009 79303 135218 128429 192616 49456 161556 211962 131222
384 162837 213626 69346 140780 182506 219428 76684 189374 173010
385 310626 82647 202329 180955 250410 180001 118798 76398 160764
386 317359 119013 262630 359627 218376 186915 130857 377006 162826
387 113684 317703 441298 287040 245435 174805 133427 134523 108740
388 225633 185619 78463 240138 174021 244236 142435 8187 5913
389 14381); # updated 21Jul2000
390 } else {
391 @species_codon_usage=split(/ /,$cusg_data);
394 my $gene=Bio::LiveSeq::IO::Loader::_common_novelaasequence2gene(\@species_codon_usage,$ttabid,$aasequence,$gene_name);
395 return ($gene);