Bio/LiveSeq/IO/Loader.pm

   1 #
   2 # bioperl module for Bio::LiveSeq::IO::Loader
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Joseph Insana <insana@ebi.ac.uk> <jinsana@gmx.net>
   7 #
   8 # Copyright Joseph Insana
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11 #
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::LiveSeq::IO::Loader - Parent Loader for LiveSeq
  17
  18 =head1 SYNOPSIS
  19
  20   #documentation needed
  21
  22 =head1 DESCRIPTION
  23
  24 This package holds common methods used by BioPerl and file loaders.
  25 It contains methods to create LiveSeq objects out of entire entries or from a
  26 localized sequence region surrounding a particular gene.
  27
  28 =head1 AUTHOR - Joseph A.L. Insana
  29
  30 Email:  Insana@ebi.ac.uk, jinsana@gmx.net
  31
  32 =head1 APPENDIX
  33
  34 The rest of the documentation details each of the object
  35 methods. Internal methods are usually preceded with a _
  36
  37 =cut
  38
  39 # Let the code begin...
  40
  41 package Bio::LiveSeq::IO::Loader;
  42
  43 use strict;
  44 use Carp qw(cluck croak carp);
  45 use Bio::LiveSeq::DNA;
  46 use Bio::LiveSeq::Exon;
  47 use Bio::LiveSeq::Transcript ;
  48 use Bio::LiveSeq::Translation;
  49 use Bio::LiveSeq::Gene;
  50 use Bio::LiveSeq::Intron;
  51 use Bio::LiveSeq::Prim_Transcript;
  52 use Bio::LiveSeq::Repeat_Region;
  53 use Bio::LiveSeq::Repeat_Unit;
  54 use Bio::LiveSeq::AARange;
  55 use Bio::Tools::CodonTable;
  56
  57 =head2 entry2liveseq
  58
  59   Title   : entry2liveseq
  60   Usage   : @translationobjects=$loader->entry2liveseq();
  61           : @translationobjects=$loader->entry2liveseq(-getswissprotinfo => 0);
  62   Function: creates LiveSeq objects from an entry previously loaded
  63   Returns : array of references to objects of class Translation
  64   Errorcode 0
  65   Args    : optional boolean flag to avoid the retrieval of SwissProt
  66             information for all Transcripts containing SwissProt x-reference
  67             default is 1 (to retrieve those information and create AARange
  68             LiveSeq objects)
  69   Note    : this method can get really slow for big entries. The lightweight
  70             gene2liveseq method is recommended
  71
  72 =cut
  73
  74 sub entry2liveseq {
  75   my ($self, %args) = @_;
  76   my ($getswissprotinfo)=($args{-getswissprotinfo});
  77   if (defined($getswissprotinfo)) {
  78     if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) {
  79       carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information....";
  80       $getswissprotinfo=0;
  81     }
  82   } else {
  83     $getswissprotinfo=1;
  84   }
  85   my $hashref=$self->{'hash'};
  86   unless ($hashref) { return (0); }
  87   my @translationobjects=$self->hash2liveseq($hashref,$getswissprotinfo);
  88   my $test_transl=0;
  89   if ($test_transl) { $self->test_transl($hashref,\@translationobjects);}
  90   return @translationobjects;
  91 }
  92
  93 =head2 novelaasequence2gene
  94
  95   Title   : novelaasequence2gene
  96   Usage   : $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
  97           : $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
  98                                              -taxon => 9606,
  99                                              -gene_name => "tyr-kinase");
 100
 101   Function: creates LiveSeq objects from a novel amino acid sequence,
 102             using codon usage database to choose codons according to
 103             relative frequencies.
 104             If a taxon ID is not specified, the default is to use the human
 105             one (taxonomy ID 9606).
 106   Returns : reference to a Gene object containing references to LiveSeq objects
 107   Errorcode 0
 108   Args    : string containing an amino acid sequence
 109             integer (optional) with a taxonomy ID
 110             string specifying a gene name
 111
 112 =cut
 113
 114 =head2 gene2liveseq
 115
 116   Title   : gene2liveseq
 117   Usage   : $gene=$loader->gene2liveseq(-gene_name => "gene name");
 118           : $gene=$loader->gene2liveseq(-gene_name => "gene name",
 119                                         -flanking => 64);
 120           : $gene=$loader->gene2liveseq(-gene_name => "gene name",
 121                                         -getswissprotinfo => 0);
 122           : $gene=$loader->gene2liveseq(-position => 4);
 123
 124   Function: creates LiveSeq objects from an entry previously loaded
 125             It is a "light weight" creation because it creates
 126             a LiveSequence just for the interesting region in an entry
 127             (instead than for the total entry, like in entry2liveseq) and for
 128             the flanking regions up to 500 nucleotides (default) or up to
 129             the specified amount of nucleotides (given as argument) around the
 130             Gene.
 131   Returns : reference to a Gene object containing possibly alternative
 132             Transcripts.
 133   Errorcode 0
 134   Args    : string containing the gene name as in the EMBL feature qualifier
 135             integer (optional) "flanking": amount of flanking bases to be kept
 136             boolean (optional) "getswissprotinfo": if set to "0" it will avoid
 137              trying to fetch information from a crossreference to a SwissProt
 138              entry, avoding the process of creation of AARange objects
 139              It is "1" (on) by default
 140
 141             Alternative to a gene_name, a position can be given: an
 142             integer (1-) containing the position of the desired CDS in the
 143             loaded entry
 144
 145 =cut
 146
 147 sub gene2liveseq {
 148   my ($self, %args) = @_;
 149   my ($gene_name,$flanking,$getswissprotinfo,$cds_position)=($args{-gene_name},$args{-flanking},$args{-getswissprotinfo},$args{-position});
 150   my $input;
 151   unless (($gene_name)||($cds_position)) {
 152     carp "Gene_Name or Position not specified for gene2liveseq loading function";
 153     return (0);
 154   }
 155   if (($gene_name)&&($cds_position)) {
 156     carp "Gene_Name and Position cannot be given together";
 157     return (0);
 158   } elsif ($gene_name) {
 159     $input=$gene_name;
 160   } else {
 161     $input="cds-position:".$cds_position;
 162   }
 163
 164   if (defined($getswissprotinfo)) {
 165     if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) {
 166       carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information....";
 167       $getswissprotinfo=0;
 168     }
 169   } else {
 170     $getswissprotinfo=1;
 171   }
 172
 173   if (defined($flanking)) {
 174     unless ($flanking >= 0) {
 175       carp "No sense in specifying a number < 0 for flanking regions to be created for gene2liveseq loading function";
 176       return (0);
 177     }
 178   } else {
 179     $flanking=500; # the default flanking length
 180   }
 181   my $hashref=$self->{'hash'};
 182   unless ($hashref) { return (0); }
 183   my $gene=$self->hash2gene($hashref,$input,$flanking,$getswissprotinfo);
 184   unless ($gene) { # if $gene == 0 it means problems in hash2gene
 185     carp "gene2liveseq produced error";
 186     return (0);
 187   }
 188   return $gene;
 189 }
 190
 191 # TODO: update so that it will work even if CDS is not only accepted FEATURE!!
 192 # this method is for now deprecated and not supported
 193 sub test_transl {
 194   my ($self,$entry)=@_;
 195   my @features=@{$entry->{'Features'}};
 196   my @translationobjects=@{$_[1]};
 197   my ($i,$translation);
 198   my ($obj_transl,$hash_transl);
 199   my @cds=@{$entry->{'CDS'}};
 200   foreach $translation (@translationobjects) {
 201     $obj_transl=$translation->seq;
 202     $hash_transl=$cds[$i]->{'qualifiers'}->{'translation'};
 203     #before seq was changed in Translation 1.4# chop $obj_transl; # to remove trailing "*"
 204     unless ($obj_transl eq $hash_transl) {
 205       cluck "Serious error: Translation from the Entry does not match Translation from object's seq for CDS at position $i";
 206       carp "\nEntry's transl: ",$hash_transl,"\n";
 207       carp "\nObject's transl: ",$obj_transl,"\n";
 208       exit;
 209     }
 210     $i++;
 211   }
 212 }
 213
 214 # argument: hashref containing the EMBL entry datas,
 215 #           getswissprotinfo boolean flag
 216 # creates the liveseq objects
 217 # returns: an array of Translation object references
 218 sub hash2liveseq {
 219   my ($self,$entry,$getswissprotinfo)=@_;
 220   my $i;
 221   my @transcripts;
 222   my $dna=Bio::LiveSeq::DNA->new(-seq => $entry->{'Sequence'} );
 223   $dna->alphabet(lc($entry->{'Molecule'}));
 224   $dna->display_id($entry->{'ID'});
 225   $dna->accession_number($entry->{'AccNumber'});
 226   $dna->desc($entry->{'Description'});
 227   my @cds=@{$entry->{'CDS'}};
 228   my ($swissacc,$swisshash); my @swisshashes;
 229   for $i (0..$#cds) {
 230     #my @transcript=@{$cds[$i]->{'range'}};
 231     #$transcript=\@transcript;
 232     #push (@transcripts,$transcript);
 233     push (@transcripts,$cds[$i]->{'range'});
 234     if ($getswissprotinfo) {
 235       $swissacc=$cds[$i]->{'qualifiers'}->{'db_xref'};
 236       $swisshash=$self->get_swisshash($swissacc);
 237       #$self->printswissprot($swisshash); # DEBUG
 238       push (@swisshashes,$swisshash);
 239     }
 240   }
 241   my @translations=($self->transexonscreation($dna,\@transcripts));
 242   my $translation; my $j=0;
 243   foreach $translation (@translations) {
 244     if ($swisshashes[$j]) { # if not 0
 245       $self->swisshash2liveseq($swisshashes[$j],$translation);
 246     }
 247     $j++;
 248   }
 249   return (@translations);
 250 }
 251
 252 # only features pertaining to a specified gene are created
 253 # only the sequence of the gene and appropriate context flanking regions
 254 # are created as chain
 255 # arguments: hashref, gene_name (OR: cds_position), length_of_flanking_sequences, getswissprotinfo boolean flag
 256 # returns: reference to Gene object
 257 #
 258 # Note: if entry contains just one CDS, all the features get added
 259 #       this is useful because often the features in these entries do not
 260 #       carry the /gene qualifier
 261 #
 262 # errorcode: 0
 263 sub hash2gene {
 264   my ($self,$entry,$input,$flanking,$getswissprotinfo)=@_;
 265   my $entryfeature;
 266   my $genefeatureshash;
 267
 268   my @cds=@{$entry->{'CDS'}};
 269
 270   # checking if a position has been given instead than a gene_name
 271   if (index($input,"cds-position:") == 0 ) {
 272     my $cds_position=substr($input,13); # extracting the cds position
 273     if (($cds_position >= 1)&&($cds_position <= scalar(@cds))) {
 274       $genefeatureshash=$self->_findgenefeatures($entry,undef,$cds_position,$getswissprotinfo);
 275     }
 276   } else {
 277     $genefeatureshash=$self->_findgenefeatures($entry,$input,undef,$getswissprotinfo);
 278   }
 279
 280   unless (($genefeatureshash)&&(scalar(@{$genefeatureshash->{'genefeatures'}}))) { # array empty, no gene features found
 281     my @genes=$self->genes($entry);
 282     my $cds_number=scalar(@cds);
 283     warn "Warning! Not even one genefeature found for /$input/....
 284     The genes present in this entry are:\n\t@genes\n
 285     The number of CDS in this entry is:\n\t$cds_number\n";
 286     return(0);
 287   }
 288
 289   # get max and min, check flankings
 290   my ($min,$max)=$self->rangeofarray(@{$genefeatureshash->{'labels'}}); # gene "boundaries"
 291   my $seqlength=$entry->{'SeqLength'};
 292   my ($mindna,$maxdna); # some flanking region next to the gene "boundaries"
 293   if ($min-$flanking < 1) {
 294     $mindna=1;
 295   } else {
 296     $mindna=$min-$flanking;
 297   }
 298   if ($max+$flanking > $seqlength) {
 299     $maxdna=$seqlength;
 300   } else {
 301     $maxdna=$max+$flanking;
 302   }
 303   my $subseq=substr($entry->{'Sequence'},$mindna-1,$maxdna-$mindna+1);
 304
 305   # create LiveSeq objects
 306
 307   # create DNA
 308   my $dna=Bio::LiveSeq::DNA->new(-seq => $subseq, -offset => $mindna);
 309   $dna->alphabet(lc($entry->{'Molecule'}));
 310   $dna->source($entry->{'Organism'});
 311   $dna->display_id($entry->{'ID'});
 312   $dna->accession_number($entry->{'AccNumber'});
 313   $dna->desc($entry->{'Description'});
 314
 315   my @transcripts=@{$genefeatureshash->{'transcripts'}};
 316   # create Translations, Transcripts, Exons out of the CDS
 317   unless (@transcripts) {
 318     cluck "no CDS feature found for /$input/....";
 319     return(0);
 320   }
 321   my @translationobjs=$self->transexonscreation($dna,\@transcripts);
 322   my @transcriptobjs;
 323
 324   # get the Transcript obj_refs
 325   my $translation;
 326   my $j=0;
 327   my @ttables=@{$genefeatureshash->{'ttables'}};
 328   my @swisshashes=@{$genefeatureshash->{'swisshashes'}};
 329   foreach $translation (@translationobjs) {
 330     push(@transcriptobjs,$translation->get_Transcript);
 331     if ($ttables[$j]) { # if not undef
 332       $translation->get_Transcript->translation_table($ttables[$j]);
 333     #} else { # DEBUG
 334     #  print "\n\t\tno translation table information....\n";
 335     }
 336     if ($swisshashes[$j]) { # if not 0
 337       $self->swisshash2liveseq($swisshashes[$j],$translation);
 338     }
 339     $j++;
 340   }
 341
 342   my %gene; # this is the hash to store created object references
 343   $gene{DNA}=$dna;
 344   $gene{Transcripts}=\@transcriptobjs;
 345   $gene{Translations}=\@translationobjs;
 346
 347   my @exonobjs; my @intronobjs;
 348   my @repeatunitobjs; my @repeatregionobjs;
 349   my @primtranscriptobjs;
 350
 351   my ($object,$range,$start,$end,$strand);
 352
 353   my @exons=@{$genefeatureshash->{'exons'}};
 354   my @exondescs=@{$genefeatureshash->{'exondescs'}};
 355   if (@exons) {
 356     my $exoncount = 0;
 357     foreach $range (@exons) {
 358       ($start,$end,$strand)=@{$range};
 359       $object = Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
 360       if ($object != -1) {
 361         $object->desc($exondescs[$exoncount]) if defined $exondescs[$exoncount];
 362         $exoncount++;
 363         push (@exonobjs,$object);
 364       } else {
 365         $exoncount++;
 366       }
 367     }
 368     $gene{Exons}=\@exonobjs;
 369   }
 370   my @introns=@{$genefeatureshash->{'introns'}};
 371   my @introndescs=@{$genefeatureshash->{'introndescs'}};
 372   if (@introns) {
 373     my $introncount = 0;
 374     foreach $range (@introns) {
 375       ($start,$end,$strand)=@{$range};
 376       $object=Bio::LiveSeq::Intron->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
 377       if ($object != -1) {
 378         $object->desc($introndescs[$introncount]);
 379         $introncount++;
 380         push (@intronobjs,$object);
 381       } else {
 382         $introncount++;
 383       }
 384     }
 385     $gene{Introns}=\@intronobjs;
 386   }
 387   my @prim_transcripts=@{$genefeatureshash->{'prim_transcripts'}};
 388   if (@prim_transcripts) {
 389     foreach $range (@prim_transcripts) {
 390       ($start,$end,$strand)=@{$range};
 391       $object=Bio::LiveSeq::Prim_Transcript->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
 392       if ($object != -1) { push (@primtranscriptobjs,$object); }
 393     }
 394     $gene{Prim_Transcripts}=\@primtranscriptobjs;
 395   }
 396   my @repeat_regions=@{$genefeatureshash->{'repeat_regions'}};
 397   my @repeat_regions_family=@{$genefeatureshash->{'repeat_regions_family'}};
 398   if (@repeat_regions) {
 399     my $k=0;
 400     foreach $range (@repeat_regions) {
 401       ($start,$end,$strand)=@{$range};
 402       $object=Bio::LiveSeq::Repeat_Region->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
 403       if ($object != -1) {
 404         $object->desc($repeat_regions_family[$k]);
 405         $k++;
 406         push (@repeatregionobjs,$object);
 407       } else {
 408         $k++;
 409       }
 410     }
 411     $gene{Repeat_Regions}=\@repeatregionobjs;
 412   }
 413   my @repeat_units=@{$genefeatureshash->{'repeat_units'}};
 414   my @repeat_units_family=@{$genefeatureshash->{'repeat_units_family'}};
 415   if (@repeat_units) {
 416     my $k=0;
 417     foreach $range (@repeat_units) {
 418       ($start,$end,$strand)=@{$range};
 419       $object=Bio::LiveSeq::Repeat_Unit->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
 420       if ($object != -1) {
 421         $object->desc($repeat_units_family[$k]);
 422         $k++;
 423         push (@repeatunitobjs,$object);
 424       } else {
 425         $k++;
 426       }
 427     }
 428     $gene{Repeat_Units}=\@repeatunitobjs;
 429   }
 430
 431   # create the Gene
 432   my $gene_name=$genefeatureshash->{'gene_name'}; # either a name or a cdspos
 433   return (Bio::LiveSeq::Gene->new(-name=>$gene_name,-features=>\%gene,
 434                                   -upbound=>$min,-downbound=>$max));
 435 }
 436
 437 # maybe this function will be moved to general utility package
 438 # argument: array of numbers
 439 # returns: (min,max) numbers in the array
 440 sub rangeofarray {
 441   my $self=shift;
 442   my @array=@_;
 443   #print "\n-=-=-=-=-=-=-=-=-=-=array: @array\n";
 444   my ($max,$min,$element);
 445   $min=$max=shift(@array);
 446   foreach $element (@array) {
 447       $element = 0 unless defined $element;
 448     if ($element < $min) {
 449       $min=$element;
 450     }
 451     if ($element > $max) {
 452       $max=$element;
 453     }
 454   }
 455   #print "\n-=-=-=-=-=-=-=-=-=-=min: $min\tmax: $max\n";
 456   return ($min,$max);
 457 }
 458
 459
 460 # argument: reference to DNA object, reference to array of transcripts
 461 # returns: an array of Translation object references
 462 sub transexonscreation {
 463   my $self=shift;
 464   my $dna=$_[0];
 465   my @transcripts=@{$_[1]};
 466
 467   my (@transexons,$start,$end,$strand,$exonref,$exonobj,$transcript,$transcriptobj);
 468   my $translationobj;
 469   my @translationobjects;
 470   foreach $transcript (@transcripts) {
 471     foreach $exonref (@{$transcript}) {
 472       ($start,$end,$strand)=@{$exonref};
 473       #print "Creating Exon: start $start end $end strand $strand\n";
 474       $exonobj=Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
 475       #push (@exonobjects,$exonobj);
 476       push (@transexons,$exonobj);
 477     }
 478     $transcriptobj=Bio::LiveSeq::Transcript->new(-exons => \@transexons );
 479     if ($transcriptobj != -1) {
 480       $translationobj=Bio::LiveSeq::Translation->new(-transcript=>$transcriptobj);
 481       @transexons=(); # cleans it
 482       #push (@transcriptobjects,$transcriptobj);
 483       push (@translationobjects,$translationobj);
 484     }
 485   }
 486   return (@translationobjects);
 487 }
 488
 489 #sub printgene {
 490 # deleted. Some functionality placed in Gene->printfeaturesnum
 491
 492 =head2 printswissprot
 493
 494   Title   : printswissprot
 495   Usage   : $loader->printswissprot($hashref);
 496   Function: prints out all information loaded from a database entry into the
 497             loader. Mainly used for testing purposes.
 498   Args    : a hashref containing the SWISSPROT entry datas
 499   Note    : the hashref can be obtained with a call to the method
 500                $loader->get_swisshash()      (BioPerl via Bio::DB::EMBL.pm)
 501             that takes as argument a string like "SWISS-PROT:P10275"
 502
 503 =cut
 504
 505 # argument: hashref containing the SWISSPROT entry datas
 506 # prints out that hash, showing the information loaded
 507 sub printswissprot {
 508   my ($self,$entry)=@_;
 509   unless ($entry) {
 510     return;
 511   }
 512   printf "ID: %s\n",
 513       $entry->{'ID'};
 514   printf "ACC: %s\n",
 515       $entry->{'AccNumber'};
 516   printf "GENE: %s\n",
 517       $entry->{'Gene'};
 518   printf "DES: %s\n",
 519       $entry->{'Description'};
 520   printf "ORG: %s\n",
 521       $entry->{'Organism'};
 522   printf "SEQLN: %s\n",
 523       $entry->{'SeqLength'};
 524   printf "SEQ: %s\n",
 525       substr($entry->{'Sequence'},0,64);
 526   if ($entry->{'Features'}) {
 527     my @features=@{$entry->{'Features'}};
 528     my $i;
 529     for $i (0..$#features) {
 530       print "|",$features[$i]->{'name'},"|";
 531       print " at ",$features[$i]->{'location'},": ";
 532       print "",$features[$i]->{'desc'},"\n";
 533     }
 534   }
 535 }
 536
 537 =head2 printembl
 538
 539   Title   : printembl
 540   Usage   : $loader->printembl();
 541   Function: prints out all information loaded from a database entry into the
 542             loader. Mainly used for testing purposes.
 543   Args    : none
 544
 545 =cut
 546
 547 # argument: hashref containing the EMBL entry datas
 548 # prints out that hash, showing the information loaded
 549 sub printembl {
 550   my ($self,$entry)=@_;
 551   unless ($entry) {
 552     $entry=$self->{'hash'};
 553   }
 554   my ($i,$featurename);
 555   printf "ID: %s\n",
 556       $entry->{'ID'};
 557   printf "ACC: %s\n",
 558       $entry->{'AccNumber'};
 559   printf "MOL: %s\n",
 560       $entry->{'Molecule'};
 561   printf "DIV: %s\n",
 562       $entry->{'Division'};
 563   printf "DES: %s\n",
 564       $entry->{'Description'};
 565   printf "ORG: %s\n",
 566       $entry->{'Organism'};
 567   printf "SEQLN: %s\n",
 568       $entry->{'SeqLength'};
 569   printf "SEQ: %s\n",
 570       substr($entry->{'Sequence'},0,64);
 571   my @features=@{$entry->{'Features'}};
 572   my @cds=@{$entry->{'CDS'}};
 573   print "\nFEATURES\nNumber of CDS: ",scalar(@cds)," (out of ",$entry->{'FeaturesNumber'}, " total features)\n";
 574   my ($exonref,$transcript);
 575   my ($qualifiernumber,$qualifiers,$key);
 576   my ($start,$end,$strand);
 577   my $j=0;
 578   for $i (0..$#features) {
 579     $featurename=$features[$i]->{'name'};
 580     if ($featurename eq "CDS") {
 581       print "|CDS| number $j at feature position: $i\n";
 582       #print $features[$i]->{'location'},"\n";
 583       $transcript=$features[$i]->{'range'};
 584       foreach $exonref (@{$transcript}) {
 585         ($start,$end,$strand)=@{$exonref};
 586         print "\tExon: start $start end $end strand $strand\n";
 587       }
 588       $j++;
 589     } else {
 590       print "|$featurename| at feature position: $i\n";
 591       print "\trange: ";
 592       print join("\t",@{$features[$i]->{'range'}}),"\n";
 593       #print $features[$i]->{'location'},"\n";
 594     }
 595     $qualifiernumber=$features[$i]->{'qual_number'};
 596     $qualifiers=$features[$i]->{'qualifiers'}; # hash
 597     foreach $key (keys (%{$qualifiers})) {
 598     print "\t\t",$key,": ";
 599       print $qualifiers->{$key},"\n";
 600     }
 601   }
 602 }
 603
 604 =head2 genes
 605
 606   Title   : genes
 607   Usage   : $loader->genes();
 608   Function: Returns an array of gene_names (strings) contained in the loaded
 609             entry.
 610   Args    : none
 611
 612 =cut
 613
 614 # argument: entryhashref
 615 # returns: array of genenames found in the entry
 616 sub genes {
 617   my ($self,$entry)=@_;
 618   unless ($entry) {
 619     $entry=$self->{'hash'};
 620   }
 621   my @entryfeatures=@{$entry->{'Features'}};
 622   my ($genename,$genenames,$entryfeature);
 623   for $entryfeature (@entryfeatures) {
 624     $genename=$entryfeature->{'qualifiers'}->{'gene'};
 625     if ($genename) {
 626       if (index($genenames,$genename) == -1) { # if name is new
 627         $genenames .= $genename . " "; # add the name
 628       }
 629     }
 630   }
 631   return (split(/ /,$genenames)); # assumes no space inbetween each genename
 632 }
 633
 634 # arguments: swisshash, translation objref
 635 # adds information to the Translation, creates AARange objects, sets the
 636 # aa_range attribute on the Translation, pointing to those objects
 637 sub swisshash2liveseq {
 638   my ($self,$entry,$translation)=@_;
 639   my $translength=$translation->length;
 640   $translation->desc($translation->desc . $entry->{'Description'});
 641   $translation->display_id("SWISSPROT:" . $entry->{'ID'});
 642   $translation->accession_number("SWISSPROT:" . $entry->{'AccNumber'});
 643   $translation->name($entry->{'Gene'});
 644   $translation->source($entry->{'Organism'});
 645   my @aarangeobjects;
 646   my ($start,$end,$aarangeobj,$feature);
 647   my @features; my @newfeatures;
 648   if ($entry->{'Features'}) {
 649     @features=@{$entry->{'Features'}};
 650   }
 651   my $cleavedmet=0;
 652   # check for cleaved Met
 653   foreach $feature (@features) {
 654     if (($feature->{'name'} eq "INIT_MET")&&($feature->{'location'} eq "0 0")) {
 655       $cleavedmet=1;
 656       $translation->{'offset'}="1"; # from swissprot to liveseq protein sequence
 657     } else {
 658       push(@newfeatures,$feature);
 659     }
 660   }
 661
 662   my $swissseq=$entry->{'Sequence'};
 663   my $liveseqtransl=$translation->seq;
 664   chop $liveseqtransl; # to take away the trailing STOP "*"
 665   my $translated=substr($liveseqtransl,$cleavedmet);
 666
 667   my ($liveseq_aa,$swiss_aa,$codes_aa)=$self->_get_alignment($translated,$swissseq); # alignment after cleavage of possible initial met
 668
 669   if ((index($liveseq_aa,"-") != -1)||(index($swiss_aa,"-") != -1)) { # there are gaps, how to proceed?
 670     print "LIVE-SEQ=\'$liveseq_aa\'\nIDENTITY=\'$codes_aa\'\nSWS-PROT=\'$swiss_aa\'\n";
 671     carp "Nucleotides translation and SwissProt translation are different in size, cannot attach the SwissSequence to the EMBL one, cannot add any AminoAcidRange object/Domain information!";
 672     return;
 673   }
 674
 675   #my $i=0; # debug
 676   @features=@newfeatures;
 677   foreach $feature (@features) {
 678     #print "Processing SwissProtFeature: $i\n"; # debug
 679     ($start,$end)=split(/ /,$feature->{'location'});
 680     # Note: cleavedmet is taken in account for updating numbering
 681     $aarangeobj=Bio::LiveSeq::AARange->new(-start => $start+$cleavedmet, -end => $end+$cleavedmet, -name => $feature->{'name'}, -description => $feature->{'description'}, -translation => $translation, -translength => $translength);
 682     if ($aarangeobj != -1) {
 683       push(@aarangeobjects,$aarangeobj);
 684     }
 685     # $i++; # debug
 686   }
 687   $translation->{'aa_ranges'}=\@aarangeobjects;
 688 }
 689
 690 # if there is no SRS support, the default will be to return 0
 691 # i.e. this function is overridden in SRS package
 692 sub get_swisshash {
 693   return (0);
 694 }
 695
 696 # Args: $entry hashref, gene_name OR cds_position (undef is used to
 697 # choose between the two), getswissprotinfo boolean flag
 698 # Returns: an hash holding various arrayref used in the hash2gene method
 699 # Function: examines the nucleotide entry, identifying features belonging
 700 # to the gene (defined either by its name or by the position of its CDS in
 701 # the entry)
 702
 703 sub _findgenefeatures {
 704   my ($self,$entry,$gene_name,$cds_position,$getswissprotinfo)=@_;
 705
 706   my @entryfeatures=@{$entry->{'Features'}};
 707   my @exons; my @introns; my @prim_transcripts; my @transcripts;
 708   my @repeat_units; my @repeat_regions;
 709   my @repeat_units_family; my @repeat_regions_family; my $rpt_family;
 710   my $entryfeature; my @genefeatures;
 711   my $desc; my @exondescs; my @introndescs;
 712
 713   # for swissprot xreference
 714   my ($swissacc,$swisshash); my @swisshashes;
 715
 716   # for translation_tables
 717   my @ttables;
 718
 719   # to create labels
 720   my ($name,$exon);
 721   my @range; my @cdsexons; my @labels;
 722
 723   # maybe here also could be added special case when there is no CDS feature
 724   # in the entry (e.g. tRNA entry -> TOCHECK).
 725   # let's deal with the special case in which there is just one gene per entry
 726   # usually without /gene qualifier
 727   my @cds=@{$entry->{'CDS'}};
 728
 729   my $skipgenematch=0;
 730   if (scalar(@cds) == 1) {
 731     #carp "Note: only one CDS in this entry. Treating all features found in entry as Gene features.";
 732     $skipgenematch=1;
 733   }
 734
 735   my ($cds_begin,$cds_end,$proximity);
 736   if ($cds_position) { # if a position has been requested
 737     my @cds_exons=@{$cds[$cds_position-1]->{'range'}};
 738     ($cds_begin,$cds_end)=($cds_exons[0]->[0],$cds_exons[-1]->[1]); # begin and end of CDS
 739     $gene_name=$cds[$cds_position-1]->{'qualifiers'}->{'gene'};
 740     # DEBUG
 741     unless ($skipgenematch) {
 742       carp "--DEBUG-- cdsbegin $cds_begin cdsend $cds_end--------";
 743     }
 744     $proximity=100; # proximity CONSTANT to decide whether a feature "belongs" to the CDS
 745   }
 746
 747   for $entryfeature (@entryfeatures) { # get only features for the desired gene
 748     if (($skipgenematch)||(($cds_position)&&($self->_checkfeatureproximity($entryfeature->{'range'},$cds_begin,$cds_end,$proximity)))||(!($cds_position)&&($entryfeature->{'qualifiers'}->{'gene'} eq "$gene_name"))) {
 749       push(@genefeatures,$entryfeature);
 750
 751       my @range=@{$entryfeature->{'range'}};
 752       $name=$entryfeature->{'name'};
 753       my %qualifierhash=%{$entryfeature->{'qualifiers'}};
 754       if ($name eq "CDS") { # that has range containing array of exons
 755
 756         # swissprot crossindexing (if without SRS support it will fill array
 757         # with zeros and do nothing
 758         if ($getswissprotinfo) {
 759           $swissacc=$entryfeature->{'qualifiers'}->{'db_xref'};
 760           $swisshash=$self->get_swisshash($swissacc);
 761           #$self->printswissprot($swisshash); # DEBUG
 762           push (@swisshashes,$swisshash);
 763         }
 764
 765         push (@ttables,$entryfeature->{'qualifiers'}->{'transl_table'}); # undef if not specified
 766
 767         # create labels array
 768         for $exon (@range) {
 769           push(@labels,$exon->[0],$exon->[1]); # start and end of every exon of the CDS
 770         }
 771         push (@transcripts,$entryfeature->{'range'});
 772       } else {
 773         # "simplifying" the joinedlocation features. I.e. changing them from
 774         # multijoined ones to simple plain start-end features, taking only
 775         # the start of the first "exon" and the end of the last "exon" as
 776         # start and end of the entire feature
 777         if ($entryfeature->{'locationtype'} && $entryfeature->{'locationtype'} eq "joined") { # joined location
 778           @range=($range[0]->[0],$range[-1]->[1]);
 779         }
 780         push(@labels,$range[0],$range[1]); # start and end of every feature
 781         if ($name eq "exon") {
 782           $desc=$entryfeature->{'qualifiers'}->{'number'};
 783           if ($entryfeature->{'qualifiers'}->{'note'}) {
 784             if ($desc) {
 785               $desc .= "|" . $entryfeature->{'qualifiers'}->{'note'};
 786             } else {
 787               $desc = $entryfeature->{'qualifiers'}->{'note'};
 788             }
 789           }
 790           push (@exondescs,$desc || "unknown");
 791           push(@exons,\@range);
 792         }
 793         if ($name eq "intron") {
 794           $desc=$entryfeature->{'qualifiers'}->{'number'};
 795           if ($desc) {
 796             $desc .= "|" . $entryfeature->{'qualifiers'}->{'note'};
 797           } else {
 798             $desc = $entryfeature->{'qualifiers'}->{'note'};
 799           }
 800           push (@introndescs,$desc || "unknown");
 801           push(@introns,\@range);
 802         }
 803         if (($name eq "prim_transcript")||($name eq "mRNA")) { push(@prim_transcripts,\@range); }
 804         if ($name eq "repeat_unit") { push(@repeat_units,\@range);
 805           $rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'};
 806           push (@repeat_units_family,$rpt_family || "unknown");
 807         }
 808         if ($name eq "repeat_region") { push(@repeat_regions,\@range);
 809           $rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'};
 810           push (@repeat_regions_family,$rpt_family || "unknown");
 811         }
 812       }
 813     }
 814   }
 815   unless ($gene_name) { $gene_name="cds-position:".$cds_position; }
 816   my %genefeatureshash;
 817   $genefeatureshash{gene_name}=$gene_name;
 818   $genefeatureshash{genefeatures}=\@genefeatures;
 819   $genefeatureshash{labels}=\@labels;
 820   $genefeatureshash{ttables}=\@ttables;
 821   $genefeatureshash{swisshashes}=\@swisshashes;
 822   $genefeatureshash{transcripts}=\@transcripts;
 823   $genefeatureshash{exons}=\@exons;
 824   $genefeatureshash{exondescs}=\@exondescs;
 825   $genefeatureshash{introns}=\@introns;
 826   $genefeatureshash{introndescs}=\@introndescs;
 827   $genefeatureshash{prim_transcripts}=\@prim_transcripts;
 828   $genefeatureshash{repeat_units}=\@repeat_units;
 829   $genefeatureshash{repeat_regions}=\@repeat_regions;
 830   $genefeatureshash{repeat_units_family}=\@repeat_units_family;
 831   $genefeatureshash{repeat_regions_family}=\@repeat_regions_family;
 832   return (\%genefeatureshash);
 833 }
 834
 835
 836 # used by _findgenefeatures, when a CDS at a certain position is requested,
 837 # to retrieve only features quite close to the wanted CDS.
 838 # Args: range hashref, begin and end positions of the CDS, $proximity
 839 # $proximity holds the maximum distance between the extremes of the CDS
 840 # and of the feature under exam.
 841 # Returns: boolean
 842 sub _checkfeatureproximity {
 843   my ($self,$range,$cds_begin,$cds_end,$proximity)=@_;
 844   my @range=@{$range};
 845   my ($begin,$end,$strand);
 846   if (ref($range[0]) eq "ARRAY") { # like in CDS, whose range equivals to exons
 847     ($begin,$end,$strand)=($range[0]->[0],$range[-1]->[1],$range[0]->[2]);
 848   } else {
 849     ($begin,$end,$strand)=@range;
 850   }
 851   if ($cds_begin > $cds_end) { # i.e. reverse strand CDS
 852     ($cds_begin,$cds_end)=($cds_end,$cds_begin); # swap boundaries
 853   }
 854   if ($strand == -1) { # reverse strand
 855     ($begin,$end)=($end,$begin); # swap boundaries
 856   }
 857   if (($cds_begin-$end)>$proximity) {
 858     carp "--DEBUG-- feature rejected: begin $begin end $end -------";
 859     return (0);
 860   }
 861   if (($begin-$cds_end)>$proximity) {
 862     carp "--DEBUG-- feature rejected: begin $begin end $end -------";
 863     return (0);
 864   }
 865   carp "--DEBUG-- feature accepted: begin $begin end $end -------";
 866   return (1); # otherwise ok, feature considered next to CDS
 867 }
 868
 869
 870 # function that calls the external program "align" (on the fasta2 package)
 871 # to create an alignment between two sequences, returning the aligned
 872 # strings and the codes for the identity (:: ::::)
 873
 874 sub _get_alignment {
 875   my ($self,$seq1,$seq2)=@_;
 876
 877   my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null';
 878   my $fastafile1="/tmp/tmpfastafile1";
 879   my $fastafile2="/tmp/tmpfastafile2";
 880   my $grepcut='egrep -v "[[:digit:]]|^ *$|sequences" | cut -c8-'; # grep/cut
 881   my $alignprogram="/usr/local/etc/bioinfo/fasta2/align -s /usr/local/etc/bioinfo/fasta2/idnaa.mat $fastafile1 $fastafile2 2>$null | $grepcut"; # ALIGN
 882   open my $TMPFASTAFILE1, '>', $fastafile1 or croak "Could not write file '$fastafile1' for aa alignment: $!";
 883   open my $TMPFASTAFILE2, '>', $fastafile2 or croak "Could not write file '$fastafile2' for aa alignment: $!";
 884   print $TMPFASTAFILE1 ">firstseq\n$seq1\n";
 885   print $TMPFASTAFILE2 ">secondseq\n$seq2\n";
 886   close $TMPFASTAFILE1;
 887   close $TMPFASTAFILE2;
 888   my $alignment=`$alignprogram`;
 889   my @alignlines=split(/\n/,$alignment);
 890   my ($linecount,$seq1_aligned,$seq2_aligned,$codes);
 891   for ($linecount=0; $linecount < @alignlines; $linecount+=3) {
 892     $seq1_aligned .= $alignlines[$linecount];
 893     $codes .= $alignlines[$linecount+1];
 894     $seq2_aligned .= $alignlines[$linecount+2];
 895   }
 896   return ($seq1_aligned,$seq2_aligned,$codes);
 897 }
 898
 899 # common part of the function to create a novel liveseq gene structure
 900 # from an amino acid sequence, using codon usage frequencies
 901 # args: codon_usage_array transltableid aasequence gene_name
 902 sub _common_novelaasequence2gene {
 903   my ($species_codon_usage,$ttabid,$aasequence,$gene_name)=@_;
 904   my @species_codon_usage=@{$species_codon_usage};
 905   my @codon_usage_label=
 906       qw (cga cgc cgg cgt aga agg cta ctc ctg ctt tta ttg tca tcc tcg
 907       tct agc agt aca acc acg act cca ccc ccg cct gca gcc gcg gct gga
 908       ggc ggg ggt gta gtc gtg gtt aaa aag aac aat caa cag cac cat gaa
 909       gag gac gat tac tat tgc tgt ttc ttt ata atc att atg tgg taa tag
 910       tga);
 911   my ($i,$j);
 912   my %codon_usage_value;
 913   my $aa_codon_total;
 914   for ($i=0;$i<64;$i++) {
 915     $codon_usage_value{$codon_usage_label[$i]}=$species_codon_usage[$i];
 916   }
 917
 918   my $CodonTable  = Bio::Tools::CodonTable->new ( -id => $ttabid );
 919   my @aminoacids = split(//,uc($aasequence));
 920   my @alt_codons; my ($relativeusage,$dnasequence,$chosen_codon,$dice,$partial,$thiscodon);
 921   for $i (@aminoacids) {
 922     @alt_codons = $CodonTable->revtranslate($i);
 923     unless (@alt_codons) {
 924       carp "No reverse translation possible for aminoacid \'$i\'";
 925       $dnasequence .= "???";
 926     } else {
 927       $aa_codon_total=0;
 928       for $j (@alt_codons) {
 929         $aa_codon_total+=$codon_usage_value{$j};
 930       }
 931       # print "aminoacid $i, codonchoice: "; # verbose
 932       #$partial=0;
 933       #for $j (@alt_codons) {
 934         #printf "%s %.2f ",$j,$partial+$codon_usage_value{$j}/$aa_codon_total;
 935         #$partial+=($codon_usage_value{$j}/$aa_codon_total);
 936       #}
 937       #print "\n";
 938       $dice=rand(1);
 939       #print "roulette: $dice\n"; # verbose
 940       $partial=0;
 941       $chosen_codon="";
 942       CODONCHOICE:
 943       for $j (0..@alt_codons) { # last one not accounted
 944         $thiscodon=$alt_codons[$j];
 945         $relativeusage=($codon_usage_value{$thiscodon}/$aa_codon_total);
 946         if ($dice < $relativeusage+$partial) {
 947           $chosen_codon=$thiscodon;
 948           last CODONCHOICE;
 949         } else {
 950           $partial += $relativeusage;
 951         }
 952       }
 953       unless ($chosen_codon) {
 954         $chosen_codon = $alt_codons[-1]; # the last one
 955       }
 956       # print ".....adding $chosen_codon\n"; # verbose
 957       $dnasequence .= $chosen_codon;
 958     }
 959   }
 960
 961   my $dna = Bio::LiveSeq::DNA->new(-seq => $dnasequence);
 962   my $min=1;
 963   my $max=length($dnasequence);
 964   my $exon = Bio::LiveSeq::Exon->new(-seq => $dna, -start => $min, -end => $max, -strand => 1);
 965   my @exons=($exon);
 966   my $transcript = Bio::LiveSeq::Transcript->new(-exons => \@exons);
 967   $transcript->translation_table($ttabid);
 968   my @transcripts=($transcript);
 969   my $translation = Bio::LiveSeq::Translation->new(-transcript => $transcript);
 970   my @translations=($translation);
 971   my %features=(DNA => $dna, Transcripts => \@transcripts, Translations => \@translations);
 972   my $gene = Bio::LiveSeq::Gene->new(-name => $gene_name, -features => \%features, -upbound => $min, -downbound => $max);
 973
 974   # creation of gene
 975   unless ($gene) { # if $gene == 0 it means problems in hash2gene
 976     carp "Error in Gene creation phase";
 977     return (0);
 978   }
 979   return $gene;
 980 }
 981
 982 1;