Bio/Tools/Est2Genome.pm

   1 #
   2 # BioPerl module for Bio::Tools::Est2Genome
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Jason Stajich <jason-at-bioperl.org>
   7 #
   8 # Copyright Jason Stajich
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::Tools::Est2Genome - Parse est2genome output, makes simple Bio::SeqFeature::Generic objects
  17
  18 =head1 SYNOPSIS
  19
  20   use Bio::Tools::Est2Genome;
  21
  22   my $featureiter = Bio::Tools::Est2Genome->new(-file => 'output.est2genome');
  23
  24   # This is going to be fixed to use the SeqAnalysisI next_feature
  25   # Method eventually when we have the objects to put the data in
  26   # properly
  27   while( my $f = $featureiter->parse_next_gene ) {
  28    # process Bio::SeqFeature::Generic objects here
  29   }
  30
  31 =head1 DESCRIPTION
  32
  33 This module is a parser for C<est2genome> [EMBOSS] alignments of est/cdna
  34 sequence to genomic DNA.  This is generally accepted as the best
  35 program for predicting splice sites based on est/dnas (as far as I know).
  36
  37 This module currently does not try pull out the ungapped alignments
  38 (Segment) but may in the future.
  39
  40 =head1 FEEDBACK
  41
  42 =head2 Mailing Lists
  43
  44 User feedback is an integral part of the evolution of this and other
  45 Bioperl modules. Send your comments and suggestions preferably to
  46 the Bioperl mailing list.  Your participation is much appreciated.
  47
  48   bioperl-l@bioperl.org                  - General discussion
  49   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  50
  51 =head2 Support
  52
  53 Please direct usage questions or support issues to the mailing list:
  54
  55 I<bioperl-l@bioperl.org>
  56
  57 rather than to the module maintainer directly. Many experienced and
  58 reponsive experts will be able look at the problem and quickly
  59 address it. Please include a thorough description of the problem
  60 with code and data examples if at all possible.
  61
  62 =head2 Reporting Bugs
  63
  64 Report bugs to the Bioperl bug tracking system to help us keep track
  65 of the bugs and their resolution. Bug reports can be submitted the
  66 web:
  67
  68   https://github.com/bioperl/bioperl-live/issues
  69
  70 =head1 AUTHOR - Jason Stajich
  71
  72 Email jason-at-bioperl.org
  73
  74 =head1 APPENDIX
  75
  76 The rest of the documentation details each of the object methods.
  77 Internal methods are usually preceded with a _
  78
  79 =cut
  80
  81
  82 # Let the code begin...
  83
  84
  85 package Bio::Tools::Est2Genome;
  86 use strict;
  87
  88 # Object preamble - inherits from Bio::Root::Root
  89
  90 use Bio::Root::Root;
  91 use Bio::SeqFeature::Gene::Exon;
  92 use Bio::SeqFeature::Gene::Transcript;
  93 use Bio::SeqFeature::Gene::Intron;
  94 use Bio::SeqFeature::Gene::GeneStructure;
  95 use Bio::SeqFeature::SimilarityPair;
  96
  97 use base qw(Bio::Tools::AnalysisResult);
  98
  99 =head2 new
 100
 101  Title   : new
 102  Usage   : my $obj = Bio::Tools::Est2Genome->new();
 103  Function: Builds a new Bio::Tools::Est2Genome object
 104  Returns : an instance of Bio::Tools::Est2Genome
 105  Args    : -file => 'output.est2genome' or
 106            -fh   => \*EST2GENOMEOUTPUT
 107            -genomefirst => 1  # genome was the first input (not standard)
 108
 109 =cut
 110
 111 sub _initialize_state {
 112     my($self,@args) = @_;
 113
 114     # call the inherited method first
 115     my $make = $self->SUPER::_initialize_state(@args);
 116
 117     my ($genome_is_first) = $self->_rearrange([qw(GENOMEFIRST)], @args);
 118
 119     delete($self->{'_genome_is_first'});
 120     $self->{'_genome_is_first'} = $genome_is_first if(defined($genome_is_first));
 121     $self->analysis_method("est2genome");
 122 }
 123
 124 =head2 analysis_method
 125
 126  Usage     : $sim4->analysis_method();
 127  Purpose   : Inherited method. Overridden to ensure that the name matches
 128              /est2genome/i.
 129  Returns   : String
 130  Argument  : n/a
 131
 132 =cut
 133
 134 #-------------
 135 sub analysis_method {
 136 #-------------
 137     my ($self, $method) = @_;
 138     if($method && ($method !~ /est2genome/i)) {
 139         $self->throw("method $method not supported in " . ref($self));
 140     }
 141     return $self->SUPER::analysis_method($method);
 142 }
 143
 144 =head2 parse_next_gene
 145
 146  Title   : parse_next_gene
 147  Usage   : @gene = $est2genome_result->parse_next_gene;
 148            foreach $exon (@exons) {
 149                # do something
 150            }
 151
 152  Function: Parses the next alignments of the est2genome result file and
 153            returns the found exons as an array of
 154            Bio::SeqFeature::SimilarityPair objects. Call
 155            this method repeatedly until an empty array is returned to get the
 156            results for all alignments.
 157
 158            The $exon->seq_id() attribute will be set to the identifier of the
 159            respective sequence for both sequences.
 160            The length is accessible via the seqlength()
 161            attribute of $exon->query() and
 162            $exon->est_hit().
 163  Returns : An array (or array reference) of Bio::SeqFeature::SimilarityPair and Bio::SeqFeature::Generic objects
 164            or Bio::SeqFeature::Gene::GeneStructure
 165  Args    : flag(1/0) indicating to return Bio::SeqFeature::Gene::GeneStructure or Bio::SeqFeature::SimilarityPair
 166            defaults to 0
 167
 168 =cut
 169
 170 sub parse_next_gene {
 171    my ($self,$return_gene) = @_;
 172    return $self->_parse_gene_struct if $return_gene;
 173    my $seensegment = 0;
 174    my @features;
 175    my ($qstrand,$hstrand) = (1,1);
 176    my $lasthseqname;
 177    while( defined($_ = $self->_readline) ) {
 178        if( /Note Best alignment is between (reversed|forward) est and (reversed|forward) genome, (but|and) splice\s+sites imply\s+(forward gene|REVERSED GENE)/) {
 179            if( $seensegment ) {
 180                $self->_pushback($_);
 181                return wantarray ? @features : \@features;
 182            }
 183            $hstrand = -1 if $1 eq 'reversed';
 184            $qstrand = -1 if $4 eq 'REVERSED GENE';
 185            #$self->debug( "1=$1, 2=$2, 4=$4\n");
 186        }
 187        elsif( /^Exon/ ) {
 188            my ($name,$score,$perc_ident,$qstart,$qend,$qseqname,
 189                $hstart,$hend, $hseqname) = split;
 190            $lasthseqname = $hseqname;
 191            my $query = Bio::SeqFeature::Similarity->new(-primary => $name,
 192                                                        -source  => $self->analysis_method,
 193                                                        -seq_id => $qseqname, # FIXME WHEN WE REDO THE GENERIC NAME CHANGE
 194                                                        -start   => $qstart,
 195                                                        -end     => $qend,
 196                                                        -strand  => $qstrand,
 197                                                        -score   => $score,
 198                                                        -tag => {
 199 #                                                          'Location' => "$hstart..$hend",
 200                                                            'Sequence' => "$hseqname",
 201                                                            'identity' => $perc_ident,
 202                                                            }
 203                                                        );
 204            my $hit = Bio::SeqFeature::Similarity->new(-primary => 'exon_hit',
 205                                                      -source  => $self->analysis_method,
 206                                                      -seq_id => $hseqname,
 207                                                      -start   => $hstart,
 208                                                      -end     => $hend,
 209                                                      -strand  => $hstrand,
 210                                                      -score   => $score,
 211                                                      -tag => {
 212 #                                                        'Location' => "$qstart..$qend",
 213                                                          'Sequence' => "$qseqname",
 214                                                          'identity' => $perc_ident,
 215                                                      }
 216                                                      );
 217            push @features, Bio::SeqFeature::SimilarityPair->new
 218                (-query => $query,
 219                 -hit   => $hit,
 220                 -source => $self->analysis_method);
 221        } elsif( /^([\-\+\?])(Intron)/) {
 222            my ($name,$score,$perc_ident,$qstart,$qend,$qseqname) = split;
 223            push @features, Bio::SeqFeature::Generic->new(-primary => $2,
 224                                                         -source => $self->analysis_method,
 225                                                         -start => $qstart,
 226                                                         -end   => $qend,
 227                                                         -strand => $qstrand,
 228                                                         -score  => $score,
 229                                                         -seq_id => $qseqname,
 230                                                         -tag => {
 231                                                          'identity' => $perc_ident,
 232                                                             'Sequence' => $lasthseqname});
 233        } elsif( /^Span/ ) {
 234        } elsif( /^Segment/ ) {
 235            $seensegment = 1;
 236        } elsif( /^\s+$/ ) { # do nothing
 237        } else {
 238            $self->warn( "unknown line $_\n");
 239        }
 240    }
 241    return unless( @features );
 242    return wantarray ? @features : \@features;
 243 }
 244
 245 sub _parse_gene_struct {
 246    my ($self) = @_;
 247    my $seensegment = 0;
 248    my @features;
 249    my ($qstrand,$hstrand) = (1,1);
 250    my $lasthseqname;
 251    my $gene = Bio::SeqFeature::Gene::GeneStructure->new(-source => $self->analysis_method);
 252    my $transcript = Bio::SeqFeature::Gene::Transcript->new(-source => $self->analysis_method);
 253    my @suppf;
 254    my @exon;
 255    while( defined($_ = $self->_readline) ) {
 256        if( /Note Best alignment is between (reversed|forward) est and (reversed|forward) genome, (but|and) splice\s+sites imply\s+(forward gene|REVERSED GENE)/) {
 257               if( $seensegment ) {
 258                $self->_pushback($_);
 259                return $gene;
 260               }
 261            $hstrand = -1 if $1 eq 'reversed';
 262            $qstrand = -1 if $4 eq 'REVERSED GENE';
 263        }
 264        elsif( /^Exon/ ) {
 265            my ($name,$score,$perc_ident,$qstart,$qend,$qseqname,$hstart,$hend, $hseqname) = split;
 266            $lasthseqname = $hseqname;
 267            my $exon = Bio::SeqFeature::Gene::Exon->new(-primary => $name,
 268                                                        -source  => $self->analysis_method,
 269                                                        -seq_id => $qseqname, # FIXME WHEN WE REDO THE GENERIC NAME CHANGE
 270                                                        -start   => $qstart,
 271                                                        -end     => $qend,
 272                                                        -strand  => $qstrand,
 273                                                        -score   => $score,
 274                                                        -tag => {
 275                             #'Location' => "$hstart..$hend",
 276                                                          'identity' => $perc_ident,
 277                                                                    'Sequence' => "$hseqname",
 278                                                                       }
 279                                                        );
 280           $transcript->seq_id($qseqname) unless $transcript->seq_id;
 281           $exon->add_tag_value('phase',0);
 282           push @exon, $exon;
 283
 284        } elsif( /^([\-\+\?])(Intron)/) {
 285          next; #intron auto matically built from exons..hope thats ok..
 286        } elsif( /^Span/ ) {
 287        } elsif( /^Segment/ ) {
 288             my ($name,$score,$perc_ident,$qstart,$qend,$qseqname,$hstart,$hend, $hseqname) = split;
 289                  my $query = Bio::SeqFeature::Similarity->new(-primary => $name,
 290                                                        -source  => $self->analysis_method,
 291                                                        -seq_id => $qseqname, # FIXME WHEN WE REDO THE GENERIC NAME CHANGE
 292                                                        -start   => $qstart,
 293                                                        -end     => $qend,
 294                                                        -strand  => $qstrand,
 295                                                        -score   => $score,
 296                                                        -tag => {
 297 #                                                          'Location' => "$hstart..$hend",
 298                                                            'Sequence' => "$hseqname",
 299                                                          'identity' => $perc_ident,
 300                                                            }
 301                                                      );
 302            my $hit = Bio::SeqFeature::Similarity->new(-primary => 'exon_hit',
 303                                                                                              -source  => $self->analysis_method,
 304                                                                                              -seq_id => $hseqname,
 305                                                                                              -start   => $hstart,
 306                                                                                              -end     => $hend,
 307                                                                                              -strand  => $hstrand,
 308                                                                                              -score   => $score,
 309                                                                                              -tag => {
 310                                                             #   'Location' => "$qstart..$qend",
 311                                                                                                  'Sequence' => "$qseqname",
 312                                                          'identity' => $perc_ident,
 313                                                                                                 }
 314                                                      );
 315                    my $support =  Bio::SeqFeature::SimilarityPair->new(-query => $query,
 316                                                                         -hit   => $hit,
 317                                                                         -source => $self->analysis_method);
 318              push @suppf, $support;
 319        } elsif( /^\s+$/ ) { # do nothing
 320        } else {
 321            $self->warn( "unknown line $_\n");
 322        }
 323    }
 324    return unless $#exon >=0;
 325    foreach my $e(@exon){
 326     my @add;
 327     foreach my $sf(@suppf){
 328       if($sf->overlaps($e)){
 329           push @add,$sf;
 330       }
 331     }
 332     $e->add_tag_value('supporting_feature',@add);
 333     $transcript->add_exon($e);
 334   }
 335
 336    $gene->add_transcript($transcript);
 337    $gene->seq_id($transcript->seq_id);
 338    return $gene;
 339 }
 340
 341 =head2 next_feature
 342
 343  Title   : next_feature
 344  Usage   : $seqfeature = $obj->next_feature();
 345  Function: Returns the next feature available in the analysis result, or
 346            undef if there are no more features.
 347  Example :
 348  Returns : A Bio::SeqFeatureI implementing object, or undef if there are no
 349            more features.
 350  Args    : none
 351
 352 =cut
 353
 354 sub next_feature {
 355     my ($self) = shift;
 356     $self->throw("We haven't really done this right, yet, use parse_next_gene");
 357 }
 358
 359
 360 1;