Bio/SeqIO/chaos.pm

   1 # $Id$
   2 # $Date$
   3 #
   4 # BioPerl module for Bio::SeqIO::chaos
   5 #
   6 # Chris Mungall <cjm@fruitfly.org>
   7 #
   8 # You may distribute this module under the same terms as perl itself
   9
  10 # POD documentation - main docs before the code
  11
  12 =head1 NAME
  13
  14 Bio::SeqIO::chaos - chaos sequence input/output stream
  15
  16 =head1 SYNOPSIS
  17
  18     #In general you will not want to use this module directly;
  19     #use the chaosxml format via SeqIO
  20
  21     $outstream = Bio::SeqIO->new(-file => $filename,
  22                                  -format => 'chaosxml');
  23
  24     while ( my $seq = $instream->next_seq() ) {
  25        $outstream->write_seq($seq);
  26     }
  27
  28 =head1 DESCRIPTION
  29
  30 This is the guts of L<Bio::SeqIO::chaosxml> - please refer to the
  31 documentation for this module
  32
  33 B<CURRENTLY WRITE ONLY>
  34
  35 ChaosXML is an XML mapping of the chado relational database; for more
  36 information, see http://www.fruitfly.org/chaos-xml
  37
  38 chaos can be represented in various syntaxes - XML, S-Expressions or
  39 indented text. You should see the relevant SeqIO file. You will
  40 probably want to use L<Bio::SeqIO::chaosxml>, which is a wrapper to
  41 this module.
  42
  43 =head2 USING STAG OBJECTS
  44
  45 B<non-standard bioperl stuff you dont necessarily need to know follows>
  46
  47 This module (in write mode) is an B<event producer> - it generates XML
  48 events via the L<Data::Stag> module. If you only care about the final
  49 end-product xml, use L<Bio::SeqIO::chaosxml>
  50
  51 You can treat the resulting chaos-xml stream as stag XML objects;
  52
  53     $outstream = Bio::SeqIO->new(-file => $filename, -format => 'chaos');
  54
  55     while ( my $seq = $instream->next_seq() ) {
  56        $outstream->write_seq($seq);
  57     }
  58     my $chaos = $outstream->handler->stag;
  59     # stag provides get/set methods for xml elements
  60     # (these are chaos objects, not bioperl objects)
  61     my @features = $chaos->get_feature;
  62     my @feature_relationships = $chaos->get_feature_relationships;
  63     # stag objects can be queried with functional-programming
  64     # style queries
  65     my @features_in_range =
  66       $chaos->where('feature',
  67                     sub {
  68                          my $featureloc = shift->get_featureloc;
  69                          $featureloc->strand == 1 &&
  70                          $featureloc->nbeg > 10000 &&
  71                          $featureloc->nend < 20000;
  72                     });
  73     foreach my $feature (@features_in_range) {
  74       my $featureloc = $feature->get_featureloc;
  75       printf "%s [%d->%d on %s]\n",
  76         $feature->sget_name,
  77         $featureloc->sget_nbeg,
  78         $featureloc->sget_end,
  79         $featureloc->sget_srcfeature_id;
  80     }
  81
  82 =head1 MODULES REQUIRED
  83
  84 L<Data::Stag>
  85
  86 Downloadable from CPAN; see also http://stag.sourceforge.net
  87
  88 =head1 FEEDBACK
  89
  90 =head2 Mailing Lists
  91
  92 User feedback is an integral part of the evolution of this and other
  93 Bioperl modules. Send your comments and suggestions preferably to one
  94 of the Bioperl mailing lists.  Your participation is much appreciated.
  95
  96   bioperl-l@bioperl.org                  - General discussion
  97   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  98
  99 =head2 Reporting Bugs
 100
 101 Report bugs to the Bioperl bug tracking system to help us keep track
 102 the bugs and their resolution.
 103 Bug reports can be submitted via the web:
 104
 105   http://bugzilla.bioperl.org
 106
 107 =head1 AUTHOR - Chris Mungall
 108
 109 Email cjm@fruitfly.org
 110
 111 =head1 APPENDIX
 112
 113 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
 114
 115 =cut
 116
 117 # Let the code begin...
 118
 119 package Bio::SeqIO::chaos;
 120 use strict;
 121
 122 use Bio::SeqFeature::Generic;
 123 use Bio::Species;
 124 use Bio::Seq::SeqFactory;
 125 use Bio::Annotation::Collection;
 126 use Bio::Annotation::Comment;
 127 use Bio::Annotation::Reference;
 128 use Bio::Annotation::DBLink;
 129 use Bio::SeqFeature::Tools::TypeMapper;
 130 use Bio::SeqFeature::Tools::FeatureNamer;
 131 use Bio::SeqFeature::Tools::IDHandler;
 132 use Data::Stag qw(:all);
 133
 134 use base qw(Bio::SeqIO);
 135
 136 our $TM = 'Bio::SeqFeature::Tools::TypeMapper';
 137 our $FNAMER = 'Bio::SeqFeature::Tools::FeatureNamer';
 138 our $IDH = 'Bio::SeqFeature::Tools::IDHandler';
 139
 140 sub _initialize {
 141     my($self,@args) = @_;
 142
 143     $self->SUPER::_initialize(@args);
 144     if( ! defined $self->sequence_factory ) {
 145         $self->sequence_factory(Bio::Seq::SeqFactory->new
 146                                 (-verbose => $self->verbose(),
 147                                  -type => 'Bio::Seq::RichSeq'));
 148     }
 149     my $wclass = $self->default_handler_class;
 150     $self->handler($wclass);
 151     if ($self->_fh) {
 152         $self->handler->fh($self->_fh);
 153     }
 154     $self->{_end_of_data} = 0;
 155     $self->_type_by_id_h({});
 156     my $t = time;
 157     my $ppt = localtime $t;
 158     $self->handler->S("chaos");
 159     $self->handler->ev(chaos_metadata=>[
 160                                         [chaos_version=>1],
 161                                         [chaos_flavour=>'bioperl'],
 162                                         [feature_unique_key=>'feature_id'],
 163                                         [equiv_chado_release=>'chado_1_01'],
 164                                         [export_unixtime=>$t],
 165                                         [export_localtime=>$ppt],
 166                                         [export_host=>$ENV{HOST}],
 167                                         [export_user=>$ENV{USER}],
 168                                         [export_perl5lib=>$ENV{PERL5LIB}],
 169                                         [export_program=>$0],
 170                                         [export_module=>'Bio::SeqIO::chaos'],
 171                                         [export_module_cvs_id=>'$Id$'],
 172                                        ]);
 173
 174     return;
 175 }
 176
 177 sub DESTROY {
 178     my $self = shift;
 179     $self->end_of_data();
 180     $self->SUPER::DESTROY();
 181 }
 182
 183 sub end_of_data {
 184     my $self = shift;
 185     return if $self->{_end_of_data};
 186     $self->{_end_of_data} = 1;
 187     $self->handler->E("chaos");
 188 }
 189
 190 sub default_handler_class {
 191     return Data::Stag->makehandler;
 192 }
 193
 194 =head2 context_namespace
 195
 196  Title   : context_namespace
 197  Usage   : $obj->context_namespace($newval)
 198  Function:
 199  Example :
 200  Returns : value of context_namespace (a scalar)
 201  Args    : on set, new value (a scalar or undef, optional)
 202
 203 IDs will be preceeded with the context namespace
 204
 205 =cut
 206
 207 sub context_namespace{
 208     my $self = shift;
 209
 210     return $self->{'context_namespace'} = shift if @_;
 211     return $self->{'context_namespace'};
 212 }
 213
 214
 215 =head2 next_seq
 216
 217  Title   : next_seq
 218  Usage   : $seq = $stream->next_seq()
 219  Function: returns the next sequence in the stream
 220  Returns : Bio::Seq object
 221  Args    :
 222
 223 =cut
 224
 225 sub next_seq {
 226     my ($self,@args) = @_;
 227     my $seq = $self->sequence_factory->create
 228         (
 229          #         '-verbose' =>$self->verbose(),
 230          #       %params,
 231          #       -seq => $seqc,
 232          #       -annotation => $annotation,
 233          #       -features => \@features
 234         );
 235     return $seq;
 236 }
 237
 238 sub handler {
 239     my $self = shift;
 240     $self->{_handler} = shift if @_;
 241     return $self->{_handler};
 242 }
 243
 244
 245 =head2 write_seq
 246
 247  Title   : write_seq
 248  Usage   : $stream->write_seq($seq)
 249  Function: writes the $seq object (must be seq) to the stream
 250  Returns : 1 for success and 0 for error
 251  Args    : Bio::Seq
 252
 253
 254 =cut
 255
 256 sub write_seq {
 257     my ($self,$seq) = @_;
 258
 259     if( !defined $seq ) {
 260         $self->throw("Attempting to write with no seq!");
 261     }
 262
 263     if( ! ref $seq || ! $seq->isa('Bio::SeqI') ) {
 264         $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!");
 265     }
 266
 267     # get a handler - must inherit from Data::Stag::BaseHandler;
 268     my $w = $self->handler;
 269
 270     # start of data
 271     ###    $w->S("chaos_block");
 272
 273     my $seq_chaos_feature_id;
 274
 275     # different seq objects have different version accessors -
 276     # weird but true
 277     my $version = $seq->can('seq_version') ? $seq->seq_version : $seq->version;
 278
 279     my $accversion = $seq->accession_number;
 280     if ($version) {
 281         $accversion .= ".$version";
 282     }
 283
 284     if ($accversion) {
 285         $seq_chaos_feature_id = $accversion;
 286     }
 287     else {
 288         $seq_chaos_feature_id = $self->get_chaos_feature_id($seq);
 289         $accversion = $seq_chaos_feature_id;
 290     }
 291
 292     # All ids must have a namespace prefix
 293     if ($seq_chaos_feature_id !~ /:/) {
 294         $seq_chaos_feature_id = "GenericSeqDB:$seq_chaos_feature_id";
 295     }
 296
 297 #    if ($seq->accession_number eq 'unknown') {
 298 #        $seq_chaos_feature_id = $self->get_chaos_feature_id('contig', $seq);
 299 #    }
 300
 301     my $haplotype;
 302     if ($seq->desc =~ /haplotype(.*)/i) {
 303         # yikes, no consistent way to specify haplotype in gb
 304         $haplotype = $1;
 305         $haplotype =~ s/\s+/_/g;
 306         $haplotype =~ s/\W+//g;
 307     }
 308
 309     my $OS;
 310     # Organism lines
 311     if (my $spec = $seq->species) {
 312         my ($species, $genus, @class) = $spec->classification();
 313         $OS = "$genus $species";
 314         if (my $ssp = $spec->sub_species) {
 315             $OS .= " $ssp";
 316         }
 317         $self->genus_species($OS);
 318         if( $spec->common_name ) {
 319             my $common = $spec->common_name;
 320             # genbank parser sets species->common_name to
 321             # be "Genus Species (common name)" which is wrong;
 322             # we will correct for this; if common_name is set
 323             # correctly then carry on
 324             if ($common =~ /\((.*)\)/) {
 325                 $common = $1;
 326             }
 327             $OS .= " (".$common.")";
 328         }
 329     }
 330     if ($OS) {
 331         $self->organismstr($OS);
 332     }
 333     if ($haplotype) {
 334         # genus_species is part of uniquename - add haplotype
 335         # to make it genuinely unique
 336         $self->genus_species($self->genus_species .= " $haplotype");
 337     }
 338
 339     my $uname = $self->make_uniquename($self->genus_species, $accversion);
 340
 341     # data structure representing the core sequence for this record
 342     my $seqnode =
 343       Data::Stag->new(feature=>[
 344                                 [feature_id=>$seq_chaos_feature_id],
 345                                 [dbxrefstr=>'SEQDB:'.$accversion],
 346                                 [name=>$seq->display_name],
 347                                 [uniquename=>$uname],
 348                                 [residues=>$seq->seq],
 349                                ]);
 350
 351     # soft properties
 352     my %prop = ();
 353
 354     $seqnode->set_type('databank_entry');
 355
 356     map {
 357         $prop{$_} = $seq->$_() if $seq->can($_);
 358     } qw(desc keywords division molecule is_circular);
 359     $prop{dates} = join("; ", $seq->get_dates) if $seq->can("get_dates");
 360
 361     local($^W) = 0;   # supressing warnings about uninitialized fields.
 362
 363     # Reference lines
 364     my $count = 1;
 365     foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
 366         # TODO
 367     }
 368     # Comment lines
 369
 370     $seqnode->add_featureprop([[type=>'haplotype'],[value=>$haplotype]])
 371       if $haplotype;
 372     foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
 373         $seqnode->add_featureprop([[type=>'comment'],[value=>$comment->text]]);
 374     }
 375     if ($OS) {
 376         $seqnode->set_organismstr($OS);
 377     }
 378
 379     my @sfs = $seq->get_SeqFeatures;
 380
 381     # genbank usually includes a 'source' feature - we just
 382     # migrate the data from this to the actual source feature
 383     my @sources = grep {$_->primary_tag eq 'source'} @sfs;
 384     @sfs = grep {$_->primary_tag ne 'source'} @sfs;
 385     $self->throw(">1 source types") if @sources > 1;
 386     my $source = shift @sources;
 387     if ($source) {
 388
 389         my $tempw = Data::Stag->makehandler;
 390         $self->write_sf($source, $seq_chaos_feature_id, $tempw);
 391         my $snode = $tempw->stag;
 392         $seqnode->add($_->name, $_->data)
 393           foreach ($snode->get_featureprop,
 394                    $snode->get_feature_dbxref);
 395
 396     }
 397
 398
 399     # throw the writer an event
 400     $w->ev(@$seqnode);
 401
 402     $seqnode = undef;      # free memory
 403
 404     # make events for all the features within the record
 405     foreach my $sf ( @sfs ) {
 406         $FNAMER->name_feature($sf);
 407         $FNAMER->name_contained_features($sf);
 408         $self->write_sf($sf, $seq_chaos_feature_id);
 409     }
 410
 411     # data end
 412     ### $w->E("chaos_block");
 413     return 1;
 414 }
 415
 416
 417 sub organismstr{
 418     my $self = shift;
 419
 420     return $self->{'organismstr'} = shift if @_;
 421     return $self->{'organismstr'};
 422 }
 423
 424
 425 sub genus_species{
 426     my $self = shift;
 427
 428     return $self->{'genus_species'} = shift if @_;
 429     return $self->{'genus_species'};
 430 }
 431
 432
 433 # maps ID to type
 434 sub _type_by_id_h {
 435     my $self = shift;
 436     $self->{_type_by_id_h} = shift if @_;
 437     return $self->{_type_by_id_h};
 438 }
 439
 440
 441
 442 # ----
 443 # writes a seq feature
 444 # ----
 445
 446 sub write_sf {
 447     my $self = shift;
 448     my $sf = shift;
 449     my $seq_chaos_feature_id = shift;
 450     my $w = shift || $self->handler;
 451
 452     my %props =
 453       map {
 454           lc($_)=>[$sf->each_tag_value($_)]
 455       } $sf->all_tags;
 456
 457     my $loc = $sf->location;
 458     my $name = $FNAMER->generate_feature_name($sf);
 459     my $type = $sf->primary_tag;
 460
 461     # The CDS (eg in a genbank feature) implicitly represents
 462     # the protein
 463     $type =~ s/CDS/polypeptide/;
 464
 465     my @subsfs = $sf->sub_SeqFeature;
 466     my @locnodes = ();
 467     my $sid = $loc->is_remote ? $loc->seq_id : $seq_chaos_feature_id;
 468
 469     my $CREATE_SPLIT_SFS = 0;
 470
 471     if($CREATE_SPLIT_SFS &&
 472        $loc->isa("Bio::Location::SplitLocationI") ) {
 473         # turn splitlocs into subfeatures
 474         my $n = 1;
 475         push(@subsfs,
 476              map {
 477                  my $ssf =
 478                    Bio::SeqFeature::Generic->new(
 479
 480                                                  -start=>$_->start,
 481                                                  -end=>$_->end,
 482                                                  -strand=>$_->strand,
 483                                                  -primary=>$self->subpartof($type),
 484                                               );
 485                  if ($_->is_remote) {
 486                      $ssf->location->is_remote(1);
 487                      $ssf->location->seq_id($_->seq_id);
 488                  }
 489                  $ssf;
 490                } $loc->each_Location);
 491     }
 492     elsif( $loc->isa("Bio::Location::RemoteLocationI") ) {
 493         # turn splitlocs into subfeatures
 494         my $n = 1;
 495         push(@subsfs,
 496              map {
 497                  Bio::SeqFeature::Generic->new(
 498 #                                               -name=>$name.'.'.$n++,
 499                                                -start=>$_->start,
 500                                                -end=>$_->end,
 501                                                -strand=>$_->strand,
 502                                                -primary=>$self->subpartof($type),
 503                                               )
 504                } $loc->each_Location);
 505     }
 506     else {
 507         my ($beg, $end, $strand) = $self->bp2ib($loc);
 508         if (!$strand) {
 509             use Data::Dumper;
 510             print Dumper $sf, $loc;
 511             $self->throw("($beg, $end, $strand) - no strand\n");
 512         }
 513         @locnodes = (
 514                      [featureloc=>[
 515                                    [nbeg=>$beg],
 516                                    [nend=>$end],
 517                                    [strand=>$strand],
 518                                    [srcfeature_id=>$sid],
 519                                    [locgroup=>0],
 520                                    [rank=>0],
 521                                   ]
 522                      ]
 523                     );
 524     }
 525     my $feature_id = $self->get_chaos_feature_id($sf);
 526
 527     delete $props{id} if $props{id};
 528     # do something with genbank stuff
 529     my $pid = $props{'protein_id'};
 530     my $tn = $props{'translation'};
 531     my @xrefs = @{$props{'db_xref'} || []};
 532     if ($pid) {
 533         push(@xrefs, "protein:$pid->[0]");
 534     }
 535
 536     my $org = $props{organism} ? $props{organism}->[0] : undef;
 537     if (!$org && $self->organismstr) {
 538         $org = $self->organismstr;
 539     }
 540     my $uname = $name ? $name.'/'.$feature_id : $feature_id;
 541     if ($self->genus_species && $name) {
 542         $uname = $self->make_uniquename($self->genus_species, $name);
 543     }
 544     if (!$uname) {
 545         $self->throw("cannot make uniquename for $feature_id $name");
 546     }
 547     $self->_type_by_id_h->{$feature_id} = $type;
 548     my $fnode =
 549       [feature=>[
 550                  [feature_id=>$feature_id],
 551                  $name ? ([name=>$name]) : (),
 552                  [uniquename=>$uname],
 553                  [type=>$type],
 554                  $tn ? ([residues=>$tn->[0]],
 555                         [seqlen=>length($tn->[0])],
 556                         #####[md5checksum=>md5checksum($tn->[0])],
 557                        ) :(),
 558                  $org ? ([organismstr=>$org]) : (),
 559                  @locnodes,
 560                  (map {
 561                      [feature_dbxref=>[
 562                                        [dbxrefstr=>$_]
 563                                       ]
 564                      ]
 565                  } @xrefs),
 566                  (map {
 567                      my $k = $_;
 568                      my $rank=0;
 569                      map { [featureprop=>[[type=>$k],[value=>$_],[rank=>$rank++]]] } @{$props{$k}}
 570                  } keys %props),
 571                 ]];
 572     $w->ev(@$fnode);
 573
 574     my $rank = 0;
 575     if (@subsfs) {
 576         # strand is always determined by FIRST feature listed
 577         # (see genbank entry for trans-spliced mod(mdg4) AE003734)
 578         my $strand = $subsfs[0];
 579
 580         # almost all the time, all features are on same strand
 581         my @sfs_on_main_strand = grep {$_->strand == $strand} @subsfs;
 582         my @sfs_on_other_strand = grep {$_->strand != $strand} @subsfs;
 583
 584         sort_by_strand($strand, \@sfs_on_main_strand);
 585         sort_by_strand(0-$strand, \@sfs_on_other_strand);
 586         @subsfs = (@sfs_on_main_strand, @sfs_on_other_strand);
 587
 588         foreach my $ssf (@subsfs) {
 589             my $ssfid = $self->write_sf($ssf, $sid);
 590             #my $rtype = 'part_of';
 591             my $rtype =
 592               $TM->get_relationship_type_by_parent_child($sf,$ssf);
 593             if ($ssf->primary_tag eq 'CDS') {
 594                 $rtype = 'derives_from';
 595             }
 596             $w->ev(feature_relationship=>[
 597                                           [subject_id=>$ssfid],
 598                                           [object_id=>$feature_id],
 599                                           [type=>$rtype],
 600                                           [rank=>$rank++],
 601                                          ]
 602                   );
 603         }
 604     }
 605     else {
 606         # parents not stored as bioperl containment hierarchy
 607         my @parent_ids = @{$props{parent} || []};
 608         foreach my $parent_id (@parent_ids) {
 609             my $ptype =
 610               $self->_type_by_id_h->{$parent_id} || 'unknown';
 611             my $rtype =
 612               $TM->get_relationship_type_by_parent_child($ptype,$type);
 613             $w->ev(feature_relationship=>[
 614                                           [subject_id=>$feature_id],
 615                                           [object_id=>$parent_id],
 616                                           [type=>$rtype],
 617                                           [rank=>$rank++],
 618                                          ]
 619                   );
 620         }
 621     }
 622     return $feature_id;
 623 }
 624
 625 sub sort_by_strand {
 626     my $strand = shift || 1;
 627     my $sfs = shift;
 628     @$sfs = sort { ($a->start <=> $b->start) * $strand } @$sfs;
 629     return;
 630 }
 631
 632 sub make_uniquename {
 633     my $self = shift;
 634     my $org = shift;
 635     my $name = shift;
 636
 637     my $os = $org;
 638     $os =~ s/\s+/_/g;
 639     $os =~ s/\(/_/g;
 640     $os =~ s/\)/_/g;
 641     $os =~ s/_+/_/g;
 642     $os =~ s/^_+//g;
 643     $os =~ s/_+$//g;
 644     return "$os:$name";
 645 }
 646
 647
 648 sub get_chaos_feature_id {
 649     my $self = shift;
 650     my $ob = shift;
 651
 652     my $id;
 653     if ($ob->isa("Bio::SeqI")) {
 654         $id = $ob->accession_number . '.' . ($ob->can('seq_version') ? $ob->seq_version : $ob->version);
 655     }
 656     else {
 657         $ob->isa("Bio::SeqFeatureI") || $self->throw("$ob must be either SeqI or SeqFeatureI");
 658
 659         if ($ob->primary_id) {
 660             $id = $ob->primary_id;
 661         }
 662         else {
 663             eval {
 664                 $id = $IDH->generate_unique_persistent_id($ob);
 665             };
 666             if ($@) {
 667                 $self->warn($@);
 668                 $id = "$ob"; # last resort - use memory pointer ref
 669                 # will not be persistent, but will be unique
 670             }
 671         }
 672     }
 673     if (!$id) {
 674         if ($ob->isa("Bio::SeqFeatureI")) {
 675             $id = $IDH->generate_unique_persistent_id($ob);
 676         }
 677         else {
 678             $self->throw("Cannot generate a unique persistent ID for a Seq without either primary_id or accession");
 679         }
 680     }
 681     if ($id) {
 682         $id = $self->context_namespace ? $self->context_namespace . ":" . $id : $id;
 683
 684     }
 685     return $id;
 686 }
 687
 688 # interbase and directional semantics
 689 sub bp2ib {
 690     my $self = shift;
 691     my $loc = shift;
 692     my ($s, $e, $str) =
 693       ref($loc) eq "ARRAY" ? (@$loc) : ($loc->start, $loc->end, $loc->strand);
 694     $s--;
 695     if ($str < 0) {
 696         ($s, $e) = ($e, $s);
 697     }
 698     return ($s, $e, $str || 1);
 699 }
 700
 701 sub subpartof {
 702     my $self = shift;
 703     my $type = 'partof_'.shift;
 704     $type =~ s/partof_CDS/CDS_exon/;
 705     $type =~ s/partof_protein/CDS_exon/;
 706     $type =~ s/partof_polypeptide/CDS_exon/;
 707     $type =~ s/partof_\w*RNA/exon/;
 708     return $type;
 709 }
 710
 711 1;