[bug 2714]
[bioperl-live.git] / Bio / SeqIO / chaos.pm
blob1aaab58d2e4fab939507e459cefff326960a7619
1 # $Id$
2 # $Date$
4 # BioPerl module for Bio::SeqIO::chaos
6 # Chris Mungall <cjm@fruitfly.org>
8 # You may distribute this module under the same terms as perl itself
10 # POD documentation - main docs before the code
12 =head1 NAME
14 Bio::SeqIO::chaos - chaos sequence input/output stream
16 =head1 SYNOPSIS
18 #In general you will not want to use this module directly;
19 #use the chaosxml format via SeqIO
21 $outstream = Bio::SeqIO->new(-file => $filename,
22 -format => 'chaosxml');
24 while ( my $seq = $instream->next_seq() ) {
25 $outstream->write_seq($seq);
28 =head1 DESCRIPTION
30 This is the guts of L<Bio::SeqIO::chaosxml> - please refer to the
31 documentation for this module
33 B<CURRENTLY WRITE ONLY>
35 ChaosXML is an XML mapping of the chado relational database; for more
36 information, see http://www.fruitfly.org/chaos-xml
38 chaos can be represented in various syntaxes - XML, S-Expressions or
39 indented text. You should see the relevant SeqIO file. You will
40 probably want to use L<Bio::SeqIO::chaosxml>, which is a wrapper to
41 this module.
43 =head2 USING STAG OBJECTS
45 B<non-standard bioperl stuff you dont necessarily need to know follows>
47 This module (in write mode) is an B<event producer> - it generates XML
48 events via the L<Data::Stag> module. If you only care about the final
49 end-product xml, use L<Bio::SeqIO::chaosxml>
51 You can treat the resulting chaos-xml stream as stag XML objects;
53 $outstream = Bio::SeqIO->new(-file => $filename, -format => 'chaos');
55 while ( my $seq = $instream->next_seq() ) {
56 $outstream->write_seq($seq);
58 my $chaos = $outstream->handler->stag;
59 # stag provides get/set methods for xml elements
60 # (these are chaos objects, not bioperl objects)
61 my @features = $chaos->get_feature;
62 my @feature_relationships = $chaos->get_feature_relationships;
63 # stag objects can be queried with functional-programming
64 # style queries
65 my @features_in_range =
66 $chaos->where('feature',
67 sub {
68 my $featureloc = shift->get_featureloc;
69 $featureloc->strand == 1 &&
70 $featureloc->nbeg > 10000 &&
71 $featureloc->nend < 20000;
72 });
73 foreach my $feature (@features_in_range) {
74 my $featureloc = $feature->get_featureloc;
75 printf "%s [%d->%d on %s]\n",
76 $feature->sget_name,
77 $featureloc->sget_nbeg,
78 $featureloc->sget_end,
79 $featureloc->sget_srcfeature_id;
82 =head1 MODULES REQUIRED
84 L<Data::Stag>
86 Downloadable from CPAN; see also http://stag.sourceforge.net
88 =head1 FEEDBACK
90 =head2 Mailing Lists
92 User feedback is an integral part of the evolution of this and other
93 Bioperl modules. Send your comments and suggestions preferably to one
94 of the Bioperl mailing lists. Your participation is much appreciated.
96 bioperl-l@bioperl.org - General discussion
97 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
99 =head2 Reporting Bugs
101 Report bugs to the Bioperl bug tracking system to help us keep track
102 the bugs and their resolution.
103 Bug reports can be submitted via the web:
105 http://bugzilla.bioperl.org
107 =head1 AUTHOR - Chris Mungall
109 Email cjm@fruitfly.org
111 =head1 APPENDIX
113 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
115 =cut
117 # Let the code begin...
119 package Bio::SeqIO::chaos;
120 use strict;
122 use Bio::SeqFeature::Generic;
123 use Bio::Species;
124 use Bio::Seq::SeqFactory;
125 use Bio::Annotation::Collection;
126 use Bio::Annotation::Comment;
127 use Bio::Annotation::Reference;
128 use Bio::Annotation::DBLink;
129 use Bio::SeqFeature::Tools::TypeMapper;
130 use Bio::SeqFeature::Tools::FeatureNamer;
131 use Bio::SeqFeature::Tools::IDHandler;
132 use Data::Stag qw(:all);
134 use base qw(Bio::SeqIO);
136 our $TM = 'Bio::SeqFeature::Tools::TypeMapper';
137 our $FNAMER = 'Bio::SeqFeature::Tools::FeatureNamer';
138 our $IDH = 'Bio::SeqFeature::Tools::IDHandler';
140 sub _initialize {
141 my($self,@args) = @_;
143 $self->SUPER::_initialize(@args);
144 if( ! defined $self->sequence_factory ) {
145 $self->sequence_factory(Bio::Seq::SeqFactory->new
146 (-verbose => $self->verbose(),
147 -type => 'Bio::Seq::RichSeq'));
149 my $wclass = $self->default_handler_class;
150 $self->handler($wclass);
151 if ($self->_fh) {
152 $self->handler->fh($self->_fh);
154 $self->{_end_of_data} = 0;
155 $self->_type_by_id_h({});
156 my $t = time;
157 my $ppt = localtime $t;
158 $self->handler->S("chaos");
159 $self->handler->ev(chaos_metadata=>[
160 [chaos_version=>1],
161 [chaos_flavour=>'bioperl'],
162 [feature_unique_key=>'feature_id'],
163 [equiv_chado_release=>'chado_1_01'],
164 [export_unixtime=>$t],
165 [export_localtime=>$ppt],
166 [export_host=>$ENV{HOST}],
167 [export_user=>$ENV{USER}],
168 [export_perl5lib=>$ENV{PERL5LIB}],
169 [export_program=>$0],
170 [export_module=>'Bio::SeqIO::chaos'],
171 [export_module_cvs_id=>'$Id$'],
174 return;
177 sub DESTROY {
178 my $self = shift;
179 $self->end_of_data();
180 $self->SUPER::DESTROY();
183 sub end_of_data {
184 my $self = shift;
185 return if $self->{_end_of_data};
186 $self->{_end_of_data} = 1;
187 $self->handler->E("chaos");
190 sub default_handler_class {
191 return Data::Stag->makehandler;
194 =head2 context_namespace
196 Title : context_namespace
197 Usage : $obj->context_namespace($newval)
198 Function:
199 Example :
200 Returns : value of context_namespace (a scalar)
201 Args : on set, new value (a scalar or undef, optional)
203 IDs will be preceeded with the context namespace
205 =cut
207 sub context_namespace{
208 my $self = shift;
210 return $self->{'context_namespace'} = shift if @_;
211 return $self->{'context_namespace'};
215 =head2 next_seq
217 Title : next_seq
218 Usage : $seq = $stream->next_seq()
219 Function: returns the next sequence in the stream
220 Returns : Bio::Seq object
221 Args :
223 =cut
225 sub next_seq {
226 my ($self,@args) = @_;
227 my $seq = $self->sequence_factory->create
229 # '-verbose' =>$self->verbose(),
230 # %params,
231 # -seq => $seqc,
232 # -annotation => $annotation,
233 # -features => \@features
235 return $seq;
238 sub handler {
239 my $self = shift;
240 $self->{_handler} = shift if @_;
241 return $self->{_handler};
245 =head2 write_seq
247 Title : write_seq
248 Usage : $stream->write_seq($seq)
249 Function: writes the $seq object (must be seq) to the stream
250 Returns : 1 for success and 0 for error
251 Args : Bio::Seq
254 =cut
256 sub write_seq {
257 my ($self,$seq) = @_;
259 if( !defined $seq ) {
260 $self->throw("Attempting to write with no seq!");
263 if( ! ref $seq || ! $seq->isa('Bio::SeqI') ) {
264 $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!");
267 # get a handler - must inherit from Data::Stag::BaseHandler;
268 my $w = $self->handler;
270 # start of data
271 ### $w->S("chaos_block");
273 my $seq_chaos_feature_id;
275 # different seq objects have different version accessors -
276 # weird but true
277 my $version = $seq->can('seq_version') ? $seq->seq_version : $seq->version;
279 my $accversion = $seq->accession_number;
280 if ($version) {
281 $accversion .= ".$version";
284 if ($accversion) {
285 $seq_chaos_feature_id = $accversion;
287 else {
288 $seq_chaos_feature_id = $self->get_chaos_feature_id($seq);
289 $accversion = $seq_chaos_feature_id;
292 # All ids must have a namespace prefix
293 if ($seq_chaos_feature_id !~ /:/) {
294 $seq_chaos_feature_id = "GenericSeqDB:$seq_chaos_feature_id";
297 # if ($seq->accession_number eq 'unknown') {
298 # $seq_chaos_feature_id = $self->get_chaos_feature_id('contig', $seq);
301 my $haplotype;
302 if ($seq->desc =~ /haplotype(.*)/i) {
303 # yikes, no consistent way to specify haplotype in gb
304 $haplotype = $1;
305 $haplotype =~ s/\s+/_/g;
306 $haplotype =~ s/\W+//g;
309 my $OS;
310 # Organism lines
311 if (my $spec = $seq->species) {
312 my ($species, $genus, @class) = $spec->classification();
313 $OS = "$genus $species";
314 if (my $ssp = $spec->sub_species) {
315 $OS .= " $ssp";
317 $self->genus_species($OS);
318 if( $spec->common_name ) {
319 my $common = $spec->common_name;
320 # genbank parser sets species->common_name to
321 # be "Genus Species (common name)" which is wrong;
322 # we will correct for this; if common_name is set
323 # correctly then carry on
324 if ($common =~ /\((.*)\)/) {
325 $common = $1;
327 $OS .= " (".$common.")";
330 if ($OS) {
331 $self->organismstr($OS);
333 if ($haplotype) {
334 # genus_species is part of uniquename - add haplotype
335 # to make it genuinely unique
336 $self->genus_species($self->genus_species .= " $haplotype");
339 my $uname = $self->make_uniquename($self->genus_species, $accversion);
341 # data structure representing the core sequence for this record
342 my $seqnode =
343 Data::Stag->new(feature=>[
344 [feature_id=>$seq_chaos_feature_id],
345 [dbxrefstr=>'SEQDB:'.$accversion],
346 [name=>$seq->display_name],
347 [uniquename=>$uname],
348 [residues=>$seq->seq],
351 # soft properties
352 my %prop = ();
354 $seqnode->set_type('databank_entry');
356 map {
357 $prop{$_} = $seq->$_() if $seq->can($_);
358 } qw(desc keywords division molecule is_circular);
359 $prop{dates} = join("; ", $seq->get_dates) if $seq->can("get_dates");
361 local($^W) = 0; # supressing warnings about uninitialized fields.
363 # Reference lines
364 my $count = 1;
365 foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
366 # TODO
368 # Comment lines
370 $seqnode->add_featureprop([[type=>'haplotype'],[value=>$haplotype]])
371 if $haplotype;
372 foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
373 $seqnode->add_featureprop([[type=>'comment'],[value=>$comment->text]]);
375 if ($OS) {
376 $seqnode->set_organismstr($OS);
379 my @sfs = $seq->get_SeqFeatures;
381 # genbank usually includes a 'source' feature - we just
382 # migrate the data from this to the actual source feature
383 my @sources = grep {$_->primary_tag eq 'source'} @sfs;
384 @sfs = grep {$_->primary_tag ne 'source'} @sfs;
385 $self->throw(">1 source types") if @sources > 1;
386 my $source = shift @sources;
387 if ($source) {
389 my $tempw = Data::Stag->makehandler;
390 $self->write_sf($source, $seq_chaos_feature_id, $tempw);
391 my $snode = $tempw->stag;
392 $seqnode->add($_->name, $_->data)
393 foreach ($snode->get_featureprop,
394 $snode->get_feature_dbxref);
399 # throw the writer an event
400 $w->ev(@$seqnode);
402 $seqnode = undef; # free memory
404 # make events for all the features within the record
405 foreach my $sf ( @sfs ) {
406 $FNAMER->name_feature($sf);
407 $FNAMER->name_contained_features($sf);
408 $self->write_sf($sf, $seq_chaos_feature_id);
411 # data end
412 ### $w->E("chaos_block");
413 return 1;
417 sub organismstr{
418 my $self = shift;
420 return $self->{'organismstr'} = shift if @_;
421 return $self->{'organismstr'};
425 sub genus_species{
426 my $self = shift;
428 return $self->{'genus_species'} = shift if @_;
429 return $self->{'genus_species'};
433 # maps ID to type
434 sub _type_by_id_h {
435 my $self = shift;
436 $self->{_type_by_id_h} = shift if @_;
437 return $self->{_type_by_id_h};
442 # ----
443 # writes a seq feature
444 # ----
446 sub write_sf {
447 my $self = shift;
448 my $sf = shift;
449 my $seq_chaos_feature_id = shift;
450 my $w = shift || $self->handler;
452 my %props =
453 map {
454 lc($_)=>[$sf->each_tag_value($_)]
455 } $sf->all_tags;
457 my $loc = $sf->location;
458 my $name = $FNAMER->generate_feature_name($sf);
459 my $type = $sf->primary_tag;
461 # The CDS (eg in a genbank feature) implicitly represents
462 # the protein
463 $type =~ s/CDS/polypeptide/;
465 my @subsfs = $sf->sub_SeqFeature;
466 my @locnodes = ();
467 my $sid = $loc->is_remote ? $loc->seq_id : $seq_chaos_feature_id;
469 my $CREATE_SPLIT_SFS = 0;
471 if($CREATE_SPLIT_SFS &&
472 $loc->isa("Bio::Location::SplitLocationI") ) {
473 # turn splitlocs into subfeatures
474 my $n = 1;
475 push(@subsfs,
476 map {
477 my $ssf =
478 Bio::SeqFeature::Generic->new(
480 -start=>$_->start,
481 -end=>$_->end,
482 -strand=>$_->strand,
483 -primary=>$self->subpartof($type),
485 if ($_->is_remote) {
486 $ssf->location->is_remote(1);
487 $ssf->location->seq_id($_->seq_id);
489 $ssf;
490 } $loc->each_Location);
492 elsif( $loc->isa("Bio::Location::RemoteLocationI") ) {
493 # turn splitlocs into subfeatures
494 my $n = 1;
495 push(@subsfs,
496 map {
497 Bio::SeqFeature::Generic->new(
498 # -name=>$name.'.'.$n++,
499 -start=>$_->start,
500 -end=>$_->end,
501 -strand=>$_->strand,
502 -primary=>$self->subpartof($type),
504 } $loc->each_Location);
506 else {
507 my ($beg, $end, $strand) = $self->bp2ib($loc);
508 if (!$strand) {
509 use Data::Dumper;
510 print Dumper $sf, $loc;
511 $self->throw("($beg, $end, $strand) - no strand\n");
513 @locnodes = (
514 [featureloc=>[
515 [nbeg=>$beg],
516 [nend=>$end],
517 [strand=>$strand],
518 [srcfeature_id=>$sid],
519 [locgroup=>0],
520 [rank=>0],
525 my $feature_id = $self->get_chaos_feature_id($sf);
527 delete $props{id} if $props{id};
528 # do something with genbank stuff
529 my $pid = $props{'protein_id'};
530 my $tn = $props{'translation'};
531 my @xrefs = @{$props{'db_xref'} || []};
532 if ($pid) {
533 push(@xrefs, "protein:$pid->[0]");
536 my $org = $props{organism} ? $props{organism}->[0] : undef;
537 if (!$org && $self->organismstr) {
538 $org = $self->organismstr;
540 my $uname = $name ? $name.'/'.$feature_id : $feature_id;
541 if ($self->genus_species && $name) {
542 $uname = $self->make_uniquename($self->genus_species, $name);
544 if (!$uname) {
545 $self->throw("cannot make uniquename for $feature_id $name");
547 $self->_type_by_id_h->{$feature_id} = $type;
548 my $fnode =
549 [feature=>[
550 [feature_id=>$feature_id],
551 $name ? ([name=>$name]) : (),
552 [uniquename=>$uname],
553 [type=>$type],
554 $tn ? ([residues=>$tn->[0]],
555 [seqlen=>length($tn->[0])],
556 #####[md5checksum=>md5checksum($tn->[0])],
557 ) :(),
558 $org ? ([organismstr=>$org]) : (),
559 @locnodes,
560 (map {
561 [feature_dbxref=>[
562 [dbxrefstr=>$_]
565 } @xrefs),
566 (map {
567 my $k = $_;
568 my $rank=0;
569 map { [featureprop=>[[type=>$k],[value=>$_],[rank=>$rank++]]] } @{$props{$k}}
570 } keys %props),
572 $w->ev(@$fnode);
574 my $rank = 0;
575 if (@subsfs) {
576 # strand is always determined by FIRST feature listed
577 # (see genbank entry for trans-spliced mod(mdg4) AE003734)
578 my $strand = $subsfs[0];
580 # almost all the time, all features are on same strand
581 my @sfs_on_main_strand = grep {$_->strand == $strand} @subsfs;
582 my @sfs_on_other_strand = grep {$_->strand != $strand} @subsfs;
584 sort_by_strand($strand, \@sfs_on_main_strand);
585 sort_by_strand(0-$strand, \@sfs_on_other_strand);
586 @subsfs = (@sfs_on_main_strand, @sfs_on_other_strand);
588 foreach my $ssf (@subsfs) {
589 my $ssfid = $self->write_sf($ssf, $sid);
590 #my $rtype = 'part_of';
591 my $rtype =
592 $TM->get_relationship_type_by_parent_child($sf,$ssf);
593 if ($ssf->primary_tag eq 'CDS') {
594 $rtype = 'derives_from';
596 $w->ev(feature_relationship=>[
597 [subject_id=>$ssfid],
598 [object_id=>$feature_id],
599 [type=>$rtype],
600 [rank=>$rank++],
605 else {
606 # parents not stored as bioperl containment hierarchy
607 my @parent_ids = @{$props{parent} || []};
608 foreach my $parent_id (@parent_ids) {
609 my $ptype =
610 $self->_type_by_id_h->{$parent_id} || 'unknown';
611 my $rtype =
612 $TM->get_relationship_type_by_parent_child($ptype,$type);
613 $w->ev(feature_relationship=>[
614 [subject_id=>$feature_id],
615 [object_id=>$parent_id],
616 [type=>$rtype],
617 [rank=>$rank++],
622 return $feature_id;
625 sub sort_by_strand {
626 my $strand = shift || 1;
627 my $sfs = shift;
628 @$sfs = sort { ($a->start <=> $b->start) * $strand } @$sfs;
629 return;
632 sub make_uniquename {
633 my $self = shift;
634 my $org = shift;
635 my $name = shift;
637 my $os = $org;
638 $os =~ s/\s+/_/g;
639 $os =~ s/\(/_/g;
640 $os =~ s/\)/_/g;
641 $os =~ s/_+/_/g;
642 $os =~ s/^_+//g;
643 $os =~ s/_+$//g;
644 return "$os:$name";
648 sub get_chaos_feature_id {
649 my $self = shift;
650 my $ob = shift;
652 my $id;
653 if ($ob->isa("Bio::SeqI")) {
654 $id = $ob->accession_number . '.' . ($ob->can('seq_version') ? $ob->seq_version : $ob->version);
656 else {
657 $ob->isa("Bio::SeqFeatureI") || $self->throw("$ob must be either SeqI or SeqFeatureI");
659 if ($ob->primary_id) {
660 $id = $ob->primary_id;
662 else {
663 eval {
664 $id = $IDH->generate_unique_persistent_id($ob);
666 if ($@) {
667 $self->warn($@);
668 $id = "$ob"; # last resort - use memory pointer ref
669 # will not be persistent, but will be unique
673 if (!$id) {
674 if ($ob->isa("Bio::SeqFeatureI")) {
675 $id = $IDH->generate_unique_persistent_id($ob);
677 else {
678 $self->throw("Cannot generate a unique persistent ID for a Seq without either primary_id or accession");
681 if ($id) {
682 $id = $self->context_namespace ? $self->context_namespace . ":" . $id : $id;
685 return $id;
688 # interbase and directional semantics
689 sub bp2ib {
690 my $self = shift;
691 my $loc = shift;
692 my ($s, $e, $str) =
693 ref($loc) eq "ARRAY" ? (@$loc) : ($loc->start, $loc->end, $loc->strand);
694 $s--;
695 if ($str < 0) {
696 ($s, $e) = ($e, $s);
698 return ($s, $e, $str || 1);
701 sub subpartof {
702 my $self = shift;
703 my $type = 'partof_'.shift;
704 $type =~ s/partof_CDS/CDS_exon/;
705 $type =~ s/partof_protein/CDS_exon/;
706 $type =~ s/partof_polypeptide/CDS_exon/;
707 $type =~ s/partof_\w*RNA/exon/;
708 return $type;