Couple more minor edits ...
[bioperl-live.git] / Bio / SeqIO / chaos.pm
blob622bafa2ffe567fdd9ff261cbe4557f8a42f76b0
2 # BioPerl module for Bio::SeqIO::chaos
4 # Chris Mungall <cjm@fruitfly.org>
6 # You may distribute this module under the same terms as perl itself
8 # POD documentation - main docs before the code
10 =head1 NAME
12 Bio::SeqIO::chaos - chaos sequence input/output stream
14 =head1 SYNOPSIS
16 #In general you will not want to use this module directly;
17 #use the chaosxml format via SeqIO
19 $outstream = Bio::SeqIO->new(-file => $filename,
20 -format => 'chaosxml');
22 while ( my $seq = $instream->next_seq() ) {
23 $outstream->write_seq($seq);
26 =head1 DESCRIPTION
28 This is the guts of L<Bio::SeqIO::chaosxml> - please refer to the
29 documentation for this module
31 B<CURRENTLY WRITE ONLY>
33 ChaosXML is an XML mapping of the chado relational database; for more
34 information, see http://www.fruitfly.org/chaos-xml
36 chaos can be represented in various syntaxes - XML, S-Expressions or
37 indented text. You should see the relevant SeqIO file. You will
38 probably want to use L<Bio::SeqIO::chaosxml>, which is a wrapper to
39 this module.
41 =head2 USING STAG OBJECTS
43 B<non-standard bioperl stuff you don't necessarily need to know follows>
45 This module (in write mode) is an B<event producer> - it generates XML
46 events via the L<Data::Stag> module. If you only care about the final
47 end-product xml, use L<Bio::SeqIO::chaosxml>
49 You can treat the resulting chaos-xml stream as stag XML objects;
51 $outstream = Bio::SeqIO->new(-file => $filename, -format => 'chaos');
53 while ( my $seq = $instream->next_seq() ) {
54 $outstream->write_seq($seq);
56 my $chaos = $outstream->handler->stag;
57 # stag provides get/set methods for xml elements
58 # (these are chaos objects, not bioperl objects)
59 my @features = $chaos->get_feature;
60 my @feature_relationships = $chaos->get_feature_relationships;
61 # stag objects can be queried with functional-programming
62 # style queries
63 my @features_in_range =
64 $chaos->where('feature',
65 sub {
66 my $featureloc = shift->get_featureloc;
67 $featureloc->strand == 1 &&
68 $featureloc->nbeg > 10000 &&
69 $featureloc->nend < 20000;
70 });
71 foreach my $feature (@features_in_range) {
72 my $featureloc = $feature->get_featureloc;
73 printf "%s [%d->%d on %s]\n",
74 $feature->sget_name,
75 $featureloc->sget_nbeg,
76 $featureloc->sget_end,
77 $featureloc->sget_srcfeature_id;
80 =head1 MODULES REQUIRED
82 L<Data::Stag>
84 Downloadable from CPAN; see also http://stag.sourceforge.net
86 =head1 FEEDBACK
88 =head2 Mailing Lists
90 User feedback is an integral part of the evolution of this and other
91 Bioperl modules. Send your comments and suggestions preferably to one
92 of the Bioperl mailing lists. Your participation is much appreciated.
94 bioperl-l@bioperl.org - General discussion
95 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
97 =head2 Support
99 Please direct usage questions or support issues to the mailing list:
101 I<bioperl-l@bioperl.org>
103 rather than to the module maintainer directly. Many experienced and
104 reponsive experts will be able look at the problem and quickly
105 address it. Please include a thorough description of the problem
106 with code and data examples if at all possible.
108 =head2 Reporting Bugs
110 Report bugs to the Bioperl bug tracking system to help us keep track
111 the bugs and their resolution.
112 Bug reports can be submitted via the web:
114 https://github.com/bioperl/bioperl-live/issues
116 =head1 AUTHOR - Chris Mungall
118 Email cjm@fruitfly.org
120 =head1 APPENDIX
122 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
124 =cut
126 # Let the code begin...
128 package Bio::SeqIO::chaos;
129 use strict;
131 use Bio::SeqFeature::Generic;
132 use Bio::Species;
133 use Bio::Seq::SeqFactory;
134 use Bio::Annotation::Collection;
135 use Bio::Annotation::Comment;
136 use Bio::Annotation::Reference;
137 use Bio::Annotation::DBLink;
138 use Bio::SeqFeature::Tools::TypeMapper;
139 use Bio::SeqFeature::Tools::FeatureNamer;
140 use Bio::SeqFeature::Tools::IDHandler;
141 use Data::Stag qw(:all);
143 use base qw(Bio::SeqIO);
145 our $TM = 'Bio::SeqFeature::Tools::TypeMapper';
146 our $FNAMER = 'Bio::SeqFeature::Tools::FeatureNamer';
147 our $IDH = 'Bio::SeqFeature::Tools::IDHandler';
149 sub _initialize {
150 my($self,@args) = @_;
152 $self->SUPER::_initialize(@args);
153 if( ! defined $self->sequence_factory ) {
154 $self->sequence_factory(Bio::Seq::SeqFactory->new
155 (-verbose => $self->verbose(),
156 -type => 'Bio::Seq::RichSeq'));
158 my $wclass = $self->default_handler_class;
159 $self->handler($wclass);
160 if ($self->_fh) {
161 $self->handler->fh($self->_fh);
163 $self->{_end_of_data} = 0;
164 $self->_type_by_id_h({});
165 my $t = time;
166 my $ppt = localtime $t;
167 $self->handler->S("chaos");
168 $self->handler->ev(chaos_metadata=>[
169 [chaos_version=>1],
170 [chaos_flavour=>'bioperl'],
171 [feature_unique_key=>'feature_id'],
172 [equiv_chado_release=>'chado_1_01'],
173 [export_unixtime=>$t],
174 [export_localtime=>$ppt],
175 [export_host=>$ENV{HOST}],
176 [export_user=>$ENV{USER}],
177 [export_perl5lib=>$ENV{PERL5LIB}],
178 [export_program=>$0],
179 [export_module=>'Bio::SeqIO::chaos'],
180 [export_module_cvs_id=>'$Id$'],
183 return;
186 sub DESTROY {
187 my $self = shift;
188 $self->end_of_data();
189 $self->SUPER::DESTROY();
192 sub end_of_data {
193 my $self = shift;
194 return if $self->{_end_of_data};
195 $self->{_end_of_data} = 1;
196 $self->handler->E("chaos");
199 sub default_handler_class {
200 return Data::Stag->makehandler;
203 =head2 context_namespace
205 Title : context_namespace
206 Usage : $obj->context_namespace($newval)
207 Function:
208 Example :
209 Returns : value of context_namespace (a scalar)
210 Args : on set, new value (a scalar or undef, optional)
212 IDs will be preceded with the context namespace
214 =cut
216 sub context_namespace{
217 my $self = shift;
219 return $self->{'context_namespace'} = shift if @_;
220 return $self->{'context_namespace'};
224 =head2 next_seq
226 Title : next_seq
227 Usage : $seq = $stream->next_seq()
228 Function: returns the next sequence in the stream
229 Returns : Bio::Seq object
230 Args :
232 =cut
234 sub next_seq {
235 my ($self,@args) = @_;
236 my $seq = $self->sequence_factory->create
238 # '-verbose' =>$self->verbose(),
239 # %params,
240 # -seq => $seqc,
241 # -annotation => $annotation,
242 # -features => \@features
244 return $seq;
247 sub handler {
248 my $self = shift;
249 $self->{_handler} = shift if @_;
250 return $self->{_handler};
254 =head2 write_seq
256 Title : write_seq
257 Usage : $stream->write_seq($seq)
258 Function: writes the $seq object (must be seq) to the stream
259 Returns : 1 for success and 0 for error
260 Args : Bio::Seq
263 =cut
265 sub write_seq {
266 my ($self,$seq) = @_;
268 if( !defined $seq ) {
269 $self->throw("Attempting to write with no seq!");
272 if( ! ref $seq || ! $seq->isa('Bio::SeqI') ) {
273 $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!");
276 # get a handler - must inherit from Data::Stag::BaseHandler;
277 my $w = $self->handler;
279 # start of data
280 ### $w->S("chaos_block");
282 my $seq_chaos_feature_id;
284 # different seq objects have different version accessors -
285 # weird but true
286 my $version = $seq->can('seq_version') ? $seq->seq_version : $seq->version;
288 my $accversion = $seq->accession_number;
289 if ($version) {
290 $accversion .= ".$version";
293 if ($accversion) {
294 $seq_chaos_feature_id = $accversion;
296 else {
297 $seq_chaos_feature_id = $self->get_chaos_feature_id($seq);
298 $accversion = $seq_chaos_feature_id;
301 # All ids must have a namespace prefix
302 if ($seq_chaos_feature_id !~ /:/) {
303 $seq_chaos_feature_id = "GenericSeqDB:$seq_chaos_feature_id";
306 # if ($seq->accession_number eq 'unknown') {
307 # $seq_chaos_feature_id = $self->get_chaos_feature_id('contig', $seq);
310 my $haplotype;
311 if ($seq->desc =~ /haplotype(.*)/i) {
312 # yikes, no consistent way to specify haplotype in gb
313 $haplotype = $1;
314 $haplotype =~ s/\s+/_/g;
315 $haplotype =~ s/\W+//g;
318 my $OS;
319 # Organism lines
320 if (my $spec = $seq->species) {
321 my ($species, $genus, @class) = $spec->classification();
322 $OS = "$genus $species";
323 if (my $ssp = $spec->sub_species) {
324 $OS .= " $ssp";
326 $self->genus_species($OS);
327 if( $spec->common_name ) {
328 my $common = $spec->common_name;
329 # genbank parser sets species->common_name to
330 # be "Genus Species (common name)" which is wrong;
331 # we will correct for this; if common_name is set
332 # correctly then carry on
333 if ($common =~ /\((.*)\)/) {
334 $common = $1;
336 $OS .= " (".$common.")";
339 if ($OS) {
340 $self->organismstr($OS);
342 if ($haplotype) {
343 # genus_species is part of uniquename - add haplotype
344 # to make it genuinely unique
345 $self->genus_species($self->genus_species .= " $haplotype");
348 my $uname = $self->make_uniquename($self->genus_species, $accversion);
350 # data structure representing the core sequence for this record
351 my $seqnode =
352 Data::Stag->new(feature=>[
353 [feature_id=>$seq_chaos_feature_id],
354 [dbxrefstr=>'SEQDB:'.$accversion],
355 [name=>$seq->display_name],
356 [uniquename=>$uname],
357 [residues=>$seq->seq],
360 # soft properties
361 my %prop = ();
363 $seqnode->set_type('databank_entry');
365 map {
366 $prop{$_} = $seq->$_() if $seq->can($_);
367 } qw(desc keywords division molecule is_circular);
368 $prop{dates} = join("; ", $seq->get_dates) if $seq->can("get_dates");
370 local($^W) = 0; # supressing warnings about uninitialized fields.
372 # Reference lines
373 my $count = 1;
374 foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
375 # TODO
377 # Comment lines
379 $seqnode->add_featureprop([[type=>'haplotype'],[value=>$haplotype]])
380 if $haplotype;
381 foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
382 $seqnode->add_featureprop([[type=>'comment'],[value=>$comment->text]]);
384 if ($OS) {
385 $seqnode->set_organismstr($OS);
388 my @sfs = $seq->get_SeqFeatures;
390 # genbank usually includes a 'source' feature - we just
391 # migrate the data from this to the actual source feature
392 my @sources = grep {$_->primary_tag eq 'source'} @sfs;
393 @sfs = grep {$_->primary_tag ne 'source'} @sfs;
394 $self->throw(">1 source types") if @sources > 1;
395 my $source = shift @sources;
396 if ($source) {
398 my $tempw = Data::Stag->makehandler;
399 $self->write_sf($source, $seq_chaos_feature_id, $tempw);
400 my $snode = $tempw->stag;
401 $seqnode->add($_->name, $_->data)
402 foreach ($snode->get_featureprop,
403 $snode->get_feature_dbxref);
408 # throw the writer an event
409 $w->ev(@$seqnode);
411 $seqnode = undef; # free memory
413 # make events for all the features within the record
414 foreach my $sf ( @sfs ) {
415 $FNAMER->name_feature($sf);
416 $FNAMER->name_contained_features($sf);
417 $self->write_sf($sf, $seq_chaos_feature_id);
420 # data end
421 ### $w->E("chaos_block");
422 return 1;
426 sub organismstr{
427 my $self = shift;
429 return $self->{'organismstr'} = shift if @_;
430 return $self->{'organismstr'};
434 sub genus_species{
435 my $self = shift;
437 return $self->{'genus_species'} = shift if @_;
438 return $self->{'genus_species'};
442 # maps ID to type
443 sub _type_by_id_h {
444 my $self = shift;
445 $self->{_type_by_id_h} = shift if @_;
446 return $self->{_type_by_id_h};
451 # ----
452 # writes a seq feature
453 # ----
455 sub write_sf {
456 my $self = shift;
457 my $sf = shift;
458 my $seq_chaos_feature_id = shift;
459 my $w = shift || $self->handler;
461 my %props =
462 map {
463 lc($_)=>[$sf->each_tag_value($_)]
464 } $sf->all_tags;
466 my $loc = $sf->location;
467 my $name = $FNAMER->generate_feature_name($sf);
468 my $type = $sf->primary_tag;
470 # The CDS (eg in a genbank feature) implicitly represents
471 # the protein
472 $type =~ s/CDS/polypeptide/;
474 my @subsfs = $sf->sub_SeqFeature;
475 my @locnodes = ();
476 my $sid = $loc->is_remote ? $loc->seq_id : $seq_chaos_feature_id;
478 my $CREATE_SPLIT_SFS = 0;
480 if($CREATE_SPLIT_SFS &&
481 $loc->isa("Bio::Location::SplitLocationI") ) {
482 # turn splitlocs into subfeatures
483 my $n = 1;
484 push(@subsfs,
485 map {
486 my $ssf =
487 Bio::SeqFeature::Generic->new(
489 -start=>$_->start,
490 -end=>$_->end,
491 -strand=>$_->strand,
492 -primary=>$self->subpartof($type),
494 if ($_->is_remote) {
495 $ssf->location->is_remote(1);
496 $ssf->location->seq_id($_->seq_id);
498 $ssf;
499 } $loc->each_Location);
501 elsif( $loc->isa("Bio::Location::RemoteLocationI") ) {
502 # turn splitlocs into subfeatures
503 my $n = 1;
504 push(@subsfs,
505 map {
506 Bio::SeqFeature::Generic->new(
507 # -name=>$name.'.'.$n++,
508 -start=>$_->start,
509 -end=>$_->end,
510 -strand=>$_->strand,
511 -primary=>$self->subpartof($type),
513 } $loc->each_Location);
515 else {
516 my ($beg, $end, $strand) = $self->bp2ib($loc);
517 if (!$strand) {
518 use Data::Dumper;
519 print Dumper $sf, $loc;
520 $self->throw("($beg, $end, $strand) - no strand\n");
522 @locnodes = (
523 [featureloc=>[
524 [nbeg=>$beg],
525 [nend=>$end],
526 [strand=>$strand],
527 [srcfeature_id=>$sid],
528 [locgroup=>0],
529 [rank=>0],
534 my $feature_id = $self->get_chaos_feature_id($sf);
536 delete $props{id} if $props{id};
537 # do something with genbank stuff
538 my $pid = $props{'protein_id'};
539 my $tn = $props{'translation'};
540 my @xrefs = @{$props{'db_xref'} || []};
541 if ($pid) {
542 push(@xrefs, "protein:$pid->[0]");
545 my $org = $props{organism} ? $props{organism}->[0] : undef;
546 if (!$org && $self->organismstr) {
547 $org = $self->organismstr;
549 my $uname = $name ? $name.'/'.$feature_id : $feature_id;
550 if ($self->genus_species && $name) {
551 $uname = $self->make_uniquename($self->genus_species, $name);
553 if (!$uname) {
554 $self->throw("cannot make uniquename for $feature_id $name");
556 $self->_type_by_id_h->{$feature_id} = $type;
557 my $fnode =
558 [feature=>[
559 [feature_id=>$feature_id],
560 $name ? ([name=>$name]) : (),
561 [uniquename=>$uname],
562 [type=>$type],
563 $tn ? ([residues=>$tn->[0]],
564 [seqlen=>length($tn->[0])],
565 #####[md5checksum=>md5checksum($tn->[0])],
566 ) :(),
567 $org ? ([organismstr=>$org]) : (),
568 @locnodes,
569 (map {
570 [feature_dbxref=>[
571 [dbxrefstr=>$_]
574 } @xrefs),
575 (map {
576 my $k = $_;
577 my $rank=0;
578 map { [featureprop=>[[type=>$k],[value=>$_],[rank=>$rank++]]] } @{$props{$k}}
579 } keys %props),
581 $w->ev(@$fnode);
583 my $rank = 0;
584 if (@subsfs) {
585 # strand is always determined by FIRST feature listed
586 # (see genbank entry for trans-spliced mod(mdg4) AE003734)
587 my $strand = $subsfs[0];
589 # almost all the time, all features are on same strand
590 my @sfs_on_main_strand = grep {$_->strand == $strand} @subsfs;
591 my @sfs_on_other_strand = grep {$_->strand != $strand} @subsfs;
593 sort_by_strand($strand, \@sfs_on_main_strand);
594 sort_by_strand(0-$strand, \@sfs_on_other_strand);
595 @subsfs = (@sfs_on_main_strand, @sfs_on_other_strand);
597 foreach my $ssf (@subsfs) {
598 my $ssfid = $self->write_sf($ssf, $sid);
599 #my $rtype = 'part_of';
600 my $rtype =
601 $TM->get_relationship_type_by_parent_child($sf,$ssf);
602 if ($ssf->primary_tag eq 'CDS') {
603 $rtype = 'derives_from';
605 $w->ev(feature_relationship=>[
606 [subject_id=>$ssfid],
607 [object_id=>$feature_id],
608 [type=>$rtype],
609 [rank=>$rank++],
614 else {
615 # parents not stored as bioperl containment hierarchy
616 my @parent_ids = @{$props{parent} || []};
617 foreach my $parent_id (@parent_ids) {
618 my $ptype =
619 $self->_type_by_id_h->{$parent_id} || 'unknown';
620 my $rtype =
621 $TM->get_relationship_type_by_parent_child($ptype,$type);
622 $w->ev(feature_relationship=>[
623 [subject_id=>$feature_id],
624 [object_id=>$parent_id],
625 [type=>$rtype],
626 [rank=>$rank++],
631 return $feature_id;
634 sub sort_by_strand {
635 my $strand = shift || 1;
636 my $sfs = shift;
637 @$sfs = sort { ($a->start <=> $b->start) * $strand } @$sfs;
638 return;
641 sub make_uniquename {
642 my $self = shift;
643 my $org = shift;
644 my $name = shift;
646 my $os = $org;
647 $os =~ s/\s+/_/g;
648 $os =~ s/\(/_/g;
649 $os =~ s/\)/_/g;
650 $os =~ s/_+/_/g;
651 $os =~ s/^_+//g;
652 $os =~ s/_+$//g;
653 return "$os:$name";
657 sub get_chaos_feature_id {
658 my $self = shift;
659 my $ob = shift;
661 my $id;
662 if ($ob->isa("Bio::SeqI")) {
663 $id = $ob->accession_number . '.' . ($ob->can('seq_version') ? $ob->seq_version : $ob->version);
665 else {
666 $ob->isa("Bio::SeqFeatureI") || $self->throw("$ob must be either SeqI or SeqFeatureI");
668 if ($ob->primary_id) {
669 $id = $ob->primary_id;
671 else {
672 eval {
673 $id = $IDH->generate_unique_persistent_id($ob);
675 if ($@) {
676 $self->warn($@);
677 $id = "$ob"; # last resort - use memory pointer ref
678 # will not be persistent, but will be unique
682 if (!$id) {
683 if ($ob->isa("Bio::SeqFeatureI")) {
684 $id = $IDH->generate_unique_persistent_id($ob);
686 else {
687 $self->throw("Cannot generate a unique persistent ID for a Seq without either primary_id or accession");
690 if ($id) {
691 $id = $self->context_namespace ? $self->context_namespace . ":" . $id : $id;
694 return $id;
697 # interbase and directional semantics
698 sub bp2ib {
699 my $self = shift;
700 my $loc = shift;
701 my ($s, $e, $str) =
702 ref($loc) eq "ARRAY" ? (@$loc) : ($loc->start, $loc->end, $loc->strand);
703 $s--;
704 if ($str < 0) {
705 ($s, $e) = ($e, $s);
707 return ($s, $e, $str || 1);
710 sub subpartof {
711 my $self = shift;
712 my $type = 'partof_'.shift;
713 $type =~ s/partof_CDS/CDS_exon/;
714 $type =~ s/partof_protein/CDS_exon/;
715 $type =~ s/partof_polypeptide/CDS_exon/;
716 $type =~ s/partof_\w*RNA/exon/;
717 return $type;