4 # BioPerl module for Bio::SeqIO::chaos
6 # Chris Mungall <cjm@fruitfly.org>
8 # You may distribute this module under the same terms as perl itself
10 # POD documentation - main docs before the code
14 Bio::SeqIO::chaos - chaos sequence input/output stream
18 #In general you will not want to use this module directly;
19 #use the chaosxml format via SeqIO
21 $outstream = Bio::SeqIO->new(-file => $filename,
22 -format => 'chaosxml');
24 while ( my $seq = $instream->next_seq() ) {
25 $outstream->write_seq($seq);
30 This is the guts of L<Bio::SeqIO::chaosxml> - please refer to the
31 documentation for this module
33 B<CURRENTLY WRITE ONLY>
35 ChaosXML is an XML mapping of the chado relational database; for more
36 information, see http://www.fruitfly.org/chaos-xml
38 chaos can be represented in various syntaxes - XML, S-Expressions or
39 indented text. You should see the relevant SeqIO file. You will
40 probably want to use L<Bio::SeqIO::chaosxml>, which is a wrapper to
43 =head2 USING STAG OBJECTS
45 B<non-standard bioperl stuff you dont necessarily need to know follows>
47 This module (in write mode) is an B<event producer> - it generates XML
48 events via the L<Data::Stag> module. If you only care about the final
49 end-product xml, use L<Bio::SeqIO::chaosxml>
51 You can treat the resulting chaos-xml stream as stag XML objects;
53 $outstream = Bio::SeqIO->new(-file => $filename, -format => 'chaos');
55 while ( my $seq = $instream->next_seq() ) {
56 $outstream->write_seq($seq);
58 my $chaos = $outstream->handler->stag;
59 # stag provides get/set methods for xml elements
60 # (these are chaos objects, not bioperl objects)
61 my @features = $chaos->get_feature;
62 my @feature_relationships = $chaos->get_feature_relationships;
63 # stag objects can be queried with functional-programming
65 my @features_in_range =
66 $chaos->where('feature',
68 my $featureloc = shift->get_featureloc;
69 $featureloc->strand == 1 &&
70 $featureloc->nbeg > 10000 &&
71 $featureloc->nend < 20000;
73 foreach my $feature (@features_in_range) {
74 my $featureloc = $feature->get_featureloc;
75 printf "%s [%d->%d on %s]\n",
77 $featureloc->sget_nbeg,
78 $featureloc->sget_end,
79 $featureloc->sget_srcfeature_id;
82 =head1 MODULES REQUIRED
86 Downloadable from CPAN; see also http://stag.sourceforge.net
92 User feedback is an integral part of the evolution of this and other
93 Bioperl modules. Send your comments and suggestions preferably to one
94 of the Bioperl mailing lists. Your participation is much appreciated.
96 bioperl-l@bioperl.org - General discussion
97 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
101 Report bugs to the Bioperl bug tracking system to help us keep track
102 the bugs and their resolution.
103 Bug reports can be submitted via the web:
105 http://bugzilla.bioperl.org
107 =head1 AUTHOR - Chris Mungall
109 Email cjm@fruitfly.org
113 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
117 # Let the code begin...
119 package Bio
::SeqIO
::chaos
;
122 use Bio
::SeqFeature
::Generic
;
124 use Bio
::Seq
::SeqFactory
;
125 use Bio
::Annotation
::Collection
;
126 use Bio
::Annotation
::Comment
;
127 use Bio
::Annotation
::Reference
;
128 use Bio
::Annotation
::DBLink
;
129 use Bio
::SeqFeature
::Tools
::TypeMapper
;
130 use Bio
::SeqFeature
::Tools
::FeatureNamer
;
131 use Bio
::SeqFeature
::Tools
::IDHandler
;
132 use Data
::Stag
qw(:all);
134 use base
qw(Bio::SeqIO);
136 our $TM = 'Bio::SeqFeature::Tools::TypeMapper';
137 our $FNAMER = 'Bio::SeqFeature::Tools::FeatureNamer';
138 our $IDH = 'Bio::SeqFeature::Tools::IDHandler';
141 my($self,@args) = @_;
143 $self->SUPER::_initialize
(@args);
144 if( ! defined $self->sequence_factory ) {
145 $self->sequence_factory(Bio
::Seq
::SeqFactory
->new
146 (-verbose
=> $self->verbose(),
147 -type
=> 'Bio::Seq::RichSeq'));
149 my $wclass = $self->default_handler_class;
150 $self->handler($wclass);
152 $self->handler->fh($self->_fh);
154 $self->{_end_of_data
} = 0;
155 $self->_type_by_id_h({});
157 my $ppt = localtime $t;
158 $self->handler->S("chaos");
159 $self->handler->ev(chaos_metadata
=>[
161 [chaos_flavour
=>'bioperl'],
162 [feature_unique_key
=>'feature_id'],
163 [equiv_chado_release
=>'chado_1_01'],
164 [export_unixtime
=>$t],
165 [export_localtime
=>$ppt],
166 [export_host
=>$ENV{HOST
}],
167 [export_user
=>$ENV{USER
}],
168 [export_perl5lib
=>$ENV{PERL5LIB
}],
169 [export_program
=>$0],
170 [export_module
=>'Bio::SeqIO::chaos'],
171 [export_module_cvs_id
=>'$Id$'],
179 $self->end_of_data();
180 $self->SUPER::DESTROY
();
185 return if $self->{_end_of_data
};
186 $self->{_end_of_data
} = 1;
187 $self->handler->E("chaos");
190 sub default_handler_class
{
191 return Data
::Stag
->makehandler;
194 =head2 context_namespace
196 Title : context_namespace
197 Usage : $obj->context_namespace($newval)
200 Returns : value of context_namespace (a scalar)
201 Args : on set, new value (a scalar or undef, optional)
203 IDs will be preceeded with the context namespace
207 sub context_namespace
{
210 return $self->{'context_namespace'} = shift if @_;
211 return $self->{'context_namespace'};
218 Usage : $seq = $stream->next_seq()
219 Function: returns the next sequence in the stream
220 Returns : Bio::Seq object
226 my ($self,@args) = @_;
227 my $seq = $self->sequence_factory->create
229 # '-verbose' =>$self->verbose(),
232 # -annotation => $annotation,
233 # -features => \@features
240 $self->{_handler
} = shift if @_;
241 return $self->{_handler
};
248 Usage : $stream->write_seq($seq)
249 Function: writes the $seq object (must be seq) to the stream
250 Returns : 1 for success and 0 for error
257 my ($self,$seq) = @_;
259 if( !defined $seq ) {
260 $self->throw("Attempting to write with no seq!");
263 if( ! ref $seq || ! $seq->isa('Bio::SeqI') ) {
264 $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!");
267 # get a handler - must inherit from Data::Stag::BaseHandler;
268 my $w = $self->handler;
271 ### $w->S("chaos_block");
273 my $seq_chaos_feature_id;
275 # different seq objects have different version accessors -
277 my $version = $seq->can('seq_version') ?
$seq->seq_version : $seq->version;
279 my $accversion = $seq->accession_number;
281 $accversion .= ".$version";
285 $seq_chaos_feature_id = $accversion;
288 $seq_chaos_feature_id = $self->get_chaos_feature_id($seq);
289 $accversion = $seq_chaos_feature_id;
292 # All ids must have a namespace prefix
293 if ($seq_chaos_feature_id !~ /:/) {
294 $seq_chaos_feature_id = "GenericSeqDB:$seq_chaos_feature_id";
297 # if ($seq->accession_number eq 'unknown') {
298 # $seq_chaos_feature_id = $self->get_chaos_feature_id('contig', $seq);
302 if ($seq->desc =~ /haplotype(.*)/i) {
303 # yikes, no consistent way to specify haplotype in gb
305 $haplotype =~ s/\s+/_/g;
306 $haplotype =~ s/\W+//g;
311 if (my $spec = $seq->species) {
312 my ($species, $genus, @class) = $spec->classification();
313 $OS = "$genus $species";
314 if (my $ssp = $spec->sub_species) {
317 $self->genus_species($OS);
318 if( $spec->common_name ) {
319 my $common = $spec->common_name;
320 # genbank parser sets species->common_name to
321 # be "Genus Species (common name)" which is wrong;
322 # we will correct for this; if common_name is set
323 # correctly then carry on
324 if ($common =~ /\((.*)\)/) {
327 $OS .= " (".$common.")";
331 $self->organismstr($OS);
334 # genus_species is part of uniquename - add haplotype
335 # to make it genuinely unique
336 $self->genus_species($self->genus_species .= " $haplotype");
339 my $uname = $self->make_uniquename($self->genus_species, $accversion);
341 # data structure representing the core sequence for this record
343 Data
::Stag
->new(feature
=>[
344 [feature_id
=>$seq_chaos_feature_id],
345 [dbxrefstr
=>'SEQDB:'.$accversion],
346 [name
=>$seq->display_name],
347 [uniquename
=>$uname],
348 [residues
=>$seq->seq],
354 $seqnode->set_type('databank_entry');
357 $prop{$_} = $seq->$_() if $seq->can($_);
358 } qw(desc keywords division molecule is_circular);
359 $prop{dates
} = join("; ", $seq->get_dates) if $seq->can("get_dates");
361 local($^W
) = 0; # supressing warnings about uninitialized fields.
365 foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
370 $seqnode->add_featureprop([[type
=>'haplotype'],[value
=>$haplotype]])
372 foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
373 $seqnode->add_featureprop([[type
=>'comment'],[value
=>$comment->text]]);
376 $seqnode->set_organismstr($OS);
379 my @sfs = $seq->get_SeqFeatures;
381 # genbank usually includes a 'source' feature - we just
382 # migrate the data from this to the actual source feature
383 my @sources = grep {$_->primary_tag eq 'source'} @sfs;
384 @sfs = grep {$_->primary_tag ne 'source'} @sfs;
385 $self->throw(">1 source types") if @sources > 1;
386 my $source = shift @sources;
389 my $tempw = Data
::Stag
->makehandler;
390 $self->write_sf($source, $seq_chaos_feature_id, $tempw);
391 my $snode = $tempw->stag;
392 $seqnode->add($_->name, $_->data)
393 foreach ($snode->get_featureprop,
394 $snode->get_feature_dbxref);
399 # throw the writer an event
402 $seqnode = undef; # free memory
404 # make events for all the features within the record
405 foreach my $sf ( @sfs ) {
406 $FNAMER->name_feature($sf);
407 $FNAMER->name_contained_features($sf);
408 $self->write_sf($sf, $seq_chaos_feature_id);
412 ### $w->E("chaos_block");
420 return $self->{'organismstr'} = shift if @_;
421 return $self->{'organismstr'};
428 return $self->{'genus_species'} = shift if @_;
429 return $self->{'genus_species'};
436 $self->{_type_by_id_h
} = shift if @_;
437 return $self->{_type_by_id_h
};
443 # writes a seq feature
449 my $seq_chaos_feature_id = shift;
450 my $w = shift || $self->handler;
454 lc($_)=>[$sf->each_tag_value($_)]
457 my $loc = $sf->location;
458 my $name = $FNAMER->generate_feature_name($sf);
459 my $type = $sf->primary_tag;
461 # The CDS (eg in a genbank feature) implicitly represents
463 $type =~ s/CDS/polypeptide/;
465 my @subsfs = $sf->sub_SeqFeature;
467 my $sid = $loc->is_remote ?
$loc->seq_id : $seq_chaos_feature_id;
469 my $CREATE_SPLIT_SFS = 0;
471 if($CREATE_SPLIT_SFS &&
472 $loc->isa("Bio::Location::SplitLocationI") ) {
473 # turn splitlocs into subfeatures
478 Bio
::SeqFeature
::Generic
->new(
483 -primary
=>$self->subpartof($type),
486 $ssf->location->is_remote(1);
487 $ssf->location->seq_id($_->seq_id);
490 } $loc->each_Location);
492 elsif( $loc->isa("Bio::Location::RemoteLocationI") ) {
493 # turn splitlocs into subfeatures
497 Bio
::SeqFeature
::Generic
->new(
498 # -name=>$name.'.'.$n++,
502 -primary
=>$self->subpartof($type),
504 } $loc->each_Location);
507 my ($beg, $end, $strand) = $self->bp2ib($loc);
510 print Dumper
$sf, $loc;
511 $self->throw("($beg, $end, $strand) - no strand\n");
518 [srcfeature_id
=>$sid],
525 my $feature_id = $self->get_chaos_feature_id($sf);
527 delete $props{id
} if $props{id
};
528 # do something with genbank stuff
529 my $pid = $props{'protein_id'};
530 my $tn = $props{'translation'};
531 my @xrefs = @
{$props{'db_xref'} || []};
533 push(@xrefs, "protein:$pid->[0]");
536 my $org = $props{organism
} ?
$props{organism
}->[0] : undef;
537 if (!$org && $self->organismstr) {
538 $org = $self->organismstr;
540 my $uname = $name ?
$name.'/'.$feature_id : $feature_id;
541 if ($self->genus_species && $name) {
542 $uname = $self->make_uniquename($self->genus_species, $name);
545 $self->throw("cannot make uniquename for $feature_id $name");
547 $self->_type_by_id_h->{$feature_id} = $type;
550 [feature_id
=>$feature_id],
551 $name ?
([name
=>$name]) : (),
552 [uniquename
=>$uname],
554 $tn ?
([residues
=>$tn->[0]],
555 [seqlen
=>length($tn->[0])],
556 #####[md5checksum=>md5checksum($tn->[0])],
558 $org ?
([organismstr
=>$org]) : (),
569 map { [featureprop
=>[[type
=>$k],[value
=>$_],[rank
=>$rank++]]] } @
{$props{$k}}
576 # strand is always determined by FIRST feature listed
577 # (see genbank entry for trans-spliced mod(mdg4) AE003734)
578 my $strand = $subsfs[0];
580 # almost all the time, all features are on same strand
581 my @sfs_on_main_strand = grep {$_->strand == $strand} @subsfs;
582 my @sfs_on_other_strand = grep {$_->strand != $strand} @subsfs;
584 sort_by_strand
($strand, \
@sfs_on_main_strand);
585 sort_by_strand
(0-$strand, \
@sfs_on_other_strand);
586 @subsfs = (@sfs_on_main_strand, @sfs_on_other_strand);
588 foreach my $ssf (@subsfs) {
589 my $ssfid = $self->write_sf($ssf, $sid);
590 #my $rtype = 'part_of';
592 $TM->get_relationship_type_by_parent_child($sf,$ssf);
593 if ($ssf->primary_tag eq 'CDS') {
594 $rtype = 'derives_from';
596 $w->ev(feature_relationship
=>[
597 [subject_id
=>$ssfid],
598 [object_id
=>$feature_id],
606 # parents not stored as bioperl containment hierarchy
607 my @parent_ids = @
{$props{parent
} || []};
608 foreach my $parent_id (@parent_ids) {
610 $self->_type_by_id_h->{$parent_id} || 'unknown';
612 $TM->get_relationship_type_by_parent_child($ptype,$type);
613 $w->ev(feature_relationship
=>[
614 [subject_id
=>$feature_id],
615 [object_id
=>$parent_id],
626 my $strand = shift || 1;
628 @
$sfs = sort { ($a->start <=> $b->start) * $strand } @
$sfs;
632 sub make_uniquename
{
648 sub get_chaos_feature_id
{
653 if ($ob->isa("Bio::SeqI")) {
654 $id = $ob->accession_number . '.' . ($ob->can('seq_version') ?
$ob->seq_version : $ob->version);
657 $ob->isa("Bio::SeqFeatureI") || $self->throw("$ob must be either SeqI or SeqFeatureI");
659 if ($ob->primary_id) {
660 $id = $ob->primary_id;
664 $id = $IDH->generate_unique_persistent_id($ob);
668 $id = "$ob"; # last resort - use memory pointer ref
669 # will not be persistent, but will be unique
674 if ($ob->isa("Bio::SeqFeatureI")) {
675 $id = $IDH->generate_unique_persistent_id($ob);
678 $self->throw("Cannot generate a unique persistent ID for a Seq without either primary_id or accession");
682 $id = $self->context_namespace ?
$self->context_namespace . ":" . $id : $id;
688 # interbase and directional semantics
693 ref($loc) eq "ARRAY" ?
(@
$loc) : ($loc->start, $loc->end, $loc->strand);
698 return ($s, $e, $str || 1);
703 my $type = 'partof_'.shift;
704 $type =~ s/partof_CDS/CDS_exon/;
705 $type =~ s/partof_protein/CDS_exon/;
706 $type =~ s/partof_polypeptide/CDS_exon/;
707 $type =~ s/partof_\w*RNA/exon/;