1 # BioPerl module for Bio::ClusterIO::dbsnp
3 # Copyright Allen Day <allenday@ucla.edu>, Stan Nelson <snelson@ucla.edu>
4 # Human Genetics, UCLA Medical School, University of California, Los Angeles
6 # POD documentation - main docs before the code
10 Bio::ClusterIO::dbsnp - dbSNP input stream
14 Do not use this module directly. Use it via the Bio::ClusterIO class.
18 Parse dbSNP XML files, one refSNP entry at a time. Note this handles dbSNPp
19 output generated by NBCI's eutils and does NOT parse output derived from
20 SNP's XML format (found at ftp://ftp.ncbi.nih.gov/snp/).
26 User feedback is an integral part of the evolution of this and other
27 Bioperl modules. Send your comments and suggestions preferably to one
28 of the Bioperl mailing lists. Your participation is much appreciated.
30 bioperl-l@bioperl.org - General discussion
31 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
35 Please direct usage questions or support issues to the mailing list:
37 I<bioperl-l@bioperl.org>
39 rather than to the module maintainer directly. Many experienced and
40 reponsive experts will be able look at the problem and quickly
41 address it. Please include a thorough description of the problem
42 with code and data examples if at all possible.
46 Report bugs to the Bioperl bug tracking system to help us keep track
47 the bugs and their resolution. Bug reports can be submitted via the
50 https://github.com/bioperl/bioperl-live/issues
54 Allen Day E<lt>allenday@ucla.eduE<gt>
58 The rest of the documentation details each of the object
59 methods. Internal methods are usually preceded with a _
63 # Let the code begin...
64 package Bio
::ClusterIO
::dbsnp
;
68 use Bio
::Variation
::SNP
;
72 use Time
::HiRes
qw(tv_interval gettimeofday);
74 use base
qw(Bio::ClusterIO);
79 #the ones commented out i haven't written methods for yet... -Allen
81 # 'Rs_taxId' => 'tax_id',
82 # 'Rs_organism' => 'organism',
83 'Rs_snpType' => {'type' => 'value'},
84 'Rs_sequence_observed' => 'observed',
85 'Rs_sequence_seq5' => 'seq_5',
86 'Rs_sequence_seq3' => 'seq_3',
87 # 'Rs_sequence_exemplarSs' => 'exemplar_subsnp',
88 'Rs_create_build' => 'ncbi_build',
89 #?? 'Rs_update_build' => 'ncbi_build',
90 # 'NSE-rs_ncbi-num-chr-hits' => 'ncbi_chr_hits',
91 # 'NSE-rs_ncbi-num-ctg-hits' => 'ncbi_ctg_hits',
92 # 'NSE-rs_ncbi-num-seq-loc' => 'ncbi_seq_loc',
93 # 'NSE-rs_ncbi-mapweight' => 'ncbi_mapweight',
94 # 'NSE-rs_ucsc-build-id' => 'ucsc_build',
95 # 'NSE-rs_ucsc-num-chr-hits' => 'ucsc_chr_hits',
96 # 'NSE-rs_ucsc-num-seq-loc' => 'ucsc_ctg_hits',
97 # 'NSE-rs_ucsc-mapweight' => 'ucsc_mapweight',
99 'Rs_het_value' => 'heterozygous',
100 'Rs_het-stdError' => 'heterozygous_SE',
101 'Rs_validation' => {'validated' => 'value'}, #??
102 # 'NSE-rs_genotype' => {'genotype' => 'value'},
104 'Ss_handle' => 'handle',
105 'Ss_batchId' => 'batch_id',
106 'Ss_locSnpId' => 'id',
107 # 'Ss_locSnpId' => 'loc_id',
108 # 'Ss_orient' => {'orient' => 'value'},
109 # 'Ss_buildId' => 'build',
110 'Ss_methodClass' => {'method' => 'value'},
111 # 'NSE-ss_accession_E' => 'accession',
112 # 'NSE-ss_comment_E' => 'comment',
113 # 'NSE-ss_genename' => 'gene_name',
114 # 'NSE-ss_assay-5_E' => 'seq_5',
115 # 'NSE-ss_assay-3_E' => 'seq_3',
116 # 'NSE-ss_observed' => 'observed',
118 # 'NSE-ss-popinfo_type' => 'pop_type',
119 # 'NSE-ss-popinfo_batch-id' => 'pop_batch_id',
120 # 'NSE-ss-popinfo_pop-name' => 'pop_name',
121 # 'NSE-ss-popinfo_samplesize' => 'pop_samplesize',
122 # 'NSE-ss_popinfo_est-het' => 'pop_est_heterozygous',
123 # 'NSE-ss_popinfo_est-het-se-sq' => 'pop_est_heterozygous_se_sq',
125 # 'NSE-ss-alleleinfo_type' => 'allele_type',
126 # 'NSE-ss-alleleinfo_batch-id' => 'allele_batch_id',
127 # 'NSE-ss-alleleinfo_pop-id' => 'allele_pop_id',
128 # 'NSE-ss-alleleinfo_snp-allele' => 'allele_snp',
129 # 'NSE-ss-alleleinfo_other-allele' => 'allele_other',
130 # 'NSE-ss-alleleinfo_freq' => 'allele_freq',
131 # 'NSE-ss-alleleinfo_count' => 'allele_count',
133 # 'NSE-rsContigHit_contig-id' => 'contig_hit',
134 # 'NSE-rsContigHit_accession' => 'accession_hit',
135 # 'NSE-rsContigHit_version' => 'version',
136 # 'NSE-rsContigHit_chromosome' => 'chromosome_hit',
138 # 'NSE-rsMaploc_asn-from' => 'asn_from',
139 # 'NSE-rsMaploc_asn-to' => 'asn_to',
140 # 'NSE-rsMaploc_loc-type' => {'loc_type' => 'value'},
141 # 'NSE-rsMaploc_hit-quality' => {'hit_quality' => 'value'},
142 # 'NSE-rsMaploc_orient' => {'orient' => 'value'},
143 # 'NSE-rsMaploc_physmap-str' => 'phys_from',
144 # 'NSE-rsMaploc_physmap-int' => 'phys_to',
146 'FxnSet_geneId' => 'locus_id', # does the code realise that there can be multiple of these
147 'FxnSet_symbol' => 'symbol',
148 'FxnSet_mrnaAcc' => 'mrna',
149 'FxnSet_protAcc' => 'protein',
150 'FxnSet_fxnClass' => {'functional_class' => 'value'},
154 #there are lots more, but i don't need them at the moment... -Allen
158 my ($self,@args) = @_;
159 $self->SUPER::_initialize
(@args);
160 my ($usetempfile) = $self->_rearrange([qw(TEMPFILE)],@args);
161 defined $usetempfile && $self->use_tempfile($usetempfile);
163 # start up the parser factory
164 my $parserfactory = XML
::SAX
::ParserFactory
->parser(
166 $self->{'_xmlparser'} = $parserfactory;
167 $DEBUG = 1 if( ! defined $DEBUG && $self->verbose > 0);
173 Usage : $dbsnp = $stream->next_cluster()
174 Function: returns the next refSNP in the stream
175 Returns : Bio::Variation::SNP object representing composite refSNP
176 and its component subSNP(s).
182 #Adapted from Jason's blastxml.pm
185 # you shouldn't have to preparse this; the XML is well-formed and refers
186 # accurately to a remote DTD/schema
193 if( $self->use_tempfile ) {
194 $tfh = IO
::File
->new_tmpfile or $self->throw("Unable to open temp file: $!");
199 while( defined( $_ = $self->_readline ) ){
200 #skip to beginning of refSNP entry
201 if($_ !~ m{<Rs[^>]*>} && $start){
203 } elsif($_ =~ m{<Rs[^>]*>} && $start){
214 #and stop at the end of the refSNP entry
215 last if $_ =~ m{</Rs>};
218 #if we didn't find a start tag
224 %parser_args = ('Source' => { 'ByteStream' => $tfh },
227 %parser_args = ('Source' => { 'String' => $data },
234 if( $DEBUG ) { $starttime = [ Time
::HiRes
::gettimeofday
() ]; }
237 $result = $self->{'_xmlparser'}->parse(%parser_args);
241 $self->warn("error in parsing a report:\n $@");
246 $self->debug( sprintf("parsing took %f seconds\n", Time
::HiRes
::tv_interval
($starttime)));
249 return $self->refsnp;
256 =head2 start_document
258 Title : start_document
259 Usage : $parser->start_document;
260 Function: SAX method to indicate starting to parse a new document.
261 Creates a Bio::Variation::SNP
269 $self->{refsnp
} = Bio
::Variation
::SNP
->new;
273 return shift->{refsnp
};
279 Usage : $parser->end_document;
280 Function: SAX method to indicate finishing parsing a new document
287 my ($self,@args) = @_;
292 Title : start_element
293 Usage : $parser->start_element($data)
294 Function: SAX method to indicate starting a new element
296 Args : hash ref for data
301 my ($self,$data) = @_;
302 my $nm = $data->{'Name'};
303 my $at = $data->{'Attributes'}->{'{}value'};
305 #$self->debug(Dumper($at)) if $nm = ;
308 $self->refsnp->add_subsnp;
312 if(my $type = $MAPPING{$nm}){
313 if(ref $type eq 'HASH'){
314 #okay, this is nasty. what can you do?
315 $self->{will_handle
} = (keys %$type)[0];
316 $self->{last_data
} = $at->{Value
};
318 $self->{will_handle
} = $type;
319 $self->{last_data
} = undef;
322 undef $self->{will_handle
};
329 Usage : $parser->end_element($data)
330 Function: Signals finishing an element
332 Args : hash ref for data
337 my ($self,$data) = @_;
338 my $nm = $data->{'Name'};
339 my $at = $data->{'Attributes'};
341 my $method = $self->{will_handle
};
343 if($nm =~ /^Rs/ or $nm =~ /^NSE-SeqLoc/ or $nm =~ /^FxnSet/){
344 $self->refsnp->$method($self->{last_data
});
345 } elsif ($nm =~ /^Ss/){
346 $self->refsnp->subsnp->$method($self->{last_data
});
354 Usage : $parser->characters($data)
355 Function: Signals new characters to be processed
356 Returns : characters read
357 Args : hash ref with the key 'Data'
362 my ($self,$data) = @_;
363 $self->{last_data
} = $data->{Data
}
364 if $data->{Data
} =~ /\S/; #whitespace is meaningless -ad
370 Usage : $obj->use_tempfile($newval)
371 Function: Get/Set boolean flag on whether or not use a tempfile
373 Returns : value of use_tempfile
374 Args : newvalue (optional)
379 my ($self,$value) = @_;
380 if( defined $value) {
381 $self->{'_use_tempfile'} = $value;
383 return $self->{'_use_tempfile'};