Couple more minor edits ...
[bioperl-live.git] / Bio / ClusterIO / dbsnp.pm
blobe2072f33aa5455fa1f04a202116b9c1dc298ab52
1 # BioPerl module for Bio::ClusterIO::dbsnp
3 # Copyright Allen Day <allenday@ucla.edu>, Stan Nelson <snelson@ucla.edu>
4 # Human Genetics, UCLA Medical School, University of California, Los Angeles
6 # POD documentation - main docs before the code
8 =head1 NAME
10 Bio::ClusterIO::dbsnp - dbSNP input stream
12 =head1 SYNOPSIS
14 Do not use this module directly. Use it via the Bio::ClusterIO class.
16 =head1 DESCRIPTION
18 Parse dbSNP XML files, one refSNP entry at a time. Note this handles dbSNPp
19 output generated by NBCI's eutils and does NOT parse output derived from
20 SNP's XML format (found at ftp://ftp.ncbi.nih.gov/snp/).
22 =head1 FEEDBACK
24 =head2 Mailing Lists
26 User feedback is an integral part of the evolution of this and other
27 Bioperl modules. Send your comments and suggestions preferably to one
28 of the Bioperl mailing lists. Your participation is much appreciated.
30 bioperl-l@bioperl.org - General discussion
31 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
33 =head2 Support
35 Please direct usage questions or support issues to the mailing list:
37 I<bioperl-l@bioperl.org>
39 rather than to the module maintainer directly. Many experienced and
40 reponsive experts will be able look at the problem and quickly
41 address it. Please include a thorough description of the problem
42 with code and data examples if at all possible.
44 =head2 Reporting Bugs
46 Report bugs to the Bioperl bug tracking system to help us keep track
47 the bugs and their resolution. Bug reports can be submitted via the
48 web:
50 https://github.com/bioperl/bioperl-live/issues
52 =head1 AUTHOR
54 Allen Day E<lt>allenday@ucla.eduE<gt>
56 =head1 APPENDIX
58 The rest of the documentation details each of the object
59 methods. Internal methods are usually preceded with a _
61 =cut
63 # Let the code begin...
64 package Bio::ClusterIO::dbsnp;
66 use strict;
67 use Bio::Root::Root;
68 use Bio::Variation::SNP;
69 use XML::SAX;
70 use Data::Dumper;
71 use IO::File;
72 use Time::HiRes qw(tv_interval gettimeofday);
74 use base qw(Bio::ClusterIO);
76 our $DEBUG = 0;
78 our %MAPPING = (
79 #the ones commented out i haven't written methods for yet... -Allen
80 'Rs_rsId' => 'id',
81 # 'Rs_taxId' => 'tax_id',
82 # 'Rs_organism' => 'organism',
83 'Rs_snpType' => {'type' => 'value'},
84 'Rs_sequence_observed' => 'observed',
85 'Rs_sequence_seq5' => 'seq_5',
86 'Rs_sequence_seq3' => 'seq_3',
87 # 'Rs_sequence_exemplarSs' => 'exemplar_subsnp',
88 'Rs_create_build' => 'ncbi_build',
89 #?? 'Rs_update_build' => 'ncbi_build',
90 # 'NSE-rs_ncbi-num-chr-hits' => 'ncbi_chr_hits',
91 # 'NSE-rs_ncbi-num-ctg-hits' => 'ncbi_ctg_hits',
92 # 'NSE-rs_ncbi-num-seq-loc' => 'ncbi_seq_loc',
93 # 'NSE-rs_ncbi-mapweight' => 'ncbi_mapweight',
94 # 'NSE-rs_ucsc-build-id' => 'ucsc_build',
95 # 'NSE-rs_ucsc-num-chr-hits' => 'ucsc_chr_hits',
96 # 'NSE-rs_ucsc-num-seq-loc' => 'ucsc_ctg_hits',
97 # 'NSE-rs_ucsc-mapweight' => 'ucsc_mapweight',
99 'Rs_het_value' => 'heterozygous',
100 'Rs_het-stdError' => 'heterozygous_SE',
101 'Rs_validation' => {'validated' => 'value'}, #??
102 # 'NSE-rs_genotype' => {'genotype' => 'value'},
104 'Ss_handle' => 'handle',
105 'Ss_batchId' => 'batch_id',
106 'Ss_locSnpId' => 'id',
107 # 'Ss_locSnpId' => 'loc_id',
108 # 'Ss_orient' => {'orient' => 'value'},
109 # 'Ss_buildId' => 'build',
110 'Ss_methodClass' => {'method' => 'value'},
111 # 'NSE-ss_accession_E' => 'accession',
112 # 'NSE-ss_comment_E' => 'comment',
113 # 'NSE-ss_genename' => 'gene_name',
114 # 'NSE-ss_assay-5_E' => 'seq_5',
115 # 'NSE-ss_assay-3_E' => 'seq_3',
116 # 'NSE-ss_observed' => 'observed',
118 # 'NSE-ss-popinfo_type' => 'pop_type',
119 # 'NSE-ss-popinfo_batch-id' => 'pop_batch_id',
120 # 'NSE-ss-popinfo_pop-name' => 'pop_name',
121 # 'NSE-ss-popinfo_samplesize' => 'pop_samplesize',
122 # 'NSE-ss_popinfo_est-het' => 'pop_est_heterozygous',
123 # 'NSE-ss_popinfo_est-het-se-sq' => 'pop_est_heterozygous_se_sq',
125 # 'NSE-ss-alleleinfo_type' => 'allele_type',
126 # 'NSE-ss-alleleinfo_batch-id' => 'allele_batch_id',
127 # 'NSE-ss-alleleinfo_pop-id' => 'allele_pop_id',
128 # 'NSE-ss-alleleinfo_snp-allele' => 'allele_snp',
129 # 'NSE-ss-alleleinfo_other-allele' => 'allele_other',
130 # 'NSE-ss-alleleinfo_freq' => 'allele_freq',
131 # 'NSE-ss-alleleinfo_count' => 'allele_count',
133 # 'NSE-rsContigHit_contig-id' => 'contig_hit',
134 # 'NSE-rsContigHit_accession' => 'accession_hit',
135 # 'NSE-rsContigHit_version' => 'version',
136 # 'NSE-rsContigHit_chromosome' => 'chromosome_hit',
138 # 'NSE-rsMaploc_asn-from' => 'asn_from',
139 # 'NSE-rsMaploc_asn-to' => 'asn_to',
140 # 'NSE-rsMaploc_loc-type' => {'loc_type' => 'value'},
141 # 'NSE-rsMaploc_hit-quality' => {'hit_quality' => 'value'},
142 # 'NSE-rsMaploc_orient' => {'orient' => 'value'},
143 # 'NSE-rsMaploc_physmap-str' => 'phys_from',
144 # 'NSE-rsMaploc_physmap-int' => 'phys_to',
146 'FxnSet_geneId' => 'locus_id', # does the code realise that there can be multiple of these
147 'FxnSet_symbol' => 'symbol',
148 'FxnSet_mrnaAcc' => 'mrna',
149 'FxnSet_protAcc' => 'protein',
150 'FxnSet_fxnClass' => {'functional_class' => 'value'},
152 #...
153 #...
154 #there are lots more, but i don't need them at the moment... -Allen
157 sub _initialize{
158 my ($self,@args) = @_;
159 $self->SUPER::_initialize(@args);
160 my ($usetempfile) = $self->_rearrange([qw(TEMPFILE)],@args);
161 defined $usetempfile && $self->use_tempfile($usetempfile);
163 # start up the parser factory
164 my $parserfactory = XML::SAX::ParserFactory->parser(
165 Handler => $self);
166 $self->{'_xmlparser'} = $parserfactory;
167 $DEBUG = 1 if( ! defined $DEBUG && $self->verbose > 0);
170 =head2 next_cluster
172 Title : next_cluster
173 Usage : $dbsnp = $stream->next_cluster()
174 Function: returns the next refSNP in the stream
175 Returns : Bio::Variation::SNP object representing composite refSNP
176 and its component subSNP(s).
177 Args : NONE
179 =cut
182 #Adapted from Jason's blastxml.pm
185 # you shouldn't have to preparse this; the XML is well-formed and refers
186 # accurately to a remote DTD/schema
188 sub next_cluster {
189 my $self = shift;
190 my $data = '';
191 my($tfh);
193 if( $self->use_tempfile ) {
194 $tfh = IO::File->new_tmpfile or $self->throw("Unable to open temp file: $!");
195 $tfh->autoflush(1);
198 my $start = 1;
199 while( defined( $_ = $self->_readline ) ){
200 #skip to beginning of refSNP entry
201 if($_ !~ m{<Rs[^>]*>} && $start){
202 next;
203 } elsif($_ =~ m{<Rs[^>]*>} && $start){
204 $start = 0;
207 #slurp up the data
208 if( defined $tfh ) {
209 print $tfh $_;
210 } else {
211 $data .= $_;
214 #and stop at the end of the refSNP entry
215 last if $_ =~ m{</Rs>};
218 #if we didn't find a start tag
219 return if $start;
221 my %parser_args;
222 if( defined $tfh ) {
223 seek($tfh,0,0);
224 %parser_args = ('Source' => { 'ByteStream' => $tfh },
225 'Handler' => $self);
226 } else {
227 %parser_args = ('Source' => { 'String' => $data },
228 'Handler' => $self);
231 my $starttime;
232 my $result;
234 if( $DEBUG ) { $starttime = [ Time::HiRes::gettimeofday() ]; }
236 eval {
237 $result = $self->{'_xmlparser'}->parse(%parser_args);
240 if( $@ ) {
241 $self->warn("error in parsing a report:\n $@");
242 $result = undef;
245 if( $DEBUG ) {
246 $self->debug( sprintf("parsing took %f seconds\n", Time::HiRes::tv_interval($starttime)));
249 return $self->refsnp;
252 =head2 SAX methods
254 =cut
256 =head2 start_document
258 Title : start_document
259 Usage : $parser->start_document;
260 Function: SAX method to indicate starting to parse a new document.
261 Creates a Bio::Variation::SNP
262 Returns : none
263 Args : none
265 =cut
267 sub start_document{
268 my ($self) = @_;
269 $self->{refsnp} = Bio::Variation::SNP->new;
272 sub refsnp {
273 return shift->{refsnp};
276 =head2 end_document
278 Title : end_document
279 Usage : $parser->end_document;
280 Function: SAX method to indicate finishing parsing a new document
281 Returns : none
282 Args : none
284 =cut
286 sub end_document{
287 my ($self,@args) = @_;
290 =head2 start_element
292 Title : start_element
293 Usage : $parser->start_element($data)
294 Function: SAX method to indicate starting a new element
295 Returns : none
296 Args : hash ref for data
298 =cut
300 sub start_element{
301 my ($self,$data) = @_;
302 my $nm = $data->{'Name'};
303 my $at = $data->{'Attributes'}->{'{}value'};
305 #$self->debug(Dumper($at)) if $nm = ;
307 if($nm eq 'Ss'){
308 $self->refsnp->add_subsnp;
309 return;
312 if(my $type = $MAPPING{$nm}){
313 if(ref $type eq 'HASH'){
314 #okay, this is nasty. what can you do?
315 $self->{will_handle} = (keys %$type)[0];
316 $self->{last_data} = $at->{Value};
317 } else {
318 $self->{will_handle} = $type;
319 $self->{last_data} = undef;
321 } else {
322 undef $self->{will_handle};
326 =head2 end_element
328 Title : end_element
329 Usage : $parser->end_element($data)
330 Function: Signals finishing an element
331 Returns : none
332 Args : hash ref for data
334 =cut
336 sub end_element {
337 my ($self,$data) = @_;
338 my $nm = $data->{'Name'};
339 my $at = $data->{'Attributes'};
341 my $method = $self->{will_handle};
342 if($method){
343 if($nm =~ /^Rs/ or $nm =~ /^NSE-SeqLoc/ or $nm =~ /^FxnSet/){
344 $self->refsnp->$method($self->{last_data});
345 } elsif ($nm =~ /^Ss/){
346 $self->refsnp->subsnp->$method($self->{last_data});
351 =head2 characters
353 Title : characters
354 Usage : $parser->characters($data)
355 Function: Signals new characters to be processed
356 Returns : characters read
357 Args : hash ref with the key 'Data'
359 =cut
361 sub characters{
362 my ($self,$data) = @_;
363 $self->{last_data} = $data->{Data}
364 if $data->{Data} =~ /\S/; #whitespace is meaningless -ad
367 =head2 use_tempfile
369 Title : use_tempfile
370 Usage : $obj->use_tempfile($newval)
371 Function: Get/Set boolean flag on whether or not use a tempfile
372 Example :
373 Returns : value of use_tempfile
374 Args : newvalue (optional)
376 =cut
378 sub use_tempfile{
379 my ($self,$value) = @_;
380 if( defined $value) {
381 $self->{'_use_tempfile'} = $value;
383 return $self->{'_use_tempfile'};