Merge pull request #111 from adsj/master
[bioperl-live.git] / examples / searchio / waba2gff3.pl
blobee43f04843fd860285ba7895109868f8d8d64d71
1 #!/usr/bin/perl
3 =head1 NAME
5 waba2gff3.pl - convert waba output into GFF3 suitable for Gbrowse
7 =head1 DESCRIPTION
9 This script turns WABA output into GFF3 output for the query sequence.
10 If you need to get this where the Hit sequence is the reference
11 sequence you'll want to flip-flop the code to use hit instead of
12 query. I didn't try and make it that general yet.
14 I don't (yet) know how the 'score' field is calculate by Wormbase
15 folks for WABA data in their GFF dumps. I'm checking on that but it
16 shouldn't make a difference for Gbrowse.
18 =head1 AUTHOR
20 Jason Stajich, jason-at-bioperl-dot-org
21 Duke University,
23 =head1 LICENSE
25 This script is available under the Perl Artistic License meaning you
26 can do with it what you wish.
28 Please do tell me about bugs or improvements so I can roll those back
29 in for other people to use.
32 =cut
36 use strict;
37 use Bio::SearchIO;
38 use Bio::SeqFeature::Generic;
39 use Bio::Tools::GFF;
40 use Getopt::Long;
42 my %States = ('1' => 'coding',
43 '2' => 'coding',
44 '3' => 'coding',
45 'L' => 'weak',
46 'H' => 'strong',
48 my ($infile,$outfile,$verbose,$version);
49 $version = 3;
50 my $ptag = 'nucleotide_match';
51 GetOptions(
52 'i|input:s' => \$infile,
53 'o|output:s' => \$outfile,
54 'v|verbose' => \$verbose,
55 'version' => \$version,
56 'p|primary|primary_tag:s' => \$ptag,
58 $infile = shift unless $infile;
60 my $in;
62 if( $infile ) {
63 $in = new Bio::SearchIO(-verbose => $verbose,
64 -format => 'waba',
65 -file => $infile);
66 } else {
67 $in = new Bio::SearchIO(-verbose => $verbose,
68 -format => 'waba',
69 -fh => \*ARGV);
72 my $out;
73 if( defined $outfile) {
74 $out = new Bio::Tools::GFF(-gff_version => $version,
75 -file => ">$outfile",
76 -verbose => $verbose);
77 } else {
78 $out = new Bio::Tools::GFF(-gff_version => $version,
79 -verbose => $verbose);
82 while( my $r = $in->next_result ) {
83 while( my $hit = $r->next_hit ) {
84 while( my $hsp = $hit->next_hsp ) {
85 # now split this HSP up into pieces
86 my ($qs,$qe,$hs,$he)= ($hsp->query->start,
87 $hsp->query->end,
88 $hsp->hit->start,
89 $hsp->hit->end);
90 my $i = 0;
91 # grab the HMM states from Jim's WABA output
92 my $stateseq = $hsp->hmmstate_string;
93 my $state_len = length($stateseq);
94 my ($piece,$gap,@pieces);
95 $piece = {'length' => 0,
96 'str' => '',
97 'start' => $i};
98 $gap = 0;
100 # parse the state string, finding the gaps (Q and T states)
101 # runs of Non Q or T letters indicate a 'piece'
102 while($i < $state_len ) {
103 my $char = substr($stateseq,$i,1);
104 if($char =~ /[QT]/ ) {
105 $gap++;
106 } elsif( $gap ) {
107 # just finished a gap
108 $piece->{'length'} = length($piece->{'str'});
109 push @pieces, $piece;
110 $piece = {'length' => 0,
111 'str' => '',
112 'start' => $i };
113 $gap = 0;
114 } else {
115 $piece->{'str'} .= $char;
117 $i++;
119 # for each piece, this could be made up of things either
120 # as H,L, or 123 state.
121 # In retrospect this could all probably be contained in a
122 # single loop, but now I'm feeling lazy. I had just converted this
123 # from using 'split' in the first place if you want to know
124 # why it is structured this way....
125 for my $piece ( @pieces ) {
127 my $len = $piece->{length};
128 my $start = $piece->{start};
129 my $end = $start + $len;
130 my ($j) = 0;
131 my $state = substr($piece->{str},$j++,1);
132 warn("start is $start end is $end len is $len\n") if $verbose;
133 my ($set,@sets) = ($state);
134 while( $j < $len ) {
135 my $char = substr($piece->{str},$j++,1);
136 next unless( $char);
137 if( ($char =~ /\d/ && $state =~ /\d/) ||
138 ($char =~ /\w/ && $char eq $state) ) {
139 $set .= $char;
140 } else {
141 push @sets, $set;
142 $set = $state = $char;
145 push @sets, $set;
146 for my $set (@sets ) {
147 my $c = substr($set,0,1);
148 if( ! $c ) {
149 warn("no char for '$set'\n") if $verbose;
150 next;
152 my $type ='waba_'.$States{$c};
153 my $f = Bio::SeqFeature::Generic->new(
154 -start => $qs + $start,
155 -end => $qs + $start + length($set),
156 -strand=> $hsp->query->strand,
157 -seq_id=> $hsp->query->seq_id,
158 -score => $hsp->query->score,
159 -primary_tag => $ptag,
160 -source_tag => $type,
161 -tag => {
162 'ID' => $hsp->hit->seq_id
164 $f->add_tag_value('ID',$hs+$start,$hs+$start+$f->length);
165 $out->write_feature($f);
166 $start += $f->length+1;