changes all issue tracking in preparation for switch to github issues
[bioperl-live.git] / Bio / Assembly / IO / bowtie.pm
blobcc5149859369626873965e2cfc85719bc3768baa
2 # BioPerl module for Bio::Assembly::IO::sam
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Dan Kortschak <dan.kortschak adelaide.edu.au>
8 # Copyright Dan Kortschak
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Assembly::IO::bowtie - An IO module for assemblies in Bowtie format *BETA*
18 =head1 SYNOPSIS
20 $aio = Bio::Assembly::IO( -file => "mybowtie.bowtie",
21 -index => "myindex",
22 -format => "bowtie");
23 $assy = $aio->next_assembly;
25 =head1 DESCRIPTION
27 This is a read-only IO module designed to convert Bowtie
28 (L<http://bowtie-bio.sourceforge.net/>) formatted alignments to
29 L<Bio::Assembly::Scaffold> representations, containing
30 L<Bio::Assembly::Contig> and L<Bio::Assembly::Singlet> objects.
31 It is a wrapper that converts the Bowtie format to BAM format taken
32 by the Bio::Assembly::IO::sam module which in turn uses lstein's
33 L<Bio::DB::Sam> to parse binary formatted SAM (.bam) files guided by a
34 reference sequence fasta database.
36 Some information is lost in conversion from bowtie format to SAM/BAM format
37 that is provided by Bowtie using the SAM output option and the conversion
38 to SAM format from bowtie format is slower than using bowtie's SAM option.
39 If you plan to use SAM/BAM format it is preferable to use this Bowtie
40 option rather than convert the format after the fact.
42 See the Bio::Assembly::IO::sam documentation for relevant details.
44 =head1 DETAILS
46 =over
48 =item * Required files
50 A bowtie (C<.bowtie>) alignment and the bowtie index or fasta
51 file used to generate the alignment are required.
53 =item * Compressed files
55 ...can be specified directly , if L<IO::Uncompress::Gunzip> is
56 installed. Get it from your local CPAN mirror.
58 =back
60 =head1 FEEDBACK
62 =head2 Mailing Lists
64 User feedback is an integral part of the evolution of this and other
65 Bioperl modules. Send your comments and suggestions preferably to
66 the Bioperl mailing list. Your participation is much appreciated.
68 bioperl-l@bioperl.org - General discussion
69 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
71 =head2 Support
73 Please direct usage questions or support issues to the mailing list:
75 L<bioperl-l@bioperl.org>
77 rather than to the module maintainer directly. Many experienced and
78 reponsive experts will be able look at the problem and quickly
79 address it. Please include a thorough description of the problem
80 with code and data examples if at all possible.
82 =head2 Reporting Bugs
84 Report bugs to the Bioperl bug tracking system to help us keep track
85 of the bugs and their resolution. Bug reports can be submitted via
86 the web:
88 https://github.com/bioperl/bioperl-live/issues
90 =head1 AUTHOR - Dan Kortschak
92 Email dan.kortschak adelaide.edu.au
94 =head1 APPENDIX
96 The rest of the documentation details each of the object methods.
97 Internal methods are usually preceded with a _
99 =cut
101 # Let the code begin...
104 package Bio::Assembly::IO::bowtie;
105 use strict;
106 use warnings;
108 # Object preamble - inherits from Bio::Root::Root
110 use Bio::SeqIO;
111 use Bio::Tools::Run::Samtools;
112 use Bio::Assembly::IO;
114 use base qw( Bio::Assembly::IO );
116 our $HD = "\@HD\tVN:1.0\tSO:unsorted\n";
117 our $PG = "\@PG\tID=Bowtie\n";
119 our $HAVE_IO_UNCOMPRESS;
120 our $HAVE_BOWTIE;
122 BEGIN {
123 # check requirements
124 eval "require Bio::Tools::Run::Bowtie; \$HAVE_BOWTIE = 1";
125 unless ( eval "require IO::Uncompress::Gunzip; \$HAVE_IO_UNCOMPRESS = 1") {
126 Bio::Root::Root->warn("IO::Uncompress::Gunzip is not available; you'll have to do your decompression by hand.");
130 =head2 new()
132 Title : new
133 Usage : my $obj = new Bio::Assembly::IO::bowtie();
134 Function: Builds a new Bio::Assembly::IO object
135 Returns : an instance of Bio::Assembly::IO
136 Args : hash of options:
138 -file => bowtie_output_file
139 -index => bowtie_index or fasta_file used to create index
140 -no_head => boolean skip SAM header
141 -no_sq => boolean skip SQ lines of SAM header
143 Note : bowtie_output and fasta files may be gzipped
145 =cut
147 sub new {
148 my $class = shift;
149 my @args = @_;
150 my $self = $class->SUPER::new(@args);
151 $self->_initialize(@args);
152 $self->{'_tempdir'} = $self->tempdir(CLEANUP=>1);
153 my ($file, $index, $no_head, $no_sq) = $self->_rearrange([qw(FILE INDEX NO_HEAD NO_SQ)], @args);
154 $file =~ s/^<//;
155 $self->{'_no_head'} = $no_head;
156 $self->{'_no_sq'} = $no_sq;
158 # get the sequence so Bio::DB::Sam can work with it
159 my $refdb;
160 my $inspector;
161 if (-e $index && -r _ ) {
162 $refdb = ($index =~ m/\.gz[^.]*$/) ? $self->_uncompress($index) : $index;
163 my $guesser = Bio::Tools::GuessSeqFormat->new(-file=>$refdb);
164 $self->throw("'$index' is not a fasta file.")
165 unless $guesser->guess =~ m/^fasta$/;
166 } elsif ($HAVE_BOWTIE) {
167 $inspector = Bio::Tools::Run::Bowtie->new( -command => 'inspect' );
168 $refdb = $inspector->run($index);
169 } else {
170 $self->throw("Bio::Tools::Run::Bowtie is not available - cannot extract refdb from index.");
173 my $bam_file = $self->_make_bam($self->_bowtie_to_sam($file, $refdb));
174 my $sam = Bio::Assembly::IO->new( -file => "<$bam_file", -refdb => $refdb , -format => 'sam' );
176 return $sam;
179 sub _bowtie_to_sam {
180 my ($self, $file, $refdb) = @_;
182 $self->throw("'$file' does not exist or is not readable.")
183 unless ( -e $file && -r _ );
185 if ($file =~ m/\.gz[^.]*$/) {
186 $file = $self->_uncompress($file);
187 $self->close;
188 open my $fh, '<', $file or $self->throw("Could not read file '$file': $!");
189 $self->file($file);
190 $self->_fh($fh);
193 my $guesser = Bio::Tools::GuessSeqFormat->new(-file=>$file);
194 $self->throw("'$file' is not a bowtie formatted file.") unless $guesser->guess =~ m/^bowtie$/;
196 my %SQ;
197 my $mapq = 255;
198 my $in_pair;
199 my @mate_line;
200 my $mlen;
202 # create temp file for working
203 my ($sam_tmp_h, $sam_tmp_f) = $self->tempfile( -dir => $self->{'_tempdir'}, -suffix => '.sam' );
205 while (my $line = $self->_readline) {
206 chomp($line);
207 my ($qname,$strand,$rname,$pos,$seq,$qual,$m,$details)=split("\cI",$line);
208 $SQ{$rname} = 1;
210 my $paired_f = ($qname =~ m#/[12]#) ? 0x03 : 0;
211 my $strand_f = ($strand eq '-') ? 0x10 : 0;
212 my $op_strand_f = ($strand eq '+' && $paired_f) ? 0x20 : 0;
213 my $first_f = ($qname =~ m#/1#) ? 0x40 : 0;
214 my $second_f = ($qname =~ m#/2#) ? 0x80 : 0;
215 my $flag = $paired_f | $strand_f | $op_strand_f | $first_f | $second_f;
217 $pos++;
218 my $len = length $seq;
219 die unless $len == length $qual;
220 my $cigar = $len.'M';
221 my @detail = split(',',$details);
222 my $dist = 'NM:i:'.scalar @detail;
224 my @mismatch;
225 my $last_pos = 0;
226 for (@detail) {
227 m/(\d+):(\w)>\w/;
228 my $err = ($1-$last_pos);
229 $last_pos = $1+1;
230 push @mismatch,($err,$2);
232 push @mismatch, $len-$last_pos;
233 @mismatch = reverse @mismatch if $strand eq '-';
234 my $mismatch = join('',('MD:Z:',@mismatch));
236 if ($paired_f) {
237 my $mrnm = '=';
238 if ($in_pair) {
239 my $mpos = $mate_line[3];
240 $mate_line[7] = $pos;
241 my $isize = $mpos-$pos-$len;
242 $mate_line[8] = -$isize;
243 print $sam_tmp_h join("\t",@mate_line),"\n";
244 print $sam_tmp_h join("\t",$qname, $flag, $rname, $pos, $mapq, $cigar, $mrnm, $mpos, $isize, $seq, $qual, $mismatch, $dist),"\n";
245 $in_pair = 0;
246 } else {
247 $mlen = $len;
248 @mate_line = ($qname, $flag, $rname, $pos, $mapq, $cigar, $mrnm, undef, undef, $seq, $qual, $mismatch, $dist);
249 $in_pair = 1;
251 } else {
252 my $mrnm = '*';
253 my $mpos = 0;
254 my $isize = 0;
255 print $sam_tmp_h join("\t",$qname, $flag, $rname, $pos, $mapq, $cigar, $mrnm, $mpos, $isize, $seq, $qual, $mismatch, $dist),"\n";
259 $sam_tmp_h->close;
261 return $sam_tmp_f if $self->{'_no_head'};
263 my ($samh, $samf) = $self->tempfile( -dir => $self->{'_tempdir'}, -suffix => '.sam' );
265 # print header
266 print $samh $HD;
268 # print sequence dictionary
269 unless ($self->{'_no_sq'}) {
270 my $db = Bio::SeqIO->new( -file => $refdb, -format => 'fasta' );
271 while ( my $seq = $db->next_seq() ) {
272 $SQ{$seq->id} = $seq->length if $SQ{$seq->id};
275 map { print $samh join("\t", ('@SQ', "SN:$_", "LN:$SQ{$_}")), "\n" } keys %SQ;
278 # print program
279 print $samh $PG;
281 # print alignments
282 open $sam_tmp_h, '<', $sam_tmp_f or
283 $self->throw("Could not read file '$sam_tmp_f': $!");
285 print $samh $_ while (<$sam_tmp_h>);
287 close($sam_tmp_h);
288 $samh->close;
290 return $samf;
293 sub _uncompress {
294 my ($self, $file) = @_;
296 unless ($HAVE_IO_UNCOMPRESS) {
297 croak( "IO::Uncompress::Gunzip not available, can't expand '$file'" );
299 my ($tfh, $tf) = $self->tempfile( -dir => $self->{'_tempdir'} );
300 my $z = IO::Uncompress::Gunzip->new($file);
301 while (my $block = $z->getline) { print $tfh $block }
302 close $tfh;
303 $file = $tf;
305 return $file
308 sub _make_bam {
309 my ($self, $file) = @_;
311 $self->throw("'$file' does not exist or is not readable")
312 unless ( -e $file && -r _ );
314 # make a sorted bam file from a sam file input
315 my ($bamh, $bamf) = $self->tempfile( -dir => $self->{'_tempdir'}, -suffix => '.bam' );
316 my ($srth, $srtf) = $self->tempfile( -dir => $self->{'_tempdir'}, -suffix => '.srt' );
317 $_->close for ($bamh, $srth);
319 my $samt = Bio::Tools::Run::Samtools->new( -command => 'view',
320 -sam_input => 1,
321 -bam_output => 1 );
323 $samt->run( -bam => $file, -out => $bamf );
325 $samt = Bio::Tools::Run::Samtools->new( -command => 'sort' );
327 $samt->run( -bam => $bamf, -pfx => $srtf);
329 return $srtf.'.bam'