2 # BioPerl module for Bio::Assembly::IO::sam
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Dan Kortschak <dan.kortschak adelaide.edu.au>
8 # Copyright Dan Kortschak
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Assembly::IO::bowtie - An IO module for assemblies in Bowtie format *BETA*
20 $aio = Bio::Assembly::IO( -file => "mybowtie.bowtie",
23 $assy = $aio->next_assembly;
27 This is a read-only IO module designed to convert Bowtie
28 (L<http://bowtie-bio.sourceforge.net/>) formatted alignments to
29 L<Bio::Assembly::Scaffold> representations, containing
30 L<Bio::Assembly::Contig> and L<Bio::Assembly::Singlet> objects.
31 It is a wrapper that converts the Bowtie format to BAM format taken
32 by the Bio::Assembly::IO::sam module which in turn uses lstein's
33 L<Bio::DB::Sam> to parse binary formatted SAM (.bam) files guided by a
34 reference sequence fasta database.
36 Some information is lost in conversion from bowtie format to SAM/BAM format
37 that is provided by Bowtie using the SAM output option and the conversion
38 to SAM format from bowtie format is slower than using bowtie's SAM option.
39 If you plan to use SAM/BAM format it is preferable to use this Bowtie
40 option rather than convert the format after the fact.
42 See the Bio::Assembly::IO::sam documentation for relevant details.
48 =item * Required files
50 A bowtie (C<.bowtie>) alignment and the bowtie index or fasta
51 file used to generate the alignment are required.
53 =item * Compressed files
55 ...can be specified directly , if L<IO::Uncompress::Gunzip> is
56 installed. Get it from your local CPAN mirror.
64 User feedback is an integral part of the evolution of this and other
65 Bioperl modules. Send your comments and suggestions preferably to
66 the Bioperl mailing list. Your participation is much appreciated.
68 bioperl-l@bioperl.org - General discussion
69 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
73 Please direct usage questions or support issues to the mailing list:
75 L<bioperl-l@bioperl.org>
77 rather than to the module maintainer directly. Many experienced and
78 reponsive experts will be able look at the problem and quickly
79 address it. Please include a thorough description of the problem
80 with code and data examples if at all possible.
84 Report bugs to the Bioperl bug tracking system to help us keep track
85 of the bugs and their resolution. Bug reports can be submitted via
88 https://redmine.open-bio.org/projects/bioperl/
90 =head1 AUTHOR - Dan Kortschak
92 Email dan.kortschak adelaide.edu.au
96 The rest of the documentation details each of the object methods.
97 Internal methods are usually preceded with a _
101 # Let the code begin...
104 package Bio
::Assembly
::IO
::bowtie
;
108 # Object preamble - inherits from Bio::Root::Root
111 use Bio
::Tools
::Run
::Samtools
;
112 use Bio
::Assembly
::IO
;
114 use base
qw( Bio::Assembly::IO );
116 our $HD = "\@HD\tVN:1.0\tSO:unsorted\n";
117 our $PG = "\@PG\tID=Bowtie\n";
119 our $HAVE_IO_UNCOMPRESS;
124 eval "require Bio::Tools::Run::Bowtie; \$HAVE_BOWTIE = 1";
125 unless ( eval "require IO::Uncompress::Gunzip; \$HAVE_IO_UNCOMPRESS = 1") {
126 Bio
::Root
::Root
->warn("IO::Uncompress::Gunzip is not available; you'll have to do your decompression by hand.");
133 Usage : my $obj = new Bio::Assembly::IO::bowtie();
134 Function: Builds a new Bio::Assembly::IO object
135 Returns : an instance of Bio::Assembly::IO
136 Args : hash of options:
138 -file => bowtie_output_file
139 -index => bowtie_index or fasta_file used to create index
140 -no_head => boolean skip SAM header
141 -no_sq => boolean skip SQ lines of SAM header
143 Note : bowtie_output and fasta files may be gzipped
150 my $self = $class->SUPER::new
(@args);
151 $self->_initialize(@args);
152 $self->{'_tempdir'} = $self->tempdir(CLEANUP
=>1);
153 my ($file, $index, $no_head, $no_sq) = $self->_rearrange([qw(FILE INDEX NO_HEAD NO_SQ)], @args);
155 $self->{'_no_head'} = $no_head;
156 $self->{'_no_sq'} = $no_sq;
158 # get the sequence so Bio::DB::Sam can work with it
161 if (-e
$index && -r _
) {
162 $refdb = ($index =~ m/\.gz[^.]*$/) ?
$self->_uncompress($index) : $index;
163 my $guesser = Bio
::Tools
::GuessSeqFormat
->new(-file
=>$refdb);
164 $self->throw("'$index' is not a fasta file.")
165 unless $guesser->guess =~ m/^fasta$/;
166 } elsif ($HAVE_BOWTIE) {
167 $inspector = Bio
::Tools
::Run
::Bowtie
->new( -command
=> 'inspect' );
168 $refdb = $inspector->run($index);
170 $self->throw("Bio::Tools::Run::Bowtie is not available - cannot extract refdb from index.");
173 my $bam_file = $self->_make_bam($self->_bowtie_to_sam($file, $refdb));
174 my $sam = Bio
::Assembly
::IO
->new( -file
=> "<$bam_file", -refdb
=> $refdb , -format
=> 'sam' );
180 my ($self, $file, $refdb) = @_;
182 $self->throw("'$file' does not exist or is not readable.")
183 unless ( -e
$file && -r _
);
185 if ($file =~ m/\.gz[^.]*$/) {
186 $file = $self->_uncompress($file);
188 open my $fh, '<', $file or $self->throw("Could not read file '$file': $!");
193 my $guesser = Bio
::Tools
::GuessSeqFormat
->new(-file
=>$file);
194 $self->throw("'$file' is not a bowtie formatted file.") unless $guesser->guess =~ m/^bowtie$/;
202 # create temp file for working
203 my ($sam_tmp_h, $sam_tmp_f) = $self->tempfile( -dir
=> $self->{'_tempdir'}, -suffix
=> '.sam' );
205 while (my $line = $self->_readline) {
207 my ($qname,$strand,$rname,$pos,$seq,$qual,$m,$details)=split("\cI",$line);
210 my $paired_f = ($qname =~ m
#/[12]#) ? 0x03 : 0;
211 my $strand_f = ($strand eq '-') ?
0x10 : 0;
212 my $op_strand_f = ($strand eq '+' && $paired_f) ?
0x20 : 0;
213 my $first_f = ($qname =~ m
#/1#) ? 0x40 : 0;
214 my $second_f = ($qname =~ m
#/2#) ? 0x80 : 0;
215 my $flag = $paired_f | $strand_f | $op_strand_f | $first_f | $second_f;
218 my $len = length $seq;
219 die unless $len == length $qual;
220 my $cigar = $len.'M';
221 my @detail = split(',',$details);
222 my $dist = 'NM:i:'.scalar @detail;
228 my $err = ($1-$last_pos);
230 push @mismatch,($err,$2);
232 push @mismatch, $len-$last_pos;
233 @mismatch = reverse @mismatch if $strand eq '-';
234 my $mismatch = join('',('MD:Z:',@mismatch));
239 my $mpos = $mate_line[3];
240 $mate_line[7] = $pos;
241 my $isize = $mpos-$pos-$len;
242 $mate_line[8] = -$isize;
243 print $sam_tmp_h join("\t",@mate_line),"\n";
244 print $sam_tmp_h join("\t",$qname, $flag, $rname, $pos, $mapq, $cigar, $mrnm, $mpos, $isize, $seq, $qual, $mismatch, $dist),"\n";
248 @mate_line = ($qname, $flag, $rname, $pos, $mapq, $cigar, $mrnm, undef, undef, $seq, $qual, $mismatch, $dist);
255 print $sam_tmp_h join("\t",$qname, $flag, $rname, $pos, $mapq, $cigar, $mrnm, $mpos, $isize, $seq, $qual, $mismatch, $dist),"\n";
261 return $sam_tmp_f if $self->{'_no_head'};
263 my ($samh, $samf) = $self->tempfile( -dir
=> $self->{'_tempdir'}, -suffix
=> '.sam' );
268 # print sequence dictionary
269 unless ($self->{'_no_sq'}) {
270 my $db = Bio
::SeqIO
->new( -file
=> $refdb, -format
=> 'fasta' );
271 while ( my $seq = $db->next_seq() ) {
272 $SQ{$seq->id} = $seq->length if $SQ{$seq->id};
275 map { print $samh join("\t", ('@SQ', "SN:$_", "LN:$SQ{$_}")), "\n" } keys %SQ;
282 open $sam_tmp_h, '<', $sam_tmp_f or
283 $self->throw("Could not read file '$sam_tmp_f': $!");
285 print $samh $_ while (<$sam_tmp_h>);
294 my ($self, $file) = @_;
296 unless ($HAVE_IO_UNCOMPRESS) {
297 croak
( "IO::Uncompress::Gunzip not available, can't expand '$file'" );
299 my ($tfh, $tf) = $self->tempfile( -dir
=> $self->{'_tempdir'} );
300 my $z = IO
::Uncompress
::Gunzip
->new($file);
301 while (my $block = $z->getline) { print $tfh $block }
309 my ($self, $file) = @_;
311 $self->throw("'$file' does not exist or is not readable")
312 unless ( -e
$file && -r _
);
314 # make a sorted bam file from a sam file input
315 my ($bamh, $bamf) = $self->tempfile( -dir
=> $self->{'_tempdir'}, -suffix
=> '.bam' );
316 my ($srth, $srtf) = $self->tempfile( -dir
=> $self->{'_tempdir'}, -suffix
=> '.srt' );
317 $_->close for ($bamh, $srth);
319 my $samt = Bio
::Tools
::Run
::Samtools
->new( -command
=> 'view',
323 $samt->run( -bam
=> $file, -out
=> $bamf );
325 $samt = Bio
::Tools
::Run
::Samtools
->new( -command
=> 'sort' );
327 $samt->run( -bam
=> $bamf, -pfx
=> $srtf);