2 # BioPerl module for Bio::Assembly::IO::maq
4 # Copyright by Mark A. Jensen
6 # You may distribute this module under the same terms as Perl itself
8 # POD documentation - main docs before the code
12 Bio::Assembly::IO::maq - Driver to read assembly files in maq format *BETA*
16 # convert the native maq map format to plain text
17 $ maq mapview all.map > all.maq
19 # Building an input stream
20 use Bio::Assembly::IO;
22 # Assembly loading methods
23 my $asmio = Bio::Assembly::IO->new( -file => 'all.maq',
25 my $scaffold = $asmio->next_assembly;
30 This package loads and writes map information in/from C<maq> map files converted by the C<maq mapview> utility. This module is a driver module for
31 Bio::Assembly::IO input/output.
33 Parsing is based on Heng Li's description of C<maq mapview> output, found
34 at the C<maq> manpage: L<http://maq.sourceforge.net/maq-manpage.shtml>.
36 The basic C<maq> workflow is: map reads to a reference sequence (with
37 C<maq map>), then create a consensus from the map (with C<maq
38 assemble>). To read a complete assembly with this module, the
39 following files need to be available:
41 [basename].maq : created by maq mapview [basename].map > [basename].maq
42 [basename].cns.fastq : created as follows
43 $ maq assemble [basename].cns [refseq].bfa [basename].map
44 $ maq cns2fq [basename].cns > [basename].cns.fastq
46 C<maq> produces only one "contig"; all reads map to the reference
47 sequence, which covers everything. This module breaks the reads into
48 contigs by dividing the C<maq> consensus into pieces for which there
49 are contiguous non-zero quality values.
51 The module C<Bio::Tools::Run::Maq> will help in this process (eventually).
53 This module has no write capability.
57 Assemblies are loaded into Bio::Assembly::Scaffold objects composed of
58 Bio::Assembly::Contig and Bio::Assembly::Singlet objects. Contigs are
59 not explicitly specified in C<map> files; the division of the map into
60 contigs is calculated in this module.
62 Additional assembly information is stored as features. Contig objects have
63 SeqFeature information associated with the primary_tag:
65 _main_contig_feature:$contig_id -> misc contig information
67 Read objects have sub_seqFeature information associated with the
70 _main_read_feature:$read_id -> misc read information
72 Singlets are contigs of a single sequence, as calculated within this module.
73 They are cataloged separately, as specified in L<Bio::Assembly::Scaffold>.
81 Add pod descriptions of maq descriptive data (currently SeqFeatures
82 added to each contig component)
86 Add features describing the aggregate status of reads and contigs based
87 on the maq "paired flag"
95 User feedback is an integral part of the evolution of this and other
96 Bioperl modules. Send your comments and suggestions preferably to the
97 Bioperl mailing lists Your participation is much appreciated.
99 bioperl-l@bioperl.org - General discussion
100 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
104 Please direct usage questions or support issues to the mailing list:
106 I<bioperl-l@bioperl.org>
108 rather than to the module maintainer directly. Many experienced and
109 reponsive experts will be able look at the problem and quickly
110 address it. Please include a thorough description of the problem
111 with code and data examples if at all possible.
113 =head2 Reporting Bugs
115 Report bugs to the BioPerl bug tracking system to help us keep track
116 the bugs and their resolution. Bug reports can be submitted via email
119 bioperl-bugs@bio.perl.org
120 https://github.com/bioperl/bioperl-live/issues
122 =head1 AUTHOR - Mark A. Jensen
124 Email maj -at- fortinbras -dot- us
129 Further improvements by Florent Angly
130 (florent dot angly at gmail dot com)
132 =head1 ACKNOWLEDGEMENT
134 Code and some POD text ripped liberally from Florent Angly's
135 L<Bio::Assembly::IO::tigr>.
139 The rest of the documentation details each of the object
140 methods. Internal methods are usually preceded with a "_".
144 package Bio
::Assembly
::IO
::maq
;
147 use Bio
::Seq
::Quality
;
148 use Bio
::Seq
::PrimaryQual
;
149 use Bio
::LocatableSeq
;
150 use Bio
::Assembly
::IO
;
151 use Bio
::Assembly
::Scaffold
;
152 use Bio
::Assembly
::Contig
;
153 use Bio
::Assembly
::Singlet
;
158 use base
qw(Bio::Assembly::IO);
160 # paired flag constants
162 FF
=> 1, FR
=> 2, RF
=> 4, RR
=> 8, PE
=> 16,
163 XC
=> 32, UN
=> 64, CP
=> 18
166 my $progname = 'maq';
170 Title : next_assembly
171 Usage : $scaffold = $stream->next_assembly()
172 Function: return the assembly defined by the map and cns files
173 Returns : Bio::Assembly::Scaffold object
181 my $assembly = Bio
::Assembly
::Scaffold
->new( -progname
=> $progname );
183 # Load contigs and singlets in the scaffold
184 while ( my $obj = $self->next_contig()) {
185 # Add contig /singlet to assembly
186 if ($obj->isa('Bio::Assembly::Singlet')) { # a singlet
187 $assembly->add_singlet($obj);
189 $assembly->add_contig($obj);
200 Usage : $scaffold = $stream->next_contig()
201 Function: Returns the next contig or singlet in the ACE stream.
202 Returns : a Bio::Assembly::Contig or Bio::Assembly::Single object
208 my $self = shift; # object reference
210 # Read the file of consensus sequences if it has not already been done for
211 # this Bio:::Assembly::IO stream already
212 if (not defined $self->_cons) {
213 $self->_parse_cns_file or
214 $self->throw("Associated maq consensus file is not available");
217 # Contig and read related
221 # Loop over all assembly file lines
222 while ($_ = $self->_readline) {
226 # mapview format parsing ; every line is a read...
228 @readinfo{ qw(read_name chr posn strand insert_size
229 paired_flag map_qual se_map_qual alt_map_qual
230 num_mm_best_hit sum_qual_mm_best_hit zero_mm_hits
231 one_mm_hits read_len seqstr qualstr) } = split(/\s+/);
234 my @qual = map { ord($_)-33 } split('', $readinfo{qualstr
});
235 $readinfo{seq
} = Bio
::Seq
::Quality
->new(
236 -id
=> $readinfo{read_name
},
237 -seq
=> $readinfo{seqstr
},
241 if ( not defined $contiginfo{start
} ) {
242 # First read of new contig or singlet
243 $contiginfo{'seqnum'} = 1;
244 $contiginfo{'qualobj'} = $self->_next_cons;
245 $contiginfo{'start'} = $contiginfo{'qualobj'}->start;
246 $contiginfo{'end'} = $contiginfo{'qualobj'}->end;
247 $contiginfo{'asmbl_id'} = 'maq_assy['.$self->_basename.']/'.$contiginfo{start
}.'-'.$contiginfo{end
};
248 # It may be a singlet, but assume it's a contig for now
249 $contigobj = $self->_init_contig(\
%contiginfo);
250 $self->_store_read(\
%readinfo, $contigobj);
252 if ( $readinfo{'posn'} <= $contiginfo{end
} ) {
253 # Add read to existing contig
254 $contiginfo{'seqnum'}++;
255 $self->_store_read(\
%readinfo, $contigobj);
257 # Read belongs in a new contig
258 if ($contiginfo{'seqnum'} > 1) {
259 $self->_store_contig(\
%contiginfo, $contigobj);
262 # Create a new singlet object from the read info
263 $contigobj = $self->_store_singlet(\
%contiginfo, $contigobj);
266 $self->_pushback($_);
275 =head2 _init_contig()
278 Usage : my $contigobj; $contigobj = $self->_init_contig(
279 \%contiginfo, $scaffoldobj);
280 Function: store information of a contig belonging to a scaffold in the
282 Returns : Bio::Assembly::Contig object
283 Args : hash, Bio::Assembly::Scaffold
288 my ($self, $contiginfo) = @_;
289 # Create a contig and attach it to scaffold
290 my $contigobj = Bio
::Assembly
::Contig
->new(
291 -id
=> $$contiginfo{'asmbl_id'},
292 -source
=> $progname,
298 =head2 _store_contig()
300 Title : _store_contig
301 Usage : my $contigobj; $contigobj = $self->_store_contig(
302 \%contiginfo, $contigobj);
303 Function: store information of a contig belonging to a scaffold
304 in the appropriate object
305 Returns : Bio::Assembly::Contig object
306 Args : hash, Bio::Assembly::Contig
311 my ($self, $contiginfo, $contigobj) = @_;
313 $self->throw("Contig object must be defined") unless $contigobj;
315 my $consensus_seq = Bio
::LocatableSeq
->new(
316 -id
=> $$contiginfo{'asmbl_id'},
317 -seq
=> $$contiginfo{'qualobj'}->seq,
320 $contigobj->set_consensus_sequence($consensus_seq);
321 my $consensus_qual = Bio
::Seq
::PrimaryQual
->new(
322 -id
=> $$contiginfo{'asmbl_id'},
323 -qual
=> $$contiginfo{'qualobj'}->qual,
326 $contigobj->set_consensus_quality($consensus_qual);
328 # Add other misc contig information as features of the contig
329 # Add other misc read information as subsequence feature
330 my @other = grep !/asmbl_id|end|qualobj|start/, keys %$contiginfo;
332 @other{@other} = @
$contiginfo{@other};
333 my $contigtags = Bio
::SeqFeature
::Generic
->new(
334 -primary
=> '_main_contig_feature',
335 -source
=> $$contiginfo{'asmbl_id'},
337 -end
=> $contigobj->get_consensus_length(),
342 $contigobj->add_features([ $contigtags ], 1);
347 =head2 _parse_cns_file()
349 Title : _parse_cns_file
350 Usage : $self->_parse_cns_file
351 Function: parse the .cns.fastq (consensus) file
352 associated with the present map;
353 set the objects cns attribute
354 Returns : true on success; nil if file dne
359 sub _parse_cns_file
{
363 $self->{'_cns_parsed'} = 1;
364 my $file = $self->file;
365 $file =~ s/^[<>+]*//; # byebye parasitic mode chars
366 my ($fname, $dir, $suf) = fileparse
($file, ".maq");
367 my $cnsf = File
::Spec
->catdir($dir, "$fname.cns.fastq");
368 return unless (-e
$cnsf );
369 my $fqio = Bio
::SeqIO
->new( -file
=> $cnsf );
370 my $cns = $fqio->next_seq;
371 # now, infer the contigs on the basis of quality values
372 # - assuming quality of zero => no coverage
373 my $qual = $cns->qual;
375 my @sites = grep { $$qual[$_] > 0 } (0..$#$qual);
376 my @ranges = ($sites[0]+1);
377 for my $i (1..$#sites) {
378 if ($sites[$i]-$sites[$i-1]>1) {
379 push @ranges, $sites[$i-1]+1, $sites[$i]+1;
382 push @ranges, $sites[-1];
383 for (my $i = 0; $i<$#ranges; $i+=2) {
384 push @cons, Bio
::Seq
::Quality
->new(
385 -display_id
=> "${fname}/".$ranges[$i]."-".$ranges[$i+1],
386 -start
=> $ranges[$i],
387 -end
=> $ranges[$i+1],
388 -seq
=> $cns->subseq($ranges[$i], $ranges[$i+1]),
389 -qual
=> [@
{$cns->qual}[$ranges[$i]-1..$ranges[$i+1]-1]]
393 $self->{'_cons'} = \
@cons;
401 Usage : @cons = $self->_cons
402 Function: get the array of consensus fastq Bio::Seq::Quality objects
403 Returns : array of Bio::Seq::Quality objects
411 if (defined $self->{'_cons'}) {
412 $cons = $self->{'_cons'};
422 sub _next_cons
{ shift(@
{shift->{'_cons'}}) }
428 Usage : my $readobj = $self->_store_read(\%readinfo, $contigobj);
429 Function: store information of a read belonging to a contig
430 in the appropriate object
431 Returns : a Bio::LocatableSeq object
432 Args : hash, Bio::Assembly::Contig
436 # @readinfo{ qw(read_name chr posn strand insert_size,
437 # paired_flag map_qual se_map_qual alt_map_qual,
438 # num_mm_best_hit sum_qual_mm_best_hit zero_mm_hits,
439 # one_mm_hits seqstr qualstr) } = split(/\s+/);
442 my ($self, $readinfo, $contigobj) = @_;
444 # Create an aligned read object
446 $$readinfo{'strand'} = ($$readinfo{strand
} eq '+' ?
1 : -1);
448 my $readobj = Bio
::LocatableSeq
->new(
449 -display_id
=> $$readinfo{'read_name'},
450 -primary_id
=> $$readinfo{'read_name'},
451 -seq
=> $$readinfo{'seqstr'},
453 -strand
=> $$readinfo{'strand'},
457 # Add read location and sequence to contig (in 'gapped consensus' coordinates)
458 $$readinfo{'aln_start'} = $$readinfo{'posn'};
459 $$readinfo{'aln_end'} = $$readinfo{'posn'} + length($$readinfo{'seqstr'})-1;
461 my $alncoord = Bio
::SeqFeature
::Generic
->new(
462 -primary
=> $readobj->id,
463 -start
=> $$readinfo{'aln_start'},
464 -end
=> $$readinfo{'aln_end'},
465 -strand
=> $$readinfo{'strand'},
466 -qual
=> join(' ', $$readinfo{seq
}->qual),
467 # check here, need to create contigs "by hand"...
468 -tag
=> { 'contig' => $contigobj->id() }
471 $contigobj->set_seq_coord($alncoord, $readobj);
473 # Add other misc read information as subsequence feature
474 my @other = grep !/aln_(?:end|start)|seq(?:str)?|strand/, keys %$readinfo;
476 @other{@other} = @
$readinfo{@other};
477 my $readtags = Bio
::SeqFeature
::Generic
->new(
478 -primary
=> '_main_read_feature',
479 -source
=> $readobj->id,
480 -start
=> $$readinfo{'aln_start'},
481 -end
=> $$readinfo{'aln_end'},
482 -strand
=> $$readinfo{'strand'},
486 $contigobj->get_features_collection->add_features([$readtags]);
487 $contigobj->get_features_collection->add_SeqFeature($alncoord, $readtags);
494 =head2 _store_singlet()
496 Title : _store_singlet
497 Usage : my $singletobj = $self->_store_read(\%readinfo, \%contiginfo);
498 Function: store information of a singlet belonging to a scaffold in a singlet object
499 Returns : Bio::Assembly::Singlet
505 my ($self, $contiginfo, $contigobj) = @_;
507 my $contigid = $$contiginfo{'asmbl_id'};
508 my $seqref = ($contigobj->each_seq())[0];
509 my $singletobj = Bio
::Assembly
::Singlet
->new( -id
=> $contigid,
510 -seqref
=> $seqref );
512 # Add other misc contig information as features of the contig
513 # Add other misc read information as subsequence feature
514 #my @other = grep !/_sfc|_assembly|_elem/, keys %$contiginfo; # remove the objects; _elem contains a code ref and can't be frozen. Just shooting blind here.
516 #@other{@other} = @$contiginfo{@other};
517 #my $contigtags = Bio::SeqFeature::Generic->new(
518 # -primary => '_main_contig_feature',
519 # -source => $$contiginfo{asmbl_id},
521 # -end => $singletobj->get_consensus_length(),
526 #$singletobj->add_features([ $contigtags ], 1);
528 #$$readinfo{'aln_start'} = $$readinfo{'start'};
529 #$$readinfo{'aln_end'} = $$readinfo{'end'};
530 #$$readinfo{'strand'} = ($$readinfo{strand} eq '+' ? 1 : -1);
531 #my $alncoord = Bio::SeqFeature::Generic->new(
532 # -primary => '_aligned_coord',
533 # -source => $$readinfo{read_name},
534 # -start => $$readinfo{'start'},
535 # -end => $$readinfo{'end'},
536 # -strand => $$readinfo{'strand'},
537 # -tag => { 'contig' => $$contiginfo{asmbl_id} }
539 #$alncoord->attach_seq($singletobj->seqref);
540 #$singletobj->add_features([ $alncoord ], 0);
542 # Add other misc read information as subsequence feature
543 #my @other = grep !/seqstr|strand/, keys %$readinfo;
545 #@other{@other} = @$readinfo{@other};
546 #my $readtags = Bio::SeqFeature::Generic->new(
547 # -primary => '_main_read_feature',
548 # -source => $$readinfo{read_name},
549 # -start => $$readinfo{'aln_start'},
550 # -end => $$readinfo{'aln_end'},
551 # -strand => $$readinfo{'strand'},
555 #$singletobj->get_features_collection->add_features([$readtags]);
556 #$singletobj->get_features_collection->add_SeqFeature($alncoord, $readtags);
561 ###### writes -- need them??
563 =head2 write_assembly()
565 Title : write_assembly
567 Function: not currently available for maq assemblies
574 my ($self,@args) = @_;
575 $self->throw("Writes not currently available for maq assemblies. Complain to author.")
583 Usage : $self->_basename
584 Function: return the basename of the associate IO file
585 Returns : scalar string
592 return (fileparse
($self->file, ".maq"))[0];