1 # $Id: ace.pm 16969 2010-05-09 15:26:53Z fangly $
3 ## BioPerl module for Bio::Assembly::IO::ace
5 # Copyright by Robson F. de Souza (the reading part) and Florent Angly (the
6 # writing and ACE variants part)
8 # You may distribute this module under the same terms as perl itself
10 # POD documentation - main docs before the code
14 Bio::Assembly::IO::ace - module to load ACE files from various assembly programs
18 # Building an input stream
19 use Bio::Assembly::IO;
21 # Load a reference ACE assembly
22 my $in_io = Bio::Assembly::IO->new( -file => 'results.ace',
25 # Read the entire scaffold
26 my $scaffold = $in_io->next_assembly;
28 # Or read one contig at a time to save resources
29 while ( my $contig = $in_io->next_contig ) {
33 # Assembly writing methods
34 my $out_io = Bio::Assembly::IO->new( -file => ">output.ace",
36 $out_io->write_assembly( -scaffold => $scaffold,
39 # Read the '454' Newbler variant of ACE instead of the default 'consed'
40 # reference ACE variant
41 my $in_io = Bio::Assembly::IO->new( -file => 'results.ace',
42 -format => 'ace-454' );
44 my $in_io = Bio::Assembly::IO->new( -file => 'results.ace',
50 This package loads the standard ACE files generated by various assembly programs
51 (Phrap, CAP3, Newbler, Arachne, ...). It was written to be used as a driver
52 module for Bio::Assembly::IO input/output.
56 Assemblies are loaded into Bio::Assembly::Scaffold objects composed by
57 Bio::Assembly::Contig and Bio::Assembly::Singlet objects. Only the ACE file is
58 used, so if you need singlets, make sure that they are present in the ACE file.
60 A brief description of the ACE format is available at
61 http://www.cbcb.umd.edu/research/contig_representation.shtml#ACE
62 Read the full format description from
63 http://bozeman.mbt.washington.edu/consed/distributions/README.14.0.txt
65 In addition to default "_aligned_coord:$seqID" feature class from
66 Bio::Assembly::Contig, contig objects loaded by this module will have the
67 following special feature classes in their feature collection:
69 "_align_clipping:$seqID" (AF)
70 Location of subsequence in read $seqID which is aligned to the contig. The
71 coordinates are relative to the contig. If no feature containing this tag is
72 present the read is considered low quality by Consed.
74 "_quality_clipping:$seqID" (AF)
75 The location of high quality subsequence in read $seqID (relative to contig)
78 Location of read subsequences used to build the consensus
80 "_read_tags:$readID" (RT)
81 Sequence features stored as sub_SeqFeatures of the sequence's coordinate
82 feature (the corresponding "_aligned_coord:$seqID" feature, easily accessed
83 through get_seq_coord() method).
85 "_read_desc:$readID" (DS)
86 Sequence features stored as sub_SeqFeatures of the read's coordinate feature
89 Equivalent to a bioperl sequence feature and, therefore, are added to the
90 feature collection using their type field (see Consed's README.txt file) as
93 "whole assembly tags" (WA)
94 They have no start and end, as they are not associated to any particular
95 sequence in the assembly, and are added to the assembly's annotation
96 collection using "whole assembly" as tag.
100 The default ACE variant is called 'consed' and corresponds to the reference ACE
103 The ACE files produced by the 454 GS Assembler (Newbler) do not conform to the
104 reference ACE format. In 454 ACE, the consensus sequence reported covers only
105 its clear range and the start of the clear range consensus is defined as position
106 1. Consequently, aligned reads in the contig can have negative positions. Be sure
107 to use the '454' variant to have positive alignment positions. No attempt is made
108 to construct the missing part of the consensus sequence (beyond the clear range)
109 based on the underlying reads in the contig. Instead the ends of the consensus
110 are simply padded with the gap character '-'.
116 User feedback is an integral part of the evolution of this and other
117 Bioperl modules. Send your comments and suggestions preferably to the
118 Bioperl mailing lists Your participation is much appreciated.
120 bioperl-l@bioperl.org - General discussion
121 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
125 Please direct usage questions or support issues to the mailing list:
127 I<bioperl-l@bioperl.org>
129 rather than to the module maintainer directly. Many experienced and
130 reponsive experts will be able look at the problem and quickly
131 address it. Please include a thorough description of the problem
132 with code and data examples if at all possible.
134 =head2 Reporting Bugs
136 Report bugs to the Bioperl bug tracking system to help us keep track
137 the bugs and their resolution. Bug reports can be submitted via the web:
139 https://redmine.open-bio.org/projects/bioperl/
141 =head1 AUTHOR - Robson Francisco de Souza
143 Email rfsouza@citri.iq.usp.br
147 The rest of the documentation details each of the object
148 methods. Internal methods are usually preceded with a _
152 package Bio
::Assembly
::IO
::ace
;
156 use Bio
::Assembly
::Scaffold
;
157 use Bio
::Assembly
::Contig
;
158 use Bio
::Assembly
::Singlet
;
159 use Bio
::LocatableSeq
;
160 use Bio
::Seq
::PrimaryQual
;
161 use Bio
::Annotation
::SimpleValue
;
163 use Bio
::SeqFeature
::Generic
;
165 use base
qw(Bio::Assembly::IO);
167 our $line_width = 50;
168 our $qual_value = 20;
169 our %variant = ( 'consed' => undef, # default
172 =head1 Parser methods
176 Title : next_assembly
177 Usage : $scaffold = $stream->next_assembly()
178 Function: returns the next assembly in the stream
179 Returns : a Bio::Assembly::Scaffold object
187 my $assembly = Bio
::Assembly
::Scaffold
->new();
189 # Load contigs and singlets in the scaffold
190 while ( my $obj = $self->next_contig() ) {
191 # Add contig /singlet to assembly
192 if ($obj->isa('Bio::Assembly::Singlet')) { # a singlet
193 $assembly->add_singlet($obj);
195 $assembly->add_contig($obj);
199 # Load annotations of assembly and contigs
200 $self->scaffold_annotations($assembly);
209 Usage : $scaffold = $stream->next_contig()
210 Function: Returns the next contig or singlet in the ACE stream.
211 Returns : a Bio::Assembly::Contig or Bio::Assembly::Single object
222 my $read_data = {}; # Temporary holder for read data
224 # Keep reading the ACE stream starting at where we stopped
225 while ( $_ = $self->_readline) {
228 # Loading contig sequence (COntig sequence field)
229 if (/^CO\s(\S+)\s(\d+)\s(\d+)\s(\d+)\s(\w+)/xms) { # New contig starts!
231 if (not $contigOBJ) {
232 # Start a new contig object
233 my $contigID = $1; # Contig ID
234 #my $nof_bases = $2; # Contig length in base pairs
235 my $nof_reads = $3; # Number of reads in this contig
236 #my $nof_segments = $4; # Number of read segments selected for consensus assembly
237 my $ori = $5; # 'C' if contig was complemented or U if not (default)
238 $ori = $ori eq 'U' ?
1 : -1;
240 # Create a singlet or contig
241 if ($nof_reads == 1) { # This is a singlet
242 $contigOBJ = Bio
::Assembly
::Singlet
->new( );
243 } elsif ( $nof_reads > 1 ) { # This is a contig
244 $contigOBJ = Bio
::Assembly
::Contig
->new( );
247 $contigOBJ->id($contigID);
248 $contigOBJ->strand($ori);
250 my $consensus_sequence;
251 while ($_ = $self->_readline) { # Looping over contig lines
252 chomp; # Drop <ENTER> (\n) on current line
253 last if (/^$/); # Stop if empty line (contig end) is found
254 s/\*/-/g; # Forcing '-' as gap symbol
255 $consensus_sequence .= $_;
257 $consensus_sequence = Bio
::LocatableSeq
->new(
258 -seq
=> $consensus_sequence,
262 $consensus_sequence->id($contigID);
263 $contigOBJ->set_consensus_sequence($consensus_sequence);
265 # A second contig is about to start. Backtrack one line and go
266 # to the return statement
267 $self->_pushback($_);
272 # Loading contig qualities... (Base Quality field)
274 my $qual_string = '';
275 while ($_ = $self->_readline) {
278 $qual_string .= "$_ ";
280 my @qual_arr = $self->_input_qual($qual_string, $contigOBJ->get_consensus_sequence->seq);
281 my $qual = Bio
::Seq
::PrimaryQual
->new(-qual
=> join(" ", @qual_arr),
282 -id
=> $contigOBJ->id() );
283 $contigOBJ->set_consensus_quality($qual);
286 # Loading read info... (Assembled From field)
287 elsif (/^AF (\S+) (C|U) (-*\d+)/) {
288 $read_name = $1; # read ID
289 my $ori = $2; # strand
290 my $start = $3; # aligned start
292 $ori = $ori eq 'U' ?
1 : -1;
293 $read_data->{$read_name}{'strand'} = $ori;
294 $read_data->{$read_name}{'padded_start'} = $start;
296 if ( $self->variant eq '454' ) {
297 if ( (not defined $min_start) || ($start < $min_start) ) {
304 # Base segments definitions (Base Segment field)
305 # They indicate which read segments were used to calculate the consensus
306 # Coordinates are relative to the contig
307 elsif (/^BS (\d+) (\d+) (\S+)/) {
308 my ($start, $end, $contig_id) = ($1, $2, $3);
309 if ($self->variant eq '454') {
310 $start += abs($min_start) + 1;
311 $end += abs($min_start) + 1;
313 my $bs_feat = Bio
::SeqFeature
::Generic
->new(
318 -primary
=> '_base_segments',
319 -tag
=> { 'contig_id' => $contig_id}
321 $contigOBJ->add_features([ $bs_feat ], 0);
324 # Loading reads... (ReaD sequence field)
325 # They define the reads in each contig
326 elsif (/^RD (\S+) (-*\d+) (\d+) (\d+)/) {
328 $read_data->{$read_name}{'length'} = $2; # number_of_padded_bases
329 $read_data->{$read_name}{'contig'} = $contigOBJ;
330 # $read_data->{$read_name}{'number_of_read_info_items'} = $3;
331 # $read_data->{$read_name}{'number_of_tags'} = $4;
333 # Add a read to a contig
335 while ($_ = $self->_readline) {
338 s/\*/-/g; # Forcing '-' as gap symbol
339 $read_sequence .= $_; # aligned read sequence
341 my $read = Bio
::LocatableSeq
->new(
342 -seq
=> $read_sequence,
344 -strand
=> $read_data->{$read_name}{'strand'},
346 -primary_id
=> $read_name,
349 # Adding read location and sequence to contig ("gapped consensus" coordinates)
350 my $padded_start = $read_data->{$read_name}{'padded_start'};
351 if ($self->variant eq '454') {
352 $padded_start += abs($min_start) + 1;
354 my $padded_end = $padded_start + $read_data->{$read_name}{'length'} - 1;
355 my $coord = Bio
::SeqFeature
::Generic
->new(
356 -start
=> $padded_start,
359 -strand
=> $read_data->{$read_name}{'strand'},
360 -tag
=> { 'contig' => $contigOBJ->id }
362 if ($contigOBJ->isa('Bio::Assembly::Singlet')) {
363 # Set the the sequence in the singlet
364 $contigOBJ->seqref($read);
366 # this sets the "_aligned_coord:$seqID" feature
367 $contigOBJ->set_seq_coord($coord,$read);
372 # Loading read trimming and alignment ranges...
373 elsif (/^QA (-?\d+) (-?\d+) (-?\d+) (-?\d+)/) {
374 my ($qual_start, $qual_end, $aln_start, $aln_end) =
377 # Regions of the read that were aligned to the consensus (see BS)
378 unless ($aln_start == -1 && $aln_end == -1) {
379 $aln_start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$aln_start);
380 $aln_end = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$aln_end);
381 my $aln_feat = Bio
::SeqFeature
::Generic
->new(
382 -start
=> $aln_start,
384 -strand
=> $read_data->{$read_name}{'strand'},
385 -primary
=> '_align_clipping',
386 -source
=> $read_name,
388 $aln_feat->attach_seq( $contigOBJ->get_seq_by_name($read_name) );
389 $contigOBJ->add_features([ $aln_feat ], 0);
392 # Regions of the read with high quality score
393 unless ($qual_start == -1 && $qual_end == -1) {
394 $qual_start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$qual_start);
395 $qual_end = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$qual_end);
396 my $qual_feat = Bio
::SeqFeature
::Generic
->new(
397 -start
=> $qual_start,
399 -strand
=> $read_data->{$read_name}{'strand'},
400 -primary
=> '_quality_clipping',
401 -source
=> $read_name || '',
403 $qual_feat->attach_seq( $contigOBJ->get_seq_by_name($read_name) );
404 $contigOBJ->add_features([ $qual_feat ], 0);
409 # Loading read DeScription (DS)
410 elsif (/^DS\s+(.*)/) {
413 # Expected tags are CHROMAT_FILE, PHD_FILE, TIME and to a lesser
414 # extent DYE, TEMPLATE, CHEM and DIRECTION, but any other tag is
416 my (undef, %tags) = split /\s?(\S+):\s+/, $desc;
418 my $coord = $contigOBJ->get_seq_coord( $contigOBJ->get_seq_by_name($read_name) );
419 my $start = $coord->start;
420 my $end = $coord->end;
422 my $read_desc = Bio
::SeqFeature
::Generic
->new(
425 -primary
=> '_read_desc', # primary_tag
426 -source
=> $read_name || '',
429 $contigOBJ->get_features_collection->add_features([$read_desc]);
430 $contigOBJ->get_features_collection->add_SeqFeature($coord, $read_desc);
435 my ($readID,$type,$source,$start,$end,$date) = split(' ',$self->_readline);
436 my $extra_info = undef;
437 while ($_ = $self->_readline) {
441 $start = $contigOBJ->change_coord("aligned $readID",'gapped consensus',$start);
442 $end = $contigOBJ->change_coord("aligned $readID",'gapped consensus',$end);
443 my $read_tag = Bio
::SeqFeature
::Generic
->new(
446 -primary
=> '_read_tags',
447 -source
=> $readID || '',
448 -tag
=> { 'type' => $type,
450 'creation_date' => $date}
452 $read_tag->add_tag_value('extra_info', $extra_info) if defined $extra_info;
453 my $contig = $read_data->{$readID}{'contig'};
454 my $coord = $contig->get_seq_coord( $contig->get_seq_by_name($readID) );
455 $contig->get_features_collection->add_features([$read_tag]);
456 $contig->get_features_collection->add_SeqFeature($coord, $read_tag);
461 # Adjust consensus sequence of 454 variant by padding its start and end
462 if (($self->variant eq '454') && (defined $contigOBJ)) {
465 # Find maximum coordinate
467 for my $readid ($contigOBJ->get_seq_ids) {
468 my ($alncoord) = $contigOBJ->get_features_collection->get_features_by_type("_aligned_coord:$readid");
469 my $end = $alncoord->location->end;
470 if ( (not defined $max_end) || ($end > $max_end) ) {
475 # Pad consensus sequence
476 my $cons_seq = $contigOBJ->get_consensus_sequence;
477 my $cons_string = $cons_seq->seq;
478 my $l_pad_len = abs($min_start) + 1;
479 my $r_pad_len = $max_end - length($cons_string) - $l_pad_len;
480 $cons_string = $pad_char x
$l_pad_len . $cons_string . $pad_char x
$r_pad_len;
481 $cons_seq = Bio
::LocatableSeq
->new(
482 -seq
=> $cons_string,
483 -id
=> $cons_seq->id,
484 -start
=> $cons_seq->start,
485 -strand
=> $cons_seq->strand,
487 $contigOBJ->set_consensus_sequence($cons_seq);
489 # Pad consensus quality
490 my $cons_qual = $contigOBJ->get_consensus_quality;
491 if (defined $cons_qual) {
492 my $cons_score = [ ($pad_score) x
$l_pad_len,
494 ($pad_score) x
$r_pad_len ];
495 $cons_qual = Bio
::Seq
::PrimaryQual
->new(
496 -qual
=> join(' ', @
$cons_score),
497 -id
=> $cons_qual->id
499 $contigOBJ->set_consensus_quality($cons_qual);
507 =head2 scaffold_annotations
509 Title : scaffold_annotations
510 Usage : $stream->scaffold_annotations($scaffold)
511 Function: Add assembly and contig annotations to a scaffold. In the ACE format,
512 annotations are the WA and CT tags.
513 Returns : 1 for success
514 Args : a Bio::Assembly::Scaffold object to attach the annotations to
518 sub scaffold_annotations
{
519 my ($self, $assembly) = @_;
521 # Read the ACE stream from the beginning again
522 seek($self->_fh, 0, 0);
523 while ($_ = $self->_readline) {
526 # Assembly information (ASsembly field)
528 #(/^AS\s+(\d+)\s+(\d+)/) && do {
529 # my $nof_contigs = $1;
530 # my $nof_seq_in_contigs = $2;
533 # Loading Whole Assembly tags
535 my ($type,$source,$date) = split(' ',$self->_readline);
536 my $extra_info = undef;
537 while ($_ = $self->_readline) {
542 my $assembly_tags = join(" ","TYPE:",$type,"PROGRAM:",$source,
543 "DATE:",$date,"DATA:",$extra_info);
544 $assembly_tags = Bio
::Annotation
::SimpleValue
->new(-value
=>$assembly_tags);
545 $assembly->annotation->add_Annotation('whole assembly',$assembly_tags);
548 # Loading Contig Tags (a.k.a. Bioperl features)
550 my ($contigID,$type,$source,$start,$end,$date) = split(' ',$self->_readline);
551 my %tags = ('source' => $source, 'creation_date' => $date);
552 my $tag_type = 'extra_info';
553 while ($_ = $self->_readline) {
554 if (/COMMENT\s*\{/) {
555 $tag_type = 'comment';
557 $tag_type = 'extra_info';
561 $tags{$tag_type} .= "$_";
564 my $contig_tag = Bio
::SeqFeature
::Generic
->new( -start
=> $start,
569 my $contig = $assembly->get_contig_by_id($contigID) ||
570 $assembly->get_singlet_by_id($contigID);
571 $self->throw("Cannot add feature to unknown contig '$contigID'")
572 unless defined $contig;
574 $contig->add_features([ $contig_tag ],1);
582 =head2 write_assembly
584 Title : write_assembly
585 Usage : $ass_io->write_assembly($assembly)
586 Function: Write the assembly object in ACE compatible format. The contig IDs
587 are sorted naturally if the Sort::Naturally module is present, or
588 lexically otherwise. Internally, write_assembly use the
589 write_contig, write_footer and write_header methods. Use these
590 methods if you want more control on the writing process.
591 Returns : 1 on success, 0 for error
592 Args : A Bio::Assembly::Scaffold object
600 Usage : $ass_io->write_contig($contig)
601 Function: Write a contig or singlet object in ACE compatible format. Quality
602 scores are automatically generated if the contig does not contain
604 Returns : 1 on success, 0 for error
605 Args : A Bio::Assembly::Contig or Singlet object
610 my ($self, @args) = @_;
611 my ($contig) = $self->_rearrange([qw(CONTIG)], @args);
614 if ( !$contig || !$contig->isa('Bio::Assembly::Contig') ) {
615 $self->throw("Must provide a Bio::Assembly::Contig or Singlet object when calling write_contig");
618 # Contig consensus sequence
619 my $contig_id = $contig->id;
620 my $cons = $contig->get_consensus_sequence;
621 my $cons_seq = $cons->seq;
622 my $cons_len = $cons->length;
623 my $contig_num_reads = $contig->num_sequences;
624 my $cons_strand = ($contig->strand == -1) ?
'C' : 'U';
625 my @bs_feats = $contig->get_features_collection->get_features_by_type('_base_segments');
626 my $nof_segments = scalar @bs_feats;
629 "CO $contig_id $cons_len $contig_num_reads $nof_segments $cons_strand\n".
630 _formatted_seq
($cons_seq, $line_width).
634 # Consensus quality scores
635 $cons = $contig->get_consensus_quality;
636 my $cons_qual = $cons->qual if defined $cons;
639 _formatted_qual
($cons_qual, $cons_seq, $line_width, $qual_value).
644 my @reads = $contig->each_seq;
645 for my $read (@reads) {
646 my $read_id = $read->id;
647 my $read_strand = ($read->strand == -1) ?
'C' : 'U';
648 my $read_start = $contig->change_coord("aligned $read_id",'gapped consensus',1);
649 $self->_print( "AF $read_id $read_strand $read_start\n" );
651 $self->_print( "\n" );
653 # Deal with base segments (BS)
655 # sort segments by increasing start position
656 @bs_feats = sort { $a->start <=> $b->start } @bs_feats;
658 for my $bs_feat ( @bs_feats ) {
659 my $start = $bs_feat->start;
660 my $end = $bs_feat->end;
661 my $id = ($bs_feat->get_tag_values('contig_id'))[0];
662 $self->_print( "BS $start $end $id\n" );
664 $self->_print( "\n" );
667 for my $read (@reads) {
668 $self->_write_read($read, $contig);
678 Usage : $ass_io->write_header($scaffold)
680 $ass_io->write_header(\@contigs);
682 $ass_io->write_header();
683 Function: Write ACE header (AS tags). You can call this function at any time,
684 i.e. not necessarily at the start of the stream - this is useful
685 if you have an undetermined number of contigs to write to ACE, e.g:
686 for my $contig (@list_of_contigs) {
687 $ass_io->_write_contig($contig);
689 $ass_io->_write_header();
690 Returns : 1 on success, 0 for error
691 Args : A Bio::Assembly::Scaffold
693 an arrayref of Bio::Assembly::Contig
695 nothing (the header is dynamically written based on the ACE file
701 my ($self, $input) = @_;
705 my $err_msg = "If an input is given to write_header, it must be a single ".
706 "Bio::Assembly::Scaffold object or an arrayref of Bio::Assembly::Contig".
707 " or Singlet objects";
708 my $ref = ref $input;
709 if ( $ref eq 'ARRAY' ) {
710 for my $obj ( @
$input ) {
711 $self->throw($err_msg) if not $obj->isa('Bio::Assembly::Contig');
714 } elsif ( $ref =~ m/Bio::Assembly::Scaffold/ ) {
715 @contigs = ($input->all_contigs, $input->all_singlets);
718 # Count number of contigs and reads
721 if ( scalar @contigs > 0 ) {
722 # the contigs were provided
723 $num_contigs = scalar @contigs;
724 for my $contig ( @contigs ) {
725 $num_reads += $contig->num_sequences;
728 # need to read the contigs from file
730 my $file = $self->file(); # e.g. '+>output.ace'
731 $file =~ s/^\+?[><]?//; # e.g. 'output.ace'
732 my $read_io = Bio
::Assembly
::IO
->new( -file
=> $file, -format
=> 'ace' );
733 while ( my $contig = $read_io->next_contig ) {
735 $num_reads += $contig->num_sequences;
740 # Write ASsembly tag at the start of the file
741 my $header = "AS $num_contigs $num_reads\n\n";
742 $self->_insert($header, 1);
751 Usage : $ass_io->write_footer($scaffold)
752 Function: Write ACE footer (WA and CT tags).
753 Returns : 1 on success, 0 for error
754 Args : A Bio::Assembly::Scaffold object (optional)
759 my ($self, $scaf) = @_;
760 # Nothing to write if scaffold was not provided
761 return 1 if not defined $scaf;
762 # Verify that provided object is a scaffold
763 if ($scaf->isa('Bio::Assembly:ScaffoldI')) {
764 $self->throw("Must provide a Bio::Assembly::Scaffold object when calling write_footer");
766 # Whole Assembly tags (WA)
767 my $asm_anno = ($scaf->annotation->get_Annotations('whole assembly'))[0];
769 my $asm_tags = $asm_anno->value;
770 if ($asm_tags =~ m/^TYPE: (\S+) PROGRAM: (\S+) DATE: (\S+) DATA: (.*)$/ms) {
771 my ($type, $program, $date, $data) = ($1, $2, $3, $4);
775 "$type $program $date\n".
783 for my $contig_id ( Bio
::Assembly
::IO
::_sort
( $scaf->get_contig_ids ) ) {
784 my $contig = $scaf->get_contig_by_id($contig_id) ||
785 $scaf->get_singlet_by_id($contig_id);
786 # Is there a better way of doing this? Grepping is not very efficient...
788 { not $_->primary_tag =~ m/^_/ }
789 $contig->get_features_collection->features
791 for my $feat (@feats) {
792 my $type = $feat->primary_tag;
793 my $start = $feat->start;
794 my $end = $feat->end;
795 my $source = ($feat->get_tag_values('source') )[0];
796 my $date = ($feat->get_tag_values('creation_date'))[0];
798 if ($feat->has_tag('extra_info')) {
799 $extra = ($feat->get_tag_values('extra_info') )[0];
803 "$contig_id $type $source $start $end $date\n".
817 Usage : $format = $obj->variant();
818 Function: Get and set method for the assembly variant. This is important since
819 not all assemblers respect the reference ACE format.
821 Args : string: 'consed' (default) or '454'
826 my ($self, $enc) = @_;
829 if (not exists $variant{$enc}) {
830 $self->throw('Not a valid ACE variant format');
832 $self->{variant
} = $enc;
834 return $self->{variant
};
841 Usage : $ass_io->_write_read($read, $contig)
842 Function: Write a read object in ACE compatible format
843 Returns : 1 on success, 0 for error
844 Args : a Bio::LocatableSeq read
845 the Contig or Singlet object that this read belongs to
850 my ($self, @args) = @_;
851 my ($read, $contig) = $self->_rearrange([qw(READ CONTIG)], @args);
854 if ( !$read || !$read->isa('Bio::LocatableSeq') ) {
855 $self->throw("Must provide a Bio::LocatableSeq when calling write_read");
857 if ( !$contig || !$contig->isa('Bio::Assembly::Contig') ) {
858 $self->throw("Must provide a Bio::Assembly::Contig or Singlet object when calling write_read");
862 my $read_id = $read->id;
863 my $read_len = $read->length; # aligned length
864 my $read_seq = $read->seq;
865 my $nof_info = 0; # fea: could not find exactly the meaning of this?
866 my @read_tags = $contig->get_features_collection->get_SeqFeatures(
867 $contig->get_seq_coord($read), "_read_tags:$read_id");
868 my $nof_tags = scalar @read_tags;
870 "RD $read_id $read_len $nof_info $nof_tags\n".
871 _formatted_seq
($read_seq, $line_width).
875 # Aligned "align clipping" and quality coordinates if read object has them
876 my $qual_clip_start = 1;
877 my $qual_clip_end = length($read->seq);
878 my ($qual_clip) = $contig->get_features_collection->get_features_by_type("_quality_clipping:$read_id");
879 if ( defined $qual_clip ) {
880 $qual_clip_start = $qual_clip->location->start;
881 $qual_clip_end = $qual_clip->location->end;
882 $qual_clip_start = $contig->change_coord('gapped consensus',"aligned $read_id",$qual_clip_start);
883 $qual_clip_end = $contig->change_coord('gapped consensus',"aligned $read_id",$qual_clip_end );
886 my $aln_clip_start = 1;
887 my $aln_clip_end = length($read->seq);
888 my ($aln_clip) = $contig->get_features_collection->get_features_by_type("_align_clipping:$read_id");
890 if ( defined $aln_clip ) {
891 $aln_clip_start = $aln_clip->location->start;
892 $aln_clip_end = $aln_clip->location->end;
893 $aln_clip_start = $contig->change_coord('gapped consensus',"aligned $read_id",$aln_clip_start );
894 $aln_clip_end = $contig->change_coord('gapped consensus',"aligned $read_id",$aln_clip_end );
898 "QA $qual_clip_start $qual_clip_end $aln_clip_start $aln_clip_end\n".
902 # Read description, if read object has them
903 my $read_desc = ( $contig->get_features_collection->get_SeqFeatures(
904 $contig->get_seq_coord($read), "_read_desc:$read_id") )[0];
907 for my $tag_name ( $read_desc->get_all_tags ) {
908 my $tag_value = ($read_desc->get_tag_values($tag_name))[0];
909 $self->_print(" $tag_name: $tag_value");
911 $self->_print("\n\n");
914 # Read tags, if read object has them
915 for my $read_tag (@read_tags) {
916 #my $type = $read_tag->primary_tag;
917 my $start = $read_tag->start;
918 my $end = $read_tag->end;
919 my $type = ($read_tag->get_tag_values('type') )[0];
920 my $source = ($read_tag->get_tag_values('source') )[0];
921 my $date = ($read_tag->get_tag_values('creation_date'))[0];
922 my $extra = $read_tag->has_tag('extra_info') ?
923 ($read_tag->get_tag_values('extra_info') )[0] : '';
926 "$read_id $type $source $start $end $date\n".
937 =head2 _formatted_seq
939 Title : _formatted_seq
940 Usage : Bio::Assembly::IO::ace::_formatted_seq($sequence, $line_width)
941 Function: Format a sequence for ACE output:
942 i ) replace gaps in the sequence by the '*' char
943 ii) split the sequence on multiple lines as needed
944 Returns : new sequence string
945 Args : sequence string on one line
951 my ($seq_str, $line_width) = @_;
953 # In the ACE format, gaps are '*'
955 # Split sequences on several lines
956 while ( my $chunk = substr $seq_str, 0, $line_width, '' ) {
957 $new_str .= "$chunk\n";
963 =head2 _formatted_qual
965 Title : _formatted_qual
966 Usage : Bio::Assembly::IO::ace::_formatted_qual($qual_arr, $sequence, $line_width, $qual_default)
967 Function: Format quality scores for ACE output:
968 i ) use the default quality values when they are missing
969 ii ) remove gaps (they get no score in ACE)
970 iii) split the quality scores on several lines as needed
971 Returns : new quality score string
972 Args : quality score array reference
973 corresponding sequence string
975 default quality score
979 sub _formatted_qual
{
980 my ($qual_arr, $seq, $line_width, $qual_default) = @_;
983 if (defined $qual_arr) {
985 @qual_arr = @
$qual_arr;
988 @qual_arr = map( $qual_default, (1 .. length $seq) );
990 # Gaps get no quality score in ACE format
993 $gap_pos = index($seq, '-', $gap_pos + 1);
994 last if $gap_pos == -1;
995 substr $seq, $gap_pos, 1, '';
996 splice @qual_arr, $gap_pos, 1;
999 # Split quality scores on several lines
1000 while ( my @chunks = splice @qual_arr, 0, $line_width ) {
1001 $qual_str .= "@chunks\n";
1010 Usage : Bio::Assembly::IO::ace::_input_qual($qual_string, $sequence)
1011 Function: Reads input quality string and converts it to an array of quality
1012 scores. Gaps get a quality score equals to the average of the
1013 quality score of its neighbours.
1014 Returns : new quality score array
1015 Args : quality score string
1016 corresponding sequence string
1021 my ($self, $qual_string, $sequence) = @_;
1023 # Remove whitespaces in front of qual string and split quality values
1024 $qual_string =~ s/^\s+//;
1025 my @tmp = split(/\s+/, $qual_string);
1027 my $i = 0; # position in quality
1028 my $j = 0; # position in sequence
1031 for $j (0 .. length($sequence)-1) {
1032 my $nt = substr($sequence, $j, 1);
1044 push @qual_arr, int(($prev+$next)/2);
1046 push @qual_arr, $tmp[$i];
1057 Usage : $ass_io->_initialize(@args)
1058 Function: Initialize the Bio::Assembly::IO object with the proper ACE variant
1065 my($self, @args) = @_;
1066 $self->SUPER::_initialize
(@args);
1067 my ($variant) = $self->_rearrange([qw(VARIANT)], @args);
1068 $variant ||= 'consed';
1069 $self->variant($variant);