3 # BioPerl driver for phrap.out file
5 # Copyright by Robson F. de Souza
7 # You may distribute this module under the same terms as perl itself
9 # POD documentation - main docs before the code
13 Bio::Assembly::IO::phrap - driver to load phrap.out files.
17 # Building an input stream
18 use Bio::Assembly::IO;
20 # Assembly loading methods
21 $io = Bio::Assembly::IO->new(-file=>"SGC0-424.phrap.out",
24 $assembly = $io->next_assembly;
28 This package was developed to load the phrap.out files from the
29 (phred/phrap/consed) package by Phill Green. This files contain just
30 the messages printed to standard out by phrap when building an
31 assembly. This output is redirected by phredPhrap perl-script to a
32 file in the project's directory and hold some bit of information
33 regarding assembly quality, connections between contigs and clone's
34 position inside contigs. It should be noted that such files have no
35 data about the sequence. neither for contig consensus nor for any
36 aligned sequence. Anyway, such information may be loaded from Fasta
37 files in the projects directory and added to the assembly object
40 Note that, because no sequence is loaded for the contig consensus and
41 locations for aligned sequences are only given in "ungapped consensus"
42 coordinates in a phrap.out file, you can't make coordinate changes in
43 assemblies loaded by pharp.pm, unless you add an aligned
44 coordinates for each sequence to each contig's features collection
45 yourself. See L<Bio::Assembly::Contig::Coordinate_Systems> and
46 L<Bio::Assembly::Contig::Feature_collection>..
48 This driver also loads singlets into the assembly contigs as
49 Bio::Assembly::Singlet objects, altough without their sequence strings.
50 It also adds a feature for the entire sequence, thus storing the singlet
51 length in its end position, and adds a tag '_nof_trimmed_nonX' to the
52 feature, which stores the number of non-vector bases in the singlet.
56 Assemblies are loaded into Bio::Assembly::Scaffold objects composed by
57 Bio::Assembly::Contig objects. No features are added to Bio::Assembly::Contig
58 "_aligned_coord:$seqID" feature class, therefore you can't make
59 coordinate changes in contigs loaded by this module. Contig objects
60 created by this module will have the following special feature
61 classes, identified by their primary tags, in their features
64 "_main_contig_feature:$ID" : main feature for contig $ID. This
65 feature is used to store information
66 about the entire consensus
67 sequence. This feature always start at
68 base 1 and its end position is the
69 consensus sequence length. A tag,
70 'trimmed_length' holds the length of the
71 trimmed good quality region inside the
74 "_covered_region:$index" : coordinates for valid clones inside the
75 contig. $index is the covered region
76 number, starting at 1 for the covered
77 region closest to the consensus sequence
80 "_unalign_coord:$seqID" : location of a sequence in "ungapped
81 consensus" coordinates (consensus
82 sequence without gaps). Primary and
83 secondary scores, indel and
84 substitutions statistics are stored as
87 "_internal_clones:$cloneID" : clones inside contigs $cloneID should be
88 used as the unique id for each
89 clone. These features have six tags:
90 '_1st_name', which is the id of the
91 upstream (5') aligned sequence
92 delimiting the clone; '_1st_strand', the
93 upstream sequence strand in the
94 alignment; '_2nd_name', downstream (3')
95 sequence id; '_2nd_strand', the
96 downstream sequence strand in the
97 alignment; '_length', unaligned clone
98 length; '_rejected', a boolean flag,
99 which is false if the clone is valid and
100 true if it was rejected.
102 All coordinates for the features above are expressed as "ungapped
103 consensus" coordinates (See L<Bio::Assembly::Contig::Coordinate_Systems>..
105 =head2 Feature collection
113 User feedback is an integral part of the evolution of this and other
114 Bioperl modules. Send your comments and suggestions preferably to the
115 Bioperl mailing lists Your participation is much appreciated.
117 bioperl-l@bioperl.org - General discussion
118 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
122 Please direct usage questions or support issues to the mailing list:
124 L<bioperl-l@bioperl.org>
126 rather than to the module maintainer directly. Many experienced and
127 reponsive experts will be able look at the problem and quickly
128 address it. Please include a thorough description of the problem
129 with code and data examples if at all possible.
131 =head2 Reporting Bugs
133 Report bugs to the Bioperl bug tracking system to help us keep track
134 the bugs and their resolution. Bug reports can be submitted via the
137 http://bugzilla.open-bio.org/
140 =head1 AUTHOR - Robson Francisco de Souza
142 Email rfsouza@citri.iq.usp.br
146 The rest of the documentation details each of the object
147 methods. Internal methods are usually preceded with a _
151 package Bio
::Assembly
::IO
::phrap
;
155 use Bio
::Assembly
::Scaffold
;
156 use Bio
::Assembly
::Singlet
;
157 use Bio
::Assembly
::Contig
;
158 use Bio
::LocatableSeq
;
160 use Bio
::SeqFeature
::Generic
;
162 use base
qw(Bio::Assembly::IO);
166 Title : next_assembly
167 Usage : $unigene = $stream->next_assembly()
168 Function: returns the next assembly in the stream
169 Returns : Bio::Assembly::Scaffold object
175 my $self = shift; # Package reference
177 # Resetting assembly data structure
178 my $Assembly = Bio
::Assembly
::Scaffold
->new(-source
=>'phrap');
180 # Looping over all phrap out file lines
182 while ($_ = $self->_readline) {
185 # Loading exact dupicated reads list
186 # /Exact duplicate reads:/ && do {
190 # /(\S+)\s+(\S+)/ && do {
191 # push(@exact_dupl,[$1,$2]);
193 # $self->{'assembly'}{'exact_dupl_reads'} =
194 # new Data::Table(\@exact_dupl,['included','excluded'],0);
198 # Loading singlets reads data
199 /^(\d+) isolated singlet/ && do { # should it match 'singlets' and 'singletons'?
200 while ($_ = $self->_readline) {
203 if (/^\s+(\S+)\s+(\d+)\s+\((\d+)\)/) {
204 my ($singletID, $length, $nof_trimmed_nonX) = ($1, $2, $3);
205 # Create singlet object, and add it to scaffold
206 my $seq = Bio
::LocatableSeq
->new(
212 -primary_id
=> $singletID,
214 my $singletOBJ = Bio
::Assembly
::Singlet
->new(-seqref
=>$seq,
215 -verbose
=> $self->verbose);
216 my $feat = Bio
::SeqFeature
::Generic
->new(
219 -primary
=> "_main_contig_feature:".$singletOBJ->id(),
220 -tag
=> { '_nof_trimmed_nonX' => $nof_trimmed_nonX }
222 $singletOBJ->add_features([ $feat ],1);
223 $Assembly->add_singlet($singletOBJ);
228 # Loading contig information
229 /^Contig (\d+)\.\s+(\d+) reads?; (\d+) bp \(untrimmed\), (\d+) \(trimmed\)\./ && do {
230 my ($contigID, $nof_reads, $length, $trimmed_length) = ($1, $2, $3, $4);
231 $contigOBJ = Bio
::Assembly
::Contig
->new( -id
=> $contigID,
232 -verbose
=> $self->verbose,
233 -source
=> 'phrap' );
234 my $feat = Bio
::SeqFeature
::Generic
->new(
237 -primary
=> "_main_contig_feature:".$contigOBJ->id(),
238 -tag
=> { '_trimmed_length' => $trimmed_length }
240 $contigOBJ->add_features([ $feat ],1);
241 $Assembly->add_contig($contigOBJ);
244 # Loading read information
245 /^(C?)\s+(-?\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+\(\s*(\d+)\)\s+(\d+\.\d*)\s+(\d+\.\d*)\s+(\d+\.\d*)/ && do {
246 my ($strand, $start, $end, $readID, $primary_score, $secondary_score,
247 $substitutions, $deletions, $insertions) = ($1, $2, $3, $4, $5, $6, $7,
249 $strand = ($strand eq 'C' ?
-1 : 1);
250 my $seq = Bio
::LocatableSeq
->new(
256 -primary_id
=> $readID,
258 my $unalign_coord = Bio
::SeqFeature
::Generic
->new(
261 -primary
=> "_unalign_coord:$readID",
262 -tag
=> {'_primary_score'=>$primary_score,
263 '_secondary_score'=>$secondary_score,
264 '_substitutions'=>$substitutions,
265 '_insertions'=>,$insertions,
266 '_deletions'=>$deletions }
268 $unalign_coord->attach_seq($seq);
269 $contigOBJ->add_seq($seq);
270 $contigOBJ->add_features([ $unalign_coord ]);
273 # Loading INTERNAL clones description
274 /INTERNAL\s+Contig\s+(\d+)\s+opp\s+sense/ && do {
276 my $contig = $Assembly->get_contig_by_id($contigID);
277 while ($_ = $self->_readline) {
278 my (@data,$rejected,$c1_strand,$c2_strand);
280 (@data = /\s+(\*?)\s+(C?)\s+(\S+)\s+(C?)\s+(\S+)\s+(-?\d+)\s+(-?\d+)\s+(-?\d+)/) && do {
281 if ($data[0] eq '*') { $rejected = 1 } else { $rejected = 0 }
282 $c1_strand = ($data[1] eq 'C' ?
-1 : 1);
283 $c2_strand = ($data[3] eq 'C' ?
-1 : 1);
284 (my $clone_name = $data[2]) =~ s/^(\S+)\.\w.*/$1/;
285 my $clone = Bio
::SeqFeature
::Generic
->new(
289 -primary
=> "_internal_clone:$clone_name",
290 -tag
=> {'_1st_strand'=>,$c1_strand,
291 '_2nd_strand'=>,$c2_strand,
292 '_1st_name'=>$data[2],
293 '_2nd_name'=>$data[4],
295 '_rejected'=>$rejected}
297 $contig->add_features([ $clone ]);
300 /Covered regions:/ && do {
301 my %coord = /(\d+)/g; my $i = 0;
302 foreach my $start (sort { $a <=> $b } keys %coord) {
303 my $cov = Bio
::SeqFeature
::Generic
->new(
305 -end
=> $coord{$start},
306 -primary
=> '_covered_region:'.++$i
308 # 1: attach feature to contig consensus, if any
309 $contig->add_features([ $cov ],1);
311 last; # exit while loop
312 }; # /Covered regions:/
314 } # while ($_ = $self->_readline)
315 }; # /INTERNAL\s+Contig\s+(\d+)\s+opp\s+sense/
316 } # while ($_ = $self->_readline)
321 =head2 write_assembly (NOT IMPLEMENTED)
323 Title : write_assembly
324 Usage : $ass_io->write_assembly($assembly)
325 Function: Write the assembly object in Phrap compatible ACE format
326 Returns : 1 on success, 0 for error
327 Args : A Bio::Assembly::Scaffold object
333 $self->throw_not_implemented();