sync w/ main trunk
[bioperl-live.git] / Bio / Assembly / IO / phrap.pm
blob2956c5a74edf39d4cac62e0f9dfa1d533cf73188
1 # $Id$
3 # BioPerl driver for phrap.out file
5 # Copyright by Robson F. de Souza
7 # You may distribute this module under the same terms as perl itself
9 # POD documentation - main docs before the code
11 =head1 NAME
13 Bio::Assembly::IO::phrap - driver to load phrap.out files.
15 =head1 SYNOPSIS
17 # Building an input stream
18 use Bio::Assembly::IO;
20 # Assembly loading methods
21 $io = Bio::Assembly::IO->new(-file=>"SGC0-424.phrap.out",
22 -format=>"phrap");
24 $assembly = $io->next_assembly;
26 =head1 DESCRIPTION
28 This package was developed to load the phrap.out files from the
29 (phred/phrap/consed) package by Phill Green. This files contain just
30 the messages printed to standard out by phrap when building an
31 assembly. This output is redirected by phredPhrap perl-script to a
32 file in the project's directory and hold some bit of information
33 regarding assembly quality, connections between contigs and clone's
34 position inside contigs. It should be noted that such files have no
35 data about the sequence. neither for contig consensus nor for any
36 aligned sequence. Anyway, such information may be loaded from Fasta
37 files in the projects directory and added to the assembly object
38 later.
40 Note that, because no sequence is loaded for the contig consensus and
41 locations for aligned sequences are only given in "ungapped consensus"
42 coordinates in a phrap.out file, you can't make coordinate changes in
43 assemblies loaded by pharp.pm, unless you add an aligned
44 coordinates for each sequence to each contig's features collection
45 yourself. See L<Bio::Assembly::Contig::Coordinate_Systems> and
46 L<Bio::Assembly::Contig::Feature_collection>..
48 This driver also loads singlets into the assembly contigs as
49 Bio::Assembly::Singlet objects, altough without their sequence strings.
50 It also adds a feature for the entire sequence, thus storing the singlet
51 length in its end position, and adds a tag '_nof_trimmed_nonX' to the
52 feature, which stores the number of non-vector bases in the singlet.
54 =head2 Implementation
56 Assemblies are loaded into Bio::Assembly::Scaffold objects composed by
57 Bio::Assembly::Contig objects. No features are added to Bio::Assembly::Contig
58 "_aligned_coord:$seqID" feature class, therefore you can't make
59 coordinate changes in contigs loaded by this module. Contig objects
60 created by this module will have the following special feature
61 classes, identified by their primary tags, in their features
62 collection:
64 "_main_contig_feature:$ID" : main feature for contig $ID. This
65 feature is used to store information
66 about the entire consensus
67 sequence. This feature always start at
68 base 1 and its end position is the
69 consensus sequence length. A tag,
70 'trimmed_length' holds the length of the
71 trimmed good quality region inside the
72 consensus sequence.
74 "_covered_region:$index" : coordinates for valid clones inside the
75 contig. $index is the covered region
76 number, starting at 1 for the covered
77 region closest to the consensus sequence
78 first base.
80 "_unalign_coord:$seqID" : location of a sequence in "ungapped
81 consensus" coordinates (consensus
82 sequence without gaps). Primary and
83 secondary scores, indel and
84 substitutions statistics are stored as
85 feature tags.
87 "_internal_clones:$cloneID" : clones inside contigs $cloneID should be
88 used as the unique id for each
89 clone. These features have six tags:
90 '_1st_name', which is the id of the
91 upstream (5') aligned sequence
92 delimiting the clone; '_1st_strand', the
93 upstream sequence strand in the
94 alignment; '_2nd_name', downstream (3')
95 sequence id; '_2nd_strand', the
96 downstream sequence strand in the
97 alignment; '_length', unaligned clone
98 length; '_rejected', a boolean flag,
99 which is false if the clone is valid and
100 true if it was rejected.
102 All coordinates for the features above are expressed as "ungapped
103 consensus" coordinates (See L<Bio::Assembly::Contig::Coordinate_Systems>..
105 =head2 Feature collection
109 =head1 FEEDBACK
111 =head2 Mailing Lists
113 User feedback is an integral part of the evolution of this and other
114 Bioperl modules. Send your comments and suggestions preferably to the
115 Bioperl mailing lists Your participation is much appreciated.
117 bioperl-l@bioperl.org - General discussion
118 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
120 =head2 Support
122 Please direct usage questions or support issues to the mailing list:
124 L<bioperl-l@bioperl.org>
126 rather than to the module maintainer directly. Many experienced and
127 reponsive experts will be able look at the problem and quickly
128 address it. Please include a thorough description of the problem
129 with code and data examples if at all possible.
131 =head2 Reporting Bugs
133 Report bugs to the Bioperl bug tracking system to help us keep track
134 the bugs and their resolution. Bug reports can be submitted via the
135 web:
137 http://bugzilla.open-bio.org/
140 =head1 AUTHOR - Robson Francisco de Souza
142 Email rfsouza@citri.iq.usp.br
144 head1 APPENDIX
146 The rest of the documentation details each of the object
147 methods. Internal methods are usually preceded with a _
149 =cut
151 package Bio::Assembly::IO::phrap;
153 use strict;
155 use Bio::Assembly::Scaffold;
156 use Bio::Assembly::Singlet;
157 use Bio::Assembly::Contig;
158 use Bio::LocatableSeq;
159 use Bio::Seq;
160 use Bio::SeqFeature::Generic;
162 use base qw(Bio::Assembly::IO);
164 =head2 next_assembly
166 Title : next_assembly
167 Usage : $unigene = $stream->next_assembly()
168 Function: returns the next assembly in the stream
169 Returns : Bio::Assembly::Scaffold object
170 Args : NONE
172 =cut
174 sub next_assembly {
175 my $self = shift; # Package reference
177 # Resetting assembly data structure
178 my $Assembly = Bio::Assembly::Scaffold->new(-source=>'phrap');
180 # Looping over all phrap out file lines
181 my ($contigOBJ);
182 while ($_ = $self->_readline) {
183 chomp;
185 # Loading exact dupicated reads list
186 # /Exact duplicate reads:/ && do {
187 # my @exact_dupl;
188 # while (<FILE>) {
189 # last if (/^\s*$/);
190 # /(\S+)\s+(\S+)/ && do {
191 # push(@exact_dupl,[$1,$2]);
192 # };
193 # $self->{'assembly'}{'exact_dupl_reads'} =
194 # new Data::Table(\@exact_dupl,['included','excluded'],0);
196 # };
198 # Loading singlets reads data
199 /^(\d+) isolated singlet/ && do { # should it match 'singlets' and 'singletons'?
200 while ($_ = $self->_readline) {
201 chomp;
202 last if (/^$/);
203 if (/^\s+(\S+)\s+(\d+)\s+\((\d+)\)/) {
204 my ($singletID, $length, $nof_trimmed_nonX) = ($1, $2, $3);
205 # Create singlet object, and add it to scaffold
206 my $seq = Bio::LocatableSeq->new(
207 -start => 1,
208 -end => $length,
209 -strand => 1,
210 -nowarnonempty => 1,
211 -id => $singletID,
212 -primary_id => $singletID,
213 -alphabet => 'dna');
214 my $singletOBJ = Bio::Assembly::Singlet->new(-seqref=>$seq,
215 -verbose => $self->verbose);
216 my $feat = Bio::SeqFeature::Generic->new(
217 -start => 1,
218 -end => $length,
219 -primary => "_main_contig_feature:".$singletOBJ->id(),
220 -tag => { '_nof_trimmed_nonX' => $nof_trimmed_nonX }
222 $singletOBJ->add_features([ $feat ],1);
223 $Assembly->add_singlet($singletOBJ);
228 # Loading contig information
229 /^Contig (\d+)\.\s+(\d+) reads?; (\d+) bp \(untrimmed\), (\d+) \(trimmed\)\./ && do {
230 my ($contigID, $nof_reads, $length, $trimmed_length) = ($1, $2, $3, $4);
231 $contigOBJ = Bio::Assembly::Contig->new( -id => $contigID,
232 -verbose => $self->verbose,
233 -source => 'phrap' );
234 my $feat = Bio::SeqFeature::Generic->new(
235 -start => 1,
236 -end => $length,
237 -primary => "_main_contig_feature:".$contigOBJ->id(),
238 -tag => { '_trimmed_length' => $trimmed_length }
240 $contigOBJ->add_features([ $feat ],1);
241 $Assembly->add_contig($contigOBJ);
244 # Loading read information
245 /^(C?)\s+(-?\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+\(\s*(\d+)\)\s+(\d+\.\d*)\s+(\d+\.\d*)\s+(\d+\.\d*)/ && do {
246 my ($strand, $start, $end, $readID, $primary_score, $secondary_score,
247 $substitutions, $deletions, $insertions) = ($1, $2, $3, $4, $5, $6, $7,
248 $8, $9);
249 $strand = ($strand eq 'C' ? -1 : 1);
250 my $seq = Bio::LocatableSeq->new(
251 -start => $start,
252 -end => $end,
253 -nowarnonempty => 1,
254 -strand => $strand,
255 -id => $readID,
256 -primary_id => $readID,
257 -alphabet => 'dna');
258 my $unalign_coord = Bio::SeqFeature::Generic->new(
259 -start => $start,
260 -end => $end,
261 -primary => "_unalign_coord:$readID",
262 -tag => {'_primary_score'=>$primary_score,
263 '_secondary_score'=>$secondary_score,
264 '_substitutions'=>$substitutions,
265 '_insertions'=>,$insertions,
266 '_deletions'=>$deletions }
268 $unalign_coord->attach_seq($seq);
269 $contigOBJ->add_seq($seq);
270 $contigOBJ->add_features([ $unalign_coord ]);
273 # Loading INTERNAL clones description
274 /INTERNAL\s+Contig\s+(\d+)\s+opp\s+sense/ && do {
275 my $contigID = $1;
276 my $contig = $Assembly->get_contig_by_id($contigID);
277 while ($_ = $self->_readline) {
278 my (@data,$rejected,$c1_strand,$c2_strand);
280 (@data = /\s+(\*?)\s+(C?)\s+(\S+)\s+(C?)\s+(\S+)\s+(-?\d+)\s+(-?\d+)\s+(-?\d+)/) && do {
281 if ($data[0] eq '*') { $rejected = 1 } else { $rejected = 0 }
282 $c1_strand = ($data[1] eq 'C' ? -1 : 1);
283 $c2_strand = ($data[3] eq 'C' ? -1 : 1);
284 (my $clone_name = $data[2]) =~ s/^(\S+)\.\w.*/$1/;
285 my $clone = Bio::SeqFeature::Generic->new(
286 -start => $data[6],
287 -end => $data[7],
288 -strand => 0,
289 -primary => "_internal_clone:$clone_name",
290 -tag => {'_1st_strand'=>,$c1_strand,
291 '_2nd_strand'=>,$c2_strand,
292 '_1st_name'=>$data[2],
293 '_2nd_name'=>$data[4],
294 '_length'=>$data[5],
295 '_rejected'=>$rejected}
297 $contig->add_features([ $clone ]);
300 /Covered regions:/ && do {
301 my %coord = /(\d+)/g; my $i = 0;
302 foreach my $start (sort { $a <=> $b } keys %coord) {
303 my $cov = Bio::SeqFeature::Generic->new(
304 -start => $start,
305 -end => $coord{$start},
306 -primary => '_covered_region:'.++$i
308 # 1: attach feature to contig consensus, if any
309 $contig->add_features([ $cov ],1);
311 last; # exit while loop
312 }; # /Covered regions:/
314 } # while ($_ = $self->_readline)
315 }; # /INTERNAL\s+Contig\s+(\d+)\s+opp\s+sense/
316 } # while ($_ = $self->_readline)
318 return $Assembly;
321 =head2 write_assembly (NOT IMPLEMENTED)
323 Title : write_assembly
324 Usage : $ass_io->write_assembly($assembly)
325 Function: Write the assembly object in Phrap compatible ACE format
326 Returns : 1 on success, 0 for error
327 Args : A Bio::Assembly::Scaffold object
329 =cut
331 sub write_assembly {
332 my $self = shift;
333 $self->throw_not_implemented();
338 __END__