sync with main trunk
[bioperl-live.git] / Bio / Assembly / IO / phrap.pm
blob4013fbf224263f06f29be004bc63eeeefcbf2bf2
1 # $Id$
3 # BioPerl driver for phrap.out file
5 # Copyright by Robson F. de Souza
7 # You may distribute this module under the same terms as perl itself
9 # POD documentation - main docs before the code
11 =head1 NAME
13 Bio::Assembly::IO::phrap - driver to load phrap.out files.
15 =head1 SYNOPSIS
17 # Building an input stream
18 use Bio::Assembly::IO;
20 # Assembly loading methods
21 $io = Bio::Assembly::IO->new(-file=>"SGC0-424.phrap.out",
22 -format=>"phrap");
24 $assembly = $io->next_assembly;
26 =head1 DESCRIPTION
28 This package was developed to load the phrap.out files from the
29 (phred/phrap/consed) package by Phill Green. This files contain just
30 the messages printed to standard out by phrap when building an
31 assembly. This output is redirected by phredPhrap perl-script to a
32 file in the project's directory and hold some bit of information
33 regarding assembly quality, connections between contigs and clone's
34 position inside contigs. It should be noted that such files have no
35 data about the sequence. neither for contig consensus nor for any
36 aligned sequence. Anyway, such information may be loaded from Fasta
37 files in the projects directory and added to the assembly object
38 later.
40 Note that, because no sequence is loaded for the contig consensus and
41 locations for aligned sequences are only given in "ungapped consensus"
42 coordinates in a phrap.out file, you can't make coordinate changes in
43 assemblies loaded by pharp.pm, unless you add an aligned
44 coordinates for each sequence to each contig's features collection
45 yourself. See L<Bio::Assembly::Contig::Coordinate_Systems> and
46 L<Bio::Assembly::Contig::Feature_collection>..
48 This driver also loads singlets into the assembly contigs as
49 Bio::Assembly::Singlet objects, altough without their sequence strings.
50 It also adds a feature for the entire sequence, thus storing the singlet
51 length in its end position, and adds a tag '_nof_trimmed_nonX' to the
52 feature, which stores the number of non-vector bases in the singlet.
54 =head2 Implementation
56 Assemblies are loaded into Bio::Assembly::Scaffold objects composed by
57 Bio::Assembly::Contig objects. No features are added to Bio::Assembly::Contig
58 "_aligned_coord:$seqID" feature class, therefore you can't make
59 coordinate changes in contigs loaded by this module. Contig objects
60 created by this module will have the following special feature
61 classes, identified by their primary tags, in their features
62 collection:
64 "_main_contig_feature:$ID" : main feature for contig $ID. This
65 feature is used to store information
66 about the entire consensus
67 sequence. This feature always start at
68 base 1 and its end position is the
69 consensus sequence length. A tag,
70 'trimmed_length' holds the length of the
71 trimmed good quality region inside the
72 consensus sequence.
74 "_covered_region:$index" : coordinates for valid clones inside the
75 contig. $index is the covered region
76 number, starting at 1 for the covered
77 region closest to the consensus sequence
78 first base.
80 "_unalign_coord:$seqID" : location of a sequence in "ungapped
81 consensus" coordinates (consensus
82 sequence without gaps). Primary and
83 secondary scores, indel and
84 substitutions statistics are stored as
85 feature tags.
87 "_internal_clones:$cloneID" : clones inside contigs $cloneID should be
88 used as the unique id for each
89 clone. These features have six tags:
90 '_1st_name', which is the id of the
91 upstream (5') aligned sequence
92 delimiting the clone; '_1st_strand', the
93 upstream sequence strand in the
94 alignment; '_2nd_name', downstream (3')
95 sequence id; '_2nd_strand', the
96 downstream sequence strand in the
97 alignment; '_length', unaligned clone
98 length; '_rejected', a boolean flag,
99 which is false if the clone is valid and
100 true if it was rejected.
102 All coordinates for the features above are expressed as "ungapped
103 consensus" coordinates (See L<Bio::Assembly::Contig::Coordinate_Systems>..
105 =head2 Feature collection
109 =head1 FEEDBACK
111 =head2 Mailing Lists
113 User feedback is an integral part of the evolution of this and other
114 Bioperl modules. Send your comments and suggestions preferably to the
115 Bioperl mailing lists Your participation is much appreciated.
117 bioperl-l@bioperl.org - General discussion
118 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
120 =head2 Reporting Bugs
122 Report bugs to the Bioperl bug tracking system to help us keep track
123 the bugs and their resolution. Bug reports can be submitted via the
124 web:
126 http://bugzilla.open-bio.org/
129 =head1 AUTHOR - Robson Francisco de Souza
131 Email rfsouza@citri.iq.usp.br
133 head1 APPENDIX
135 The rest of the documentation details each of the object
136 methods. Internal methods are usually preceded with a _
138 =cut
140 package Bio::Assembly::IO::phrap;
142 use strict;
144 use Bio::Assembly::Scaffold;
145 use Bio::Assembly::Singlet;
146 use Bio::Assembly::Contig;
147 use Bio::LocatableSeq;
148 use Bio::Seq;
149 use Bio::SeqFeature::Generic;
151 use base qw(Bio::Assembly::IO);
153 =head2 next_assembly
155 Title : next_assembly
156 Usage : $unigene = $stream->next_assembly()
157 Function: returns the next assembly in the stream
158 Returns : Bio::Assembly::Scaffold object
159 Args : NONE
161 =cut
163 sub next_assembly {
164 my $self = shift; # Package reference
166 # Resetting assembly data structure
167 my $Assembly = Bio::Assembly::Scaffold->new(-source=>'phrap');
169 # Looping over all phrap out file lines
170 my ($contigOBJ);
171 while ($_ = $self->_readline) {
172 chomp;
174 # Loading exact dupicated reads list
175 # /Exact duplicate reads:/ && do {
176 # my @exact_dupl;
177 # while (<FILE>) {
178 # last if (/^\s*$/);
179 # /(\S+)\s+(\S+)/ && do {
180 # push(@exact_dupl,[$1,$2]);
181 # };
182 # $self->{'assembly'}{'exact_dupl_reads'} =
183 # new Data::Table(\@exact_dupl,['included','excluded'],0);
185 # };
187 # Loading singlets reads data
188 /^(\d+) isolated singlet/ && do { # should it match 'singlets' and 'singletons'?
189 while ($_ = $self->_readline) {
190 chomp;
191 last if (/^$/);
192 if (/^\s+(\S+)\s+(\d+)\s+\((\d+)\)/) {
193 my ($singletID, $length, $nof_trimmed_nonX) = ($1, $2, $3);
194 # Create singlet object, and add it to scaffold
195 my $seq = Bio::LocatableSeq->new(
196 -start => 1,
197 -end => $length,
198 -strand => 1,
199 -nowarnonempty => 1,
200 -id => $singletID,
201 -primary_id => $singletID,
202 -alphabet => 'dna');
203 my $singletOBJ = Bio::Assembly::Singlet->new(-seqref=>$seq,
204 -verbose => $self->verbose);
205 my $feat = Bio::SeqFeature::Generic->new(
206 -start => 1,
207 -end => $length,
208 -primary => "_main_contig_feature:".$singletOBJ->id(),
209 -tag => { '_nof_trimmed_nonX' => $nof_trimmed_nonX }
211 $singletOBJ->add_features([ $feat ],1);
212 $Assembly->add_singlet($singletOBJ);
217 # Loading contig information
218 /^Contig (\d+)\.\s+(\d+) reads?; (\d+) bp \(untrimmed\), (\d+) \(trimmed\)\./ && do {
219 my ($contigID, $nof_reads, $length, $trimmed_length) = ($1, $2, $3, $4);
220 $contigOBJ = Bio::Assembly::Contig->new( -id => $contigID,
221 -verbose => $self->verbose,
222 -source => 'phrap' );
223 my $feat = Bio::SeqFeature::Generic->new(
224 -start => 1,
225 -end => $length,
226 -primary => "_main_contig_feature:".$contigOBJ->id(),
227 -tag => { '_trimmed_length' => $trimmed_length }
229 $contigOBJ->add_features([ $feat ],1);
230 $Assembly->add_contig($contigOBJ);
233 # Loading read information
234 /^(C?)\s+(-?\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+\(\s*(\d+)\)\s+(\d+\.\d*)\s+(\d+\.\d*)\s+(\d+\.\d*)/ && do {
235 my ($strand, $start, $end, $readID, $primary_score, $secondary_score,
236 $substitutions, $deletions, $insertions) = ($1, $2, $3, $4, $5, $6, $7,
237 $8, $9);
238 $strand = ($strand eq 'C' ? -1 : 1);
239 my $seq = Bio::LocatableSeq->new(
240 -start => $start,
241 -end => $end,
242 -nowarnonempty => 1,
243 -strand => $strand,
244 -id => $readID,
245 -primary_id => $readID,
246 -alphabet => 'dna');
247 my $unalign_coord = Bio::SeqFeature::Generic->new(
248 -start => $start,
249 -end => $end,
250 -primary => "_unalign_coord:$readID",
251 -tag => {'_primary_score'=>$primary_score,
252 '_secondary_score'=>$secondary_score,
253 '_substitutions'=>$substitutions,
254 '_insertions'=>,$insertions,
255 '_deletions'=>$deletions }
257 $unalign_coord->attach_seq($seq);
258 $contigOBJ->add_seq($seq);
259 $contigOBJ->add_features([ $unalign_coord ]);
262 # Loading INTERNAL clones description
263 /INTERNAL\s+Contig\s+(\d+)\s+opp\s+sense/ && do {
264 my $contigID = $1;
265 my $contig = $Assembly->get_contig_by_id($contigID);
266 while ($_ = $self->_readline) {
267 my (@data,$rejected,$c1_strand,$c2_strand);
269 (@data = /\s+(\*?)\s+(C?)\s+(\S+)\s+(C?)\s+(\S+)\s+(-?\d+)\s+(-?\d+)\s+(-?\d+)/) && do {
270 if ($data[0] eq '*') { $rejected = 1 } else { $rejected = 0 }
271 $c1_strand = ($data[1] eq 'C' ? -1 : 1);
272 $c2_strand = ($data[3] eq 'C' ? -1 : 1);
273 (my $clone_name = $data[2]) =~ s/^(\S+)\.\w.*/$1/;
274 my $clone = Bio::SeqFeature::Generic->new(
275 -start => $data[6],
276 -end => $data[7],
277 -strand => 0,
278 -primary => "_internal_clone:$clone_name",
279 -tag => {'_1st_strand'=>,$c1_strand,
280 '_2nd_strand'=>,$c2_strand,
281 '_1st_name'=>$data[2],
282 '_2nd_name'=>$data[4],
283 '_length'=>$data[5],
284 '_rejected'=>$rejected}
286 $contig->add_features([ $clone ]);
289 /Covered regions:/ && do {
290 my %coord = /(\d+)/g; my $i = 0;
291 foreach my $start (sort { $a <=> $b } keys %coord) {
292 my $cov = Bio::SeqFeature::Generic->new(
293 -start => $start,
294 -end => $coord{$start},
295 -primary => '_covered_region:'.++$i
297 # 1: attach feature to contig consensus, if any
298 $contig->add_features([ $cov ],1);
300 last; # exit while loop
301 }; # /Covered regions:/
303 } # while ($_ = $self->_readline)
304 }; # /INTERNAL\s+Contig\s+(\d+)\s+opp\s+sense/
305 } # while ($_ = $self->_readline)
307 return $Assembly;
310 =head2 write_assembly (NOT IMPLEMENTED)
312 Title : write_assembly
313 Usage : $ass_io->write_assembly($assembly)
314 Function: Write the assembly object in Phrap compatible ACE format
315 Returns : 1 on success, 0 for error
316 Args : A Bio::Assembly::Scaffold object
318 =cut
320 sub write_assembly {
321 my $self = shift;
322 $self->throw_not_implemented();
327 __END__