9 test_begin( -tests => 892,
10 -requires_module => 'DB_File' );
13 use_ok 'Bio::LocatableSeq';
14 use_ok 'Bio::Seq::Quality';
15 use_ok 'Bio::Assembly::IO';
16 use_ok 'Bio::Assembly::Singlet';
23 my ($seq_id, $seq_str, $qual, $start, $end, $strand) = ('seq1', 'CAGT-GGT',
24 '0 1 2 3 4 5 6 7', 1, 7, -1, );
26 my $seq = Bio::PrimarySeq->new(-seq => $seq_str, -id => $seq_id);
27 my $singlet_id = 'singlet1';
28 ok my $singlet = Bio::Assembly::Singlet->new( -id => $singlet_id, -seqref =>
29 $seq), 'singlet from Bio::PrimarySeq';
30 isa_ok $singlet, 'Bio::Assembly::Contig';
31 isa_ok $singlet, 'Bio::Assembly::Singlet';
32 is $singlet->id, $singlet_id;
33 ok my $consensus = $singlet->get_consensus_sequence;
34 isa_ok $consensus, 'Bio::LocatableSeq';
35 is $consensus->seq, $seq_str;
36 is $consensus->start, 1;
37 is $consensus->end, 7;
38 is $consensus->strand, 1;
39 is $singlet->get_consensus_length, 8;
40 is $singlet->get_consensus_quality, undef;
41 ok my $seqref = $singlet->seqref;
42 is $seqref->id, $seq_id;
43 is $seqref->seq, $seq_str;
46 is $seqref->length, 8;
48 $seq = Bio::Seq::Quality->new(-seq => $seq_str, -id => $seq_id, -qual => $qual);
49 ok $singlet = Bio::Assembly::Singlet->new( -id => $singlet_id, -seqref => $seq),
50 'singlet from Bio::Seq::Quality';
51 is $singlet->get_consensus_length, 8;
52 ok $consensus = $singlet->get_consensus_quality;
53 isa_ok $consensus, 'Bio::Seq::QualI';
54 is join(' ', @{$consensus->qual}), $qual;
56 $seq = Bio::LocatableSeq->new( -seq => $seq_str, -id => $seq_id, -start =>
57 $start, -end => $end, -strand => $strand );
58 ok $singlet = Bio::Assembly::Singlet->new( -id => $singlet_id, -seqref => $seq),
59 'singlet from LocatableSeq';
60 ok $consensus = $singlet->get_consensus_sequence;
61 is $consensus->start, 1;
62 is $consensus->end, 7;
63 is $consensus->strand, -1;
64 ok $seqref = $singlet->seqref;
67 is $seqref->strand, -1;
69 ($start, $end) = (20, 26);
70 $seq = Bio::LocatableSeq->new( -seq => $seq_str, -id => $seq_id, -start =>
71 $start, -end => $end, -strand => $strand );
72 ok $singlet = Bio::Assembly::Singlet->new( -id => $singlet_id, -seqref => $seq),
73 'singlet from LocatableSeq with set coordinates';
74 ok $consensus = $singlet->get_consensus_sequence;
75 is $consensus->start, 1;
76 is $consensus->end, 7;
77 is $consensus->strand, -1;
78 ok $seqref = $singlet->seqref;
81 is $seqref->strand, -1;
92 ok my $aio = Bio::Assembly::IO->new(
93 -file => test_input_file('assembly_with_singlets.ace'),
94 -format => 'ace-consed',
96 is $aio->variant, 'consed', 'consed';
97 ok $aio = Bio::Assembly::IO->new(
98 -file => test_input_file('assembly_with_singlets.ace'),
101 is $aio->variant, 'consed';
102 ok $aio->variant('454');
103 is $aio->variant, '454';
109 my $in = Bio::Assembly::IO->new(
110 -file => test_input_file('consed_project','edit_dir','test_project.phrap.out'),
113 isa_ok $in, 'Bio::Assembly::IO';
114 while (my $contig = $in->next_contig) {
115 isa_ok $contig, 'Bio::Assembly::Contig';
118 $in = Bio::Assembly::IO->new(
119 -file => test_input_file('consed_project','edit_dir','test_project.phrap.out'),
122 isa_ok $in, 'Bio::Assembly::IO';
125 local $TODO = "phrap parser doesn't include the sequence string in the sequence objects.";
127 eval {$sc = $in->next_assembly};
132 $in = Bio::Assembly::IO->new(
133 -file => test_input_file('consed_project','edit_dir','test_project.phrap.out'),
136 ok $sc = $in->next_assembly;
137 isa_ok $sc, 'Bio::Assembly::Scaffold';
144 is $sc->id, "NoName";
145 is $sc->id('test'), "test";
147 isa_ok $sc->annotation, 'Bio::AnnotationCollectionI';
148 is $sc->annotation->get_all_annotation_keys, 0,"no annotations in Annotation collection?";
149 is $sc->get_nof_contigs, 1;
150 is $sc->get_nof_sequences_in_contigs, 2;
151 is $sc->get_nof_singlets, 2, "get_nof_singlets";
152 is $sc->get_contig_seq_ids, 2, "get_contig_seq_ids";
153 is $sc->get_contig_ids, 1, "get_contig_ids";
154 is $sc->get_singlet_ids, 2, "get_singlet_ids";
156 my @phrap_contigs = $sc->all_contigs();
157 isa_ok $phrap_contigs[0], "Bio::Assembly::Contig",'the contig is a Bio::Assembly::Contig';
158 my @singlets = $sc->all_singlets();
159 isa_ok $singlets[0], "Bio::Assembly::Contig", 'the singlet is a Bio::Assembly::Contig';
160 isa_ok $singlets[0], "Bio::Assembly::Singlet", 'the singlet is a Bio::Assembly::Singlet';
163 ok @contig_seq_ids = $sc->get_contig_seq_ids, "get_contig_seq_ids";
164 is @contig_seq_ids, 2;
165 for my $contig_seq_id (@contig_seq_ids) {
166 ok not $contig_seq_id =~ m/contig/i;
169 ok @contig_ids = $sc->get_contig_ids, "get_contig_ids";
171 for my $contig_id (@contig_ids) {
172 ok not $contig_id =~ m/contig/i;
175 ok @singlet_ids = $sc->get_singlet_ids, "get_singlet_ids";
177 for my $singlet_id (@singlet_ids) {
178 ok not $singlet_id =~ m/contig/i;
181 ok @all_seq_ids = $sc->get_all_seq_ids, "get_all_seq_ids";
182 for my $seq_id (@all_seq_ids) {
183 ok not $seq_id =~ m/contig/i;
188 # Testing ContigAnalysis
195 # ACE Consed variant (default)
196 $aio = Bio::Assembly::IO->new(
197 -file => test_input_file('consed_project','edit_dir','test_project.fasta.screen.ace.2'),
201 my $assembly = $aio->next_assembly();
203 my @contigs = $assembly->all_contigs();
205 my $direction = $contigs[0]->strand;
208 my $features = $contigs[0]->get_features_collection;
210 my @contig_features = $features->features;
211 is @contig_features, 61, 'contig features'; # 59 contig features + 2 seqfeatures
213 my @annotations = $features->get_features_by_type('Annotation');
217 for my $an (@annotations) {
218 if ($an->has_tag('extra_info')) {
220 is (($an->get_tag_values('extra_info'))[0], "contig extra\ninfo\n");
222 elsif ($an->has_tag('comment')){
224 is (($an->get_tag_values('comment'))[0], "contig tag\ncomment\n");
229 is $assembly->get_nof_contigs, 1;
230 is $assembly->get_nof_sequences_in_contigs, 2;
231 is $assembly->get_nof_singlets, 0, "get_nof_singlets";
232 is $assembly->get_contig_seq_ids, 2, "get_contig_seq_ids";
233 is $assembly->get_contig_ids, 1, "get_contig_ids";
234 is $assembly->get_singlet_ids, 0, "get_singlet_ids";
236 $aio = Bio::Assembly::IO->new(
237 -file => test_input_file('assembly_with_singlets.ace'),
241 while (my $obj = $aio->next_contig) {
242 isa_ok $obj, 'Bio::Assembly::Contig'; # Singlets are contigs too
245 $aio = Bio::Assembly::IO->new(
246 -file => test_input_file('assembly_with_singlets.ace'),
250 $assembly = $aio->next_assembly();
251 is $assembly->get_nof_contigs, 3;
252 my @ace_contigs = $assembly->all_contigs();
253 my $ace_contig = $ace_contigs[0];
254 isa_ok $ace_contig, "Bio::Assembly::Contig",'the contig is a Bio::Assembly::Contig';
256 ok my @test_reads = $ace_contig->get_seq_ids();
257 is scalar @test_reads, 2;
258 is $test_reads[0], '5704073';
259 is $test_reads[1], '5762101';
261 is $assembly->get_nof_sequences_in_contigs, 6;
262 is $assembly->get_nof_singlets, 33, "get_nof_singlets";
263 @singlets = $assembly->all_singlets();
264 isa_ok $singlets[0], "Bio::Assembly::Contig", 'the singlet is a Bio::Assembly::Contig';
265 isa_ok $singlets[0], "Bio::Assembly::Singlet", 'the singlet is a Bio::Assembly::Singlet';
266 ok @contig_seq_ids = $assembly->get_contig_seq_ids, "get_contig_seq_ids";
267 is @contig_seq_ids, 6;
268 for my $contig_seq_id (@contig_seq_ids) {
269 ok not $contig_seq_id =~ m/contig/i;
271 ok @contig_ids = $assembly->get_contig_ids, "get_contig_ids";
273 for my $contig_id (@contig_ids) {
274 ok $contig_id =~ m/contig/i;
276 ok @singlet_ids = $assembly->get_singlet_ids, "get_singlet_ids";
278 for my $singlet_id (@singlet_ids) {
279 ok $singlet_id =~ m/contig/i;
281 ok @all_seq_ids = $assembly->get_all_seq_ids, "get_all_seq_ids";
282 for my $seq_id (@all_seq_ids) {
283 ok not $seq_id =~ m/contig/i;
288 ok $aio = Bio::Assembly::IO->new(
289 -file => test_input_file('singlet_w_CT.ace'),
294 $aio = Bio::Assembly::IO->new(
295 -file => test_input_file('27-contig_Newbler.ace'),
296 -format => 'ace-454',
298 $assembly = $aio->next_assembly();
299 @contigs = $assembly->all_contigs();
300 # All read positions should be >0
301 my $contig = $contigs[0];
302 my $min_aln_coord = undef;
303 for my $read ($contig->each_seq) {
304 my ($feat) = $contig->get_features_collection->get_features_by_type("_aligned_coord:".$read->id);
305 my $aln_coord_start = $feat->location->start;
306 if ( (not defined $min_aln_coord) or ($aln_coord_start < $min_aln_coord) ) {
307 $min_aln_coord = $aln_coord_start;
310 is $min_aln_coord, 1, '454 ACE variant coordinates check';
311 # The ends of the consensus should be padded
312 my $left_pad_length = 29;
313 my $sequence_length = 203;
314 my $right_pad_length = 81;
315 my $consensus_length = $left_pad_length + $sequence_length + $right_pad_length;
316 my $cons_seq = $contig->get_consensus_sequence->seq;
317 is length $cons_seq, $consensus_length;
318 $cons_seq =~ m/^(-*).*?(-*)$/;
319 is length $1, $left_pad_length, '454 ACE variant consensus check';
320 is length $2, $right_pad_length;
321 my $cons_qual = $contig->get_consensus_quality->qual;
322 is scalar @$cons_qual, $consensus_length;
323 $cons_qual = join ' ', @{$contig->get_consensus_quality->qual};
324 my $lpad = $left_pad_length x '0 ';
325 my $rpad = $right_pad_length x '0 ';
326 $cons_qual =~ m/^($lpad).*($rpad)$/;
331 my $asm_infile = '27-contig_Newbler.ace';
332 my $asm_outfile = test_output_file();
333 my $asm_out = Bio::Assembly::IO->new(
334 -file => ">$asm_outfile",
338 ok $asm_in = Bio::Assembly::IO->new(
339 -file => test_input_file($asm_infile),
342 )->next_assembly, 'writing in the ACE format';
343 ok $asm_out->write_assembly( -scaffold => $asm_in, -singlets => 1 );
345 $asm_infile = 'assembly_with_singlets.ace';
346 ok $asm_in = Bio::Assembly::IO->new(
347 -file => test_input_file($asm_infile),
350 ok $asm_out->write_assembly( -scaffold => $asm_in, -singlets => 1 );
352 $asm_infile = 'reference_ace.ace';
353 ok $asm_in = Bio::Assembly::IO->new(
354 -file => test_input_file($asm_infile),
357 ok $asm_out->write_assembly( -scaffold => $asm_in, -singlets => 1 );
361 # Testing TIGR format
364 # Importing an assembly
366 $asm_in = Bio::Assembly::IO->new(
367 -file => test_input_file("sample_dataset.tigr"),
370 while (my $obj = $asm_in->next_contig) {
371 isa_ok $obj, 'Bio::Assembly::Contig'; # Singlets are contigs too
374 $asm_in = Bio::Assembly::IO->new(
375 -file => test_input_file("sample_dataset.tigr"),
379 my $scaf_in = $asm_in->next_assembly;
380 isa_ok $scaf_in, 'Bio::Assembly::Scaffold';
381 is $scaf_in->id, 'NoName';
382 is $scaf_in->get_nof_contigs, 13;
383 is $scaf_in->get_nof_sequences_in_contigs, 36;
384 is $scaf_in->get_nof_singlets, 1;
385 my @contigseqids = sort qw(sdsu|SDSU1_RFPERU_001_A09.x01.phd.1
386 sdsu|SDSU1_RFPERU_001_B03.x01.phd.1 sdsu|SDSU1_RFPERU_001_B04.x01.phd.1
387 sdsu|SDSU1_RFPERU_001_E04.x01.phd.1 sdsu|SDSU_RFPERU_002_A01.x01.phd.1
388 sdsu|SDSU_RFPERU_002_B07.x01.phd.1 sdsu|SDSU_RFPERU_002_C12.x01.phd.1
389 sdsu|SDSU_RFPERU_002_D08.x01.phd.1 sdsu|SDSU_RFPERU_002_H12.x01.phd.1
390 sdsu|SDSU_RFPERU_003_G09.x01.phd.1 sdsu|SDSU_RFPERU_004_H12.x01.phd.1
391 sdsu|SDSU_RFPERU_005_F02.x01.phd.1 sdsu|SDSU_RFPERU_006_D03.x01.phd.1
392 sdsu|SDSU_RFPERU_006_E04.x01.phd.1 sdsu|SDSU_RFPERU_006_E05.x01.phd.1
393 sdsu|SDSU_RFPERU_006_H08.x01.phd.1 sdsu|SDSU_RFPERU_007_E09.x01.phd.1
394 sdsu|SDSU_RFPERU_007_F06.x01.phd.1 sdsu|SDSU_RFPERU_008_B02.x01.phd.1
395 sdsu|SDSU_RFPERU_009_E07.x01.phd.1 sdsu|SDSU_RFPERU_010_B05.x01.phd.1
396 sdsu|SDSU_RFPERU_010_B06.x01.phd.1 sdsu|SDSU_RFPERU_010_C09.x01.phd.1
397 sdsu|SDSU_RFPERU_010_D10.x01.phd.1 sdsu|SDSU_RFPERU_012_H02.x01.phd.1
398 sdsu|SDSU_RFPERU_013_B05.x01.phd.1 sdsu|SDSU_RFPERU_013_C07.x01.phd.1
399 sdsu|SDSU_RFPERU_013_C08.x01.phd.1 sdsu|SDSU_RFPERU_013_G10.x01.phd.1
400 sdsu|SDSU_RFPERU_013_H05.x01.phd.1 sdsu|SDSU_RFPERU_014_H06.x01.phd.1
401 sdsu|SDSU_RFPERU_015_A05.x01.phd.1 sdsu|SDSU_RFPERU_015_C06.x01.phd.1
402 sdsu|SDSU_RFPERU_015_E04.x01.phd.1 sdsu|SDSU_RFPERU_015_G04.x01.phd.1
403 sdsu|SDSU_RFPERU_015_H03.x01.phd.1);
404 my @contigids = sort qw(106 144 148 17 185 2 210 36 453 500 613 668 93);
405 my @singletids = sort qw(123);
406 my @singletseqids = sort qw(asdf);
407 is_deeply [sort $scaf_in->get_contig_seq_ids], \@contigseqids;
408 is_deeply [sort $scaf_in->get_contig_ids], \@contigids ;
409 is_deeply [sort $scaf_in->get_singlet_ids], \@singletids ;
410 isa_ok $scaf_in->get_seq_by_id('sdsu|SDSU1_RFPERU_001_A09.x01.phd.1'),'Bio::LocatableSeq';
411 $contig = $scaf_in->get_contig_by_id('106');
412 isa_ok $contig,'Bio::Assembly::Contig';
414 # check Contig object SeqFeature::Collection
415 # should add more specific Contig tests...
416 my @sfs = $contig->get_features_collection->features; # 5 contig features + 2 seqfeatures
418 is $sfs[1]->seq_id(), undef; # should this be undef?
419 ok $contig->get_features_collection->get_features_by_type('_aligned_coord:sdsu|SDSU_RFPERU_006_E04.x01.phd.1');
420 isa_ok $scaf_in->annotation, 'Bio::AnnotationCollectionI';
421 is $scaf_in->annotation->get_all_annotation_keys, 0, "no annotations in Annotation collection?";
424 # Exporting an assembly
425 $asm_outfile = test_output_file();
426 $asm_out = Bio::Assembly::IO->new(
427 -file => ">$asm_outfile",
430 ok $asm_out->write_assembly( -scaffold => $scaf_in), 'writing in the TIGR format';
437 my $file = 'test.maq';
438 ok $aio = Bio::Assembly::IO->new(
439 -file => test_input_file($file),
441 ), "init maq IO object";
442 ok $assembly = $aio->next_assembly, "get maq assy";
443 is $assembly->get_nof_contigs, 11, "got all contigs";
444 ok open(my $tf, test_input_file($file)), "read test file as text";
446 is $assembly->get_nof_contig_seqs, scalar @lines, "recorded all maq reads";
447 ok !$assembly->get_nof_singlets, "no singlets";
449 ok $aio = Bio::Assembly::IO->new( -file => test_input_file($file),
451 isa_ok $aio, 'Bio::Assembly::IO';
452 while (my $contig = $aio->next_contig) {
453 isa_ok $contig, 'Bio::Assembly::Contig';
457 # Testing maq with singlets
459 $file = 'test_singlets.maq';
460 ok $aio = Bio::Assembly::IO->new(
461 -file => test_input_file($file),
464 ok $assembly = $aio->next_assembly, "get maq assy";
465 isa_ok $aio, 'Bio::Assembly::IO';
468 ok @contig_seq_ids = $assembly->get_contig_seq_ids, "get_contig_seq_ids";
469 is @contig_seq_ids, 246;
470 for my $contig_seq_id (@contig_seq_ids) {
471 ok not $contig_seq_id =~ m/maq_assy/i;
474 ok @contig_ids = $assembly->get_contig_ids, "get_contig_ids";
476 for my $contig_id (@contig_ids) {
477 ok $contig_id =~ m/maq_assy/i;
480 ok @singlet_ids = $assembly->get_singlet_ids, "get_singlet_ids";
482 for my $singlet_id (@singlet_ids) {
483 ok $singlet_id =~ m/maq_assy/i;
486 ok @all_seq_ids = $assembly->get_all_seq_ids, "get_all_seq_ids";
487 for my $seq_id (@all_seq_ids) {
488 ok not $seq_id =~ m/maq_assy/i;
490 is @all_seq_ids, 250;
492 ok $aio = Bio::Assembly::IO->new(
493 -file => test_input_file($file),
496 while (my $contig = $aio->next_contig) {
497 isa_ok $contig, 'Bio::Assembly::Contig';
500 ##############################################
501 # test format() and variant() in Bio::RootIO
502 ##############################################
504 $in = Bio::Assembly::IO->new(
505 -file => test_input_file('assembly_with_singlets.ace'),
507 is $in->format, 'ace';
508 is $in->variant, 'consed';