1 # -*-Perl-*- Test Harness script for Bioperl
10 test_begin(-tests => 151);
12 use_ok('Bio::SimpleAlign');
13 use_ok('Bio::AlignIO');
14 use_ok('Bio::SeqFeature::Generic');
15 use_ok('Bio::Location::Simple');
16 use_ok('Bio::Location::Split');
19 my $DEBUG = test_debug();
21 my ($str, $aln, @seqs, $seq);
23 $str = Bio::AlignIO->new(-file=> test_input_file('testaln.pfam'));
24 isa_ok($str,'Bio::AlignIO');
25 $aln = $str->next_aln();
26 is $aln->get_seq_by_pos(1)->get_nse, '1433_LYCES/9-246',
29 my $aln1 = $aln->remove_columns(['mismatch']);
30 is($aln1->match_line, '::*::::*:**:*:*:***:**.***::*.*::**::**:***..**:'.
31 '*:*.::::*:.:*.*.**:***.**:*.:.**::**.*:***********:::*:.:*:**.*::*:'.
32 '.*.:*:**:****************::', 'match_line');
34 my $aln2 = $aln->select(1,3);
35 isa_ok($aln2, 'Bio::Align::AlignI');
36 is($aln2->no_sequences, 3, 'no_sequences');
38 # test select non continuous-sorted by default
39 $aln2 = $aln->select_noncont(6,7,8,9,10,1,2,3,4,5);
40 is($aln2->no_sequences, 10, 'no_sequences');
41 is($aln2->get_seq_by_pos(2)->id, $aln->get_seq_by_pos(2)->id, 'select_noncont');
42 is($aln2->get_seq_by_pos(8)->id, $aln->get_seq_by_pos(8)->id, 'select_noncont');
44 # test select non continuous-nosort option
45 $aln2 = $aln->select_noncont('nosort',6,7,8,9,10,1,2,3,4,5);
46 is($aln2->no_sequences, 10, 'no_sequences');
47 is($aln2->get_seq_by_pos(2)->id, $aln->get_seq_by_pos(7)->id, 'select_noncont');
48 is($aln2->get_seq_by_pos(8)->id, $aln->get_seq_by_pos(3)->id, 'select_noncont');
50 @seqs = $aln->each_seq();
51 is scalar @seqs, 16, 'each_seq';
52 is $seqs[0]->get_nse, '1433_LYCES/9-246', 'get_nse';
53 is $seqs[0]->id, '1433_LYCES', 'id';
54 is $seqs[0]->no_gaps, 3, 'no_gaps';
55 @seqs = $aln->each_alphabetically();
56 is scalar @seqs, 16, 'each_alphabetically';
58 is $aln->column_from_residue_number('1433_LYCES', 10), 2, 'column_from_residue_number';
59 is $aln->displayname('1433_LYCES/9-246', 'my_seq'), 'my_seq', 'display_name get/set';
60 is $aln->displayname('1433_LYCES/9-246'), 'my_seq', 'display_name get';
61 is substr ($aln->consensus_string(50), 0, 60),
62 "RE??VY?AKLAEQAERYEEMV??MK?VAE??????ELSVEERNLLSVAYKNVIGARRASW", 'consensus_string';
63 is substr ($aln->consensus_string(100), 0, 60),
64 "?????????L????E????M???M????????????L??E?RNL?SV?YKN??G??R??W", 'consensus_string';
65 is substr ($aln->consensus_string(0), 0, 60),
66 "REDLVYLAKLAEQAERYEEMVEFMKKVAELGAPAEELSVEERNLLSVAYKNVIGARRASW", 'consensus_string';
68 ok(@seqs = $aln->each_seq_with_id('143T_HUMAN'));
69 is scalar @seqs, 1, 'each_seq_with_id';
71 is $aln->is_flush, 1,'is_flush';
72 ok($aln->id('x') && $aln->id eq 'x','id get/set');
74 is $aln->length, 242, 'length';
75 is $aln->no_residues, 3769, 'no_residues';
76 is $aln->no_sequences, 16, 'no_sequences';
77 is (sprintf("%.2f",$aln->overall_percentage_identity()), 33.06, 'overall_percentage_identity');
78 is (sprintf("%.2f",$aln->overall_percentage_identity('align')), 33.06, 'overall_percentage_identity');
79 is (sprintf("%.2f",$aln->overall_percentage_identity('short')), 35.24, 'overall_percentage_identity');
80 is (sprintf("%.2f",$aln->overall_percentage_identity('long')), 33.47, 'overall_percentage_identity');
81 is (sprintf("%.2f",$aln->average_percentage_identity()), 66.91, 'average_percentage_identity');
83 ok $aln->set_displayname_count;
84 is $aln->displayname('1433_LYCES/9-246'), '1433_LYCES_1', 'set_displayname_count';
85 ok $aln->set_displayname_flat;
86 is $aln->displayname('1433_LYCES/9-246'), '1433_LYCES', 'set_displayname_flat';
87 ok $aln->set_displayname_normal;
88 is $aln->displayname('1433_LYCES/9-246'), '1433_LYCES/9-246', 'set_displayname_normal';
90 ok $aln->map_chars('\.','-');
91 @seqs = $aln->each_seq_with_id('143T_HUMAN');
92 is substr($seqs[0]->seq, 0, 60),
93 'KTELIQKAKLAEQAERYDDMATCMKAVTEQGA---ELSNEERNLLSVAYKNVVGGRRSAW', 'uppercase, map_chars';
95 is($aln->match_line, ' ::*::::* : * *: *: *:***:**.***::*.'.
96 ' *::**::**:*** . . ** :* :* . :: :: *: . :* .*. **:'.
97 '***.** :*. : .* * : : **.*:***********:::* : .: * :** .'.
98 '*::*: .*. : *: **:****************:: ', 'match_line');
99 ok $aln->remove_seq($seqs[0]),'remove_seqs';
100 is $aln->no_sequences, 15, 'remove_seqs';
101 ok $aln->add_seq($seqs[0]), 'add_seq';
102 is $aln->no_sequences, 16, 'add_seq';
103 ok $seq = $aln->get_seq_by_pos(1), 'get_seq_by_pos';
104 is( $seq->id, '1433_LYCES', 'get_seq_by_pos');
105 ok (($aln->missing_char(), 'P') and ($aln->missing_char('X'), 'X')) ;
106 ok (($aln->match_char(), '.') and ($aln->match_char('-'), '-')) ;
107 ok (($aln->gap_char(), '-') and ($aln->gap_char('.'), '.')) ;
109 is $aln->purge(0.7), 12, 'purge';
110 is $aln->no_sequences, 4, 'purge';
113 test_skip(-tests => 24, -requires_module => 'IO::String');
116 my $out = IO::String->new($string);
118 my $s1 = Bio::LocatableSeq->new(-id => 'AAA',
119 -seq => 'aawtat-tn-',
124 my $s2 = Bio::LocatableSeq->new(-id => 'BBB',
125 -seq => '-aaaat-tt-',
130 $a = Bio::SimpleAlign->new();
134 is ($a->consensus_iupac, "aAWWAT-TN-", 'IO::String consensus_iupac');
135 $s1->seq('aaaaattttt');
136 $s1->alphabet('dna');
138 $s2->seq('-aaaatttt-');
140 $a = Bio::SimpleAlign->new();
144 my $strout = Bio::AlignIO->new(-fh => $out, -format => 'pfam');
145 $strout->write_aln($a);
147 "AAA/1-10 aaaaattttt\n".
148 "BBB/1-8 -aaaatttt-\n",
149 'IO::String write_aln normal');
153 my $b = $a->slice(2,9);
154 $strout->write_aln($b);
156 "AAA/2-9 aaaatttt\n".
157 "BBB/1-8 aaaatttt\n",
158 'IO::String write_aln slice';
162 $b = $a->slice(9,10);
163 $strout->write_aln($b);
167 'IO::String write_aln slice';
173 $strout->write_aln($b);
177 'IO::String write_aln slice';
179 # not sure what coordinates this should return...
183 $b = $a->slice(1,1,1);
184 $strout->write_aln($b);
188 'IO::String write_aln slice';
194 $strout->write_aln($b);
198 'IO::String write_aln slice';
201 $b = $a->slice(11,13);
206 # remove_columns by position
209 $str = Bio::AlignIO->new(-file=> test_input_file('mini-align.aln'));
210 $aln1 = $str->next_aln;
211 $aln2 = $aln1->remove_columns([0,0]);
212 $strout->write_aln($aln2);
214 "P84139/2-33 NEGEHQIKLDELFEKLLRARLIFKNKDVLRRC\n".
215 "P814153/2-33 NEGMHQIKLDVLFEKLLRARLIFKNKDVLRRC\n".
216 "BAB68554/1-14 ------------------AMLIFKDKQLLQQC\n".
217 "gb|443893|124775/1-32 MRFRFQIKVPPAVEGARPALLIFKSRPELGGC\n",
218 'remove_columns by position';
220 # and when arguments are entered in "wrong order"?
223 my $aln3 = $aln1->remove_columns([1,1],[30,30],[5,6]);
224 $strout->write_aln($aln3);
226 "P84139/1-33 MEGEIKLDELFEKLLRARLIFKNKDVLRC\n".
227 "P814153/1-33 MEGMIKLDVLFEKLLRARLIFKNKDVLRC\n".
228 "BAB68554/1-14 ----------------AMLIFKDKQLLQC\n".
229 "gb|443893|124775/2-32 -RFRIKVPPAVEGARPALLIFKSRPELGC\n",
230 'remove_columns by position (wrong order)';
232 my %cigars = $aln1->cigar_line;
233 is $cigars{'gb|443893|124775/1-32'},'19,19:21,24:29,29:32,32','cigar_line';
234 is $cigars{'P814153/1-33'},'20,20:22,25:30,30:33,33','cigar_line';
235 is $cigars{'BAB68554/1-14'},'1,1:3,6:11,11:14,14','cigar_line';
236 is $cigars{'P84139/1-33'},'20,20:22,25:30,30:33,33','cigar_line';
239 # sort_alphabetically
240 my $s3 = Bio::LocatableSeq->new(-id => 'ABB',
241 -seq => '-attat-tt-',
248 is $a->get_seq_by_pos(2)->id,"BBB", 'sort_alphabetically - before';
249 ok $a->sort_alphabetically;
250 is $a->get_seq_by_pos(2)->id,"ABB", 'sort_alphabetically - after';
252 $b = $a->remove_gaps();
253 is $b->consensus_string, "aaaattt", 'remove_gaps';
255 $s1->seq('aaaaattt--');
257 $b = $a->remove_gaps(undef, 'all_gaps_only');
258 is $b->consensus_string, "aaaaatttt",'remove_gaps all_gaps_only';
260 # test set_new_reference:
261 $str = Bio::AlignIO->new(-file=> test_input_file('testaln.aln'));
262 $aln=$str->next_aln();
263 my $new_aln=$aln->set_new_reference(3);
264 $a=$new_aln->get_seq_by_pos(1)->display_id;
265 $new_aln=$aln->set_new_reference('P851414');
266 $b=$new_aln->get_seq_by_pos(1)->display_id;
267 is $a, 'P851414','set_new_reference';
268 is $b, 'P851414','set_new_reference';
271 $str = Bio::AlignIO->new(-verbose => $DEBUG,
272 -file=> test_input_file('testaln2.fasta'));
273 $aln=$str->next_aln();
274 $new_aln=$aln->uniq_seq();
275 $a=$new_aln->no_sequences;
276 is $a, 11,'uniq_seq';
278 # check if slice works well with a LocateableSeq in its negative strand
279 my $seq1 = Bio::LocatableSeq->new(
280 -SEQ => "ATGCTG-ATG",
287 my $seq2 = Bio::LocatableSeq->new(
288 -SEQ => "A-GCTGCATG",
296 my $aln_negative = Bio::SimpleAlign->new();
297 $aln_negative->add_seq($seq1);
298 $aln_negative->add_seq($seq2);
300 $aln_negative->column_from_residue_number($aln_negative->get_seq_by_pos(1)->display_id,2);
302 $aln_negative->column_from_residue_number($aln_negative->get_seq_by_pos(1)->display_id,5);
303 $aln_negative = $aln_negative->slice($end_column,$start_column);
304 my $seq_negative = $aln_negative->get_seq_by_pos(1);
305 is($seq_negative->start,2,"bug 2099");
306 is($seq_negative->end,5,"bug 2099");
309 # test for Bio::SimpleAlign annotation method and
310 # Bio::FeatureHolder stuff
312 $aln = Bio::SimpleAlign->new;
313 for my $seqset ( [qw(one AGAGGAT)], [qw(two AGACGAT) ], [qw(three AGAGGTT)]) {
314 $aln->add_seq(Bio::LocatableSeq->new(-id => $seqset->[0],
315 -seq => $seqset->[1]));
318 is $aln->no_sequences, 3, 'added 3 seqs';
320 $aln->add_SeqFeature(Bio::SeqFeature::Generic->new(-start => 1,
322 -primary_tag => 'charLabel',
324 $aln->add_SeqFeature(Bio::SeqFeature::Generic->new(-start => 3,
326 -primary_tag => 'charLabel',
329 is($aln->feature_count, 2, 'first 2 features added');
331 my $splitloc =Bio::Location::Split->new;
332 $splitloc->add_sub_Location(Bio::Location::Simple->new(-start => 2,
335 $splitloc->add_sub_Location(Bio::Location::Simple->new(-start => 5,
338 $aln->add_SeqFeature(Bio::SeqFeature::Generic->new(-location =>$splitloc,
339 -primary_tag => 'charLabel',
342 is($aln->feature_count, 3, '3rd feature added');
344 #get a slice as defined by the feature
346 my @slice_lens = qw(1 1 2 2);
347 for my $feature ( $aln->get_SeqFeatures ) {
348 for my $loc ( $feature->location->each_Location ) {
349 my $fslice = $aln->slice($loc->start, $loc->end);
350 is($fslice->length, $slice_lens[$i++], "slice $i len");
354 # test set_displayname_safe & restore_displayname:
355 $str = Bio::AlignIO->new(-file=> test_input_file('pep-266.aln'));
356 $aln=$str->next_aln();
357 is $aln->get_seq_by_pos(3)->display_id, 'Smik_Contig1103.1', 'initial display id ok';
358 my ($new_aln, $ref)=$aln->set_displayname_safe();
359 is $new_aln->get_seq_by_pos(3)->display_id, 'S000000003', 'safe display id ok';
360 my $restored_aln=$new_aln->restore_displayname($ref);
361 is $restored_aln->get_seq_by_pos(3)->display_id, 'Smik_Contig1103.1', 'restored display id ok';
364 $str = Bio::AlignIO->new(-file=> test_input_file('testaln.aln'));
365 my $list_file=test_input_file('testaln.list');
366 $aln=$str->next_aln();
367 $new_aln=$aln->sort_by_list($list_file);
368 $a=$new_aln->get_seq_by_pos(1)->display_id;
369 is $a, 'BAB68554', 'sort by list ok';
371 # test for Binary/Morphological/Mixed data
378 my $s1 = Bio::LocatableSeq->new(-id => 'AAA',
379 -seq => 'aawtat-tn-',
384 my $s2 = Bio::LocatableSeq->new(-id => 'BBB',
385 -seq => '-aaaat-tt-',
390 my $s3 = Bio::LocatableSeq->new(-id => 'BBB',
391 -seq => '-aaaat-tt-',
396 $a = Bio::SimpleAlign->new();
401 @seqs = $a->each_seq;
402 is($seqs[0]->start, 12);
403 is($seqs[1]->start, 1);
404 is($seqs[2]->start, 31);
407 @seqs = $a->each_seq;
409 is($seqs[0]->start, 1);
410 is($seqs[1]->start, 12);
411 is($seqs[2]->start, 31);
414 'allele1' => 'GGATCCATT[C/C]CTACT',
415 'allele2' => 'GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT',
416 'allele3' => 'G[G/C]ATCCATT[C/G]CTACT',
417 'allele4' => 'GGATCCATT[C/G]CTACT',
418 'allele5' => 'GGATCCATT[C/G]CTAC[T/A]',
419 'allele6' => 'GGATCCATT[C/G]CTA[C/G][T/A]',
420 'testseq' => 'GGATCCATT[C/G]CTACT'
423 my $alnin = Bio::AlignIO->new(-format => 'fasta',
424 -file => test_input_file('alleles.fas'));
426 $aln = $alnin->next_aln;
429 # compare all to test seq
431 for my $ls (sort keys %testdata) {
433 my $str = $aln->bracket_string(-refseq => 'testseq',
434 -allele1 => 'allele1',
437 is($str, $testdata{$ls}, "BIC:$str");
441 'allele1' => 'GGATCCATT{C.C}CTACT',
442 'allele2' => 'GGAT{C.-}{C.-}ATT{C.C}CT{A.C}CT',
443 'allele3' => 'G{G.C}ATCCATT{C.G}CTACT',
444 'allele4' => 'GGATCCATT{C.G}CTACT',
445 'allele5' => 'GGATCCATT{C.G}CTAC{T.A}',
446 'allele6' => 'GGATCCATT{C.G}CTA{C.G}{T.A}',
447 'testseq' => 'GGATCCATT{C.G}CTACT'
450 for my $ls (sort keys %testdata) {
452 my $str = $aln->bracket_string(-refseq => 'testseq',
453 -allele1 => 'allele1',
458 is($str, $testdata{$ls},"BIC:$str");
462 # is _remove_col really working correctly?
463 my $a = Bio::LocatableSeq->new(-id => 'a', -seq => 'atcgatcgatcgatcg', -start => 5, -end => 20);
464 my $b = Bio::LocatableSeq->new(-id => 'b', -seq => '-tcgatc-atcgatcg', -start => 30, -end => 43);
465 my $c = Bio::LocatableSeq->new(-id => 'c', -seq => 'atcgatcgatc-atc-', -start => 50, -end => 63);
466 my $d = Bio::LocatableSeq->new(-id => 'd', -seq => '--cgatcgatcgat--', -start => 80, -end => 91);
467 my $e = Bio::LocatableSeq->new(-id => 'e', -seq => '-t-gatcgatcga-c-', -start => 100, -end => 111);
468 $aln = Bio::SimpleAlign->new();
473 my $gapless = $aln->remove_gaps();
474 foreach my $seq ($gapless->each_seq) {
475 if ($seq->id eq 'a') {
478 is $seq->seq, 'tcgatcatcatc';
480 elsif ($seq->id eq 'b') {
483 is $seq->seq, 'tcgatcatcatc';
485 elsif ($seq->id eq 'c') {
488 is $seq->seq, 'tcgatcatcatc';
494 $gapless = $aln->remove_gaps();
495 foreach my $seq ($gapless->each_seq) {
496 if ($seq->id eq 'a') {
499 is $seq->seq, 'gatcatca';
501 elsif ($seq->id eq 'b') {
504 is $seq->seq, 'gatcatca';
506 elsif ($seq->id eq 'c') {
509 is $seq->seq, 'gatcatca';
511 elsif ($seq->id eq 'd') {
514 is $seq->seq, 'gatcatca';
516 elsif ($seq->id eq 'e') {
519 is $seq->seq, 'gatcatca';
523 my $f = Bio::LocatableSeq->new(-id => 'f', -seq => 'a-cgatcgatcgat-g', -start => 30, -end => 43);
524 $aln = Bio::SimpleAlign->new();
528 $gapless = $aln->remove_gaps();
529 foreach my $seq ($gapless->each_seq) {
530 if ($seq->id eq 'a') {
533 is $seq->seq, 'acgatcgatcgatg';
535 elsif ($seq->id eq 'f') {
538 is $seq->seq, 'acgatcgatcgatg';
542 my $g = Bio::LocatableSeq->new(-id => 'g', -seq => 'atgc', -start => 5, -end => 8);
543 my $h = Bio::LocatableSeq->new(-id => 'h', -seq => '-tcg', -start => 30, -end => 32);
544 $aln = Bio::SimpleAlign->new();
548 my $removed = $aln->remove_columns([1, 3]);
549 foreach my $seq ($removed->each_seq) {
550 if ($seq->id eq 'g') {
555 elsif ($seq->id eq 'h') {