t/Seq/PrimarySeq.t

   1 # -*-Perl-*- Test Harness script for Bioperl
   2 # $Id$
   3
   4 use strict;
   5 use Data::Dumper;
   6
   7 BEGIN {
   8     use lib '.';
   9     use Bio::Root::Test;
  10     test_begin( -tests => 178 );
  11
  12     use_ok('Bio::PrimarySeq');
  13     use_ok('Bio::Location::Simple');
  14     use_ok('Bio::Location::Fuzzy');
  15     use_ok('Bio::Location::Split');
  16 }
  17
  18
  19 # Bare object
  20 ok my $seq = Bio::PrimarySeq->new(), 'Bare object';
  21 isa_ok $seq, 'Bio::PrimarySeqI';
  22 is $seq->id, undef;
  23 is $seq->seq, undef;
  24 is $seq->length, 0;
  25 is $seq->alphabet, undef;
  26 is $seq->is_circular, undef;
  27
  28
  29 # Empty sequence
  30 ok $seq = Bio::PrimarySeq->new( -seq => '', -nowarnonempty => 1);
  31 is $seq->seq, '';
  32 is $seq->length, 0;
  33 is $seq->alphabet, undef;
  34
  35
  36 # Basic tests
  37 ok $seq = Bio::PrimarySeq->new(
  38     '-seq'              => 'TTGGTGGCGTCAACT',
  39     '-display_id'       => 'new-id',
  40     '-alphabet'         => 'dna',
  41     '-accession_number' => 'X677667',
  42     '-desc'             => 'Sample Bio::Seq object'
  43 );
  44 ok defined $seq;
  45 is $seq->accession_number(), 'X677667';
  46 is $seq->seq(),              'TTGGTGGCGTCAACT';
  47 is $seq->display_id(),       'new-id';
  48 is $seq->alphabet(),         'dna';
  49 is $seq->is_circular(),      undef;
  50 ok $seq->is_circular(1);
  51 is $seq->is_circular(0), 0;
  52
  53 # check IdentifiableI and DescribableI interfaces
  54 isa_ok $seq, 'Bio::IdentifiableI';
  55 isa_ok $seq, 'Bio::DescribableI';
  56
  57 # make sure all methods are implemented
  58 is $seq->authority("bioperl.org"), "bioperl.org";
  59 is $seq->authority, "bioperl.org";
  60 is $seq->namespace("t"), "t";
  61 is $seq->namespace, "t";
  62 is $seq->version(0), 0;
  63 is $seq->version, 0;
  64 is $seq->lsid_string(), "bioperl.org:t:X677667";
  65 is $seq->namespace_string, "t:X677667.0";
  66 is $seq->version(47), 47;
  67 is $seq->version, 47;
  68 is $seq->namespace_string, "t:X677667.47";
  69 is $seq->description, 'Sample Bio::Seq object';
  70 is $seq->display_name, "new-id";
  71
  72
  73 # Test subseq
  74 is $seq->subseq(2, 5), 'TGGT';
  75
  76 is $seq->subseq( -start => 1, -end => 15), 'TTGGTGGCGTCAACT';
  77
  78 my $location = Bio::Location::Simple->new(
  79     '-start'  => 2,
  80     '-end'    => 5,
  81     '-strand' => -1
  82 );
  83 is $seq->subseq($location), 'ACCA';
  84
  85 my $splitlocation = Bio::Location::Split->new();
  86 $splitlocation->add_sub_Location(
  87     Bio::Location::Simple->new(
  88         '-start'  => 1,
  89         '-end'    => 4,
  90         '-strand' => 1
  91     )
  92 );
  93
  94 $splitlocation->add_sub_Location(
  95     Bio::Location::Simple->new(
  96         '-start'  => 7,
  97         '-end'    => 12,
  98         '-strand' => -1
  99     )
 100 );
 101
 102 is $seq->subseq($splitlocation), 'TTGGTGACGC';
 103
 104 my $fuzzy = Bio::Location::Fuzzy->new(
 105     -start  => '<3',
 106     -end    => '8',
 107     -strand => 1
 108 );
 109
 110 is $seq->subseq($fuzzy), 'GGTGGC';
 111
 112 {
 113     ok my $seq = Bio::PrimarySeq->new( -seq => 'TT-GTGGCGTCAACT' );
 114     is $seq->subseq(2, 5, 'nogap'), 'TGT';
 115     is $seq->subseq( -start => 2, -end => 5, -nogap => 1 ), 'TGT';
 116     my $location = Bio::Location::Simple->new(
 117        '-start'  => 2,
 118        '-end'    => 5,
 119        '-strand' => 1
 120     );
 121     is $seq->subseq( $location, -nogap => 1), 'TGT';
 122
 123     is $seq->subseq(-start=>2, -end=>5, -replace_with=>'aa'), 'T-GT';
 124     is $seq->seq, 'TaaGGCGTCAACT';
 125
 126     throws_ok { $seq->subseq(-start=>2, -end=>5, -replace_with=>'?!'); } qr/.+/;
 127 }
 128
 129 {
 130     ok my $seq = Bio::PrimarySeq->new( -seq => 'AACCGGTT', -is_circular => 1 );
 131     is $seq->subseq( -start => 7, -end => 10 ), 'TTAA';
 132 }
 133
 134 # Test trunc
 135 my $trunc = $seq->trunc( 1, 4 );
 136 isa_ok $trunc, 'Bio::PrimarySeqI';
 137 is $trunc->seq(), 'TTGG' or diag( "Expecting TTGG. Got " . $trunc->seq() );
 138
 139 $trunc = $seq->trunc($splitlocation);
 140 isa_ok $trunc, 'Bio::PrimarySeqI' ;
 141 is $trunc->seq(), 'TTGGTGACGC';
 142
 143 $trunc = $seq->trunc($fuzzy);
 144 isa_ok $trunc, 'Bio::PrimarySeqI';
 145 is $trunc->seq(), 'GGTGGC';
 146
 147 my $rev = $seq->revcom();
 148 isa_ok $rev, 'Bio::PrimarySeqI';
 149
 150 is $rev->seq(), 'AGTTGACGCCACCAA'
 151   or diag( 'revcom() failed, was ' . $rev->seq() );
 152
 153 is $rev->display_id, 'new-id';
 154 is $rev->display_name(), 'new-id';
 155 is $rev->accession_number(), 'X677667';
 156 is $rev->alphabet, 'dna';
 157 is $rev->description, 'Sample Bio::Seq object';
 158
 159
 160 TODO: {
 161     local $TODO =
 162       'all attributes of primaryseqs are not currently copied through revcom()';
 163     # Probably also not copied through trunc(), transcribe() and rev_transcribe()
 164     is $rev->is_circular(), 0,           'is_circular copied through revcom';
 165     is $rev->version, 47,                'version copied through revcom';
 166     is $rev->authority, 'bioperl.org',   'authority copied through revcom';
 167     is $rev->namespace, 't',             'namespace copied through revcom';
 168     is $rev->namespace_string(),
 169         "t:X677667.47", 'namespace_string copied through revcom';
 170 }
 171
 172 #
 173 # Translate
 174 #
 175
 176 my $aa = $seq->translate();    # TTG GTG GCG TCA ACT
 177 is $aa->seq, 'LVAST', "Translation: " . $aa->seq;
 178
 179 # tests for non-standard initiator codon coding for
 180 # M by making translate() look for an initiator codon and
 181 # terminator codon ("complete", the 5th argument below)
 182 $seq->seq('TTGGTGGCGTCAACTTAA');    # TTG GTG GCG TCA ACT TAA
 183 $aa = $seq->translate( undef, undef, undef, undef, 1 );
 184 is $aa->seq, 'MVAST', "Translation: " . $aa->seq;
 185
 186 # same test as previous, but using named parameter
 187 $aa = $seq->translate( -complete => 1 );
 188 is $aa->seq, 'MVAST', "Translation: " . $aa->seq;
 189
 190 # find ORF, ignore codons outside the ORF or CDS
 191 $seq->seq('TTTTATGGTGGCGTCAACTTAATTT');    # ATG GTG GCG TCA ACT
 192 $aa = $seq->translate( -orf => 1 );
 193 is $aa->seq, 'MVAST*', "Translation: " . $aa->seq;
 194
 195 # smallest possible ORF
 196 $seq->seq("ggggggatgtagcccc");             # atg tga
 197 $aa = $seq->translate( -orf => 1 );
 198 is $aa->seq, 'M*', "Translation: " . $aa->seq;
 199
 200 # same as previous but complete, so * is removed
 201 $aa = $seq->translate(
 202     -orf      => 1,
 203     -complete => 1
 204 );
 205 is $aa->seq, 'M', "Translation: " . $aa->seq;
 206
 207 # ORF without termination codon
 208 # should warn, let's change it into throw for testing
 209 $seq->verbose(2);
 210 $seq->seq("ggggggatgtggcccc");    # atg tgg ccc
 211 eval { $seq->translate( -orf => 1 ); };
 212 like( $@, qr/\batgtggccc\b/i );
 213 $seq->verbose(-1);
 214 $aa = $seq->translate( -orf => 1 );
 215 is $aa->seq, 'MWP', "Translation: MWP";
 216 $seq->verbose(0);
 217
 218 # use non-standard codon table where terminator is read as Q
 219 $seq->seq('ATGGTGGCGTCAACTTAG');    # ATG GTG GCG TCA ACT TAG
 220 $aa = $seq->translate( -codontable_id => 6 );
 221 is $aa->seq, 'MVASTQ' or diag( "Translation: " . $aa->seq );
 222
 223 # insert an odd character instead of terminating with *
 224 $aa = $seq->translate( -terminator => 'X' );
 225 is $aa->seq, 'MVASTX' or diag( "Translation: " . $aa->seq );
 226
 227 # change frame from default
 228 $aa = $seq->translate( -frame => 1 );    # TGG TGG CGT CAA CTT AG
 229 is $aa->seq, 'WWRQL' or diag( "Translation: " . $aa->seq );
 230
 231 $aa = $seq->translate( -frame => 2 );    # GGT GGC GTC AAC TTA G
 232 is $aa->seq, 'GGVNL' or diag( "Translation: " . $aa->seq );
 233
 234 # TTG is initiator in Standard codon table? Afraid so.
 235 $seq->seq("ggggggttgtagcccc");           # ttg tag
 236 $aa = $seq->translate( -orf => 1 );
 237 is $aa->seq, 'L*' or diag( "Translation: " . $aa->seq );
 238
 239 # Replace L at 1st position with M by setting complete to 1
 240 $seq->seq("ggggggttgtagcccc");           # ttg tag
 241 $aa = $seq->translate(
 242     -orf      => 1,
 243     -complete => 1
 244 );
 245 is $aa->seq, 'M' or diag( "Translation: " . $aa->seq );
 246
 247 # Ignore non-ATG initiators (e.g. TTG) in codon table
 248 $seq->seq("ggggggttgatgtagcccc");        # atg tag
 249 $aa = $seq->translate(
 250     -orf      => 1,
 251     -start    => "atg",
 252     -complete => 1
 253 );
 254 is $aa->seq, 'M' or diag( "Translation: " . $aa->seq );
 255
 256 # test for character '?' in the sequence string
 257 is $seq->seq('TTGGTGGCG?CAACT'), 'TTGGTGGCG?CAACT';
 258
 259 # test for some aliases
 260 $seq = Bio::PrimarySeq->new(
 261     -id          => 'aliasid',
 262     -description => 'Alias desc'
 263 );
 264 is $seq->description, 'Alias desc';
 265 is $seq->display_id,  'aliasid';
 266
 267 # Test alphabet
 268
 269 ok $seq->seq('actgx');
 270 is $seq->alphabet, 'protein', 'Alphabet';
 271 ok $seq->seq('actge');
 272 is $seq->alphabet, 'protein';
 273 ok $seq->seq('actgf');
 274 is $seq->alphabet, 'protein';
 275 ok $seq->seq('actgi');
 276 is $seq->alphabet, 'protein';
 277 ok $seq->seq('actgj');
 278 is $seq->alphabet, 'protein';
 279 ok $seq->seq('actgl');
 280 is $seq->alphabet, 'protein';
 281 ok $seq->seq('actgo');
 282 is $seq->alphabet, 'protein';
 283 ok $seq->seq('actgp');
 284 is $seq->alphabet, 'protein';
 285 ok $seq->seq('actgq');
 286 is $seq->alphabet, 'protein';
 287 ok $seq->seq('actgz');
 288 is $seq->alphabet, 'protein';
 289 ok $seq->seq('actgn');
 290 is $seq->alphabet, 'dna';
 291 ok $seq->seq('acugn');
 292 is $seq->alphabet, 'rna';
 293 ok $seq->seq('bdhkm');
 294 is $seq->alphabet, 'protein';
 295 ok $seq->seq('rsvwx');
 296 is $seq->alphabet, 'protein';
 297 ok $seq->seq('AAACTYAAAAGAATTGRCGG'); # valid degenerate DNA PCR primer sequence (90% ACGTN)
 298 is $seq->alphabet, 'dna';
 299 ok $seq->seq('AAACTYAAAKGAATTGRCGG'); # another primer previously detected as protein (85% ACGTN)
 300 is $seq->alphabet, 'dna';
 301 ok $seq->seq('YWACTYAAAKGARTTGRCGG'); # 70% ACGTN. Everything <= 70% is considered a protein
 302 is $seq->alphabet, 'protein';
 303 ok $seq->seq('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'); # Bug 2438
 304 is $seq->alphabet, 'protein', 'Bug 2438';
 305 ok $seq->seq('CAGTCXXXXXXXXXXXXXXXXXXXXXXXXXXXCAGCG');
 306 is $seq->alphabet, 'protein';
 307
 308 ok $seq->seq('actgn', 'protein'); # accept specified alphabet, no matter what
 309 is $seq->alphabet, 'protein';
 310 ok $seq->seq('bdhkm', 'dna');
 311 is $seq->alphabet, 'dna';
 312
 313
 314 # Bug #2864:
 315
 316 $seq = Bio::PrimarySeq->new( -display_id => 0, -seq => 'GATC' );
 317
 318 is $seq->display_id, 0, "Bug #2864";
 319
 320 # Test that the check for terminators inside the translated protein
 321 # works when the terminator isn't '*':
 322
 323 $seq = Bio::PrimarySeq->new(-seq=>'ATGCTCTAAGCAGGGTAA'); # ML*AG*
 324 eval { $aa = $seq->translate(-complete=>1, -throw=>1, -terminator=>'#') };
 325 my $error = $@;
 326 ok $error =~ /\QTerminator codon inside CDS!\E/, 'Terminator + inside sequence';
 327
 328 $seq = Bio::PrimarySeq->new(-seq=>'ATGCTCGCAGGGTAA'); # MLAG*
 329 $aa = $seq->translate(-complete=>1, -throw=>1, -terminator=>'#');
 330 is $aa->seq, 'MLAG';
 331
 332
 333 # Test length method
 334 ok $seq = Bio::PrimarySeq->new(), 'Length method';
 335 is $seq->length, 0;
 336 ok $seq->length(123);
 337 is $seq->length, 123;
 338
 339 ok $seq = Bio::PrimarySeq->new( -seq => 'ATGCTCTAAGCAGGGTAA' );
 340 is $seq->length, 18;
 341 ok $seq->seq('ATGCTCTAAG');
 342 is $seq->length, 10;
 343 is $seq->seq(undef), undef;
 344 is $seq->length, 0;
 345
 346 ok $seq = Bio::PrimarySeq->new( -length => 123 );
 347 is $seq->length, 123;
 348
 349 ok $seq = Bio::PrimarySeq->new( -seq => 'ATGCTCTAAGCAGGGTAA' );
 350 is $seq->length, 18;
 351 ok $seq->length( $seq->length ); # save memory by removing seq
 352 is $seq->seq( undef ), undef;    # ... but keeping a record of length
 353 is $seq->length, 18;
 354 is $seq->seq, undef;
 355 ok $seq->seq('ACGT');
 356 is $seq->length, 4; # manually-specified length changed when sequence is changed
 357
 358 throws_ok { $seq->length(666); } qr/.+/; # Cannot lie about length
 359
 360
 361 # Sequence validation method
 362 is $seq->validate_seq( undef    ), 1;
 363 is $seq->validate_seq( ''       ), 1;
 364 is $seq->validate_seq( 'acgt'   ), 1;
 365 is $seq->validate_seq( 'ACGT'   ), 1;
 366 is $seq->validate_seq( 'XFRH'   ), 1;
 367 is $seq->validate_seq( '-~'     ), 1; # gap symbols
 368 is $seq->validate_seq( '-.*?=~' ), 1; # other valid symbols
 369 is $seq->validate_seq( '0'      ), 0;
 370 is $seq->validate_seq( '   '    ), 0;
 371 is $seq->validate_seq( 'AAAA$'  ), 0;
 372 is $seq->validate_seq( 'tt&t!'  ), 0;
 373
 374 throws_ok { $seq->validate_seq('tt&t!', 1); } qr/.+/;
 375
 376
 377 # Test direct option (no sequence validation)
 378 throws_ok { $seq = Bio::PrimarySeq->new(-seq => 'A\T$AGQ+T'); } qr/.+/, 'Validation';
 379 ok $seq = Bio::PrimarySeq->new( -seq => 'A\T$AGQ+T', -direct => 1 );
 380 is $seq->seq, 'A\T$AGQ+T';
 381 throws_ok { $seq->seq('NT@/') } qr/.+/;
 382
 383 # Set a sequence by reference
 384 my $string = 'AAAACCCCGGGGTTTT';
 385 ok $seq = Bio::PrimarySeq->new( -ref_to_seq => \$string );
 386 is $seq->seq, 'AAAACCCCGGGGTTTT';
 387
 388
 389 # Test internal PrimarySeqI _find_orfs function and translate( -orf => 'longest' )
 390 {
 391     my @tests = (
 392         #tiny test
 393         ['TTTTATGGTGGCGTCAACTTAATTT',
 394          [[4,22,18,1]],
 395         ],
 396
 397         #bigger test (this is a tomato unigene)
 398         ['GAAGGCTGGTTCTGAGTTGGATCTATGTTTGATGAAGGGAAGTAGACCGGAGGTCTTGCATCAGCAATATTAGTACCAAATCCAGGTGGAGGCGCATCCTGTCTCCGTTGCATTTCAACTTTCATTTCAGCAATCTGTTGCATCAGTTGCATGATCAATTCATTCTGTTCCACTACAGTGGGCTGAGCGACCACAACGTCAGTAAGACGCCCTTCGTCATTGTTGTCTCCCATAACTGTTTTTCCTTTATCTGAATTTGATCGAGGGAAGGAATCTGTAGGACCTTTCGATCTGGTGAAGTAAGGATGATCTGCCAGCTTTATTGACACAGATCAGTAAAAAGGTACCTGAAAGGTAAAAACAACTCAAAGGCAAATTTGTTAGTGCATATCCAGAGTACAAAATGCTTAATATCGCACATAAAACCGATAAACACACAAGTCGTTTTGTTTGAGGATATCTTAACCCACGAATAAGGACGGATATATATTTTGAACAAACAGGAATTTGTTTGTTTGGCGTTATCTTGGGAAATCTG',
 399          [[98,254,156,2],[347,476,129,2],[219,303,84,0],[16,73,57,1],[403,454,51,1],[310,358,48,1],[235,280,45,1],[491,536,45,2],[150,186,36,0],[507,537,30,0],[5,32,27,2],[511,538,27,1],[24,45,21,0],[305,326,21,2],[450,465,15,0]],
 400         ],
 401
 402
 403        );
 404     foreach my $test (@tests) {
 405         my ($test_seq, $orfs) = @$test;
 406         my @orfs = Bio::PrimarySeqI::_find_orfs_nucleotide(
 407             undef,
 408             $test_seq,
 409             Bio::Tools::CodonTable->new,
 410             undef,
 411            ); # ATG GTG GCG TCA ACT
 412         is_deeply( \@orfs, $orfs, '_find_orfs 1')
 413             or diag "for $test_seq, _find_orfs returned:\n"
 414                     .Dumper([map [@$_], @orfs]);
 415
 416         is_deeply( $orfs->[0],
 417                    (sort {$b->[2] <=> $a->[2]} @$orfs)[0],
 418                    'orfs are sorted by descending length'
 419                   );
 420
 421         # make sure we get the same sequence by taking the longest orf
 422         # nucleotide from the test data and translating it, as by
 423         # calling translate with -orf => 'longest'
 424         is(
 425             Bio::PrimarySeq
 426               ->new( -seq => $test_seq, -id => 'fake_id' )
 427               ->translate( -orf => 'longest' )
 428               ->seq,
 429
 430             Bio::PrimarySeq
 431               ->new( -seq => substr( $test_seq, $orfs->[0][0], $orfs->[0][2] ),
 432                      -id => 'foo'
 433                     )
 434               ->translate
 435               ->seq,
 436             'got correct -orf => "longest" seq',
 437            );
 438     }
 439 }