Bio/PrimarySeq.pm

   1 #
   2 # bioperl module for Bio::PrimarySeq
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
   7 #
   8 # Copyright Ewan Birney
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::PrimarySeq - Bioperl lightweight sequence object
  17
  18 =head1 SYNOPSIS
  19
  20   # Bio::SeqIO for file reading, Bio::DB::GenBank for
  21   # database reading
  22
  23   use Bio::Seq;
  24   use Bio::SeqIO;
  25   use Bio::DB::GenBank;
  26
  27   # make from memory
  28
  29   $seqobj = Bio::PrimarySeq->new (
  30       -seq              => 'ATGGGGTGGGCGGTGGGTGGTTTG',
  31       -id               => 'GeneFragment-12',
  32       -accession_number => 'X78121',
  33       -alphabet         => 'dna',
  34       -is_circular      => 1,
  35   );
  36   print "Sequence ", $seqobj->id(), " with accession ",
  37     $seqobj->accession_number, "\n";
  38
  39   # read from file
  40
  41   $inputstream = Bio::SeqIO->new(
  42       -file   => "myseq.fa",
  43       -format => 'Fasta',
  44   );
  45   $seqobj = $inputstream->next_seq();
  46   print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n";
  47
  48   # to get out parts of the sequence.
  49
  50   print "Sequence ", $seqobj->id(), " with accession ",
  51     $seqobj->accession_number, " and desc ", $seqobj->desc, "\n";
  52
  53   $string  = $seqobj->seq();
  54   $string2 = $seqobj->subseq(1,40);
  55
  56 =head1 DESCRIPTION
  57
  58 PrimarySeq is a lightweight sequence object, storing the sequence, its
  59 name, a computer-useful unique name, and other fundamental attributes.
  60 It does not contain sequence features or other information.  To have a
  61 sequence with sequence features you should use the Seq object which uses
  62 this object.
  63
  64 Although new users will use Bio::PrimarySeq a lot, in general you will
  65 be using it from the Bio::Seq object. For more information on Bio::Seq
  66 see L<Bio::Seq>. For interest you might like to know that
  67 Bio::Seq has-a Bio::PrimarySeq and forwards most of the function calls
  68 to do with sequence to it (the has-a relationship lets us get out of a
  69 otherwise nasty cyclical reference in Perl which would leak memory).
  70
  71 Sequence objects are defined by the Bio::PrimarySeqI interface, and this
  72 object is a pure Perl implementation of the interface. If that's
  73 gibberish to you, don't worry. The take home message is that this
  74 object is the bioperl default sequence object, but other people can
  75 use their own objects as sequences if they so wish. If you are
  76 interested in wrapping your own objects as compliant Bioperl sequence
  77 objects, then you should read the Bio::PrimarySeqI documentation
  78
  79 The documentation of this object is a merge of the Bio::PrimarySeq and
  80 Bio::PrimarySeqI documentation.  This allows all the methods which you can
  81 call on sequence objects here.
  82
  83 =head1 FEEDBACK
  84
  85 =head2 Mailing Lists
  86
  87 User feedback is an integral part of the evolution of this and other
  88 Bioperl modules. Send your comments and suggestions preferably to one
  89 of the Bioperl mailing lists.  Your participation is much appreciated.
  90
  91   bioperl-l@bioperl.org                  - General discussion
  92   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  93
  94 =head2 Support
  95
  96 Please direct usage questions or support issues to the mailing list:
  97
  98 I<bioperl-l@bioperl.org>
  99
 100 rather than to the module maintainer directly. Many experienced and
 101 reponsive experts will be able look at the problem and quickly
 102 address it. Please include a thorough description of the problem
 103 with code and data examples if at all possible.
 104
 105 =head2 Reporting Bugs
 106
 107 Report bugs to the Bioperl bug tracking system to help us keep track
 108 the bugs and their resolution.  Bug reports can be submitted via the
 109 web:
 110
 111   https://github.com/bioperl/bioperl-live/issues
 112
 113 =head1 AUTHOR - Ewan Birney
 114
 115 Email birney@ebi.ac.uk
 116
 117 =head1 APPENDIX
 118
 119 The rest of the documentation details each of the object
 120 methods. Internal methods are usually preceded with a _
 121
 122 =cut
 123
 124
 125
 126 package Bio::PrimarySeq;
 127
 128 use strict;
 129
 130 our $MATCHPATTERN = 'A-Za-z\-\.\*\?=~';
 131 our $GAP_SYMBOLS  = '-~';
 132
 133 use base qw(Bio::Root::Root Bio::PrimarySeqI
 134             Bio::IdentifiableI Bio::DescribableI);
 135
 136
 137 # Setup the allowed values for alphabet()
 138 my %valid_type = map {$_, 1} qw( dna rna protein );
 139
 140
 141 =head2 new
 142
 143  Title   : new
 144  Usage   : $seqobj = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT',
 145                                            -id  => 'human_id',
 146                                            -accession_number => 'AL000012',
 147                                            );
 148  Function: Returns a new primary seq object from
 149            basic constructors, being a string for the sequence
 150            and strings for id and accession_number.
 151
 152            Note that you can provide an empty sequence string. However, in
 153            this case you MUST specify the type of sequence you wish to
 154            initialize by the parameter -alphabet. See alphabet() for possible
 155            values.
 156  Returns : a new Bio::PrimarySeq object
 157  Args    : -seq              => sequence string
 158            -ref_to_seq       => ... or reference to a sequence string
 159            -display_id       => display id of the sequence (locus name)
 160            -accession_number => accession number
 161            -primary_id       => primary id (Genbank id)
 162            -version          => version number
 163            -namespace        => the namespace for the accession
 164            -authority        => the authority for the namespace
 165            -description      => description text
 166            -desc             => alias for description
 167            -alphabet         => skip alphabet guess and set it to dna, rna or protein
 168            -id               => alias for display id
 169            -is_circular      => boolean to indicate that sequence is circular
 170            -direct           => boolean to directly set sequences. The next time -seq,
 171                                 seq() or -ref_to_seq is use, the sequence will not be
 172                                 validated. Be careful with this...
 173            -nowarnonempty    => boolean to avoid warning when sequence is empty
 174
 175 =cut
 176
 177 sub new {
 178     my ($class, @args) = @_;
 179     my $self = $class->SUPER::new(@args);
 180     my ($seq, $id, $acc, $pid, $ns, $auth, $v, $oid, $desc, $description,
 181         $alphabet, $given_id, $is_circular, $direct, $ref_to_seq, $len,
 182         $nowarnonempty) =
 183       $self->_rearrange([qw(SEQ
 184                             DISPLAY_ID
 185                             ACCESSION_NUMBER
 186                             PRIMARY_ID
 187                             NAMESPACE
 188                             AUTHORITY
 189                             VERSION
 190                             OBJECT_ID
 191                             DESC
 192                             DESCRIPTION
 193                             ALPHABET
 194                             ID
 195                             IS_CIRCULAR
 196                             DIRECT
 197                             REF_TO_SEQ
 198                             LENGTH
 199                             NOWARNONEMPTY
 200                             )],
 201                             @args);
 202
 203     # Private var _nowarnonempty, needs to be set before calling _guess_alphabet
 204     $self->{'_nowarnonempty'} = $nowarnonempty;
 205     $self->{'_direct'} = $direct;
 206
 207     if( defined $id && defined $given_id ) {
 208         if( $id ne $given_id ) {
 209           $self->throw("Provided both id and display_id constructors: [$id] [$given_id]");
 210         }
 211     }
 212     if( defined $given_id ) { $id = $given_id; }
 213
 214     # Bernd's idea: set ids now for more informative invalid sequence messages
 215     defined $id  && $self->display_id($id);
 216     $acc         && $self->accession_number($acc);
 217     defined $pid && $self->primary_id($pid);
 218
 219     # Set alphabet now to avoid guessing it later, when sequence is set
 220     $alphabet && $self->alphabet($alphabet);
 221
 222     # Set the length before the seq. If there is a seq, length will be updated later
 223     $self->{'length'} = $len || 0;
 224
 225     # Set the sequence (but also alphabet and length)
 226     if ($ref_to_seq) {
 227         $self->_set_seq_by_ref($ref_to_seq, $alphabet);
 228     } else {
 229         if (defined $seq) {
 230             # Note: the sequence string may be empty
 231             $self->seq($seq);
 232         }
 233     }
 234
 235     $desc                  && $self->desc($desc);
 236     $description           && $self->description($description);
 237     $ns                    && $self->namespace($ns);
 238     $auth                  && $self->authority($auth);
 239     # Any variable that can have a value "0" must be tested with defined
 240     # or it will fail to be added to the new object
 241     defined($v)            && $self->version($v);
 242     defined($oid)          && $self->object_id($oid);
 243     defined($is_circular)  && $self->is_circular($is_circular);
 244
 245     return $self;
 246 }
 247
 248
 249 =head2 seq
 250
 251  Title   : seq
 252  Usage   : $string = $seqobj->seq();
 253  Function: Get or set  the sequence as a string of letters. The case of
 254            the letters is left up to the implementer. Suggested cases are
 255            upper case for proteins and lower case for DNA sequence (IUPAC
 256            standard), but you should not rely on this. An error is thrown if
 257            the sequence contains invalid characters: see validate_seq().
 258  Returns : A scalar
 259  Args    : - Optional new sequence value (a string) to set
 260            - Optional alphabet (it is guessed by default)
 261
 262 =cut
 263
 264 sub seq {
 265     my ($self, @args) = @_;
 266
 267     if( scalar @args == 0 ) {
 268         return $self->{'seq'};
 269     }
 270
 271     my ($seq_str, $alphabet) = @args;
 272     if (@args) {
 273         $self->_set_seq_by_ref(\$seq_str, $alphabet);
 274     }
 275
 276     return $self->{'seq'};
 277 }
 278
 279
 280 sub _set_seq_by_ref {
 281     # Set a sequence by reference. A reference is used to avoid the cost of
 282     # copying the sequence (which can be very large) between functions.
 283     my ($self, $seq_str_ref, $alphabet) = @_;
 284
 285     # Validate sequence if sequence is not empty and we are not in direct mode
 286     if ( (! $self->{'_direct'}) && (defined $$seq_str_ref) ) {
 287         $self->validate_seq($$seq_str_ref, 1);
 288     }
 289     delete $self->{'_direct'}; # next sequence will have to be validated
 290
 291     # Record sequence length
 292     my $len = CORE::length($$seq_str_ref || '');
 293     my $is_changed_seq = (exists $self->{'seq'}) && ($len > 0);
 294     # Note: if the new seq is empty or undef, this is not considered a change
 295     delete $self->{'_freeze_length'} if $is_changed_seq;
 296     $self->{'length'} = $len if not exists $self->{'_freeze_length'};
 297
 298     # Set sequence
 299     $self->{'seq'} = $$seq_str_ref;
 300
 301     # Set or guess alphabet
 302     if ($alphabet) {
 303         # Alphabet specified, set it no matter what
 304         $self->alphabet($alphabet);
 305     } elsif ($is_changed_seq || (! defined($self->alphabet()))) {
 306         # If we changed a previous sequence to a new one or if there is no
 307         # alphabet yet at all, we need to guess the (possibly new) alphabet
 308         $self->_guess_alphabet();
 309     } # else (seq not changed and alphabet was defined) do nothing
 310
 311     return 1;
 312 }
 313
 314
 315 =head2 validate_seq
 316
 317  Title   : validate_seq
 318  Usage   : if(! $seqobj->validate_seq($seq_str) ) {
 319                 print "sequence $seq_str is not valid for an object of
 320                 alphabet ",$seqobj->alphabet, "\n";
 321            }
 322  Function: Test that the given sequence is valid, i.e. contains only valid
 323            characters. The allowed characters are all letters (A-Z) and '-','.',
 324            '*','?','=' and '~'. Spaces are not valid. Note that this
 325            implementation does not take alphabet() into account and that empty
 326            sequences are considered valid.
 327  Returns : 1 if the supplied sequence string is valid, 0 otherwise.
 328  Args    : - Sequence string to be validated
 329            - Boolean to optionally throw an error if the sequence is invalid
 330
 331 =cut
 332
 333 sub validate_seq {
 334     my ($self, $seqstr, $throw) = @_;
 335     if ( (defined $seqstr                ) &&
 336          ($seqstr !~ /^[$MATCHPATTERN]*$/) ) {
 337         if ($throw) {
 338             $self->throw("Failed validation of sequence '".(defined($self->id) ||
 339                 '[unidentified sequence]')."'. Invalid characters were: " .
 340                 join('',($seqstr =~ /[^$MATCHPATTERN]/g)));
 341         }
 342         return 0;
 343     }
 344     return 1;
 345 }
 346
 347
 348 =head2 subseq
 349
 350  Title   : subseq
 351  Usage   : $substring = $seqobj->subseq(10,40);
 352            $substring = $seqobj->subseq(10,40,'nogap');
 353            $substring = $seqobj->subseq(-start=>10, -end=>40, -replace_with=>'tga');
 354            $substring = $seqobj->subseq($location_obj);
 355            $substring = $seqobj->subseq($location_obj, -nogap => 1);
 356  Function: Return the subseq from start to end, where the first sequence
 357            character has coordinate 1 number is inclusive, ie 1-2 are the
 358            first two characters of the sequence. The given start coordinate
 359            has to be larger than the end, even if the sequence is circular.
 360  Returns : a string
 361  Args    : integer for start position
 362            integer for end position
 363                  OR
 364            Bio::LocationI location for subseq (strand honored)
 365            Specify -NOGAP=>1 to return subseq with gap characters removed
 366            Specify -REPLACE_WITH=>$new_subseq to replace the subseq returned
 367            with $new_subseq in the sequence object
 368
 369 =cut
 370
 371 sub subseq {
 372     my $self = shift;
 373     my @args = @_;
 374     my ($start, $end, $nogap, $replace) = $self->_rearrange([qw(START
 375                                                              END
 376                                                              NOGAP
 377                                                              REPLACE_WITH)], @args);
 378
 379     # If -replace_with is specified, validate the replacement sequence
 380     if (defined $replace) {
 381         $self->validate_seq( $replace ) ||
 382             $self->throw("Replacement sequence does not look valid");
 383     }
 384
 385     if( ref($start) && $start->isa('Bio::LocationI') ) {
 386         my $loc = $start;
 387         my $seq = '';
 388
 389         # For Split objects if Guide Strand is negative,
 390         # pass the sublocations in reverse
 391         my $order = 0;
 392         if ($loc->isa('Bio::Location::SplitLocationI')) {
 393             # guide_strand can return undef, so don't compare directly
 394             # to avoid 'uninitialized value' warning
 395             my $guide_strand = defined ($loc->guide_strand) ? ($loc->guide_strand) : 0;
 396             $order = ($guide_strand == -1) ? -1 : 0;
 397         }
 398         # Reversing order using ->each_Location(-1) does not work well for
 399         # cut by origin-splits (like "complement(join(1900..END,START..50))"),
 400         # so use "reverse" instead
 401         my @sublocs = ($order == -1) ? reverse $loc->each_Location(): $loc->each_Location;
 402         foreach my $subloc (@sublocs) {
 403             my $piece = $self->subseq(-start        => $subloc->start(),
 404                                       -end          => $subloc->end(),
 405                                       -replace_with => $replace,
 406                                       -nogap        => $nogap);
 407             $piece =~ s/[$GAP_SYMBOLS]//g if $nogap;
 408
 409             # strand can return undef, so don't compare directly
 410             # to avoid 'uninitialized value' warning
 411             my $strand = defined ($subloc->strand) ? ($subloc->strand) : 0;
 412             if ($strand < 0) {
 413                 $piece = $self->_revcom_from_string($piece, $self->alphabet);
 414             }
 415             $seq .= $piece;
 416         }
 417         return $seq;
 418     } elsif( defined $start && defined $end ) {
 419         if( $start > $end ){
 420             $self->throw("Bad start,end parameters. Start [$start] has to be ".
 421               "less than end [$end]");
 422         }
 423         if( $start <= 0 ) {
 424             $self->throw("Bad start parameter ($start). Start must be positive.");
 425         }
 426
 427         # Remove one from start, and then length is end-start
 428         $start--;
 429
 430         my $seqstr;
 431         if (defined $replace) {
 432             $seqstr = substr $self->{seq}, $start, $end-$start, $replace;
 433         } else {
 434             $seqstr = substr $self->{seq}, $start, $end-$start;
 435         }
 436
 437
 438         if ($end > $self->length) {
 439             if ($self->is_circular) {
 440                 my $start = 0;
 441                 my $end = $end - $self->length;
 442
 443                 my $appendstr;
 444                 if (defined $replace) {
 445                     $appendstr = substr $self->{seq}, $start, $end-$start, $replace;
 446                 } else {
 447                     $appendstr = substr $self->{seq}, $start, $end-$start;
 448                 }
 449
 450                 $seqstr .= $appendstr;
 451             } else {
 452                 $self->throw("Bad end parameter ($end). End must be less than ".
 453                     "the total length of sequence (total=".$self->length.")")
 454             }
 455         }
 456
 457         $seqstr =~ s/[$GAP_SYMBOLS]//g if ($nogap);
 458         return $seqstr;
 459
 460     } else {
 461         $self->warn("Incorrect parameters to subseq - must be two integers or ".
 462             "a Bio::LocationI object. Got:", $self,$start,$end,$replace,$nogap);
 463         return;
 464     }
 465 }
 466
 467
 468 =head2 length
 469
 470  Title   : length
 471  Usage   : $len = $seqobj->length();
 472  Function: Get the stored length of the sequence in number of symbols (bases
 473            or amino acids). In some circumstances, you can also set this attribute:
 474
 475            1. For empty sequences, you can set the length to anything you want:
 476               my $seqobj = Bio::PrimarySeq->new( -length => 123 );
 477               my $len = $seqobj->len; # 123
 478            2. To save memory when using very long sequences, you can set the
 479               length of the sequence to the length of the sequence (and nothing
 480               else):
 481               my $seqobj = Bio::PrimarySeq->new( -seq => 'ACGT...' ); # 1 Mbp sequence
 482               # process $seqobj... then after you're done with it
 483               $seqobj->length($seqobj->length);
 484               $seqobj->seq(undef); # free memory!
 485               my $len = $seqobj->len; # 1 Mbp
 486
 487            Note that if you set seq() to a value other than undef at any time,
 488            the length attribute will be reset.
 489  Returns : integer representing the length of the sequence.
 490  Args    : Optionally, the value on set
 491
 492 =cut
 493
 494 sub length {
 495     my ($self, $val) = @_;
 496     if (defined $val) {
 497         my $len = $self->{'length'};
 498         if ($len && ($len != $val)) {
 499             $self->throw("Can not set the length to $val, current length value is $len");
 500         }
 501         $self->{'length'} = $val;
 502         $self->{'_freeze_length'} = undef;
 503     }
 504     return $self->{'length'};
 505 }
 506
 507
 508 =head2 display_id
 509
 510  Title   : display_id or display_name
 511  Usage   : $id_string = $seqobj->display_id();
 512  Function: Get or set the display id, aka the common name of the sequence object.
 513
 514            The semantics of this is that it is the most likely string to
 515            be used as an identifier of the sequence, and likely to have
 516            "human" readability.  The id is equivalent to the ID field of
 517            the GenBank/EMBL databanks and the id field of the
 518            Swissprot/sptrembl database. In fasta format, the >(\S+) is
 519            presumed to be the id, though some people overload the id to
 520            embed other information. Bioperl does not use any embedded
 521            information in the ID field, and people are encouraged to use
 522            other mechanisms (accession field for example, or extending
 523            the sequence object) to solve this.
 524
 525            With the new Bio::DescribeableI interface, display_name aliases
 526            to this method.
 527  Returns : A string for the display ID
 528  Args    : Optional string for the display ID to set
 529
 530 =cut
 531
 532 sub display_id {
 533     my ($self, $value) = @_;
 534     if( defined $value) {
 535         $self->{'display_id'} = $value;
 536     }
 537     return $self->{'display_id'};
 538 }
 539
 540
 541 =head2 accession_number
 542
 543  Title   : accession_number or object_id
 544  Usage   : $unique_key = $seqobj->accession_number;
 545  Function: Returns the unique biological id for a sequence, commonly
 546            called the accession_number. For sequences from established
 547            databases, the implementors should try to use the correct
 548            accession number. Notice that primary_id() provides the
 549            unique id for the implemetation, allowing multiple objects
 550            to have the same accession number in a particular implementation.
 551
 552            For sequences with no accession number, this method should
 553            return "unknown".
 554
 555            [Note this method name is likely to change in 1.3]
 556
 557            With the new Bio::IdentifiableI interface, this is aliased
 558            to object_id
 559  Returns : A string
 560  Args    : A string (optional) for setting
 561
 562 =cut
 563
 564 sub accession_number {
 565     my( $self, $acc ) = @_;
 566     if (defined $acc) {
 567         $self->{'accession_number'} = $acc;
 568     } else {
 569         $acc = $self->{'accession_number'};
 570         $acc = 'unknown' unless defined $acc;
 571     }
 572     return $acc;
 573 }
 574
 575
 576 =head2 primary_id
 577
 578  Title   : primary_id
 579  Usage   : $unique_key = $seqobj->primary_id;
 580  Function: Returns the unique id for this object in this
 581            implementation. This allows implementations to manage their
 582            own object ids in a way the implementaiton can control
 583            clients can expect one id to map to one object.
 584
 585            For sequences with no natural primary id, this method
 586            should return a stringified memory location.
 587  Returns : A string
 588  Args    : A string (optional, for setting)
 589
 590 =cut
 591
 592 sub primary_id {
 593     my $self = shift;
 594
 595     if(@_) {
 596         $self->{'primary_id'} = shift;
 597     }
 598     if( ! defined($self->{'primary_id'}) ) {
 599         return "$self";
 600     }
 601     return $self->{'primary_id'};
 602 }
 603
 604
 605 =head2 alphabet
 606
 607  Title   : alphabet
 608  Usage   : if( $seqobj->alphabet eq 'dna' ) { # Do something }
 609  Function: Get/set the alphabet of sequence, one of
 610            'dna', 'rna' or 'protein'. This is case sensitive.
 611
 612            This is not called <type> because this would cause
 613            upgrade problems from the 0.5 and earlier Seq objects.
 614  Returns : a string either 'dna','rna','protein'. NB - the object must
 615            make a call of the type - if there is no alphabet specified it
 616            has to guess.
 617  Args    : optional string to set : 'dna' | 'rna' | 'protein'
 618
 619
 620 =cut
 621
 622 sub alphabet {
 623     my ($self,$value) = @_;
 624     if (defined $value) {
 625         $value = lc $value;
 626         unless ( $valid_type{$value} ) {
 627         $self->throw("Alphabet '$value' is not a valid alphabet (".
 628             join(',', map "'$_'", sort keys %valid_type) .") lowercase");
 629         }
 630         $self->{'alphabet'} = $value;
 631     }
 632     return $self->{'alphabet'};
 633 }
 634
 635
 636 =head2 desc
 637
 638  Title   : desc or description
 639  Usage   : $seqobj->desc($newval);
 640  Function: Get/set description of the sequence.
 641
 642            'description' is an alias for this for compliance with the
 643            Bio::DescribeableI interface.
 644  Returns : value of desc (a string)
 645  Args    : newvalue (a string or undef, optional)
 646
 647
 648 =cut
 649
 650 sub desc{
 651     my $self = shift;
 652
 653     return $self->{'desc'} = shift if @_;
 654     return $self->{'desc'};
 655 }
 656
 657
 658 =head2 can_call_new
 659
 660  Title   : can_call_new
 661  Usage   :
 662  Function:
 663  Example :
 664  Returns : true
 665  Args    :
 666
 667 =cut
 668
 669 sub can_call_new {
 670    my ($self) = @_;
 671
 672    return 1;
 673 }
 674
 675
 676 =head2 id
 677
 678  Title   : id
 679  Usage   : $id = $seqobj->id();
 680  Function: This is mapped on display_id
 681  Example :
 682  Returns :
 683  Args    :
 684
 685 =cut
 686
 687 sub  id {
 688    return shift->display_id(@_);
 689 }
 690
 691
 692 =head2 is_circular
 693
 694  Title   : is_circular
 695  Usage   : if( $seqobj->is_circular) { # Do something }
 696  Function: Returns true if the molecule is circular
 697  Returns : Boolean value
 698  Args    : none
 699
 700 =cut
 701
 702 sub is_circular{
 703     my $self = shift;
 704     return $self->{'is_circular'} = shift if @_;
 705     return $self->{'is_circular'};
 706 }
 707
 708
 709 =head1 Methods for Bio::IdentifiableI compliance
 710
 711 =head2 object_id
 712
 713  Title   : object_id
 714  Usage   : $string = $seqobj->object_id();
 715  Function: Get or set a string which represents the stable primary identifier
 716            in this namespace of this object. For DNA sequences this
 717            is its accession_number, similarly for protein sequences.
 718
 719            This is aliased to accession_number().
 720  Returns : A scalar
 721  Args    : Optional object ID to set.
 722
 723 =cut
 724
 725 sub object_id {
 726     return shift->accession_number(@_);
 727 }
 728
 729
 730 =head2 version
 731
 732  Title   : version
 733  Usage   : $version = $seqobj->version();
 734  Function: Get or set a number which differentiates between versions of
 735            the same object. Higher numbers are considered to be
 736            later and more relevant, but a single object described
 737            the same identifier should represent the same concept.
 738  Returns : A number
 739  Args    : Optional version to set.
 740
 741 =cut
 742
 743 sub version{
 744     my ($self,$value) = @_;
 745     if( defined $value) {
 746         $self->{'_version'} = $value;
 747     }
 748     return $self->{'_version'};
 749 }
 750
 751
 752 =head2 authority
 753
 754  Title   : authority
 755  Usage   : $authority = $seqobj->authority();
 756  Function: Get or set a string which represents the organisation which
 757            granted the namespace, written as the DNS name of the
 758            organisation (eg, wormbase.org).
 759  Returns : A scalar
 760  Args    : Optional authority to set.
 761
 762 =cut
 763
 764 sub authority {
 765     my ($self, $value) = @_;
 766     if( defined $value) {
 767         $self->{'authority'} = $value;
 768     }
 769     return $self->{'authority'};
 770 }
 771
 772
 773 =head2 namespace
 774
 775  Title   : namespace
 776  Usage   : $string = $seqobj->namespace();
 777  Function: Get or set a string representing the name space this identifier
 778            is valid in, often the database name or the name describing the
 779            collection.
 780  Returns : A scalar
 781  Args    : Optional namespace to set.
 782
 783 =cut
 784
 785 sub namespace{
 786     my ($self,$value) = @_;
 787     if( defined $value) {
 788         $self->{'namespace'} = $value;
 789     }
 790     return $self->{'namespace'} || "";
 791 }
 792
 793
 794 =head1 Methods for Bio::DescribableI compliance
 795
 796 This comprises of display_name and description.
 797
 798 =head2 display_name
 799
 800  Title   : display_name
 801  Usage   : $string = $seqobj->display_name();
 802  Function: Get or set a string which is what should be displayed to the user.
 803            The string should have no spaces (ideally, though a cautious
 804            user of this interface would not assumme this) and should be
 805            less than thirty characters (though again, double checking
 806            this is a good idea).
 807
 808            This is aliased to display_id().
 809  Returns : A string for the display name
 810  Args    : Optional string for the display name to set.
 811
 812 =cut
 813
 814 sub display_name {
 815     return shift->display_id(@_);
 816 }
 817
 818
 819 =head2 description
 820
 821  Title   : description
 822  Usage   : $string = $seqobj->description();
 823  Function: Get or set a text string suitable for displaying to the user a
 824            description. This string is likely to have spaces, but
 825            should not have any newlines or formatting - just plain
 826            text. The string should not be greater than 255 characters
 827            and clients can feel justified at truncating strings at 255
 828            characters for the purposes of display.
 829
 830            This is aliased to desc().
 831  Returns : A string for the description
 832  Args    : Optional string for the description to set.
 833
 834 =cut
 835
 836 sub description {
 837     return shift->desc(@_);
 838 }
 839
 840
 841 =head1 Methods Inherited from Bio::PrimarySeqI
 842
 843 These methods are available on Bio::PrimarySeq, although they are
 844 actually implemented on Bio::PrimarySeqI
 845
 846 =head2 revcom
 847
 848  Title   : revcom
 849  Usage   : $rev = $seqobj->revcom();
 850  Function: Produces a new Bio::SeqI implementing object which
 851            is the reversed complement of the sequence. For protein
 852            sequences this throws an exception of
 853            "Sequence is a protein. Cannot revcom".
 854
 855            The id is the same id as the orginal sequence, and the
 856            accession number is also indentical. If someone wants to
 857            track that this sequence has be reversed, it needs to
 858            define its own extensions.
 859
 860            To do an inplace edit of an object you can go:
 861
 862            $seqobj = $seqobj->revcom();
 863
 864            This of course, causes Perl to handle the garbage
 865            collection of the old object, but it is roughly speaking as
 866            efficient as an inplace edit.
 867  Returns : A new (fresh) Bio::SeqI object
 868  Args    : none
 869
 870 =head2 trunc
 871
 872  Title   : trunc
 873  Usage   : $subseq = $myseq->trunc(10,100);
 874  Function: Provides a truncation of a sequence,
 875  Returns : A fresh Bio::SeqI implementing object.
 876  Args    : Numbers for the start and end positions
 877
 878 =head1 Internal methods
 879
 880 These are internal methods to PrimarySeq
 881
 882 =head2 _guess_alphabet
 883
 884  Title   : _guess_alphabet
 885  Usage   :
 886  Function: Automatically guess and set the type of sequence: dna, rna, protein
 887            or '' if the sequence was empty. This method first removes dots (.),
 888            dashes (-) and question marks (?) before guessing the alphabet
 889            using the IUPAC conventions for ambiguous residues. Since the DNA and
 890            RNA characters are also valid characters for proteins, there is
 891            no foolproof way of determining the right alphabet. This is our best
 892            guess only!
 893  Returns : string 'dna', 'rna', 'protein' or ''.
 894  Args    : none
 895
 896 =cut
 897
 898 sub _guess_alphabet {
 899     my ($self) = @_;
 900     # Guess alphabet
 901     my $alphabet = $self->_guess_alphabet_from_string($self->seq, $self->{'_nowarnonempty'});
 902     # Set alphabet unless it is unknown
 903     $self->alphabet($alphabet) if $alphabet;
 904     return $alphabet;
 905 }
 906
 907
 908 sub _guess_alphabet_from_string {
 909     # Get the alphabet from a sequence string
 910     my ($self, $str, $nowarnonempty) = @_;
 911
 912     $nowarnonempty = 0 if not defined $nowarnonempty;
 913
 914     # Remove chars that clearly don't denote nucleic or amino acids
 915     $str =~ s/[-.?]//gi;
 916
 917     # Check for sequences without valid letters
 918     my $alphabet;
 919     my $total = CORE::length($str);
 920     if( $total == 0 ) {
 921         if (not $nowarnonempty) {
 922             $self->warn("Got a sequence without letters. Could not guess alphabet");
 923         }
 924         $alphabet = '';
 925     }
 926
 927     # Determine alphabet now
 928     if (not defined $alphabet) {
 929         if ($str =~ m/[EFIJLOPQXZ]/i) {
 930             # Start with a safe method to find proteins.
 931             # Unambiguous IUPAC letters for proteins are: E,F,I,J,L,O,P,Q,X,Z
 932             $alphabet = 'protein';
 933         } else {
 934             # Alphabet is unsure, could still be DNA, RNA or protein
 935             # DNA and RNA contain mostly A, T, U, G, C and N, but the other
 936             # letters they use are also among the 15 valid letters that a
 937             # protein sequence can contain at this stage. Make our best guess
 938             # based on sequence composition. If it contains over 70% of ACGTUN,
 939             # it is likely nucleic.
 940             if( ($str =~ tr/ATUGCNWSKMatugcnwskm//) / $total > 0.7 ) {
 941                 if ( $str =~ m/U/i ) {
 942                     $alphabet = 'rna';
 943                 } else {
 944                     $alphabet = 'dna';
 945                 }
 946             } else {
 947                 $alphabet = 'protein';
 948             }
 949         }
 950     }
 951
 952     return $alphabet;
 953 }
 954
 955
 956 ############################################################################
 957 # aliases due to name changes or to compensate for our lack of consistency #
 958 ############################################################################
 959
 960 sub accession {
 961     my $self = shift;
 962
 963     $self->warn(ref($self)."::accession is deprecated, ".
 964         "use accession_number() instead");
 965     return $self->accession_number(@_);
 966 }
 967
 968 1;