Bio/PrimarySeq.pm

   1 #
   2 # bioperl module for Bio::PrimarySeq
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
   7 #
   8 # Copyright Ewan Birney
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::PrimarySeq - Bioperl lightweight sequence object
  17
  18 =head1 SYNOPSIS
  19
  20   # Bio::SeqIO for file reading, Bio::DB::GenBank for
  21   # database reading
  22
  23   use Bio::Seq;
  24   use Bio::SeqIO;
  25   use Bio::DB::GenBank;
  26
  27   # make from memory
  28
  29   $seqobj = Bio::PrimarySeq->new (
  30       -seq              => 'ATGGGGTGGGCGGTGGGTGGTTTG',
  31       -id               => 'GeneFragment-12',
  32       -accession_number => 'X78121',
  33       -alphabet         => 'dna',
  34       -is_circular      => 1,
  35   );
  36   print "Sequence ", $seqobj->id(), " with accession ",
  37     $seqobj->accession_number, "\n";
  38
  39   # read from file
  40
  41   $inputstream = Bio::SeqIO->new(
  42       -file   => "myseq.fa",
  43       -format => 'Fasta',
  44   );
  45   $seqobj = $inputstream->next_seq();
  46   print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n";
  47
  48   # to get out parts of the sequence.
  49
  50   print "Sequence ", $seqobj->id(), " with accession ",
  51     $seqobj->accession_number, " and desc ", $seqobj->desc, "\n";
  52
  53   $string  = $seqobj->seq();
  54   $string2 = $seqobj->subseq(1,40);
  55
  56 =head1 DESCRIPTION
  57
  58 PrimarySeq is a lightweight sequence object, storing the sequence, its
  59 name, a computer-useful unique name, and other fundamental attributes.
  60 It does not contain sequence features or other information.  To have a
  61 sequence with sequence features you should use the Seq object which uses
  62 this object.
  63
  64 Although new users will use Bio::PrimarySeq a lot, in general you will
  65 be using it from the Bio::Seq object. For more information on Bio::Seq
  66 see L<Bio::Seq>. For interest you might like to know that
  67 Bio::Seq has-a Bio::PrimarySeq and forwards most of the function calls
  68 to do with sequence to it (the has-a relationship lets us get out of a
  69 otherwise nasty cyclical reference in Perl which would leak memory).
  70
  71 Sequence objects are defined by the Bio::PrimarySeqI interface, and this
  72 object is a pure Perl implementation of the interface. If that's
  73 gibberish to you, don't worry. The take home message is that this
  74 object is the bioperl default sequence object, but other people can
  75 use their own objects as sequences if they so wish. If you are
  76 interested in wrapping your own objects as compliant Bioperl sequence
  77 objects, then you should read the Bio::PrimarySeqI documentation
  78
  79 The documentation of this object is a merge of the Bio::PrimarySeq and
  80 Bio::PrimarySeqI documentation.  This allows all the methods which you can
  81 call on sequence objects here.
  82
  83 =head1 FEEDBACK
  84
  85 =head2 Mailing Lists
  86
  87 User feedback is an integral part of the evolution of this and other
  88 Bioperl modules. Send your comments and suggestions preferably to one
  89 of the Bioperl mailing lists.  Your participation is much appreciated.
  90
  91   bioperl-l@bioperl.org                  - General discussion
  92   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  93
  94 =head2 Support
  95
  96 Please direct usage questions or support issues to the mailing list:
  97
  98 I<bioperl-l@bioperl.org>
  99
 100 rather than to the module maintainer directly. Many experienced and
 101 reponsive experts will be able look at the problem and quickly
 102 address it. Please include a thorough description of the problem
 103 with code and data examples if at all possible.
 104
 105 =head2 Reporting Bugs
 106
 107 Report bugs to the Bioperl bug tracking system to help us keep track
 108 the bugs and their resolution.  Bug reports can be submitted via the
 109 web:
 110
 111   https://redmine.open-bio.org/projects/bioperl/
 112
 113 =head1 AUTHOR - Ewan Birney
 114
 115 Email birney@ebi.ac.uk
 116
 117 =head1 APPENDIX
 118
 119 The rest of the documentation details each of the object
 120 methods. Internal methods are usually preceded with a _
 121
 122 =cut
 123
 124
 125
 126 package Bio::PrimarySeq;
 127
 128 use strict;
 129
 130 our $MATCHPATTERN = 'A-Za-z\-\.\*\?=~';
 131 our $GAP_SYMBOLS  = '-~';
 132
 133 use base qw(Bio::Root::Root Bio::PrimarySeqI
 134             Bio::IdentifiableI Bio::DescribableI);
 135
 136
 137 # Setup the allowed values for alphabet()
 138 my %valid_type = map {$_, 1} qw( dna rna protein );
 139
 140
 141 =head2 new
 142
 143  Title   : new
 144  Usage   : $seqobj = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT',
 145                                            -id  => 'human_id',
 146                                            -accession_number => 'AL000012',
 147                                            );
 148  Function: Returns a new primary seq object from
 149            basic constructors, being a string for the sequence
 150            and strings for id and accession_number.
 151
 152            Note that you can provide an empty sequence string. However, in
 153            this case you MUST specify the type of sequence you wish to
 154            initialize by the parameter -alphabet. See alphabet() for possible
 155            values.
 156  Returns : a new Bio::PrimarySeq object
 157  Args    : -seq              => sequence string
 158            -ref_to_seq       => ... or reference to a sequence string
 159            -display_id       => display id of the sequence (locus name)
 160            -accession_number => accession number
 161            -primary_id       => primary id (Genbank id)
 162            -version          => version number
 163            -namespace        => the namespace for the accession
 164            -authority        => the authority for the namespace
 165            -description      => description text
 166            -desc             => alias for description
 167            -alphabet         => skip alphabet guess and set it to dna, rna or protein
 168            -id               => alias for display id
 169            -is_circular      => boolean to indicate that sequence is circular
 170            -direct           => boolean to directly set sequences. The next time -seq,
 171                                 seq() or -ref_to_seq is use, the sequence will not be
 172                                 validated. Be careful with this...
 173            -nowarnonempty    => boolean to avoid warning when sequence is empty
 174
 175 =cut
 176
 177 sub new {
 178     my ($class, @args) = @_;
 179     my $self = $class->SUPER::new(@args);
 180     my ($seq, $id, $acc, $pid, $ns, $auth, $v, $oid, $desc, $description,
 181         $alphabet, $given_id, $is_circular, $direct, $ref_to_seq, $len,
 182         $nowarnonempty) =
 183       $self->_rearrange([qw(SEQ
 184                             DISPLAY_ID
 185                             ACCESSION_NUMBER
 186                             PRIMARY_ID
 187                             NAMESPACE
 188                             AUTHORITY
 189                             VERSION
 190                             OBJECT_ID
 191                             DESC
 192                             DESCRIPTION
 193                             ALPHABET
 194                             ID
 195                             IS_CIRCULAR
 196                             DIRECT
 197                             REF_TO_SEQ
 198                             LENGTH
 199                             NOWARNONEMPTY
 200                             )],
 201                             @args);
 202
 203     # Private var _nowarnonempty, needs to be set before calling _guess_alphabet
 204     $self->{'_nowarnonempty'} = $nowarnonempty;
 205     $self->{'_direct'} = $direct;
 206
 207     if( defined $id && defined $given_id ) {
 208         if( $id ne $given_id ) {
 209           $self->throw("Provided both id and display_id constructors: [$id] [$given_id]");
 210         }
 211     }
 212     if( defined $given_id ) { $id = $given_id; }
 213
 214     # Bernd's idea: set ids now for more informative invalid sequence messages
 215     defined $id  && $self->display_id($id);
 216     $acc         && $self->accession_number($acc);
 217     defined $pid && $self->primary_id($pid);
 218
 219     # Set alphabet now to avoid guessing it later, when sequence is set
 220     $alphabet && $self->alphabet($alphabet);
 221
 222     # Set the length before the seq. If there is a seq, length will be updated later
 223     $self->{'length'} = $len || 0;
 224
 225     # Set the sequence (but also alphabet and length)
 226     if ($ref_to_seq) {
 227         $self->_set_seq_by_ref($ref_to_seq, $alphabet);
 228     } else {
 229         if (defined $seq) {
 230             # Note: the sequence string may be empty
 231             $self->seq($seq);
 232         }
 233     }
 234
 235     $desc         && $self->desc($desc);
 236     $description  && $self->description($description);
 237     $is_circular  && $self->is_circular($is_circular);
 238     $ns           && $self->namespace($ns);
 239     $auth         && $self->authority($auth);
 240     defined($v)   && $self->version($v);
 241     defined($oid) && $self->object_id($oid);
 242
 243     return $self;
 244 }
 245
 246
 247 =head2 seq
 248
 249  Title   : seq
 250  Usage   : $string = $seqobj->seq();
 251  Function: Get or set  the sequence as a string of letters. The case of
 252            the letters is left up to the implementer. Suggested cases are
 253            upper case for proteins and lower case for DNA sequence (IUPAC
 254            standard), but you should not rely on this. An error is thrown if
 255            the sequence contains invalid characters: see validate_seq().
 256  Returns : A scalar
 257  Args    : - Optional new sequence value (a string) to set
 258            - Optional alphabet (it is guessed by default)
 259
 260 =cut
 261
 262 sub seq {
 263     my ($self, @args) = @_;
 264
 265     if( scalar @args == 0 ) {
 266         return $self->{'seq'};
 267     }
 268
 269     my ($seq_str, $alphabet) = @args;
 270     if (@args) {
 271         $self->_set_seq_by_ref(\$seq_str, $alphabet);
 272     }
 273
 274     return $self->{'seq'};
 275 }
 276
 277
 278 sub _set_seq_by_ref {
 279     # Set a sequence by reference. A reference is used to avoid the cost of
 280     # copying the sequence (which can be very large) between functions.
 281     my ($self, $seq_str_ref, $alphabet) = @_;
 282
 283     # Validate sequence if sequence is not empty and we are not in direct mode
 284     if ( (! $self->{'_direct'}) && (defined $$seq_str_ref) ) {
 285         $self->validate_seq($$seq_str_ref, 1);
 286     }
 287     delete $self->{'_direct'}; # next sequence will have to be validated
 288
 289     # Record sequence length
 290     my $len = CORE::length($$seq_str_ref || '');
 291     my $is_changed_seq = (exists $self->{'seq'}) && ($len > 0);
 292     # Note: if the new seq is empty or undef, this is not considered a change
 293     delete $self->{'_freeze_length'} if $is_changed_seq;
 294     $self->{'length'} = $len if not exists $self->{'_freeze_length'};
 295
 296     # Set sequence
 297     $self->{'seq'} = $$seq_str_ref;
 298
 299     # Set or guess alphabet
 300     if ($alphabet) {
 301         # Alphabet specified, set it no matter what
 302         $self->alphabet($alphabet);
 303     } elsif ($is_changed_seq || (! defined($self->alphabet()))) {
 304         # If we changed a previous sequence to a new one or if there is no
 305         # alphabet yet at all, we need to guess the (possibly new) alphabet
 306         $self->_guess_alphabet();
 307     } # else (seq not changed and alphabet was defined) do nothing
 308
 309     return 1;
 310 }
 311
 312
 313 =head2 validate_seq
 314
 315  Title   : validate_seq
 316  Usage   : if(! $seqobj->validate_seq($seq_str) ) {
 317                 print "sequence $seq_str is not valid for an object of
 318                 alphabet ",$seqobj->alphabet, "\n";
 319            }
 320  Function: Test that the given sequence is valid, i.e. contains only valid
 321            characters. The allowed characters are all letters (A-Z) and '-','.',
 322            '*','?','=' and '~'. Spaces are not valid. Note that this
 323            implementation does not take alphabet() into account and that empty
 324            sequences are considered valid.
 325  Returns : 1 if the supplied sequence string is valid, 0 otherwise.
 326  Args    : - Sequence string to be validated
 327            - Boolean to optionally throw an error if the sequence is invalid
 328
 329 =cut
 330
 331 sub validate_seq {
 332     my ($self, $seqstr, $throw) = @_;
 333     if ( (defined $seqstr                ) &&
 334          ($seqstr !~ /^[$MATCHPATTERN]*$/) ) {
 335         if ($throw) {
 336             $self->throw("Failed validation of sequence '".(defined($self->id) ||
 337                 '[unidentified sequence]')."'. Invalid characters were: " .
 338                 join('',($seqstr =~ /[^$MATCHPATTERN]/g)));
 339         }
 340         return 0;
 341     }
 342     return 1;
 343 }
 344
 345
 346 =head2 subseq
 347
 348  Title   : subseq
 349  Usage   : $substring = $seqobj->subseq(10,40);
 350            $substring = $seqobj->subseq(10,40,'nogap');
 351            $substring = $seqobj->subseq(-start=>10, -end=>40, -replace_with=>'tga');
 352            $substring = $seqobj->subseq($location_obj);
 353            $substring = $seqobj->subseq($location_obj, -nogap => 1);
 354  Function: Return the subseq from start to end, where the first sequence
 355            character has coordinate 1 number is inclusive, ie 1-2 are the
 356            first two characters of the sequence. The given start coordinate
 357            has to be larger than the end, even if the sequence is circular.
 358  Returns : a string
 359  Args    : integer for start position
 360            integer for end position
 361                  OR
 362            Bio::LocationI location for subseq (strand honored)
 363            Specify -NOGAP=>1 to return subseq with gap characters removed
 364            Specify -REPLACE_WITH=>$new_subseq to replace the subseq returned
 365            with $new_subseq in the sequence object
 366
 367 =cut
 368
 369 sub subseq {
 370     my $self = shift;
 371     my @args = @_;
 372     my ($start, $end, $nogap, $replace) = $self->_rearrange([qw(START
 373                                                              END
 374                                                              NOGAP
 375                                                              REPLACE_WITH)], @args);
 376
 377     # If -replace_with is specified, validate the replacement sequence
 378     if (defined $replace) {
 379         $self->validate_seq( $replace ) ||
 380             $self->throw("Replacement sequence does not look valid");
 381     }
 382
 383     if( ref($start) && $start->isa('Bio::LocationI') ) {
 384         my $loc = $start;
 385         my $seq = '';
 386         foreach my $subloc ($loc->each_Location()) {
 387             my $piece = $self->subseq(-start        => $subloc->start(),
 388                                       -end          => $subloc->end(),
 389                                       -replace_with => $replace,
 390                                       -nogap        => $nogap);
 391             $piece =~ s/[$GAP_SYMBOLS]//g if $nogap;
 392             if ($subloc->strand() < 0) {
 393                 $piece = $self->_revcom_from_string($piece, $self->alphabet);
 394             }
 395             $seq .= $piece;
 396         }
 397         return $seq;
 398     } elsif( defined $start && defined $end ) {
 399         if( $start > $end ){
 400             $self->throw("Bad start,end parameters. Start [$start] has to be ".
 401               "less than end [$end]");
 402         }
 403         if( $start <= 0 ) {
 404             $self->throw("Bad start parameter ($start). Start must be positive.");
 405         }
 406
 407         # Remove one from start, and then length is end-start
 408         $start--;
 409
 410         my $seqstr;
 411         if (defined $replace) {
 412             $seqstr = substr $self->{seq}, $start, $end-$start, $replace;
 413         } else {
 414             $seqstr = substr $self->{seq}, $start, $end-$start;
 415         }
 416
 417
 418         if ($end > $self->length) {
 419             if ($self->is_circular) {
 420                 my $start = 0;
 421                 my $end = $end - $self->length;
 422
 423                 my $appendstr;
 424                 if (defined $replace) {
 425                     $appendstr = substr $self->{seq}, $start, $end-$start, $replace;
 426                 } else {
 427                     $appendstr = substr $self->{seq}, $start, $end-$start;
 428                 }
 429
 430                 $seqstr .= $appendstr;
 431             } else {
 432                 $self->throw("Bad end parameter ($end). End must be less than ".
 433                     "the total length of sequence (total=".$self->length.")")
 434             }
 435         }
 436
 437         $seqstr =~ s/[$GAP_SYMBOLS]//g if ($nogap);
 438         return $seqstr;
 439
 440     } else {
 441         $self->warn("Incorrect parameters to subseq - must be two integers or ".
 442             "a Bio::LocationI object. Got:", $self,$start,$end,$replace,$nogap);
 443         return;
 444     }
 445 }
 446
 447
 448 =head2 length
 449
 450  Title   : length
 451  Usage   : $len = $seqobj->length();
 452  Function: Get the stored length of the sequence in number of symbols (bases
 453            or amino acids).
 454
 455            In some circumstances, you can also set this attribute:
 456            1/ For empty sequences, you can set the length to anything you want:
 457               my $seqobj = Bio::PrimarySeq->new( -length => 123 );
 458               my $len = $seqobj->len; # 123
 459            2/ To save memory when using very long sequences, you can set the
 460               length of the sequence to the length of the sequence (and nothing
 461               else):
 462               my $seqobj = Bio::PrimarySeq->new( -seq => 'ACGT...' ); # 1 Mbp sequence
 463               # process $seqobj... then after you're done with it
 464               $seqobj->length($seqobj->length);
 465               $seqobj->seq(undef); # free memory!
 466               my $len = $seqobj->len; # 1 Mbp
 467
 468            Note that if you set seq() to a value other than undef at any time,
 469            the length attribute will be reset.
 470  Returns : integer representing the length of the sequence.
 471  Args    : Optionally, the value on set
 472
 473 =cut
 474
 475 sub length {
 476     my ($self, $val) = @_;
 477     if (defined $val) {
 478         my $len = $self->{'length'};
 479         if ($len && ($len != $val)) {
 480             $self->throw("You're trying to lie about the length: ".
 481                 "is $len but you say ".$val);
 482         }
 483         $self->{'length'} = $val;
 484         $self->{'_freeze_length'} = undef;
 485     }
 486     return $self->{'length'};
 487 }
 488
 489
 490 =head2 display_id
 491
 492  Title   : display_id or display_name
 493  Usage   : $id_string = $seqobj->display_id();
 494  Function: Get or set the display id, aka the common name of the sequence object.
 495
 496            The semantics of this is that it is the most likely string to
 497            be used as an identifier of the sequence, and likely to have
 498            "human" readability.  The id is equivalent to the ID field of
 499            the GenBank/EMBL databanks and the id field of the
 500            Swissprot/sptrembl database. In fasta format, the >(\S+) is
 501            presumed to be the id, though some people overload the id to
 502            embed other information. Bioperl does not use any embedded
 503            information in the ID field, and people are encouraged to use
 504            other mechanisms (accession field for example, or extending
 505            the sequence object) to solve this.
 506
 507            With the new Bio::DescribeableI interface, display_name aliases
 508            to this method.
 509  Returns : A string for the display ID
 510  Args    : Optional string for the display ID to set
 511
 512 =cut
 513
 514 sub display_id {
 515     my ($self, $value) = @_;
 516     if( defined $value) {
 517         $self->{'display_id'} = $value;
 518     }
 519     return $self->{'display_id'};
 520 }
 521
 522
 523 =head2 accession_number
 524
 525  Title   : accession_number or object_id
 526  Usage   : $unique_key = $seqobj->accession_number;
 527  Function: Returns the unique biological id for a sequence, commonly
 528            called the accession_number. For sequences from established
 529            databases, the implementors should try to use the correct
 530            accession number. Notice that primary_id() provides the
 531            unique id for the implemetation, allowing multiple objects
 532            to have the same accession number in a particular implementation.
 533
 534            For sequences with no accession number, this method should
 535            return "unknown".
 536
 537            [Note this method name is likely to change in 1.3]
 538
 539            With the new Bio::IdentifiableI interface, this is aliased
 540            to object_id
 541  Returns : A string
 542  Args    : A string (optional) for setting
 543
 544 =cut
 545
 546 sub accession_number {
 547     my( $self, $acc ) = @_;
 548     if (defined $acc) {
 549         $self->{'accession_number'} = $acc;
 550     } else {
 551         $acc = $self->{'accession_number'};
 552         $acc = 'unknown' unless defined $acc;
 553     }
 554     return $acc;
 555 }
 556
 557
 558 =head2 primary_id
 559
 560  Title   : primary_id
 561  Usage   : $unique_key = $seqobj->primary_id;
 562  Function: Returns the unique id for this object in this
 563            implementation. This allows implementations to manage their
 564            own object ids in a way the implementaiton can control
 565            clients can expect one id to map to one object.
 566
 567            For sequences with no natural primary id, this method
 568            should return a stringified memory location.
 569  Returns : A string
 570  Args    : A string (optional, for setting)
 571
 572 =cut
 573
 574 sub primary_id {
 575     my $self = shift;
 576
 577     if(@_) {
 578         $self->{'primary_id'} = shift;
 579     }
 580     if( ! defined($self->{'primary_id'}) ) {
 581         return "$self";
 582     }
 583     return $self->{'primary_id'};
 584 }
 585
 586
 587 =head2 alphabet
 588
 589  Title   : alphabet
 590  Usage   : if( $seqobj->alphabet eq 'dna' ) { # Do something }
 591  Function: Get/set the alphabet of sequence, one of
 592            'dna', 'rna' or 'protein'. This is case sensitive.
 593
 594            This is not called <type> because this would cause
 595            upgrade problems from the 0.5 and earlier Seq objects.
 596  Returns : a string either 'dna','rna','protein'. NB - the object must
 597            make a call of the type - if there is no alphabet specified it
 598            has to guess.
 599  Args    : optional string to set : 'dna' | 'rna' | 'protein'
 600
 601
 602 =cut
 603
 604 sub alphabet {
 605     my ($self,$value) = @_;
 606     if (defined $value) {
 607         $value = lc $value;
 608         unless ( $valid_type{$value} ) {
 609         $self->throw("Alphabet '$value' is not a valid alphabet (".
 610             join(',', map "'$_'", sort keys %valid_type) .") lowercase");
 611         }
 612         $self->{'alphabet'} = $value;
 613     }
 614     return $self->{'alphabet'};
 615 }
 616
 617
 618 =head2 desc
 619
 620  Title   : desc or description
 621  Usage   : $seqobj->desc($newval);
 622  Function: Get/set description of the sequence.
 623
 624            'description' is an alias for this for compliance with the
 625            Bio::DescribeableI interface.
 626  Returns : value of desc (a string)
 627  Args    : newvalue (a string or undef, optional)
 628
 629
 630 =cut
 631
 632 sub desc{
 633     my $self = shift;
 634
 635     return $self->{'desc'} = shift if @_;
 636     return $self->{'desc'};
 637 }
 638
 639
 640 =head2 can_call_new
 641
 642  Title   : can_call_new
 643  Usage   :
 644  Function:
 645  Example :
 646  Returns : true
 647  Args    :
 648
 649 =cut
 650
 651 sub can_call_new {
 652    my ($self) = @_;
 653
 654    return 1;
 655 }
 656
 657
 658 =head2 id
 659
 660  Title   : id
 661  Usage   : $id = $seqobj->id();
 662  Function: This is mapped on display_id
 663  Example :
 664  Returns :
 665  Args    :
 666
 667 =cut
 668
 669 sub  id {
 670    return shift->display_id(@_);
 671 }
 672
 673
 674 =head2 is_circular
 675
 676  Title   : is_circular
 677  Usage   : if( $seqobj->is_circular) { # Do something }
 678  Function: Returns true if the molecule is circular
 679  Returns : Boolean value
 680  Args    : none
 681
 682 =cut
 683
 684 sub is_circular{
 685     my $self = shift;
 686     return $self->{'is_circular'} = shift if @_;
 687     return $self->{'is_circular'};
 688 }
 689
 690
 691 =head1 Methods for Bio::IdentifiableI compliance
 692
 693 =head2 object_id
 694
 695  Title   : object_id
 696  Usage   : $string = $seqobj->object_id();
 697  Function: Get or set a string which represents the stable primary identifier
 698            in this namespace of this object. For DNA sequences this
 699            is its accession_number, similarly for protein sequences.
 700
 701            This is aliased to accession_number().
 702  Returns : A scalar
 703  Args    : Optional object ID to set.
 704
 705 =cut
 706
 707 sub object_id {
 708     return shift->accession_number(@_);
 709 }
 710
 711
 712 =head2 version
 713
 714  Title   : version
 715  Usage   : $version = $seqobj->version();
 716  Function: Get or set a number which differentiates between versions of
 717            the same object. Higher numbers are considered to be
 718            later and more relevant, but a single object described
 719            the same identifier should represent the same concept.
 720  Returns : A number
 721  Args    : Optional version to set.
 722
 723 =cut
 724
 725 sub version{
 726     my ($self,$value) = @_;
 727     if( defined $value) {
 728         $self->{'_version'} = $value;
 729     }
 730     return $self->{'_version'};
 731 }
 732
 733
 734 =head2 authority
 735
 736  Title   : authority
 737  Usage   : $authority = $seqobj->authority();
 738  Function: Get or set a string which represents the organisation which
 739            granted the namespace, written as the DNS name of the
 740            organisation (eg, wormbase.org).
 741  Returns : A scalar
 742  Args    : Optional authority to set.
 743
 744 =cut
 745
 746 sub authority {
 747     my ($self, $value) = @_;
 748     if( defined $value) {
 749         $self->{'authority'} = $value;
 750     }
 751     return $self->{'authority'};
 752 }
 753
 754
 755 =head2 namespace
 756
 757  Title   : namespace
 758  Usage   : $string = $seqobj->namespace();
 759  Function: Get or set a string representing the name space this identifier
 760            is valid in, often the database name or the name describing the
 761            collection.
 762  Returns : A scalar
 763  Args    : Optional namespace to set.
 764
 765 =cut
 766
 767 sub namespace{
 768     my ($self,$value) = @_;
 769     if( defined $value) {
 770         $self->{'namespace'} = $value;
 771     }
 772     return $self->{'namespace'} || "";
 773 }
 774
 775
 776 =head1 Methods for Bio::DescribableI compliance
 777
 778 This comprises of display_name and description.
 779
 780 =head2 display_name
 781
 782  Title   : display_name
 783  Usage   : $string = $seqobj->display_name();
 784  Function: Get or set a string which is what should be displayed to the user.
 785            The string should have no spaces (ideally, though a cautious
 786            user of this interface would not assumme this) and should be
 787            less than thirty characters (though again, double checking
 788            this is a good idea).
 789
 790            This is aliased to display_id().
 791  Returns : A string for the display name
 792  Args    : Optional string for the display name to set.
 793
 794 =cut
 795
 796 sub display_name {
 797     return shift->display_id(@_);
 798 }
 799
 800
 801 =head2 description
 802
 803  Title   : description
 804  Usage   : $string = $seqobj->description();
 805  Function: Get or set a text string suitable for displaying to the user a
 806            description. This string is likely to have spaces, but
 807            should not have any newlines or formatting - just plain
 808            text. The string should not be greater than 255 characters
 809            and clients can feel justified at truncating strings at 255
 810            characters for the purposes of display.
 811
 812            This is aliased to desc().
 813  Returns : A string for the description
 814  Args    : Optional string for the description to set.
 815
 816 =cut
 817
 818 sub description {
 819     return shift->desc(@_);
 820 }
 821
 822
 823 =head1 Methods Inherited from Bio::PrimarySeqI
 824
 825 These methods are available on Bio::PrimarySeq, although they are
 826 actually implemented on Bio::PrimarySeqI
 827
 828 =head2 revcom
 829
 830  Title   : revcom
 831  Usage   : $rev = $seqobj->revcom();
 832  Function: Produces a new Bio::SeqI implementing object which
 833            is the reversed complement of the sequence. For protein
 834            sequences this throws an exception of
 835            "Sequence is a protein. Cannot revcom".
 836
 837            The id is the same id as the orginal sequence, and the
 838            accession number is also indentical. If someone wants to
 839            track that this sequence has be reversed, it needs to
 840            define its own extensions.
 841
 842            To do an inplace edit of an object you can go:
 843
 844            $seqobj = $seqobj->revcom();
 845
 846            This of course, causes Perl to handle the garbage
 847            collection of the old object, but it is roughly speaking as
 848            efficient as an inplace edit.
 849  Returns : A new (fresh) Bio::SeqI object
 850  Args    : none
 851
 852 =head2 trunc
 853
 854  Title   : trunc
 855  Usage   : $subseq = $myseq->trunc(10,100);
 856  Function: Provides a truncation of a sequence,
 857  Returns : A fresh Bio::SeqI implementing object.
 858  Args    : Numbers for the start and end positions
 859
 860 =head1 Internal methods
 861
 862 These are internal methods to PrimarySeq
 863
 864 =head2 _guess_alphabet
 865
 866  Title   : _guess_alphabet
 867  Usage   :
 868  Function: Automatically guess and set the type of sequence: dna, rna, protein
 869            or '' if the sequence was empty. This method first removes dots (.),
 870            dashes (-) and question marks (?) before guessing the alphabet
 871            using the IUPAC conventions for ambiguous residues. Since the DNA and
 872            RNA characters are also valid characters for proteins, there is
 873            no foolproof way of determining the right alphabet. This is our best
 874            guess only!
 875  Returns : string 'dna', 'rna', 'protein' or ''.
 876  Args    : none
 877
 878 =cut
 879
 880 sub _guess_alphabet {
 881     my ($self) = @_;
 882     # Guess alphabet
 883     my $alphabet = $self->_guess_alphabet_from_string($self->seq, $self->{'_nowarnonempty'});
 884     # Set alphabet unless it is unknown
 885     $self->alphabet($alphabet) if $alphabet;
 886     return $alphabet;
 887 }
 888
 889
 890 sub _guess_alphabet_from_string {
 891     # Get the alphabet from a sequence string
 892     my ($self, $str, $nowarnonempty) = @_;
 893
 894     $nowarnonempty = 0 if not defined $nowarnonempty;
 895
 896     # Remove chars that clearly don't denote nucleic or amino acids
 897     $str =~ s/[-.?]//gi;
 898
 899     # Check for sequences without valid letters
 900     my $alphabet;
 901     my $total = CORE::length($str);
 902     if( $total == 0 ) {
 903         if (not $nowarnonempty) {
 904             $self->warn("Got a sequence without letters. Could not guess alphabet");
 905         }
 906         $alphabet = '';
 907     }
 908
 909     # Determine alphabet now
 910     if (not defined $alphabet) {
 911         if ($str =~ m/[EFIJLOPQXZ]/i) {
 912             # Start with a safe method to find proteins.
 913             # Unambiguous IUPAC letters for proteins are: E,F,I,J,L,O,P,Q,X,Z
 914             $alphabet = 'protein';
 915         } else {
 916             # Alphabet is unsure, could still be DNA, RNA or protein
 917             # DNA and RNA contain mostly A, T, U, G, C and N, but the other
 918             # letters they use are also among the 15 valid letters that a
 919             # protein sequence can contain at this stage. Make our best guess
 920             # based on sequence composition. If it contains over 70% of ACGTUN,
 921             # it is likely nucleic.
 922             if( ($str =~ tr/ATUGCNWSKMatugcnwskm//) / $total > 0.7 ) {
 923                 if ( $str =~ m/U/i ) {
 924                     $alphabet = 'rna';
 925                 } else {
 926                     $alphabet = 'dna';
 927                 }
 928             } else {
 929                 $alphabet = 'protein';
 930             }
 931         }
 932     }
 933
 934     return $alphabet;
 935 }
 936
 937
 938 ############################################################################
 939 # aliases due to name changes or to compensate for our lack of consistency #
 940 ############################################################################
 941
 942 sub accession {
 943     my $self = shift;
 944
 945     $self->warn(ref($self)."::accession is deprecated, ".
 946         "use accession_number() instead");
 947     return $self->accession_number(@_);
 948 }
 949
 950 1;