Bio/PrimarySeq.pm

   1 # $Id$
   2 #
   3 # bioperl module for Bio::PrimarySeq
   4 #
   5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   6 #
   7 # Cared for by Ewan Birney <birney@ebi.ac.uk>
   8 #
   9 # Copyright Ewan Birney
  10 #
  11 # You may distribute this module under the same terms as perl itself
  12
  13 # POD documentation - main docs before the code
  14
  15 =head1 NAME
  16
  17 Bio::PrimarySeq - Bioperl lightweight Sequence Object
  18
  19 =head1 SYNOPSIS
  20
  21   # Bio::SeqIO for file reading, Bio::DB::GenBank for
  22   # database reading
  23
  24   use Bio::Seq;
  25   use Bio::SeqIO;
  26   use Bio::DB::GenBank;
  27
  28   # make from memory
  29
  30   $seqobj = Bio::PrimarySeq->new ( -seq => 'ATGGGGTGGGCGGTGGGTGGTTTG',
  31                                    -id  => 'GeneFragment-12',
  32                                    -accession_number => 'X78121',
  33                                    -alphabet => 'dna',
  34                                    -is_circular => 1 );
  35   print "Sequence ", $seqobj->id(), " with accession ",
  36     $seqobj->accession_number, "\n";
  37
  38   # read from file
  39
  40   $inputstream = Bio::SeqIO->new(-file => "myseq.fa",
  41                                  -format => 'Fasta');
  42   $seqobj = $inputstream->next_seq();
  43   print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n";
  44
  45   # to get out parts of the sequence.
  46
  47   print "Sequence ", $seqobj->id(), " with accession ",
  48     $seqobj->accession_number, " and desc ", $seqobj->desc, "\n";
  49
  50   $string  = $seqobj->seq();
  51   $string2 = $seqobj->subseq(1,40);
  52
  53 =head1 DESCRIPTION
  54
  55 PrimarySeq is a lightweight Sequence object, storing the sequence, its
  56 name, a computer-useful unique name, and other fundamental attributes.
  57 It does not contain sequence features or other information.  To have a
  58 sequence with sequence features you should use the Seq object which uses
  59 this object.
  60
  61 Although new users will use Bio::PrimarySeq a lot, in general you will
  62 be using it from the Bio::Seq object. For more information on Bio::Seq
  63 see L<Bio::Seq>. For interest you might like to know that
  64 Bio::Seq has-a Bio::PrimarySeq and forwards most of the function calls
  65 to do with sequence to it (the has-a relationship lets us get out of a
  66 otherwise nasty cyclical reference in Perl which would leak memory).
  67
  68 Sequence objects are defined by the Bio::PrimarySeqI interface, and this
  69 object is a pure Perl implementation of the interface. If that's
  70 gibberish to you, don't worry. The take home message is that this
  71 object is the bioperl default sequence object, but other people can
  72 use their own objects as sequences if they so wish. If you are
  73 interested in wrapping your own objects as compliant Bioperl sequence
  74 objects, then you should read the Bio::PrimarySeqI documentation
  75
  76 The documentation of this object is a merge of the Bio::PrimarySeq and
  77 Bio::PrimarySeqI documentation.  This allows all the methods which you can
  78 call on sequence objects here.
  79
  80 =head1 FEEDBACK
  81
  82 =head2 Mailing Lists
  83
  84 User feedback is an integral part of the evolution of this and other
  85 Bioperl modules. Send your comments and suggestions preferably to one
  86 of the Bioperl mailing lists.  Your participation is much appreciated.
  87
  88   bioperl-l@bioperl.org                  - General discussion
  89   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  90
  91 =head2 Support
  92
  93 Please direct usage questions or support issues to the mailing list:
  94
  95 I<bioperl-l@bioperl.org>
  96
  97 rather than to the module maintainer directly. Many experienced and
  98 reponsive experts will be able look at the problem and quickly
  99 address it. Please include a thorough description of the problem
 100 with code and data examples if at all possible.
 101
 102 =head2 Reporting Bugs
 103
 104 Report bugs to the Bioperl bug tracking system to help us keep track
 105 the bugs and their resolution.  Bug reports can be submitted via the
 106 web:
 107
 108   http://bugzilla.open-bio.org/
 109
 110 =head1 AUTHOR - Ewan Birney
 111
 112 Email birney@ebi.ac.uk
 113
 114 =head1 APPENDIX
 115
 116 The rest of the documentation details each of the object
 117 methods. Internal methods are usually preceded with a _
 118
 119 =cut
 120
 121
 122 # Let the code begin...
 123
 124
 125 package Bio::PrimarySeq;
 126 use vars qw($MATCHPATTERN $GAP_SYMBOLS);
 127 use strict;
 128
 129 $MATCHPATTERN = 'A-Za-z\-\.\*\?=~';
 130 $GAP_SYMBOLS = '-~';
 131
 132 use base qw(Bio::Root::Root Bio::PrimarySeqI
 133             Bio::IdentifiableI Bio::DescribableI);
 134
 135 #
 136 # setup the allowed values for alphabet()
 137 #
 138
 139 my %valid_type = map {$_, 1} qw( dna rna protein );
 140
 141 =head2 new
 142
 143  Title   : new
 144  Usage   : $seq    = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT',
 145                                            -id  => 'human_id',
 146                                            -accession_number => 'AL000012',
 147                                            );
 148
 149  Function: Returns a new primary seq object from
 150            basic constructors, being a string for the sequence
 151            and strings for id and accession_number.
 152
 153            Note that you can provide an empty sequence string. However, in
 154            this case you MUST specify the type of sequence you wish to
 155            initialize by the parameter -alphabet. See alphabet() for possible
 156            values.
 157  Returns : a new Bio::PrimarySeq object
 158  Args    : -seq         => sequence string
 159            -display_id  => display id of the sequence (locus name)
 160            -accession_number => accession number
 161            -primary_id  => primary id (Genbank id)
 162            -version     => version number
 163            -namespace   => the namespace for the accession
 164            -authority   => the authority for the namespace
 165            -description => description text
 166            -desc        => alias for description
 167            -alphabet    => sequence type (alphabet) (dna|rna|protein)
 168            -id          => alias for display id
 169            -is_circular => boolean field for whether or not sequence is circular
 170            -direct      => boolean field for directly setting sequence (requires alphabet also set)
 171            -ref_to_seq  => boolean field indicating the sequence is a reference (?!?)
 172            -nowarnonempty => boolean field for whether or not to warn when sequence is empty
 173
 174 =cut
 175
 176
 177 sub new {
 178     my ($class, @args) = @_;
 179     my $self = $class->SUPER::new(@args);
 180
 181     my($seq,$id,$acc,$pid,$ns,$auth,$v,$oid,
 182        $desc,$description,
 183        $alphabet,$given_id,$is_circular,$direct,$ref_to_seq,$len,$nowarnonempty) =
 184         $self->_rearrange([qw(SEQ
 185                               DISPLAY_ID
 186                               ACCESSION_NUMBER
 187                               PRIMARY_ID
 188                               NAMESPACE
 189                               AUTHORITY
 190                               VERSION
 191                               OBJECT_ID
 192                               DESC
 193                               DESCRIPTION
 194                               ALPHABET
 195                               ID
 196                               IS_CIRCULAR
 197                               DIRECT
 198                               REF_TO_SEQ
 199                               LENGTH
 200             NOWARNONEMPTY
 201                               )],
 202                           @args);
 203
 204     # private var _nowarnonempty, need to be set before calling _guess_alphabet
 205     $self->{'_nowarnonempty'} = $nowarnonempty;
 206
 207     if( defined $id && defined $given_id ) {
 208       if( $id ne $given_id ) {
 209         $self->throw("Provided both id and display_id constructor ".
 210             "functions. [$id] [$given_id]");
 211       }
 212     }
 213     if( defined $given_id ) { $id = $given_id; }
 214
 215     # let's set the length before the seq -- if there is one, this length is
 216     # going to be invalidated
 217     defined $len && $self->length($len);
 218
 219     # if alphabet is provided we set it first, so that it won't be guessed
 220     # when the sequence is set
 221     $alphabet && $self->alphabet($alphabet);
 222
 223     # bernd's idea: define ids so that invalid sequence messages
 224     # can be more informative...
 225     defined $id  && $self->display_id($id);
 226     $acc         && $self->accession_number($acc);
 227     defined $pid && $self->primary_id($pid);
 228
 229     # if there is an alphabet, and direct is passed in, assume the alphabet
 230     # and sequence is ok
 231
 232     if( $direct && $ref_to_seq) {
 233       $self->{'seq'} = $$ref_to_seq;
 234         if( ! $alphabet ) {
 235           $self->_guess_alphabet();
 236         } # else it has been set already above
 237     } else {
 238                  #      print STDERR "DEBUG: setting sequence to [$seq]\n";
 239                  # note: the sequence string may be empty
 240                  $self->seq($seq) if defined($seq);
 241          }
 242
 243     $desc        && $self->desc($desc);
 244     $description && $self->description($description);
 245     $is_circular && $self->is_circular($is_circular);
 246     $ns          && $self->namespace($ns);
 247     $auth        && $self->authority($auth);
 248     defined($v)  && $self->version($v);
 249     defined($oid) && $self->object_id($oid);
 250
 251
 252     return $self;
 253 }
 254
 255 sub direct_seq_set {
 256     my $obj = shift;
 257     return $obj->{'seq'} = shift if @_;
 258     return;
 259 }
 260
 261
 262 =head2 seq
 263
 264  Title   : seq
 265  Usage   : $string    = $obj->seq()
 266  Function: Returns the sequence as a string of letters. The
 267            case of the letters is left up to the implementer.
 268            Suggested cases are upper case for proteins and lower case for
 269            DNA sequence (IUPAC standard), but you should not rely on this.
 270  Returns : A scalar
 271  Args    : Optionally on set the new value (a string). An optional second
 272            argument presets the alphabet (otherwise it will be guessed).
 273
 274 =cut
 275
 276 sub seq {
 277    my ($obj,@args) = @_;
 278
 279    if( scalar(@args) == 0 ) {
 280        return $obj->{'seq'};
 281    }
 282
 283    my ($value,$alphabet) = @args;
 284
 285    if(@args) {
 286        if(defined($value) && (! $obj->validate_seq($value))) {
 287            $obj->throw("Attempting to set the sequence '".(defined($obj->id) || "[unidentified sequence]")."' to [$value] ".
 288                                                         "which does not look healthy");
 289                 }
 290        # if a sequence was already set we make sure that we re-adjust the
 291        # alphabet, otherwise we skip guessing if alphabet is already set
 292        # note: if the new seq is empty or undef, we don't consider that a
 293        # change (we wouldn't have anything to guess on anyway)
 294                 my $is_changed_seq =
 295                   exists($obj->{'seq'}) && (CORE::length($value || '') > 0);
 296                 $obj->{'seq'} = $value;
 297        # new alphabet overridden by arguments?
 298                 if($alphabet) {
 299            # yes, set it no matter what
 300                         $obj->alphabet($alphabet);
 301                 } elsif( # if we changed a previous sequence to a new one
 302                                   $is_changed_seq ||
 303                                   # or if there is no alphabet yet at all
 304                                   (! defined($obj->alphabet()))) {
 305                         # we need to guess the (possibly new) alphabet
 306                         $obj->_guess_alphabet();
 307                 } # else (seq not changed and alphabet was defined) do nothing
 308                 # if the seq is changed, make sure we unset a possibly set length
 309                 $obj->length(undef) if $is_changed_seq || $obj->{'seq'};
 310    }
 311    return $obj->{'seq'};
 312 }
 313
 314 =head2 validate_seq
 315
 316  Title   : validate_seq
 317  Usage   : if(! $seq->validate_seq($seq_str) ) {
 318                 print "sequence $seq_str is not valid for an object of
 319                 alphabet ",$seq->alphabet, "\n";
 320            }
 321  Function: Validates a given sequence string. A validating sequence string
 322            must be accepted by seq(). A string that does not validate will
 323            lead to an exception if passed to seq().
 324
 325            The implementation provided here does not take alphabet() into
 326            account. Allowed are all letters (A-Z) and '-','.','*','?','=',
 327            and '~'.
 328
 329  Example :
 330  Returns : 1 if the supplied sequence string is valid for the object, and
 331            0 otherwise.
 332  Args    : The sequence string to be validated.
 333
 334
 335 =cut
 336
 337 sub validate_seq {
 338         my ($self,$seqstr) = @_;
 339         if( ! defined $seqstr ){ $seqstr = $self->seq(); }
 340         return 0 unless( defined $seqstr);
 341         if((CORE::length($seqstr) > 0) &&
 342            ($seqstr !~ /^([$MATCHPATTERN]+)$/)) {
 343             $self->warn("sequence '".(defined($self->id) || "[unidentified sequence]")."' doesn't validate, mismatch is " .
 344                         join(",",($seqstr =~ /([^$MATCHPATTERN]+)/g)));
 345                 return 0;
 346         }
 347         return 1;
 348 }
 349
 350 =head2 subseq
 351
 352  Title   : subseq
 353  Usage   : $substring = $obj->subseq(10,40);
 354            $substring = $obj->subseq(10,40,NOGAP)
 355            $substring = $obj->subseq(-START=>10,-END=>40,-REPLACE_WITH=>'tga')
 356  Function: returns the subseq from start to end, where the first sequence
 357            character has coordinate 1 number is inclusive, ie 1-2 are the
 358            first two characters of the sequence
 359  Returns : a string
 360  Args    : integer for start position
 361            integer for end position
 362                  OR
 363            Bio::LocationI location for subseq (strand honored)
 364            Specify -NOGAP=>1 to return subseq with gap characters removed
 365            Specify -REPLACE_WITH=>$new_subseq to replace the subseq returned
 366            with $new_subseq in the sequence object
 367
 368 =cut
 369
 370 sub subseq {
 371    my $self = shift;
 372    my @args = @_;
 373    my ($start,$end,$nogap,$replace) = $self->_rearrange([qw(START
 374                                                             END
 375                                                             NOGAP
 376                                                             REPLACE_WITH)],@args);
 377
 378    # if $replace is specified, have the constructor validate it as seq
 379    my $dummy = new Bio::PrimarySeq(-seq=>$replace, -alphabet=>$self->alphabet) if defined($replace);
 380
 381    if( ref($start) && $start->isa('Bio::LocationI') ) {
 382        my $loc = $start;
 383        my $seq = "";
 384        foreach my $subloc ($loc->each_Location()) {
 385            my $piece = $self->subseq(-START=>$subloc->start(),
 386                                      '-END'=>$subloc->end(),
 387                                      -REPLACE_WITH=>$replace,
 388                                      -NOGAP=>$nogap);
 389            $piece =~ s/[$GAP_SYMBOLS]//g if $nogap;
 390            if($subloc->strand() < 0) {
 391                $piece = Bio::PrimarySeq->new('-seq' => $piece)->revcom()->seq();
 392            }
 393            $seq .= $piece;
 394        }
 395        return $seq;
 396    } elsif(  defined  $start && defined $end ) {
 397        if( $start > $end ){
 398            $self->throw("Bad start,end parameters. Start [$start] has to be ".
 399                         "less than end [$end]");
 400        }
 401        if( $start <= 0 ) {
 402            $self->throw("Bad start parameter ($start). Start must be positive.");
 403        }
 404
 405        # remove one from start, and then length is end-start
 406        $start--;
 407        my @ss_args = map { eval "defined $_"  ? $_ : () } qw( $self->{seq} $start $end-$start $replace);
 408        my $seqstr = eval join( '', "substr(", join(',',@ss_args), ")");
 409
 410        if( $end > $self->length) {
 411            if ($self->is_circular) {
 412                my $start = 0;
 413                my $end = $end - $self->length;
 414                my @ss_args = map { eval "defined $_"  ? $_ : () } qw( $self->{seq} $start $end-$start $replace);
 415                my $appendstr = eval join( '', "substr(", join(',',@ss_args), ")");
 416                $seqstr .= $appendstr;
 417            } else {
 418                $self->throw("Bad end parameter ($end). End must be less than the total length of sequence (total=".$self->length.")")
 419            }
 420        }
 421
 422        $seqstr =~ s/[$GAP_SYMBOLS]//g if ($nogap);
 423        return $seqstr;
 424
 425    } else {
 426        $self->warn("Incorrect parameters to subseq - must be two integers or a Bio::LocationI object. Got:", $self,$start,$end,$replace,$nogap);
 427        return;
 428    }
 429 }
 430
 431 =head2 length
 432
 433  Title   : length
 434  Usage   : $len = $seq->length();
 435  Function: Get the length of the sequence in number of symbols (bases
 436            or amino acids).
 437
 438            You can also set this attribute, even to a number that does
 439            not match the length of the sequence string. This is useful
 440            if you don''t want to set the sequence too, or if you want
 441            to free up memory by unsetting the sequence. In the latter
 442            case you could do e.g.
 443
 444                $seq->length($seq->length);
 445                $seq->seq(undef);
 446
 447            Note that if you set the sequence to a value other than
 448            undef at any time, the length attribute will be
 449            invalidated, and the length of the sequence string will be
 450            reported again. Also, we won''t let you lie about the length.
 451
 452  Example :
 453  Returns : integer representing the length of the sequence.
 454  Args    : Optionally, the value on set
 455
 456 =cut
 457
 458 sub length {
 459     my $self = shift;
 460     my $len = CORE::length($self->seq() || '');
 461
 462     if(@_) {
 463                  my $val = shift;
 464                  if(defined($val) && $len && ($len != $val)) {
 465                          $self->throw("You're trying to lie about the length: ".
 466                                                           "is $len but you say ".$val);
 467                  }
 468                  $self->{'_seq_length'} = $val;
 469     } elsif(defined($self->{'_seq_length'})) {
 470                  return $self->{'_seq_length'};
 471     }
 472     return $len;
 473 }
 474
 475 =head2 display_id
 476
 477  Title   : display_id or display_name
 478  Usage   : $id_string = $obj->display_id();
 479  Function: returns the display id, aka the common name of the Sequence object.
 480
 481            The semantics of this is that it is the most likely string to
 482            be used as an identifier of the sequence, and likely to have
 483            "human" readability.  The id is equivalent to the ID field of
 484            the GenBank/EMBL databanks and the id field of the
 485            Swissprot/sptrembl database. In fasta format, the >(\S+) is
 486            presumed to be the id, though some people overload the id to
 487            embed other information. Bioperl does not use any embedded
 488            information in the ID field, and people are encouraged to use
 489            other mechanisms (accession field for example, or extending
 490            the sequence object) to solve this.
 491
 492            With the new Bio::DescribeableI interface, display_name aliases
 493            to this method.
 494
 495  Returns : A string
 496  Args    : None
 497
 498
 499 =cut
 500
 501 sub display_id {
 502    my ($obj,$value) = @_;
 503    if( defined $value) {
 504       $obj->{'display_id'} = $value;
 505         }
 506         return $obj->{'display_id'};
 507 }
 508
 509 =head2 accession_number
 510
 511  Title   : accession_number or object_id
 512  Usage   : $unique_key = $obj->accession_number;
 513  Function: Returns the unique biological id for a sequence, commonly
 514            called the accession_number. For sequences from established
 515            databases, the implementors should try to use the correct
 516            accession number. Notice that primary_id() provides the
 517            unique id for the implemetation, allowing multiple objects
 518            to have the same accession number in a particular implementation.
 519
 520            For sequences with no accession number, this method should
 521            return "unknown".
 522
 523            [Note this method name is likely to change in 1.3]
 524
 525            With the new Bio::IdentifiableI interface, this is aliased
 526            to object_id
 527
 528  Returns : A string
 529  Args    : A string (optional) for setting
 530
 531 =cut
 532
 533 sub accession_number {
 534     my( $obj, $acc ) = @_;
 535
 536     if (defined $acc) {
 537                  $obj->{'accession_number'} = $acc;
 538     } else {
 539                  $acc = $obj->{'accession_number'};
 540                  $acc = 'unknown' unless defined $acc;
 541     }
 542     return $acc;
 543 }
 544
 545 =head2 primary_id
 546
 547  Title   : primary_id
 548  Usage   : $unique_key = $obj->primary_id;
 549  Function: Returns the unique id for this object in this
 550            implementation. This allows implementations to manage their
 551            own object ids in a way the implementaiton can control
 552            clients can expect one id to map to one object.
 553
 554            For sequences with no natural primary id, this method
 555            should return a stringified memory location.
 556
 557  Returns : A string
 558  Args    : A string (optional, for setting)
 559
 560 =cut
 561
 562 sub primary_id {
 563     my $obj = shift;
 564
 565     if(@_) {
 566                  $obj->{'primary_id'} = shift;
 567     }
 568     if( ! defined($obj->{'primary_id'}) ) {
 569                  return "$obj";
 570     }
 571     return $obj->{'primary_id'};
 572 }
 573
 574
 575 =head2 alphabet
 576
 577  Title   : alphabet
 578  Usage   : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
 579  Function: Get/Set the alphabet of sequence, one of
 580            'dna', 'rna' or 'protein'. This is case sensitive.
 581
 582            This is not called <type> because this would cause
 583            upgrade problems from the 0.5 and earlier Seq objects.
 584
 585  Returns : a string either 'dna','rna','protein'. NB - the object must
 586            make a call of the type - if there is no alphabet specified it
 587            has to guess.
 588  Args    : optional string to set : 'dna' | 'rna' | 'protein'
 589
 590
 591 =cut
 592
 593 sub alphabet {
 594     my ($obj,$value) = @_;
 595     if (defined $value) {
 596                  $value = lc $value;
 597                  unless ( $valid_type{$value} ) {
 598                          $obj->throw("Alphabet '$value' is not a valid alphabet (".
 599                                                          join(',', map "'$_'", sort keys %valid_type) .
 600                                                          ") lowercase");
 601                  }
 602                  $obj->{'alphabet'} = $value;
 603     }
 604     return $obj->{'alphabet'};
 605 }
 606
 607 =head2 desc
 608
 609  Title   : desc or description
 610  Usage   : $obj->desc($newval)
 611  Function: Get/set description of the sequence.
 612
 613            'description' is an alias for this for compliance with the
 614            Bio::DescribeableI interface.
 615
 616  Example :
 617  Returns : value of desc (a string)
 618  Args    : newvalue (a string or undef, optional)
 619
 620
 621 =cut
 622
 623 sub desc{
 624     my $self = shift;
 625
 626     return $self->{'desc'} = shift if @_;
 627     return $self->{'desc'};
 628 }
 629
 630 =head2 can_call_new
 631
 632  Title   : can_call_new
 633  Usage   :
 634  Function:
 635  Example :
 636  Returns : true
 637  Args    :
 638
 639
 640 =cut
 641
 642 sub can_call_new {
 643    my ($self) = @_;
 644
 645    return 1;
 646 }
 647
 648 =head2 id
 649
 650  Title   : id
 651  Usage   : $id = $seq->id()
 652  Function: This is mapped on display_id
 653  Example :
 654  Returns :
 655  Args    :
 656
 657
 658 =cut
 659
 660 sub  id {
 661    return shift->display_id(@_);
 662 }
 663
 664 =head2 is_circular
 665
 666  Title   : is_circular
 667  Usage   : if( $obj->is_circular) { /Do Something/ }
 668  Function: Returns true if the molecule is circular
 669  Returns : Boolean value
 670  Args    : none
 671
 672 =cut
 673
 674 sub is_circular{
 675     my $self = shift;
 676
 677     return $self->{'is_circular'} = shift if @_;
 678     return $self->{'is_circular'};
 679 }
 680
 681
 682 =head1 Methods for Bio::IdentifiableI compliance
 683
 684 =cut
 685
 686 =head2 object_id
 687
 688  Title   : object_id
 689  Usage   : $string    = $obj->object_id()
 690  Function: A string which represents the stable primary identifier
 691            in this namespace of this object. For DNA sequences this
 692            is its accession_number, similarly for protein sequences.
 693
 694            This is aliased to accession_number().
 695  Returns : A scalar
 696
 697
 698 =cut
 699
 700 sub object_id {
 701     return shift->accession_number(@_);
 702 }
 703
 704 =head2 version
 705
 706  Title   : version
 707  Usage   : $version    = $obj->version()
 708  Function: A number which differentiates between versions of
 709            the same object. Higher numbers are considered to be
 710            later and more relevant, but a single object described
 711            the same identifier should represent the same concept.
 712
 713  Returns : A number
 714
 715 =cut
 716
 717 sub version{
 718     my ($self,$value) = @_;
 719     if( defined $value) {
 720                  $self->{'_version'} = $value;
 721     }
 722     return $self->{'_version'};
 723 }
 724
 725
 726 =head2 authority
 727
 728  Title   : authority
 729  Usage   : $authority    = $obj->authority()
 730  Function: A string which represents the organisation which
 731            granted the namespace, written as the DNS name for
 732            organisation (eg, wormbase.org).
 733
 734  Returns : A scalar
 735
 736 =cut
 737
 738 sub authority {
 739     my ($obj,$value) = @_;
 740     if( defined $value) {
 741                  $obj->{'authority'} = $value;
 742     }
 743     return $obj->{'authority'};
 744 }
 745
 746 =head2 namespace
 747
 748  Title   : namespace
 749  Usage   : $string    = $obj->namespace()
 750  Function: A string representing the name space this identifier
 751            is valid in, often the database name or the name
 752            describing the collection.
 753
 754  Returns : A scalar
 755
 756
 757 =cut
 758
 759 sub namespace{
 760     my ($self,$value) = @_;
 761     if( defined $value) {
 762                  $self->{'namespace'} = $value;
 763     }
 764     return $self->{'namespace'} || "";
 765 }
 766
 767 =head1 Methods for Bio::DescribableI compliance
 768
 769 This comprises of display_name and description.
 770
 771 =cut
 772
 773 =head2 display_name
 774
 775  Title   : display_name
 776  Usage   : $string    = $obj->display_name()
 777  Function: A string which is what should be displayed to the user.
 778            The string should have no spaces (ideally, though a cautious
 779            user of this interface would not assumme this) and should be
 780            less than thirty characters (though again, double checking
 781            this is a good idea).
 782
 783            This is aliased to display_id().
 784  Returns : A scalar
 785
 786 =cut
 787
 788 sub display_name {
 789     return shift->display_id(@_);
 790 }
 791
 792 =head2 description
 793
 794  Title   : description
 795  Usage   : $string    = $obj->description()
 796  Function: A text string suitable for displaying to the user a
 797            description. This string is likely to have spaces, but
 798            should not have any newlines or formatting - just plain
 799            text. The string should not be greater than 255 characters
 800            and clients can feel justified at truncating strings at 255
 801            characters for the purposes of display.
 802
 803            This is aliased to desc().
 804  Returns : A scalar
 805
 806 =cut
 807
 808 sub description {
 809     return shift->desc(@_);
 810 }
 811
 812 =head1 Methods Inherited from Bio::PrimarySeqI
 813
 814 These methods are available on Bio::PrimarySeq, although they are
 815 actually implemented on Bio::PrimarySeqI
 816
 817 =head2 revcom
 818
 819  Title   : revcom
 820  Usage   : $rev = $seq->revcom()
 821  Function: Produces a new Bio::SeqI implementing object which
 822            is the reversed complement of the sequence. For protein
 823            sequences this throws an exception of
 824            "Sequence is a protein. Cannot revcom".
 825
 826            The id is the same id as the orginal sequence, and the
 827            accession number is also indentical. If someone wants to
 828            track that this sequence has be reversed, it needs to
 829            define its own extensions.
 830
 831            To do an inplace edit of an object you can go:
 832
 833            $seqobj = $seqobj->revcom();
 834
 835            This of course, causes Perl to handle the garbage
 836            collection of the old object, but it is roughly speaking as
 837            efficient as an inplace edit.
 838
 839  Returns : A new (fresh) Bio::SeqI object
 840  Args    : none
 841
 842 =cut
 843
 844 =head2 trunc
 845
 846  Title   : trunc
 847  Usage   : $subseq = $myseq->trunc(10,100);
 848  Function: Provides a truncation of a sequence,
 849
 850  Example :
 851  Returns : A fresh Bio::SeqI implementing object.
 852  Args    :
 853
 854
 855 =cut
 856
 857 =head1 Internal methods
 858
 859 These are internal methods to PrimarySeq
 860
 861 =cut
 862
 863 =head2 _guess_alphabet
 864
 865  Title   : _guess_alphabet
 866  Usage   :
 867  Function: Determines (and sets) the type of sequence: dna, rna, protein
 868  Example :
 869  Returns : one of strings 'dna', 'rna' or 'protein'.
 870  Args    : none
 871
 872
 873 =cut
 874
 875 sub _guess_alphabet {
 876    my ($self) = @_;
 877    my $type;
 878
 879         #return if $self->alphabet;
 880
 881    my $str = $self->seq();
 882         # Remove char's that clearly denote ambiguity
 883    $str =~ s/[-.?]//gi;
 884
 885    my $total = CORE::length($str);
 886    if( $total == 0 ) {
 887      if (!$self->{'_nowarnonempty'}) {
 888        $self->warn("Got a sequence with no letters in it ".
 889            "cannot guess alphabet");
 890      }
 891      return '';
 892    }
 893
 894    my $u = ($str =~ tr/Uu//);
 895         # The assumption here is that most of sequences comprised of mainly
 896    # ATGC, with some N, will be 'dna' despite the fact that N could
 897         # also be Asparagine
 898    my $atgc = ($str =~ tr/ATGCNatgcn//);
 899
 900    if( ($atgc / $total) > 0.85 ) {
 901        $type = 'dna';
 902    } elsif( (($atgc + $u) / $total) > 0.85 ) {
 903        $type = 'rna';
 904    } else {
 905        $type = 'protein';
 906    }
 907
 908    $self->alphabet($type);
 909    return $type;
 910 }
 911
 912 ############################################################################
 913 # aliases due to name changes or to compensate for our lack of consistency #
 914 ############################################################################
 915
 916 sub accession {
 917     my $self = shift;
 918
 919     $self->warn(ref($self)."::accession is deprecated, ".
 920                 "use accession_number() instead");
 921     return $self->accession_number(@_);
 922 }
 923
 924 1;
 925