3 # bioperl module for Bio::Structure::SecStr::STRIDE::Res.pm
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Ed Green <ed@compbio.berkeley.edu>
9 # Copyright Univ. of California
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::Structure::SecStr::STRIDE::Res - Module for parsing/accessing stride output
21 my $stride_obj = Bio::Structure::SecStr::STRIDE::Res->new( '-file' => 'filename.stride' );
25 my $stride_obj = Bio::Structure::SecStr::STRIDE::Res->new( '-fh' => \*STDOUT );
27 # Get secondary structure assignment for PDB residue 20 of chain A
28 $sec_str = $stride_obj->resSecStr( '20:A' );
31 $sec_str = $stride_obj->resSecStr( 20, 'A' )
35 STRIDE::Res is a module for objectifying STRIDE output. STRIDE is a
36 program (similar to DSSP) for assigning secondary structure to
37 individual residues of a pdb structure file.
39 ( Knowledge-Based Protein Secondary Structure Assignment,
40 PROTEINS: Structure, Function, and Genetics 23:566-579 (1995) )
42 STRIDE is available here:
43 http://webclu.bio.wzw.tum.de/stride/
45 Methods are then available for extracting all of the infomation
46 present within the output or convenient subsets of it.
48 Although they are very similar in function, DSSP and STRIDE differ
49 somewhat in output format. Thes differences are reflected in the
50 return value of some methods of these modules. For example, both
51 the STRIDE and DSSP parsers have resSecStr() methods for returning
52 the secondary structure of a given residue. However, the range of
53 return values for DSSP is ( H, B, E, G, I, T, and S ) whereas the
54 range of values for STRIDE is ( H, G, I, E, B, b, T, and C ). See
55 individual methods for details.
57 The methods are roughly divided into 3 sections:
59 1. Global features of this structure (PDB ID, total surface area,
60 etc.). These methods do not require an argument.
61 2. Residue specific features ( amino acid, secondary structure,
62 solvent exposed surface area, etc. ). These methods do require an
63 arguement. The argument is supposed to uniquely identify a
64 residue described within the structure. It can be of any of the
66 ('#A:B') or ( #, 'A', 'B' )
68 || - Chain ID (blank for single chain)
69 |--- Insertion code for this residue. Blank for most residues.
70 |--- Numeric portion of residue ID.
74 --- Numeric portion of residue ID. If there is only one chain and
75 it has no ID AND there is no residue with an insertion code at this
76 number, then this can uniquely specify a residue.
81 ---Numeric portion of residue ID.
83 If a residue is incompletely specified then the first residue that
84 fits the arguments is returned. For example, if 19 is the argument
85 and there are three chains, A, B, and C with a residue whose number
86 is 19, then 19:A will be returned (assuming its listed first).
88 Since neither DSSP nor STRIDE correctly handle alt-loc codes, they
89 are not supported by these modules.
91 3. Value-added methods. Return values are not verbatem strings
92 parsed from DSSP or STRIDE output.
98 UsUser feedback is an integral part of the evolution of this and other
99 Bioperl modules. Send your comments and suggestions preferably to one
100 of the Bioperl mailing lists. Your participation is much appreciated.
102 bioperl-l@bioperl.org - General discussion
103 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
107 Please direct usage questions or support issues to the mailing list:
109 I<bioperl-l@bioperl.org>
111 rather than to the module maintainer directly. Many experienced and
112 reponsive experts will be able look at the problem and quickly
113 address it. Please include a thorough description of the problem
114 with code and data examples if at all possible.
116 =head2 Reporting Bugs
118 Report bugs to the Bioperl bug tracking system to help us keep track
119 the bugs and their resolution. Bug reports can be submitted via the
122 http://bugzilla.open-bio.org/
124 =head1 AUTHOR - Ed Green
126 Email ed@compbio.berkeley.edu
131 The Rest of the documentation details each method.
132 Internal methods are preceded with a _.
137 package Bio
::Structure
::SecStr
::STRIDE
::Res
;
142 use base
qw(Bio::Root::Root);
144 our %ASGTable = ( 'aa' => 0,
152 our %AATable = ( 'ALA' => 'A', 'ARG' => 'R', 'ASN' => 'N',
153 'ASP' => 'D', 'CYS' => 'C', 'GLN' => 'Q',
154 'GLU' => 'E', 'GLY' => 'G', 'HIS' => 'H',
155 'ILE' => 'I', 'LEU' => 'L', 'LYS' => 'K',
156 'MET' => 'M', 'PHE' => 'F', 'PRO' => 'P',
157 'SER' => 'S', 'THR' => 'T', 'TRP' => 'W',
158 'TYR' => 'Y', 'VAL' => 'V' );
163 Usage : makes new object of this class
164 Function : Constructor
165 Example : $stride_obj = Bio::Structure::SecStr::STRIDE:Res->new( '-file' => filename
167 '-fh' => FILEHANDLE )
168 Returns : object (ref)
169 Args : filename or filehandle( must be proper STRIDE output )
174 my ( $class, @args ) = @_;
175 my $self = $class->SUPER::new
( @args );
176 my $io = Bio
::Root
::IO
->new( @args );
177 $self->_parse( $io ); # not passing filehandle !
182 # GLOBAL FEATURES / INFO / STATS
187 Usage : returns sum of surface areas of all residues of all
188 chains considered. Result is memoized.
190 Example : $tot_SA = $stride_obj->totSurfArea();
202 if ( $self->{ 'SurfArea' } ) {
203 return $self->{ 'SurfArea' };
206 foreach $chain ( keys %{$self->{ 'ASG' }} ) {
207 for ( my $i = 1; $i <= $#{$self->{'ASG'}->{$chain}}; $i++ ) {
209 $self->{'ASG'}->{$chain}->[$i]->[$ASGTable{'surfArea'}];
214 $self->{ 'SurfArea' } = $total;
215 return $self->{ 'SurfArea' };
222 Usage : returns total number of residues in all chains or
223 just the specified chain
225 Example : $tot_res = $stride_obj->numResidues();
227 Args : none or chain id
237 foreach $key ( keys %{$self->{ 'ASG' }} ) {
239 if ( $key eq $chain ) {
240 $total += $#{$self->{ 'ASG' }{ $key }};
244 $total += $#{$self->{ 'ASG' }{ $key }};
250 # STRAIGHT FROM THE PDB ENTRY
255 Usage : returns pdb identifier ( 1FJM, e.g. )
257 Example : $pdb_id = $stride_obj->pdbID();
258 Returns : scalar string
266 return $self->{ 'PDB' };
271 Usage : returns author of this PDB entry
273 Example : $auth = $stride_obj->pdbAuthor()
274 Returns : scalar string
282 return join( ' ', @
{ $self->{ 'HEAD' }->{ 'AUT' } } );
288 Usage : returns string of what was found on the
291 Example : $cmp = $stride_obj->pdbCompound();
300 return join( ' ', @
{ $self->{ 'HEAD' }->{ 'CMP' } } );
306 Usage : returns date given in PDB file
308 Example : $pdb_date = $stride_obj->pdbDate();
317 return $self->{ 'DATE' };
323 Usage : returns string of characters found on the PDB header line
325 Example : $head = $stride_obj->pdbHeader();
334 return $self->{ 'HEAD' }->{ 'HEADER' };
340 Usage : returns string of what was found on SRC lines
342 Example : $src = $stride_obj->pdbSource();
351 return join( ' ', @
{ $self->{ 'HEAD' }->{ 'SRC' } } );
354 # RESIDUE SPECIFIC ACCESSORS
359 Usage : returns 1 letter abbr. of the amino acid specified by
362 Examples : $aa = $stride_obj->resAA( RESIDUE_ID );
363 Returns : scalar character
372 my ( $ord, $chain ) = $self->_toOrdChain( @args );
373 return ( $AATable{$self->{'ASG'}->{$chain}->[$ord]->[$ASGTable{'aa'}]} );
379 Usage : returns phi angle of specified residue
381 Example : $phi = $stride_obj->resPhi( RESIDUE_ID );
391 my ( $ord, $chain ) = $self->_toOrdChain( @args );
392 return $self->{ 'ASG' }->{ $chain }->[ $ord ]->[ $ASGTable{ 'phi' } ];
398 Usage : returns psi angle of specified residue
400 Example : $psi = $stride_obj->resPsi( RESIDUE_ID );
410 my ( $ord, $chain ) = $self->_toOrdChain( @args );
411 return $self->{ 'ASG' }->{ $chain }->[ $ord ]->[ $ASGTable{ 'psi' } ];
417 Usage : returns stride calculated surface area of specified residue
419 Example : $sa = $stride_obj->resSolvAcc( RESIDUE_ID );
429 my ( $ord, $chain ) = $self->_toOrdChain( @args );
430 return $self->{ 'ASG' }->{ $chain }->[ $ord ]->[ $ASGTable{ 'surfArea' } ];
436 Usage : returns stride calculated surface area of specified residue
438 Example : $sa = $stride_obj->resSurfArea( RESIDUE_ID );
448 my ( $ord, $chain ) = $self->_toOrdChain( @args );
449 return $self->{ 'ASG' }->{ $chain }->[ $ord ]->[ $ASGTable{ 'surfArea' } ];
455 Usage : gives one letter abbr. of stride determined secondary
456 structure of specified residue
458 Example : $ss = $stride_obj->resSecStr( RESIDUE_ID );
459 Returns : one of: 'H' => Alpha Helix
462 'E' => Extended conformation
463 'B' or 'b' => Isolated bridge
467 # NOTE: This range is slightly DIFFERENT from the
468 # DSSP method of the same name
477 my ( $ord, $chain ) = $self->_toOrdChain( @args );
478 return $self->{ 'ASG' }->{ $chain }->[ $ord ]->[ $ASGTable{ 'ssAbbr' } ];
484 Usage : gives one letter summary of secondary structure of
485 specified residue. More general than secStruc()
487 Example : $ss_sum = $stride_obj->resSecStrSum( RESIDUE_ID );
488 Returns : one of: 'H' (helix), 'B' (beta), 'T' (turn), or 'C' (coil)
489 Args : residue identifier(s) ( SEE INTRO NOTE )
497 my $ss_char = $self->resSecStr( @args );
499 if ( $ss_char eq 'H' || $ss_char eq 'G' || $ss_char eq 'I' ) {
502 if ( $ss_char eq 'E' || $ss_char eq 'B' || $ss_char eq 'b' ) {
505 if ( $ss_char eq 'T' ) {
517 Title : resSecStrName
518 Usage : gives full name of the secondary structural element
519 classification of the specified residue
521 Example : $ss_name = $stride_obj->resSecStrName( RESIDUE_ID );
522 Returns : scalar string
531 my ( $ord, $chain ) = $self->_toOrdChain( @args );
532 return $self->{ 'ASG' }->{ $chain }->[ $ord ]->[ $ASGTable{ 'ssName' } ];
538 Usage : returns stride determined contiguous secondary
539 structural elements as specified on the LOC lines
541 Example : $loc_pnt = $stride_obj->strideLocs();
542 Returns : pointer to array of 5 element arrays.
543 0 => stride name of structural element
544 1 => first residue pdb key (including insertion code, if app.)
545 2 => first residue chain id
546 3 => last residue pdb key (including insertion code, if app.)
547 4 => last residue chain id
548 NOTE the differences between this range and the range of SecBounds()
556 return $self->{ 'LOC' };
559 # VALUE ADDED METHODS (NOT JUST PARSE/REPORT)
564 Usage : gets residue ids of boundary residues in each
565 contiguous secondary structural element of specified
568 Example : $ss_bound_pnt = $stride_obj->secBounds( 'A' );
569 Returns : pointer to array of 3 element arrays. First two elements
570 are the PDB IDs of the start and end points, respectively
571 and inclusively. The last element is the STRIDE secondary
572 structural element code (same range as resSecStr).
573 Args : chain identifier ( one character ). If none, '-' is assumed
579 # Requires a chain name. If left blank, we assume ' ' which equals '-'
584 $chain = '-' if ( !( $chain ) || $chain eq ' ' || $chain eq '-' );
586 # if we've memoized this one, use that
587 if ( $self->{ 'SecBounds' }->{ $chain } ) {
588 return $self->{ 'SecBounds' }->{ $chain };
591 #check to make sure chain is valid
592 if ( !( $self->{ 'ASG' }->{ $chain } ) ) {
593 $self->throw( "No such chain: $chain\n" );
596 my $cur_element = $self->{ 'ASG' }->{ $chain }->[ 1 ]->
597 [ $ASGTable{ 'ssAbbr' } ];
601 for ( $i = 2; $i <= $#{$self->{'ASG'}->{$chain}}; $i++ ) {
602 if ( $self->{ 'ASG' }->{ $chain }->[ $i ]->[ $ASGTable{ 'ssAbbr' } ]
604 push( @SecBounds, [ $beg, $i -1 , $cur_element ] );
606 $cur_element = $self->{ 'ASG' }->{ $chain }->[ $i ]->
607 [ $ASGTable{ 'ssAbbr' } ];
611 if ( $self->{ 'ASG' }->{ $chain }->[ $i ]->[ $ASGTable{ 'ssAbbr' } ]
613 push( @SecBounds, [ $beg, $i, $cur_element ] );
616 push( @SecBounds, [ $beg, $i - 1, $cur_element ],
617 [ $i, $i, $self->{ 'ASG' }->{ $chain }->[ $i ]->
618 [ $ASGTable{ 'ssAbbr' } ] ] );
621 $self->{ 'SecBounds' }->{ $chain } = \
@SecBounds;
622 return $self->{ 'SecBounds' }->{ $chain };
628 Usage : gives array chain I.D.s (characters)
630 Example : @chains = $stride_obj->chains();
631 Returns : array of characters
639 my @chains = keys ( %{ $self->{ 'ASG' } } );
646 Usage : returns a Bio::PrimarySeq object which represents an
647 approximation at the sequence of the specified chain.
648 Function : For most chain of most entries, the sequence returned by
649 this method will be very good. However, it it inherently
650 unsafe to rely on STRIDE to extract sequence information about
651 a PDB entry. More reliable information can be obtained from
652 the PDB entry itself. If a second option is given
653 (and evaluates to true), the sequence generated will
654 have 'X' in spaces where the pdb residue numbers are
655 discontinuous. In some cases this results in a
656 better sequence object (when the discontinuity is
657 due to regions which were present, but could not be
658 resolved). In other cases, it will result in a WORSE
659 sequence object (when the discontinuity is due to
660 historical sequence numbering and all sequence is
662 Example : $pso = $dssp_obj->getSeq( 'A' );
663 Returns : (pointer to) a PrimarySeq object
664 Args : Chain identifier. If none given, '-' is assumed.
678 if ( $self->{ 'Seq' }->{ $chain } ) {
679 return $self->{ 'Seq' }->{ $chain };
692 $num_res = $self->numResidues( $chain );
693 $last_res_num = $self->_pdbNum( 1, $chain );
694 for ( $i = 1; $i <= $num_res; $i++ ) {
696 $cur_res_num = $self->_pdbNum( $i, $chain );
697 $step = $cur_res_num - $last_res_num;
699 $seq .= 'X' x
( $step - 1 );
702 $seq .= $self->_resAA( $i, $chain );
703 $last_res_num = $cur_res_num;
706 $id = $self->pdbID();
709 $self->{ 'Seq' }->{ $chain } = Bio
::PrimarySeq
->new( -seq
=> $seq,
711 -moltype
=> 'protein'
714 return $self->{ 'Seq' }->{ $chain };
717 =head1 INTERNAL METHODS
722 Usage : fetches the numeric portion of the identifier for a given
723 residue as reported by the pdb entry. Note, this DOES NOT
724 uniquely specify a residue. There may be an insertion code
725 and/or chain identifier differences.
727 Example : $pdbNum = $self->pdbNum( 3, 'A' );
729 Args : valid ordinal num / chain combination
738 if ( !( $self->{ 'ASG' }->{ $chain }->[ $ord ] ) ) {
739 $self->throw( "No such ordinal $ord in chain $chain.\n" );
741 my $pdb_junk = $self->{ 'ASG' }->{ $chain }->[ $ord ]->[ $ASGTable{ 'resNum' } ];
743 ( $num_part ) = ( $pdb_junk =~ /(-*\d+).*/ );
750 Usage : returns 1 letter abbr. of the amino acid specified by
753 Examples : $aa = $stride_obj->_resAA( 3, '-' );
754 Returns : scalar character
755 Args : ( ord. num, chain )
764 if ( !( $self->{ 'ASG' }->{ $chain }->[ $ord ] ) ) {
765 $self->throw( "No such ordinal $ord in chain $chain.\n" );
767 return ( $AATable{$self->{'ASG'}->{$chain}->[$ord]->[$ASGTable{'aa'}]} );
773 Usage : fetches the Insertion code for this residue.
775 Example : $pdb_ins_co = $self->_pdb_ins_co( 15, 'B' );
777 Args : ordinal number and chain
786 if ( !( $self->{ 'ASG' }->{ $chain }->[ $ord ] ) ) {
787 $self->throw( "No such ordinal $ord in chain $chain.\n" );
789 my $pdb_junk = $self->{ 'ASG' }->{ $chain }->[ $ord ]->[ $ASGTable{ 'resNum' } ];
791 ( $letter_part ) = ( $pdb_junk =~ /\d+(\D+)/ ); # insertion code can be any
792 # non-word character(s)
799 Usage : takes any set of residue identifying parameters and
800 wrestles them into a two element array: the chain and the ordinal
801 number of this residue. This two element array can then be
802 efficiently used as keys in many of the above accessor methods
803 ('#A:B') or ( #, 'A', 'B' )
805 || - Chain ID (blank for single chain)
806 |--- Insertion code for this residue. Blank for most residues.
807 |--- Numeric portion of residue ID.
811 --- Numeric portion of residue ID. If there is only one chain and
812 it has no ID AND there is no residue with an insertion code at this
813 number, then this can uniquely specify a residue.
815 # ('#:C) or ( #, 'C' )
818 ---Numeric portion of residue ID.
820 If a residue is incompletely specified then the first residue that
821 fits the arguments is returned. For example, if 19 is the argument
822 and there are three chains, A, B, and C with a residue whose number
823 is 19, then 19:A will be returned (assuming its listed first).
826 Example : my ( $ord, $chain ) = $self->_toOrdChain( @args );
827 Returns : two element array
828 Args : valid set of residue identifier(s) ( SEE NOTE ABOVE )
837 my ( $key_num, $chain_id, $ins_code, $key, $i );
839 # check to see how many args are given
840 if ( $#_ >= 1 ) { # multiple args
842 if ( $#_ >= 1 ) { # still multiple args => ins. code, too
846 else { # just one more arg. => chain_id
850 else { # only single arg. Might be number or string
852 if ( $arg_str =~ /:/ ) {
853 # a chain is specified
854 ( $chain_id ) = ( $arg_str =~ /:(.)/);
857 if ( $arg_str =~ /[A-Z]|[a-z]/ ) {
858 # an insertion code is specified
859 ( $ins_code ) = ( $arg_str =~ /([A-Z]|[a-z])/ );
860 $arg_str =~ s/[A-Z]|[a-z]//g;
862 #now, get the number bit-> everything still around
866 $key = "$key_num$ins_code";
867 if ( !( $chain_id ) || $chain_id eq ' ' ) {
871 if ( !( $self->{ 'ASG' }->{ $chain_id } ) ) {
872 $self->throw( "No such chain: $chain_id" );
875 for ( $i = 1; $i <= $#{$self->{ 'ASG' }->{ $chain_id }}; $i++ ) {
876 if ( $self->{ 'ASG' }->{ $chain_id }->[ $i ]->[ $ASGTable{ 'resNum' } ] eq
878 return ( $i, $chain_id );
882 $self->throw( "No such key: $key" );
889 Usage : as name suggests, parses stride output, creating object
891 Example : $self->_parse( $io );
893 Args : valid Bio::Root::IO object
901 my $file = $io->_fh();
904 if ( $self->_parseTop( $io ) ) {
905 $self->throw( "Not stride output" );
908 # Parse the HDR, CMP, SCR, and AUT lines
909 $self->_parseHead( $io );
911 # Parse the CHN, SEQ, STR, and LOC lines
912 $self->_parseSummary( $io ); # we're ignoring this
914 # Parse the ASG lines
915 $self->_parseASG( $io );
921 Usage : makes sure this looks like stride output
933 my $file = $io->_fh();
935 if ( $cur =~ /^REM ---/ ) {
945 Function : HDR, CMP, SRC, and AUT lines
956 my $file = $io->_fh();
959 my ( @elements, @cmp, @src, @aut );
964 while ( $cur =~ /^REM / ) {
968 if ( $cur =~ /^HDR / ) {
969 @elements = split( /\s+/, $cur );
972 $self->{ 'PDB' } = pop( @elements );
973 $self->{ 'DATE' } = pop( @elements );
974 # now, everything else is "header" except for the word
976 $element = join( ' ', @elements );
977 $head{ 'HEADER' } = $element;
981 while ( $cur =~ /^CMP / ) {
982 ( $cur ) = ( $cur =~ /^CMP\s+(.+?)\s*\w{4}$/ );
987 while ( $cur =~ /^SRC / ) {
988 ( $cur ) = ( $cur =~ /^SRC\s+(.+?)\s*\w{4}$/ );
993 while ( $cur =~ /^AUT / ) {
994 ( $cur ) = ( $cur =~ /^AUT\s+(.+?)\s*\w{4}$/ );
999 $head{ 'CMP' } = \
@cmp;
1000 $head{ 'SRC' } = \
@src;
1001 $head{ 'AUT' } = \
@aut;
1002 $self->{ 'HEAD' } = \
%head;
1005 =head2 _parseSummary
1007 Title : _parseSummary
1008 Usage : parses LOC lines
1020 my $file = $io->_fh();
1024 my ( @elements, @cur );
1025 my @LOC_lookup = ( [ 5, 12 ], # Element name
1026 # reduntdant [ 18, 3 ], # First residue name
1027 [ 22, 5 ], # First residue PDB number
1028 [ 28, 1 ], # First residue Chain ID
1029 # redundant [ 35, 3 ], # Last residue name
1030 [ 40, 5 ], # Last residue PDB number
1031 [ 46, 1 ] ); # Last residue Chain ID
1034 while ( $cur =~ /^REM |^STR |^SEQ |^CHN / ) {
1038 while ( $cur =~ /^LOC / ) {
1039 foreach $bound_set ( @LOC_lookup ) {
1040 $element = substr( $cur, $bound_set->[ 0 ], $bound_set->[ 1 ] );
1041 $element =~ s/\s//g;
1042 push( @cur, $element );
1044 push( @elements, [ @cur ] );
1048 $self->{ 'LOC' } = \
@elements;
1055 Usage : parses ASG lines
1067 my $file = $io->_fh();
1071 my ( $chain, $last_chain );
1074 my ( @cur, @elements );
1075 my @ASG_lookup = ( [ 5, 3 ], # Residue name
1076 # [ 9, 1 ], # Chain ID
1077 [ 10, 5 ], # PDB residue number (w/ins.code)
1078 # [ 16, 4 ], # ordinal stride number
1079 [ 24, 1 ], # one letter sec. stru. abbr.
1080 [ 26, 13], # full sec. stru. name
1081 [ 42, 7 ], # phi angle
1082 [ 52, 7 ], # psi angle
1083 [ 64, 5 ] );# residue solv. acc.
1085 while ( $cur =~ /^REM / ) {
1089 while ( $cur =~ /^ASG / ) {
1090 # get ordinal number for array key
1091 $ord_num = substr( $cur, 16, 4 );
1092 $ord_num =~ s/\s//g;
1095 $chain = substr( $cur, 9, 1 );
1097 if ( $last_chain && ( $chain ne $last_chain ) ) {
1098 $ASG{ $last_chain } = [ @elements ];
1102 # now get the rest of the info on this line
1103 foreach $bound_set ( @ASG_lookup ) {
1104 $element = substr( $cur, $bound_set->[ 0 ],
1105 $bound_set->[ 1 ] );
1106 $element =~ s/\s//g;
1107 push( @cur, $element );
1109 $elements[ $ord_num ] = [ @cur ];
1112 $last_chain = $chain;
1115 $ASG{ $chain } = [ @elements ];
1117 $self->{ 'ASG' } = \
%ASG;