scripts/seqstats/aacomp.PLS

   1 #!perl -w
   2 use strict;
   3 use Carp;
   4 # $Id$
   5
   6 use Bio::SeqIO;
   7 use Getopt::Long;
   8 use Bio::SeqUtils;
   9 use Bio::Tools::IUPAC;
  10
  11 my $table = new Bio::SeqUtils;
  12 my @BASES = $table->valid_aa(0);
  13 my %all = $table->valid_aa(2);
  14 my ($file,$format,$help) = ( undef, 'fasta');
  15 GetOptions(
  16            'i|in:s'  => \$file,
  17            'f|format:s' => \$format,
  18            'h|help|?'  => \$help,
  19            );
  20
  21 my $USAGE = "usage: aacomp [-f format] filename\n\tdefault format is fasta\n";
  22 $file = shift unless $file;
  23
  24 die $USAGE if $help;
  25
  26 my $seqin;
  27 if( defined $file ) {
  28     print "Could not open file [$file]\n$USAGE" and exit unless -e $file;
  29     $seqin = new Bio::SeqIO(-format => $format,
  30                             -file   => $file);
  31 } else {
  32     $seqin = new Bio::SeqIO(-format => $format,
  33                             -fh     => \*STDIN);
  34 }
  35
  36 my %composition;
  37 my $total;
  38 foreach my $base ( @BASES ) {
  39     $composition{$base} = 0;
  40 }
  41 while ( my $seq = $seqin->next_seq ) {
  42     if( $seq->alphabet ne 'protein' ) {
  43         confess("Must only provide amino acid sequences to aacomp...skipping this seq");
  44         next;
  45     }
  46     foreach my $base ( split(//,$seq->seq()) ) {
  47         $composition{uc $base}++;
  48         $total++;
  49     }
  50 }
  51
  52 printf("%d aa\n",$total);
  53 printf("%5s %4s\n", 'aa', '#' );
  54 my $ct = 0;
  55 foreach my $base ( @BASES ) {
  56     printf(" %s %s %3d\n", $base, $all{$base}, $composition{$base} );
  57     $ct += $composition{$base};
  58 }
  59 printf( "%6s %s\n", '','-'x5);
  60 printf( "%6s %3d\n", '',$ct);
  61
  62
  63 __END__
  64
  65
  66 =head1 NAME
  67
  68 aacomp - amino acid composition of protein sequences
  69
  70 =head1 SYNOPSIS
  71
  72   aacomp [-f/--format FORMAT] [-h/--help] filename
  73   or
  74   aacomp [-f/--format FORMAT] < filename
  75   or
  76   aacomp [-f/--format FORMAT] -i filename
  77
  78 =head1 DESCRIPTION
  79
  80 This scripts prints out the count of amino acids over all protein
  81 sequences from the input file.
  82
  83 =head1 OPTIONS
  84
  85 The default sequence format is fasta.
  86
  87 The sequence input can be provided using any of the three methods:
  88
  89 =over 3
  90
  91 =item unnamed argument
  92
  93   aacomp filename
  94
  95 =item named argument
  96
  97   aacomp -i filename
  98
  99 =item standard input
 100
 101   aacomp < filename
 102
 103 =back
 104
 105 =head1 FEEDBACK
 106
 107 =head2 Mailing Lists
 108
 109 User feedback is an integral part of the evolution of this and other
 110 Bioperl modules. Send your comments and suggestions preferably to
 111 the Bioperl mailing list.  Your participation is much appreciated.
 112
 113   bioperl-l@bioperl.org                  - General discussion
 114   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 115
 116 =head2 Reporting Bugs
 117
 118 Report bugs to the Bioperl bug tracking system to help us keep track
 119 of the bugs and their resolution. Bug reports can be submitted via the
 120 web:
 121
 122   http://bugzilla.open-bio.org/
 123
 124 =head1 AUTHOR - Jason Stajich
 125
 126 Email jason@bioperl.org
 127
 128 =head1 HISTORY
 129
 130 Based on aacomp.c from an old version of EMBOSS
 131
 132 =cut