scripts/utilities/bp_nrdb.pl

   1 #!perl
   2
   3 # Author Jason Stajich <jason-at-bioperl-dot-org>
   4 #
   5 # Make a non-redundant database based on sequence (not on ID!)
   6 # This script is still in progress but is intended to mimic what
   7 # Warren Gish's nrdb does
   8
   9 # It requires that Digest::MD5 is installed (for now)
  10
  11 =head1 NAME
  12
  13 bp_nrdb.PLS - a script to emulate Warren Gish's nrdb, make a unique sequence database from a set of input databases
  14
  15 =head1 SYNOPSIS
  16
  17
  18 Usage:
  19   bp_nrdb.PLS [options] file1 file2 file3
  20
  21 Alternative usage
  22   bp_nrdb.PLS -p [options] file1 id1 file2 id2 file3 id3
  23
  24 =head1 DESCRIPTION
  25
  26 This script will create a unique database of sequences
  27 (quasi-nonredundant).  The options are:
  28
  29    -o filename          - the filename the db is written (STDOUT by default)
  30    -a filename          - the filename to append the db to
  31    -l#                  - minimum required sequence length
  32    -i                   - do not check for duplicates
  33    -n#                  - max number of descriptions to report per seq
  34    -d#                  - delimiter to use between consecutive descriptions
  35    -p                   - use database id prefixes from command line
  36
  37 =head1 AUTHOR
  38
  39 Jason Stajich, jason-at-bioperl-dot-org
  40
  41 =cut
  42
  43 use strict;
  44 use warnings;
  45 use Bio::SeqIO;
  46 use Getopt::Long;
  47
  48 use Digest::MD5 qw(md5_hex);
  49 my ($output,$append,$min_len,
  50     $no_duplicate_check,$desc_count,
  51     $delimiter, $expect_prefixes,$help);
  52 $delimiter = ';';
  53
  54 GetOptions(
  55            'o|output:s'    => \$output,
  56            'a|append:s'    => \$append,
  57            'n:s'           => \$desc_count,
  58            'l:s'           => \$min_len,
  59            'd:s'           => \$delimiter,
  60            'p'             => \$expect_prefixes,
  61            'i'             => \$no_duplicate_check,
  62            'h'             => \$help,
  63            );
  64
  65 die("must supply a positive integer for -d") if ( defined $desc_count &&
  66                                                   ( $desc_count !~ /^\d+$/ ||
  67                                                     $desc_count < 1) );
  68 die("must supply a positive integer for -l") if ( defined $min_len &&
  69                                                   ( $min_len !~ /^\d+$/ ||
  70                                                     $min_len < 1) );
  71 my @files;
  72
  73 if( $help || ! @ARGV ) {
  74     exec('perldoc',$0);
  75     exit(0);
  76 }
  77 while( @ARGV ) {
  78
  79     my ($file, $id) = (undef,'');
  80     if( $expect_prefixes ) {
  81         ($file,$id) = (shift @ARGV, shift @ARGV);
  82         if( ! $id ) {
  83             die("Must provide 'name id' pairing of dbfile and id");
  84         }
  85     } else {
  86         $file = shift @ARGV;
  87     }
  88     push @files, [ $file,$id];
  89 }
  90
  91
  92 my $out;
  93 if( $append ) {
  94     $out = new Bio::SeqIO(-file => ">>$append");
  95 } elsif( $output ) {
  96     $out = new Bio::SeqIO(-file => ">$output");
  97 } else {
  98     $out = new Bio::SeqIO(); # use STDOUT
  99 }
 100
 101 my %unique;
 102 my %seqcount;
 103 my $counter = 0;
 104 foreach my $pair ( @files ) {
 105     my ($file,$id) = @$pair;
 106     my $in = new Bio::SeqIO(-file => $file);
 107     while( my $seq = $in->next_seq ) {
 108         next if defined $min_len && $seq->length < $min_len;
 109         if( $id ) {
 110             $seq->display_id("$id:".$seq->display_id);
 111         }
 112         my $s = lc($seq->seq());
 113         my $md5sum = md5_hex($s);
 114         if( $no_duplicate_check ) {
 115             $md5sum = $counter++;
 116         }
 117
 118         if( defined $unique{$md5sum} ) {
 119             $seqcount{$md5sum}++;
 120             next if defined $desc_count && $seqcount{$md5sum++} > $desc_count;
 121             my $desc = $unique{$md5sum}->description;
 122             my $id2 = sprintf("%s %s:%s %s",$delimiter,
 123                               $id,$seq->display_id,$seq->description);
 124             $unique{$md5sum}->desc($desc . $id2);
 125         } else {
 126             $unique{$md5sum} = $seq;
 127         }
 128     }
 129 }
 130
 131 foreach my $seq ( values %unique ) {
 132     $out->write_seq($seq);
 133 }
 134
 135 __END__