Add tsibley's work to Changes file
[bioperl-live.git] / scripts / utilities / bp_nrdb.pl
blob08183142d0f0aa69bb48ee24da6d0c6b85013e7d
1 #!perl
3 # Author Jason Stajich <jason-at-bioperl-dot-org>
4 #
5 # Make a non-redundant database based on sequence (not on ID!)
6 # This script is still in progress but is intended to mimic what
7 # Warren Gish's nrdb does
9 # It requires that Digest::MD5 is installed (for now)
11 =head1 NAME
13 bp_nrdb.PLS - a script to emulate Warren Gish's nrdb, make a unique sequence database from a set of input databases
15 =head1 SYNOPSIS
18 Usage:
19 bp_nrdb.PLS [options] file1 file2 file3
21 Alternative usage
22 bp_nrdb.PLS -p [options] file1 id1 file2 id2 file3 id3
24 =head1 DESCRIPTION
26 This script will create a unique database of sequences
27 (quasi-nonredundant). The options are:
29 -o filename - the filename the db is written (STDOUT by default)
30 -a filename - the filename to append the db to
31 -l# - minimum required sequence length
32 -i - do not check for duplicates
33 -n# - max number of descriptions to report per seq
34 -d# - delimiter to use between consecutive descriptions
35 -p - use database id prefixes from command line
37 =head1 AUTHOR
39 Jason Stajich, jason-at-bioperl-dot-org
41 =cut
43 use strict;
44 use warnings;
45 use Bio::SeqIO;
46 use Getopt::Long;
48 use Digest::MD5 qw(md5_hex);
49 my ($output,$append,$min_len,
50 $no_duplicate_check,$desc_count,
51 $delimiter, $expect_prefixes,$help);
52 $delimiter = ';';
54 GetOptions(
55 'o|output:s' => \$output,
56 'a|append:s' => \$append,
57 'n:s' => \$desc_count,
58 'l:s' => \$min_len,
59 'd:s' => \$delimiter,
60 'p' => \$expect_prefixes,
61 'i' => \$no_duplicate_check,
62 'h' => \$help,
65 die("must supply a positive integer for -d") if ( defined $desc_count &&
66 ( $desc_count !~ /^\d+$/ ||
67 $desc_count < 1) );
68 die("must supply a positive integer for -l") if ( defined $min_len &&
69 ( $min_len !~ /^\d+$/ ||
70 $min_len < 1) );
71 my @files;
73 if( $help || ! @ARGV ) {
74 exec('perldoc',$0);
75 exit(0);
77 while( @ARGV ) {
79 my ($file, $id) = (undef,'');
80 if( $expect_prefixes ) {
81 ($file,$id) = (shift @ARGV, shift @ARGV);
82 if( ! $id ) {
83 die("Must provide 'name id' pairing of dbfile and id");
85 } else {
86 $file = shift @ARGV;
88 push @files, [ $file,$id];
92 my $out;
93 if( $append ) {
94 $out = new Bio::SeqIO(-file => ">>$append");
95 } elsif( $output ) {
96 $out = new Bio::SeqIO(-file => ">$output");
97 } else {
98 $out = new Bio::SeqIO(); # use STDOUT
101 my %unique;
102 my %seqcount;
103 my $counter = 0;
104 foreach my $pair ( @files ) {
105 my ($file,$id) = @$pair;
106 my $in = new Bio::SeqIO(-file => $file);
107 while( my $seq = $in->next_seq ) {
108 next if defined $min_len && $seq->length < $min_len;
109 if( $id ) {
110 $seq->display_id("$id:".$seq->display_id);
112 my $s = lc($seq->seq());
113 my $md5sum = md5_hex($s);
114 if( $no_duplicate_check ) {
115 $md5sum = $counter++;
118 if( defined $unique{$md5sum} ) {
119 $seqcount{$md5sum}++;
120 next if defined $desc_count && $seqcount{$md5sum++} > $desc_count;
121 my $desc = $unique{$md5sum}->description;
122 my $id2 = sprintf("%s %s:%s %s",$delimiter,
123 $id,$seq->display_id,$seq->description);
124 $unique{$md5sum}->desc($desc . $id2);
125 } else {
126 $unique{$md5sum} = $seq;
131 foreach my $seq ( values %unique ) {
132 $out->write_seq($seq);
135 __END__