Add tsibley's work to Changes file
[bioperl-live.git] / scripts / DB / bp_flanks.pl
blobda92d1de09719ab1a07659266867c1e2dbedf9e3
1 #!perl
2 # -*-Perl-*-
4 # Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
5 # Finding flanking sequences for a variant.
8 # v. 1 16 Mar 2001
9 # v. 1.1 9 Aug 2001 interface change, more info in fasta header
10 # v. 2.0 23 Nov 2001 new code from the flanks CGI program
11 # support for EMBL-like positions
12 # v. 3.0 21 Feb 2003 new command line interface
15 use Bio::PrimarySeq;
16 use Bio::SeqIO;
17 use Bio::DB::EMBL;
18 use Bio::DB::GenBank;
19 use Getopt::Long;
20 use strict;
21 use warnings;
23 use constant VERSION => '3.0';
25 my $help = '';
26 my $flank = 100; # flank length on both sides of the region
27 my $in_format = 'EMBL'; # format of the file to read in
28 my @pos; # position(s) in the sequence
31 GetOptions ("help" => \$help, "flanklength=i" => \$flank,
32 "position=s" => \@pos );
34 @pos = split(/,/,join(',',@pos));
36 system("perldoc $0") if $help;
37 system("perldoc $0") unless @ARGV;
38 print STDERR "\nYou need to provide --position option\n" and system("perldoc $0")
39 unless @pos;
41 my $file = shift;
42 $file || system("perldoc $0");
44 my $seq = get_seq($file);
45 exit 0 unless $seq;
47 &extract($seq, \@pos, $flank);
50 ## end main
53 sub get_seq {
54 my ($file) = @_;
55 my $IN_FORMAT = 'EMBL'; # format of the local file on disk
57 if (-e $file ) { # local file
58 my $in = Bio::SeqIO->new('-file' => $file,
59 '-format' => $IN_FORMAT);
60 $seq = $in->next_seq();
62 elsif ($file =~ /\./) { # sequence version from GenBank
63 eval {
64 my $gb = new Bio::DB::GenBank;
65 $seq = $gb->get_Seq_by_version($file);
67 } else { # plain accession mumber from more reliable EMBL
68 eval {
69 my $gb = new Bio::DB::EMBL;
70 $seq = $gb->get_Seq_by_acc($file);
74 print STDERR "Could not find sequence [$file]" && return unless $seq;
75 return $seq;
78 sub extract {
79 my ($seq, $pos, $flank) = @_;
80 my ($out_seq);
81 my $OUT_FORMAT = 'FASTA'; # output format, going into STDOUT
82 my $strand = 1; # default for the forward strand
84 my $out = Bio::SeqIO->new('-format' => $OUT_FORMAT);
86 my $count = 1;
87 foreach my $idpos (@$pos) {
89 my ($id, $pos_range, $start, $end, $allele_len);
90 my $inbetween = 0; # handle 23^24 notation as well as plain integer (24)
91 # but set flag and make corrections when needed
93 if ($idpos =~ /:/ ) { # id and position separator
94 ($id, $pos_range) = split /:/, $idpos;
95 } else { # no id
96 $id = $count;
97 $count++;
98 $pos_range = $idpos;
100 $strand = -1 if $pos_range =~ /-$/; # opposite strand
101 $pos_range = $1 if $pos_range =~ /(.+)-/; # remove trailing '-'
103 if ($pos_range =~ /\^/) { # notation 23^24 used
104 ($start, $end) = split /\^/, $pos_range;
105 print STDERR $id, ": Give adjacent nucleotides flanking '^' character, not [",
106 $start, "] and [", $end, "]\n" and next
107 unless $end == $start + 1;
108 $end = $start;
109 $inbetween = 1;
110 } else { # notation 23..24 used
111 ($start, $end) = split /\.\./, $pos_range;
113 $end ||= $start; # notation 23 used
114 print STDERR $id, ": Start can not be larger than end. Not [",
115 $start, "] and [", $end, "]\n" and next
116 if $start > $end;
117 $allele_len = $end - $start;
119 # sanity checks
120 next unless defined $start && $start =~ /\d+/ && $start != 0;
121 print STDERR "Position '$start' not in sequence '$file'\n", and next
122 if $start < 1 or $start > $seq->length;
123 print STDERR "End position '$end' not in sequence '$file'\n", and next
124 if $end < 1 or $end > $seq->length;
126 # determine nucleotide positions
127 # left edge
128 my $five_start = $start - $flank;
129 $five_start = 1 if $five_start < 1; # not outside the sequence
130 # right edge
131 my $three_end = $start + $allele_len + $flank;
132 $three_end = $seq->length if $start + $allele_len + $flank > $seq->length;
133 $three_end-- if $inbetween;
135 # extract the sequences
136 my $five_prime = lc $seq->subseq($five_start , $start - 1); # left flank
137 my $snp = uc $seq->subseq($start, $end); # allele (always > 0 length)
138 $snp = lc $snp if $inbetween;
140 my $three_prime; # right flank
141 if ($end < $seq->length) { # make sure we are not beyond reference sequece
142 $three_prime = lc $seq->subseq($end + 1, $three_end);
143 } else {
144 $three_prime = '';
147 # allele positions in local, extracted coordinates
148 my $locpos = length($five_prime) + 1;
149 my $loc_end;
150 if ($allele_len) {
151 $loc_end = "..". ($locpos+$allele_len);
152 } else {
153 $loc_end = '';
154 $loc_end = '^'. ($locpos+1) if $inbetween;
156 # build FASTA id and description line
157 my $fastaid = uc($id). "_". uc($file).
158 " oripos=$pos_range strand=$strand allelepos=$locpos$loc_end";
160 #build BioPerl sequence objects
161 if ($strand == -1) {
162 my $five_prime_seq = new Bio::PrimarySeq(-alphabet=>'dna',-seq=>$five_prime);
163 my $snp_seq = new Bio::PrimarySeq(-alphabet=>'dna',-seq=>$snp);
164 my $three_prime_seq = new Bio::PrimarySeq(-alphabet=>'dna',-seq=>$three_prime);
166 my $str = $three_prime_seq->revcom->seq. " ".
167 $snp_seq->revcom->seq. " ". $five_prime_seq->revcom->seq;
168 $str =~ s/ //g;
169 $out_seq = new Bio::PrimarySeq (-id => $fastaid,
170 -alphabet=>'dna',
171 -seq => $str );
172 } else {
173 my $str = $five_prime. " ". $snp. " ". $three_prime;
174 $str =~ s/ //g;
175 $out_seq = new Bio::PrimarySeq (-id => $fastaid,
176 -alphabet=>'dna',
177 -seq => $str );
179 $out->write_seq($out_seq); # print sequence out
185 =head1 NAME
187 bp_flanks - finding flanking sequences for a variant in a sequence position
189 =head1 SYNOPSIS
191 bp_flanks --position POS [-p POS ...] [--flanklen INT]
192 accession | filename
194 =head1 DESCRIPTION
196 This script allows you to extract a subsequence around a region of
197 interest from an existing sequence. The output if fasta formatted
198 sequence entry where the header line contains additional information
199 about the location.
201 =head1 OPTIONS
203 The script takes one unnamed argument which be either a file name in
204 the local file system or a nucleotide sequence accession number.
207 -p Position uses simple nucleotide sequence feature table
208 --position notation to define the region of interest, typically a
209 SNP or microsatellite repeat around which the flanks are
210 defined.
212 There can be more than one position option or you can
213 give a comma separated list to one position option.
215 The format of a position is:
217 [id:] int | range | in-between [-]
219 The optional id is the name you want to call the new
220 sequence. If it not given in joins running number to the
221 entry name with an underscore.
223 The position is either a point (e.g. 234), a range (e.g
224 250..300) or insertion point between nucleotides
225 (e.g. 234^235)
227 If the position is not completely within the source
228 sequence the output sequence will be truncated and it
229 will print a warning.
231 The optional hyphen [-] at the end of the position
232 indicates that that you want the retrieved sequence to be
233 in the opposite strand.
236 -f Defaults to 100. This is the length of the nucleotides
237 --flanklen sequence retrieved on both sides of the given position.
239 If the source file does not contain
241 =head1 OUTPUT FORMAT
243 The output is a fasta formatted entry where the description file
244 contains tag=value pairs for information about where in the original
245 sequence the subsequence was taken.
247 The ID of the fasta entry is the name given at the command line joined
248 by hyphen to the filename or accesion of the source sequence. If no id
249 is given a series of consequtive integers is used.
251 The tag=value pairs are:
253 =over 3
255 =item oripos=int
257 position in the source file
259 =item strand=1|-1
261 strand of the sequence compared to the source sequence
263 =item allelepos=int
265 position of the region of interest in the current entry.
266 The tag is the same as used by dbSNP@NCBI
268 =back
270 The sequence highlights the allele variant position by showing it in
271 upper case and rest of the sequence in lower case characters.
273 =head1 EXAMPLE
275 % bp_flanks ~/seq/ar.embl
277 >1_/HOME/HEIKKI/SEQ/AR.EMBL oripos=100 strand=1 allelepos=100
278 taataactcagttcttatttgcacctacttcagtggacactgaatttggaaggtggagga
279 ttttgtttttttcttttaagatctgggcatcttttgaatCtacccttcaagtattaagag
280 acagactgtgagcctagcagggcagatcttgtccaccgtgtgtcttcttctgcacgagac
281 tttgaggctgtcagagcgct
284 =head1 TODO
286 The input files are assumed to be in EMBL format and the sequences are
287 retrieved only from the EMB database. Make this more generic and use
288 the registry.
291 head1 FEEDBACK
293 =head2 Mailing Lists
295 User feedback is an integral part of the evolution of this and other
296 Bioperl modules. Send your comments and suggestions preferably to the
297 Bioperl mailing lists Your participation is much appreciated.
299 bioperl-l@bioperl.org - General discussion
300 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
302 =head2 Reporting Bugs
304 Report bugs to the Bioperl bug tracking system to help us keep track
305 the bugs and their resolution. Bug reports can be submitted via the
306 web:
308 https://github.com/bioperl/bioperl-live/issues
310 =head1 AUTHOR - Heikki Lehvaslaiho
312 Email: E<lt>heikki-at-bioperl-dot-orgE<gt>
314 =cut