sync w/ main trunk
[bioperl-live.git] / Bio / Tools / SiRNA.pm
blob2267154cc475b1b27a9e68c91e9c7912937cf3f3
1 # $Id$
3 # BioPerl module for Bio::Tools::SiRNA
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Donald Jackson, donald.jackson@bms.com
9 # Copyright Bristol-Myers Squibb
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 SiRNA - Perl object for designing small inhibitory RNAs.
19 =head1 SYNOPSIS
21 use Bio::Tools::SiRNA;
23 my $sirna_designer = Bio::Tools::SiRNA->new( -target => $bio_seq,
24 -rules => 'saigo'
26 my @pairs = $sirna_designer->design;
28 foreach $pair (@pairs) {
29 my $sense_oligo_sequence = $pair->sense->seq;
30 my $antisense_oligo_sequence = $pair->antisense->seq;
32 # print out results
33 print join ("\t", $pair->start, $pair->end, $pair->rank,
34 $sense_oligo_sequence, $antisense_oligo_sequence), "\n";
37 =head1 DESCRIPTION
39 Package for designing siRNA reagents.
41 Input is a L<Bio::SeqI>-compliant object (the target).
43 Output is a list of Bio::SeqFeature::SiRNA::Pair objects, which are
44 added to the feature table of the target sequence. Each
45 Bio::SeqFeature::SiRNA::Pair contains two subfeatures
46 (Bio::SeqFeature::Oligo objects) which correspond to the individual
47 oligos. These objects provide accessors for the information on the
48 individual reagent pairs.
50 This verion of Bio::Tools::SiRNA represents a major change in architecture.
51 Specific 'rulesets' for siRNA selection as developed by various groups are
52 implemented as Bio::Tools::SiRNA::Ruleset objects, which inherit from
53 Bio::Tools::SiRNA. This will make it easier to add new rule sets or modify
54 existing approaches. Currently the Tuschl and Ui-Tei (2004) rules are
55 implemented. For consistency, the Tuschl rules are implemented by default.
57 In addition, this module provides three 'extra' rules which can be added
58 above and beyond any ruleset.
60 =over 3
62 =item 1.
64 SiRNAs that overlap known SNPs (identified as SeqFeatures with
65 primary tag = variation) can be avoided.
67 =item 2.
69 Other regions (with primary tag = 'Excluded') can also be skipped. I
70 use this with Bio::Tools::Run::Mdust to avoid low-complexity regions
71 (must be run separately), but other programs could also be used.
73 =item 3.
75 SiRNAs may also be selected in the 3 prime UTR of a gene by setting
76 $sirna_designer-E<gt>include_3pr() to true.
78 =back
80 =head2 EXPORT
82 None.
84 =head1 SEE ALSO
86 L<Bio::Tools::Run::Mdust>, L<Bio::SeqFeature::SiRNA::Pair>,
87 L<Bio::SeqFeature::SiRNA::Oligo>..
89 =head1 FEEDBACK
91 =head2 Mailing Lists
93 User feedback is an integral part of the evolution of this and other
94 Bioperl modules. Send your comments and suggestions preferably to
95 the Bioperl mailing list. Your participation is much appreciated.
97 bioperl-l@bioperl.org - General discussion
98 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
100 =head2 Support
102 Please direct usage questions or support issues to the mailing list:
104 L<bioperl-l@bioperl.org>
106 rather than to the module maintainer directly. Many experienced and
107 reponsive experts will be able look at the problem and quickly
108 address it. Please include a thorough description of the problem
109 with code and data examples if at all possible.
111 =head2 Reporting Bugs
113 Report bugs to the Bioperl bug tracking system to help us keep track
114 of the bugs and their resolution. Bug reports can be submitted via
115 the web:
117 http://bugzilla.open-bio.org/
119 =head1 AUTHOR
121 Donald Jackson (donald.jackson@bms.com)
123 =head1 APPENDIX
125 The rest of the documentation details each of the object methods.
126 Internal methods are usually preceded with a _
128 =cut
130 package Bio::Tools::SiRNA;
132 use strict;
133 use warnings;
135 use vars qw($AUTOLOAD);
137 use Bio::Seq::RichSeq;
138 use Bio::SeqFeature::Generic;
139 use Bio::SeqFeature::SiRNA::Oligo;
140 use Bio::SeqFeature::SiRNA::Pair;
143 use base qw(Bio::Root::Root);
146 our %COMP = ( A => 'T',
147 T => 'A',
148 C => 'G',
149 G => 'C',
150 N => 'N',
153 our @ARGNAMES = qw(RULES START_PAD END_PAD MIN_GC CUTOFF OLIGOS AVOID_SNPS
154 GSTRING TMPDIR TARGET DEBUG);
157 =head2 new
159 Title : new
160 Usage : my $sirna_designer = Bio::Tools::SiRNA->new();
161 Function : Constructor for designer object
162 Returns : Bio::Tools::SiRNA object
163 Args : target - the target sequence for the SiRNAs as a Bio::Seq::RichSeq
164 start_pad - distance from the CDS start to skip (default 75)
165 end_pad - distance from the CDS end to skip (default 50)
166 include_3pr - set to true to include SiRNAs in the 3prime UTR (default false)
167 rules - rules for selecting siRNAs, currently supporting saigo and tuschl
168 min_gc - minimum GC fraction (NOT percent) (default 0.4)
169 max_gc - maximum GC fraction (NOT percent) (default 0.6)
170 cutoff - worst 'rank' accepted(default 3)
171 avoid_snps - boolean - reject oligos that overlap a variation
172 SeqFeature in the target (default true)
173 gstring - maximum allowed consecutive Gs.
174 Too many can cause problems in synthesis (default 4)
175 Note : All arguments can also be changed/accessed using autoloaded
176 methods such as:
178 my $start_pad = $sirna_designer->start_pad().
180 =cut
182 sub new {
183 my ($proto, @args) = @_;
184 my $pkg = ref($proto) || $proto;
186 my $self = {};
187 bless ($self, $pkg);
189 my %args;
191 @args{@ARGNAMES} = $self->_rearrange(\@ARGNAMES, @args);
193 if ($args{'RULES'}) {
194 $self->rules($args{'RULES'});
197 $self->{'start_pad'} = $args{'START_PAD'} || 75; # nt from start to mask
198 $self->{'end_pad'} = $args{'END_PAD'} || 50; # nt from end to mask
199 $self->{'include_3pr'} = $args{'INCLUDE_3PR'} || 0; # look for oligos in 3prime UTR
200 $self->{'min_gc'} = $args{'MIN_GC'} || 0.40;
201 $self->{'max_gc'} = $args{'MAX_GC'} || 0.60;
202 $self->{'cutoff'} = $args{'CUTOFF'} || 3; # highest (worst) rank wanted
203 $self->{'oligos'} = [];
204 defined($args{'AVOID_SNPS'}) ? $self->{'avoid_snps'} = $args{'AVOID_SNPS'} :
205 $self->{'avoid_snps'} = 1; # (t/f to avoid or include reagents that cover SNPs)
206 $self->{'gstring'} = $args{'GSTRING'} || 4; # maximum allowed consecutive Gs - too many can cause problems in oligo synthesis
207 $self->{'tmpdir'} = $args{'TMPDIR'} || $ENV{'TMPDIR'} || $ENV{'TMP'} || '';
208 $self->{'debug'} = $args{'DEBUG'} || 0;
210 $self->target($args{'TARGET'}) if ($args{'TARGET'});
212 return $self;
216 =head2 target
218 Title : target
219 Usage : my $target_seq = $sirna_designer->target(); # get the current target
221 $sirna_designer->target($new_target_seq); # set a new target
222 Function : Set/get the target as a Bio::SeqI-compliant object
223 Returns : a Bio::SeqI-compliant object
224 Args : a Bio::SeqI-compliant object (optional)
226 =cut
228 sub target {
229 my ($self, $target) = @_;
231 if ($target) {
232 unless ($target->isa('Bio::SeqI')) {
233 $self->throw( -class => 'Bio::Root::BadParameter',
234 -text => "Target must be passed as a Bio::Seq object" );
236 if ($target->can('molecule')) {
237 ( grep { uc($target->molecule) eq $_ } qw(DNA MRNA CDNA)) or
238 $self->throw( -class => 'Bio::Root::BadParameter',
239 -text => "Sequences of type ". $target->molecule. " are not supported"
242 else {
243 ($target->alphabet eq 'dna') or
244 $self->throw( -class => 'Bio::Root::BadParameter',
245 -text => "Sequences of alphabet ". $target->alphabet. " are not supported"
249 $self->{'target'} = $target;
250 return 1;
253 elsif ($self->{'target'}) {
254 return $self->{'target'};
256 else {
257 $self->throw("Target sequence not defined");
261 =head2 rules
263 Title : rules
264 Usage : $sirna->rules('ruleset')
265 Purpose : set/get ruleset to use for selecting SiRNA oligo pairs.
266 Returns : not sure yet
267 Args : a ruleset name (currently supported: Tuschl, Saigo)
268 or a Bio::Tools::SiRNA::RulesetI compliant object
270 =cut
272 sub rules {
273 my ($self, $rules) = @_;
275 if ($rules) {
276 $self->_load_ruleset($rules);
278 # default: use tuschl rules
279 unless ($self->{_rules}) {
280 $self->_load_ruleset('tuschl');
282 return $self->{_rules};
285 sub _load_ruleset {
286 my ($self, $ruleset) = @_;
288 my $rule_module = join('::', ref($self), 'Ruleset', lc($ruleset));
290 eval "require $rule_module";
292 if ($@) {
293 #warn join("\n", '@INC contains:', @INC, undef);
294 $self->throw("Unable to load $rule_module: $@");
295 return;
298 else {
299 $self->{_rules} = $rule_module;
300 bless($self, $rule_module); # recast as subclass
303 return 1;
306 =head2 design
308 Title : design
309 Usage : my @pairs = $sirna_designer->design();
310 Purpose : Design SiRNA oligo pairs.
311 Returns : A list of SiRNA pairs as Bio::SeqFeature::SiRNA::Pair objects
312 Args : none
314 =cut
316 sub design {
317 my ($self) = @_;
319 ($self->rules) or $self->throw('Unable to design siRNAs: no rule set specified');
321 # unless ( grep { $_->primary_tag eq 'Target' } $self->target->top_SeqFeatures ) {
322 # $self->_define_target();
325 my @oligos = $self->_get_oligos();
327 return ( grep { $_->isa('Bio::SeqFeature::SiRNA::Pair') } $self->target->top_SeqFeatures );
330 sub _define_target {
331 my ($self) = @_;
332 my ($feat, $cds, $left, $right);
334 my $target = $self->target or
335 $self->throw("Unable to design oligos - no target provided");
337 ($cds) = grep { $_->primary_tag eq 'CDS' } $target->top_SeqFeatures if ($target->can('top_SeqFeatures'));
339 if ($cds) {
340 $left = $cds->start + $self->start_pad;
341 if (!$self->include_3pr) {
342 $right = $cds->end - $self->end_pad;
344 else {
345 $right = $target->length - $self->end_pad;
348 else {
349 $left = 0 + $self->start_pad;
350 $right = $target->length - $self->end_pad;
354 # is there anything left?
355 if (($right - $left) < 20) {
356 $self->throw("There isn't enough sequence to design oligos. Please reduce start_pad and end_pad or supply more sequence");
358 # define target region
359 my $targregion = Bio::SeqFeature::Generic->new( -start => $left,
360 -end => $right,
361 -primary => 'Target' );
362 $self->target->add_SeqFeature($targregion);
364 # locate excluded regions
365 my @excluded = grep { $_->primary_tag eq 'Excluded' } $self->target->top_SeqFeatures;
367 if ($self->avoid_snps) {
368 my @snps = grep { $_->primary_tag eq 'variation' } $self->target->top_SeqFeatures;
369 push(@excluded, @snps);
372 $self->excluded(\@excluded);
374 return $targregion;
377 sub _get_targetregion {
378 my ($self) = @_;
380 my ($targregion) = grep { $_->primary_tag eq 'Target' } $self->target->top_SeqFeatures;
381 $targregion ||= $self->_define_target;
383 $self->throw("Target region for SiRNA design not defined") unless ($targregion);
385 my $seq = $targregion->seq->seq;
386 # but this way I loose start info
387 my $targstart = $targregion->start;
389 return ($seq, $targstart);
392 # MOVE to SiRNA::Ruleset::tuschl
393 # sub _regex {
394 # my ($self, $rank) = @_;
395 # return $PATTERNS{$rank};
398 # sub _get_oligos {
399 # # use regular expressions to pull out oligos
401 # my ($self, $rank) = @_;
402 # my $regex = $self->_regex($rank);
403 # my @exclude;
406 # my ($targregion) = grep { $_->primary_tag eq 'Target' } $self->target->top_SeqFeatures;
407 # my $seq = $targregion->seq->seq;
408 # # but this way I loose start info
409 # my $targstart = $targregion->start;
411 # # exclude masked region
412 # push(@exclude, grep { $_->primary_tag eq 'Excluded' } $self->target->top_SeqFeatures);
414 # # add SNP checking
415 # if ($self->avoid_snps) {
416 # my @snps = grep { $_->primary_tag eq 'variation' } $self->target->top_SeqFeatures;
417 # push(@exclude, @snps);
420 # while ( $seq =~ /$regex/gi ) {
421 # my $target = $1;
423 # # check for too many Gs (or Cs on the other strand)
424 # next if ( $target =~ /G{ $self->gstring,}/io );
425 # next if ( $target =~ /C{ $self->gstring,}/io );
426 # # skip Ns (for filtering)
427 # next if ( $target =~ /N/i);
429 # my $start = length($`) + $targstart;
430 # my $stop = $start + length($target) -1;
432 # my @gc = ( $target =~ /G|C/gi);
433 # my $fxGC = sprintf("%2.2f", (scalar(@gc) / length($target)));
434 # next if ($fxGC < $self->min_gc);
435 # next if ($fxGC > $self->max_gc);
437 # my $sense = Bio::SeqFeature::SiRNA::Oligo->new( -start => $start,
438 # -end => $stop,
439 # -strand => 1,
440 # -seq => _get_sense($target),
441 # -source_tag => ref($self),
442 # );
444 # my $asense = Bio::SeqFeature::SiRNA::Oligo->new( -start => $start,
445 # -end => $stop,
446 # -strand => -1,
447 # -seq => _get_anti($target),
448 # -source_tag => ref($self),
449 # );
451 # my $sirna = Bio::SeqFeature::SiRNA::Pair->new( -rank => $rank,
452 # -fxGC => $fxGC,
453 # -sense => $sense,
454 # -antisense => $asense,
455 # -source_tag => ref($self),
456 # );
458 # unless ($self->_has_overlap($sirna, \@exclude)) {
459 # $self->target->add_SeqFeature($sirna);
462 # }
464 =head2 add_oligos
466 Title : add_oligos
467 Usage : $sirna_designer->add_oligos($sequence, $start, $rank);
468 Purpose : Add SiRNA olgos to target Bio::Seq as Bio::SeqFeature::SiRNA::Pair objects
469 Args : Oligo sequence and start position (required), rank/score (optional)
471 =cut
473 sub add_oligos {
474 my ($self, $seq, $start, $rank) = @_;
476 ($seq) or throw ('No sequence supplied for add_oligos');
477 (defined $start) or throw ('No start position specified for add_oligos');
479 my ($end) = $start + length($seq);
481 my ($sseq) = $self->_get_sense($seq);
482 my $sense = Bio::SeqFeature::SiRNA::Oligo->new( -start => $start,
483 -end => ($start + length($sseq)),
484 -strand => 1,
485 -seq => $sseq,
486 -source_tag => ref($self),
489 my $aseq = $self->_get_anti($seq);
490 my $asense = Bio::SeqFeature::SiRNA::Oligo->new( -start => $end,
491 -end => ($end - length($aseq)),
492 -strand => -1,
493 -seq => $aseq,
494 -source_tag => ref($self),
497 my $sirna = Bio::SeqFeature::SiRNA::Pair->new( -rank => $rank,
498 # -fxGC => $fxGC,
499 -sense => $sense,
500 -antisense => $asense,
501 -source_tag => ref($self),
504 unless ($self->_has_overlap($sirna, $self->excluded)) {
505 $self->target->add_SeqFeature($sirna);
509 sub _has_overlap {
510 # flag any pairs that overlap an UNDESIRED feature (eg SNP)
511 # return true if there is overlap, false if not
513 my ($self, $test, $flist) = @_;
514 print STDERR "Checking oligo at ", $test->start, " to ",$test->end, "\n"
515 if ($self->debug);
517 foreach my $feat (@$flist) {
518 if (($test->start <= $feat->end) and ($test->end >= $feat->start)) {
519 print STDERR "Overlaps ", $feat->primary_tag, " at ",
520 $feat->start, " to ", $feat->end, "\n" if ($self->debug);
521 return 1;
524 return 0; # default - no overlap
527 # MOVE to SiRNA::Ruleset::tuschl
529 # sub _get_sense {
530 # my ($target) = @_;
531 # # trim off 1st 2 nt to get overhang
532 # $target =~ s/^..//;
533 # # convert T's to U's (transcribe)
534 # $target =~ s/T/U/gi;
535 # # force last 2 nt to be T's
536 # $target =~ s/..$/TT/;
538 # return $target;
541 # sub _get_anti {
542 # my ($target) = @_;
543 # my @target = split(//, $target);
544 # my ($nt,@antitarget);
546 # while ($nt = pop @target) {
547 # push(@antitarget, $COMP{$nt});
549 # my $anti = join('', @antitarget);
550 # # trim off 1st 2 nt to get overhang
551 # $anti =~ s/^..//;
552 # # convert T's to U's
553 # $anti =~ s/T/U/gi;
554 # # convert last 2 NT's to T
555 # $anti =~ s/..$/TT/;
557 # return $anti;
561 sub AUTOLOAD {
562 my ($self, $value) = @_;
563 my $name = $AUTOLOAD;
564 $name =~ s/.+:://;
566 return if ($name eq 'DESTROY');
569 if (defined $value) {
570 $self->{$name} = $value;
573 unless (exists $self->{$name}) {
574 $self->throw("Attribute $name not defined for ". ref($self));
577 return $self->{$name};
580 sub _comp {
581 my ($self, $char) = @_;
583 return unless ($char);
584 $char = uc($char);
585 return $COMP{ $char };