speelink fixes, patch courtesy Charles Plessy, fixes #3256
[bioperl-run.git] / lib / Bio / Tools / Run / MCS.pm
blob5924c499652f7f335845b613e8b3ca9033768731
1 # $Id: MCS.pm,v 1.3 2007/05/25 10:14:55 sendu Exp $
3 # BioPerl module for Bio::Tools::Run::MCS
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Sendu Bala <bix@sendu.me.uk>
9 # Copyright Sendu Bala
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::Tools::Run::MCS - Wrapper for MCS
19 =head1 SYNOPSIS
21 use Bio::Tools::Run::MCS;
23 # Make a MCS factory
24 $factory = Bio::Tools::Run::MCS->new();
26 # Run MCS on an alignment
27 my @results = $factory->run($alignfilename);
29 # or with alignment object
30 @results = $factory->run($bio_simplalign);
32 # look at the results
33 foreach my $feat (@results) {
34 my $seq_id = $feat->seq_id;
35 my $start = $feat->start;
36 my $end = $feat->end;
37 my $score = $feat->score;
38 my ($pvalue) = $feat->get_tag_values('pvalue');
39 my ($kind) = $feat->get_tag_values('kind'); # 'all', 'exon' or 'nonexon'
42 =head1 DESCRIPTION
44 This is a wrapper for running the MCS (binCons) scripts by Elliott H Margulies.
45 You can get details here: http://zoo.nhgri.nih.gov/elliott/mcs_doc/. MCS is used
46 for the prediciton of transcription factor binding sites and other regions of
47 the genome conserved amongst different species.
49 Note that this wrapper assumes you already have alignments, so only uses MCS
50 for the latter stages (the stages involving align2binomial.pl,
51 generate_phyloMAX_score.pl and generate_mcs_beta.pl).
53 You can try supplying normal MCS command-line arguments to new(), eg.
55 $factory->new(-percentile => 95)
57 or calling arg-named methods (excluding the initial
58 hyphens, eg.
60 $factory->percentile(95)
62 to set the --percentile arg).
65 You will need to enable this MCS wrapper to find the MCS scripts.
66 This can be done in (at least) three ways:
68 1. Make sure the MCS scripts are in your path.
69 2. Define an environmental variable MCSDIR which is a
70 directory which contains the MCS scripts:
71 In bash:
73 export MCSDIR=/home/username/mcs/
75 In csh/tcsh:
77 setenv MCSDIR /home/username/mcs
79 3. Include a definition of an environmental variable MCSDIR in
80 every script that will use this MCS wrapper module, e.g.:
82 BEGIN { $ENV{MCSDIR} = '/home/username/mcs/' }
83 use Bio::Tools::Run::MCS;
85 =head1 FEEDBACK
87 =head2 Mailing Lists
89 User feedback is an integral part of the evolution of this and other
90 Bioperl modules. Send your comments and suggestions preferably to
91 the Bioperl mailing list. Your participation is much appreciated.
93 bioperl-l@bioperl.org - General discussion
94 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
96 =head2 Support
98 Please direct usage questions or support issues to the mailing list:
100 I<bioperl-l@bioperl.org>
102 rather than to the module maintainer directly. Many experienced and
103 reponsive experts will be able look at the problem and quickly
104 address it. Please include a thorough description of the problem
105 with code and data examples if at all possible.
107 =head2 Reporting Bugs
109 Report bugs to the Bioperl bug tracking system to help us keep track
110 of the bugs and their resolution. Bug reports can be submitted via
111 the web:
113 http://redmine.open-bio.org/projects/bioperl/
115 =head1 AUTHOR - Sendu Bala
117 Email bix@sendu.me.uk
119 =head1 APPENDIX
121 The rest of the documentation details each of the object methods.
122 Internal methods are usually preceded with a _
124 =cut
126 package Bio::Tools::Run::MCS;
127 use strict;
129 use Cwd;
130 use File::Spec;
131 use Bio::AlignIO;
132 use Bio::FeatureIO;
133 use Bio::Annotation::SimpleValue;
135 use base qw(Bio::Tools::Run::Phylo::PhyloBase);
137 our $PROGRAM_NAME = 'align2binomial.pl';
138 our $PROGRAM_DIR;
140 # methods for the mcs args we support
141 our @PARAMS = qw(neutral percentile mcs specificity sensitivity name);
142 our @SWITCHES = qw(neg-score);
144 # just to be explicit, args we don't support (yet) or we handle ourselves
145 our @UNSUPPORTED = qw(ucsc gtf neutral-only fourd-align align-only ar);
147 BEGIN {
148 # lets add all the mcs scripts to the path so that when we call
149 # align2binomial.pl it can find its siblings
150 $PROGRAM_DIR = $ENV{'MCSDIR'};
151 $ENV{PATH} = "$PROGRAM_DIR:$ENV{PATH}" if $PROGRAM_DIR;
154 =head2 program_name
156 Title : program_name
157 Usage : $factory>program_name()
158 Function: holds the program name
159 Returns : string
160 Args : None
162 =cut
164 sub program_name {
165 return $PROGRAM_NAME;
168 =head2 program_dir
170 Title : program_dir
171 Usage : $factory->program_dir(@params)
172 Function: returns the program directory, obtained from ENV variable.
173 Returns : string
174 Args : None
176 =cut
178 sub program_dir {
179 return $PROGRAM_DIR;
182 =head2 new
184 Title : new
185 Usage : $factory = Bio::Tools::Run::MCS->new()
186 Function: creates a new MCS factory
187 Returns : Bio::Tools::Run::MCS
188 Args : Many options understood by MCS can be supplied as key =>
189 value pairs.
191 These options can NOT be used with this wrapper:
192 ucsc gtf neutral-only fourd-align align-only ar
194 =cut
196 sub new {
197 my ($class, @args) = @_;
198 my $self = $class->SUPER::new(@args);
200 $self->_set_from_args(\@args, -methods => [@PARAMS, @SWITCHES, 'quiet'],
201 -create => 1);
203 return $self;
206 =head2 run
208 Title : run
209 Usage : $result = $factory->run($align_file_or_object, Bio::Location::Atomic,
210 [Bio::SeqFeatureI]);
211 Function: Runs the MCS scripts on an alignment.
212 Returns : list of Bio::SeqFeatureI feature objects (with coordinates corrected
213 according to the supplied offset, if any)
214 Args : The first argument represents an alignment, the optional second
215 argument represents the chromosome, stand and end and the optional
216 third argument represents annotation of the exons in the alignment.
218 The alignment can be provided as a multi-fasta format alignment
219 filename, or a Bio::Align::AlignI compliant object (eg. a
220 Bio::SimpleAlign).
222 The position in the genome can be provided as a Bio::Location::Atomic
223 with start, end and seq_id set.
225 The annnotation can be provided as an array of Bio::SeqFeatureI
226 objects.
228 =cut
230 sub run {
231 my ($self, $aln, $offset, $exon_feats) = @_;
232 $self->_alignment($aln || $self->alignment || $self->throw("An alignment must be supplied"));
234 return $self->_run($offset, $exon_feats);
237 sub _run {
238 my ($self, $atomic, $exon_feats) = @_;
240 my $exe = $self->executable || return;
242 # cd to a temp dir
243 my $temp_dir = $self->tempdir;
244 my $cwd = Cwd->cwd();
245 chdir($temp_dir) || $self->throw("Couldn't change to temp dir '$temp_dir'");
247 my $offset = '';
248 my $start_adjust = 0;
249 if ($atomic) {
250 $start_adjust = $atomic->start;
251 $offset = '--ucsc '.$atomic->seq_id.':'.$start_adjust.'-'.$atomic->end;
252 $start_adjust--;
255 my $gtf_file = 'exons.gtf';
256 if ($exon_feats) {
257 my $fout = Bio::FeatureIO->new(-file => ">$gtf_file", -format => 'gtf');
258 foreach my $feat (@{$exon_feats}) {
259 $fout->write_feature($feat);
262 my $gtf = $exon_feats ? "--gtf $gtf_file" : '';
264 # step '2' (http://zoo.nhgri.nih.gov/elliott/mcs_doc/node1.html) of MCS:
265 # run align2binomial.pl to calculate individual species binomial scores
266 my $aln_file = $self->_write_alignment;
267 my $error_file = 'stderr';
268 my $binomial_file = 'align_name.binomial';
269 my $cmd = "align2binomial.pl $offset $gtf $aln_file > $binomial_file 2> $error_file";
270 #system("rm -fr $cwd/mcs_dir; cp -R $temp_dir $cwd/mcs_dir");
271 my $throw = system($cmd);
272 open(my $efh, "<", $error_file) || $self->throw("Could not open error file '$error_file'");
273 my $error;
274 while (<$efh>) {
275 $error .= $_;
276 $throw = 1 if /not divisible by 3/;
278 close($efh);
279 $self->throw($error) if $throw;
281 # step '3': run generate_phyloMAX_score.pl to combine the individual
282 # binomial scores and generate the final Multi-species Conservation Score
283 my $phylo_file = 'align_name.phylo';
284 system("generate_phyloMAX_score.pl $binomial_file > $phylo_file") && $self->throw("generate_phyloMAX_score.pl call failed: $?, $!");
286 # step '4': Generate MCSs from the conservation score using
287 # generate_mcs_beta.pl
288 my $mcs_file = 'mcs_result.stdout';
289 my $bed_file = 'align_name.bed'; # hardcoded in generate_mcs_beta.pl
290 system("generate_mcs_beta.pl $offset $gtf $phylo_file > $mcs_file") && $self->throw("generate_mcs_beta.pl failed: $?, $!");
292 my @feats;
293 my $fin = Bio::FeatureIO->new(-file => $bed_file, -format => 'bed');
294 my $source = Bio::Annotation::SimpleValue->new(-value => 'MCS');
295 while (my $feat = $fin->next_feature()) {
296 # convert coords given offset
297 if ($start_adjust) {
298 $feat->start($feat->start + $start_adjust);
299 $feat->end($feat->end + $start_adjust);
301 $feat->source($source);
302 push(@feats, $feat);
305 # cd back again
306 chdir($cwd) || $self->throw("Couldn't change back to working directory '$cwd'");
308 return @feats;
311 =head2 _setparams
313 Title : _setparams
314 Usage : Internal function, not to be called directly
315 Function: Creates a string of params to be used in the command string
316 Returns : string of params
317 Args : none
319 =cut
321 sub _setparams {
322 my $self = shift;
324 my $param_string = $self->SUPER::_setparams(-params => \@PARAMS,
325 -switches => \@SWITCHES,
326 -dash => 1);
328 $param_string .= ' 1>/dev/null' if $self->quiet;
330 return $param_string;