Add basic unit test suite, plus minor bugfixes
[greylag.git] / SIMR / greylag-process
blobb9531155b4510484667224b095726aef903ff0f2
1 #!/bin/bash
3 # This wrapper script will invoke greylag in parallel on our SGE cluster.
5 # This directory is assumed to be shared across all cluster nodes
6 SHAREDTMPDIR=/clusterfs/home/proteomics/tmp
8 # The python image to use
9 PYTHON=/n/site/inst/Linux-i686/sys/bin/python2.4
11 # greylag lives here (use python2.4 explicitly)
12 greylag=/clusterfs/site/inst/Linux-i686/bioinfo/greylag/greylag.py
14 # options passed to all greylag invocations
15 #greylag_options="-v --quirks-mode"
16 greylag_options="-v"
18 # Divide this run into this many parts. This should probably be at least 5x
19 # the number of cluster nodes available. Currently, the fixed overhead cost
20 # for each part is pretty small (10-20 seconds?).
21 PARTS=1000
24 p=`basename $0`
26 usage() {
27 cat <<EOF 1>&2
28 usage: $p [-l] <greylag-params.xml> [ <ms2-file> ... ]
30 Process a set of ms2 files as specified by the given params file (which must
31 end in '.xml' and should probably be in the current directory).
33 If the '-l' flag is given, stdout and stderr will be redirected to a
34 corresponding '.log' file.
36 EOF
37 exit 0
40 err() {
41 echo 1>&2 "$p: $@"
44 die() {
45 err "$@"
46 touch "$jobname.done-failed" || err "touch failed"
47 exit 1
50 if [ "$1" == "-l" ]; then
51 shift
52 logging=1
55 if [ $# -lt 2 ]; then
56 usage
59 params="$1"
61 rm -f ${params%.xml}.done-*
63 case "$params" in
64 *.xml) true;;
65 *) die "parameter filename must end in '*.xml'";;
66 esac
68 jobname=$(basename $params .xml)
70 shift
71 for f in "$@"; do
72 if [ $(dirname $f) != "." ]; then
73 die "file '$f' must be in the current working directory"
75 case "$f" in
76 *.ms2 | *.ms2.gz | *.ms2.bz2) true;;
77 *) die "argument file '$f' should end in '.ms2' (or '.ms2.{gz,bz2}')";;
78 esac
79 done
81 [ -e $params ] || die "'$params' not found"
83 if ! ls -ld . | egrep -q '^drwxrws'; then
84 chmod g+rwxs . || err "attempt to make this directory group writable failed"
88 # Do some basic locking. This tries to prevent simultaneous runs on the same
89 # parameter file, which would produce output to the same file, wasting
90 # resources and causing confusion.
92 lockfile="$PWD/$jobname.lock"
93 trap "rm -f $lockfile" EXIT
94 ln -s $$ $lockfile 2>/dev/null || true
95 lockpid=$(ls -ld $lockfile | sed -e 's/^.*> //')
97 if [ $lockpid != $$ ]; then
98 die "this directory locked by another process (pid = $lockpid)?
99 remove $lockfile if not"
102 # point of no return
103 if [ "$logging" == "1" ]; then
104 exec < /dev/null > "$jobname.log" 2>&1
107 # priority?
109 # Be very careful with quoting, as these names may eventually come from
110 # Windows users...
112 shared_d=$SHAREDTMPDIR/greylag-$(date +%s)-$$ # unique
114 # Could add removal of the shared directory to the EXIT trap, but probably we
115 # shouldn't because pdq (and maybe SGE) may react badly
116 # trap "rm -rf $lockfile $shared_d &" EXIT
118 mkdir $shared_d || die "'mkdir $shared_d' failed!"
120 cp -p "$params" $shared_d/ || die "params cp failed!"
122 for f in "$@"; do
123 case "$f" in
124 *.gz) zcat < "$f" > "$shared_d/$f" || die "ms2 zcat failed";;
125 *.bz2) bzcat < "$f" > "$shared_d/$f" || die "ms2 zcat failed";;
126 *) ln -s "$PWD/$f" $shared_d/ || die "ms2 ln -s failed";;
127 esac
128 done
130 results="$jobname.out.xml.gz"
132 jobbasedir=$(basename $PWD)
134 #########################################################################
135 # submit the job in this shared directory so that the nodes can see it
136 pushd $shared_d > /dev/null || die "pushd failed!"
138 # use err
140 #cat <<EOF
141 #job name: $jobname
142 #shared dir: $shared_d
143 #params: $params
144 #ms2: $@
145 #EOF
146 #ls -l $shared_d/
148 err round 0 setup
150 # Use of *.ms2 implies that there are no such files here except for the ones
151 # we just created. Fix?
152 # [NB: the ms2 arguments must be given in the same order each time!]
154 # split ms2 files (on master, for now)
155 $PYTHON $greylag $greylag_options \
156 -o "$results" \
157 --part-split=$PARTS \
158 "$params" *.ms2 \
159 || die "round 0 setup on master failed"
161 err round 0 processing
163 # The -pe option is currently a hack to try to stop multiple jobs from being
164 # run on the same node.
166 # first do parts on nodes
167 qsub -sync y -r y -b y -cwd -V -hard -l mem_free=250M \
168 -q all.q \
169 -t 1-$PARTS \
170 -N "greylag-$jobbasedir-$jobname-$$" \
171 -e 'round-1.$TASK_ID.err' \
172 -o 'round-1.$TASK_ID.out' \
173 $PYTHON $greylag $greylag_options \
174 -o "$results" \
175 --part='${SGE_TASK_ID}of${SGE_TASK_LAST}' \
176 "$params" *.ms2
177 qsub_status=$?
178 cat $(ls -1 round-1.*.{out,err} | sort -t . -k 2n,3) /dev/null
179 if [ "$qsub_status" != 0 ]; then
180 die "round 0 on nodes failed"
183 err round 0 merge
185 # now merge results (on master, for now)
186 $PYTHON $greylag $greylag_options \
187 -o "$results" \
188 --part-merge=$PARTS \
189 "$params" *.ms2 \
190 || die "round 0 merge on master failed"
192 popd > /dev/null || die "popd failed!"
193 #########################################################################
195 if ! [ -e $shared_d/"$results" ]; then
196 die 'no results file present after processing--job failed?'
199 cp -p $shared_d/"$results" . || die "cp failed!"
201 # for now, don't do this
202 #rm -fr "$shared_d" $lockfile
204 err complete
205 touch "$jobname.done-ok" || err "touch failed"
206 exit 0