3 # This wrapper script will invoke greylag in parallel on our SGE cluster.
5 # This directory is assumed to be shared across all cluster nodes
6 SHAREDTMPDIR
=/clusterfs
/home
/proteomics
/tmp
8 # The python image to use
9 PYTHON
=/n
/site
/inst
/Linux-i686
/sys
/bin
/python2.4
11 # greylag lives here (use python2.4 explicitly)
12 greylag
=/clusterfs
/site
/inst
/Linux-i686
/bioinfo
/greylag
/greylag.py
14 # options passed to all greylag invocations
15 #greylag_options="-v --quirks-mode"
18 # Divide this run into this many parts. This should probably be at least 5x
19 # the number of cluster nodes available. Currently, the fixed overhead cost
20 # for each part is pretty small (10-20 seconds?).
28 usage: $p [-l] <greylag-params.xml> [ <ms2-file> ... ]
30 Process a set of ms2 files as specified by the given params file (which must
31 end in '.xml' and should probably be in the current directory).
33 If the '-l' flag is given, stdout and stderr will be redirected to a
34 corresponding '.log' file.
46 touch "$jobname.done-failed" || err
"touch failed"
50 if [ "$1" == "-l" ]; then
61 rm -f ${params%.xml}.done-
*
65 *) die
"parameter filename must end in '*.xml'";;
68 jobname
=$
(basename $params .xml
)
72 if [ $
(dirname $f) != "." ]; then
73 die
"file '$f' must be in the current working directory"
76 *.ms2 |
*.ms2.gz |
*.ms2.bz2
) true
;;
77 *) die
"argument file '$f' should end in '.ms2' (or '.ms2.{gz,bz2}')";;
81 [ -e $params ] || die
"'$params' not found"
83 if ! ls -ld . |
egrep -q '^drwxrws'; then
84 chmod g
+rwxs . || err
"attempt to make this directory group writable failed"
88 # Do some basic locking. This tries to prevent simultaneous runs on the same
89 # parameter file, which would produce output to the same file, wasting
90 # resources and causing confusion.
92 lockfile
="$PWD/$jobname.lock"
93 trap "rm -f $lockfile" EXIT
94 ln -s $$
$lockfile 2>/dev
/null || true
95 lockpid
=$
(ls -ld $lockfile |
sed -e 's/^.*> //')
97 if [ $lockpid != $$
]; then
98 die
"this directory locked by another process (pid = $lockpid)?
99 remove $lockfile if not"
103 if [ "$logging" == "1" ]; then
104 exec < /dev
/null
> "$jobname.log" 2>&1
109 # Be very careful with quoting, as these names may eventually come from
112 shared_d
=$SHAREDTMPDIR/greylag-$
(date +%s
)-$$
# unique
114 # Could add removal of the shared directory to the EXIT trap, but probably we
115 # shouldn't because pdq (and maybe SGE) may react badly
116 # trap "rm -rf $lockfile $shared_d &" EXIT
118 mkdir
$shared_d || die
"'mkdir $shared_d' failed!"
120 cp -p "$params" $shared_d/ || die
"params cp failed!"
124 *.gz
) zcat
< "$f" > "$shared_d/$f" || die
"ms2 zcat failed";;
125 *.bz2
) bzcat
< "$f" > "$shared_d/$f" || die
"ms2 zcat failed";;
126 *) ln -s "$PWD/$f" $shared_d/ || die
"ms2 ln -s failed";;
130 results
="$jobname.out.xml.gz"
132 jobbasedir
=$
(basename $PWD)
134 #########################################################################
135 # submit the job in this shared directory so that the nodes can see it
136 pushd $shared_d > /dev
/null || die
"pushd failed!"
142 #shared dir: $shared_d
150 # Use of *.ms2 implies that there are no such files here except for the ones
151 # we just created. Fix?
152 # [NB: the ms2 arguments must be given in the same order each time!]
154 # split ms2 files (on master, for now)
155 $PYTHON $greylag $greylag_options \
157 --part-split=$PARTS \
159 || die
"round 0 setup on master failed"
161 err round
0 processing
163 # The -pe option is currently a hack to try to stop multiple jobs from being
164 # run on the same node.
166 # first do parts on nodes
167 qsub
-sync y
-r y
-b y
-cwd -V -hard -l mem_free
=250M \
170 -N "greylag-$jobbasedir-$jobname-$$" \
171 -e 'round-1.$TASK_ID.err' \
172 -o 'round-1.$TASK_ID.out' \
173 $PYTHON $greylag $greylag_options \
175 --part='${SGE_TASK_ID}of${SGE_TASK_LAST}' \
178 cat $
(ls -1 round-1.
*.
{out
,err
} |
sort -t .
-k 2n
,3) /dev
/null
179 if [ "$qsub_status" != 0 ]; then
180 die
"round 0 on nodes failed"
185 # now merge results (on master, for now)
186 $PYTHON $greylag $greylag_options \
188 --part-merge=$PARTS \
190 || die
"round 0 merge on master failed"
192 popd > /dev
/null || die
"popd failed!"
193 #########################################################################
195 if ! [ -e $shared_d/"$results" ]; then
196 die
'no results file present after processing--job failed?'
199 cp -p $shared_d/"$results" . || die
"cp failed!"
201 # for now, don't do this
202 #rm -fr "$shared_d" $lockfile
205 touch "$jobname.done-ok" || err
"touch failed"