Bug fixes: deterministic order of H lines, greylag-python, etc.
[greylag.git] / SIMR / greylag-process
blob7335927bbe22964e956409814b9c14e968f5fadf
1 #!/bin/bash
3 # This wrapper script will invoke greylag in parallel on our SGE cluster.
5 # This directory is assumed to be shared across all cluster nodes
6 SHAREDTMPDIR=/clusterfs/home/proteomics/tmp
8 # options passed to all greylag-grind invocations
9 greylag_options="-v"
11 # Divide this run into this many parts. This should probably be at least 5x
12 # the number of cluster nodes available. At the moment, this must be an exact
13 # power of ten.
14 PARTS=${GREYLAG_PARTS:-3000}
17 p=`basename $0`
19 usage() {
20 cat <<EOF 1>&2
21 usage: $p [-l] <greylag.conf> <ms2-file> [ <ms2-file>... ]
23 Process a set of ms2 files as specified by the given configuration file (which
24 must end in '.conf' and should probably be in the current directory).
26 If the '-l' flag is given, stdout and stderr will be redirected to a
27 corresponding '.log' file.
29 EOF
30 exit 0
33 err() {
34 echo 1>&2 "$(date '+%b %e %H:%M:%S') $p: $@"
37 die() {
38 err "$@"
39 touch "$jobname.done-failed" || err "touch failed"
40 exit 1
43 if [ "$1" == "-l" ]; then
44 shift
45 logging=1
48 if [ $# -lt 2 ]; then
49 usage
52 config="$1"
54 rm -f ${config%.conf}.done-*
56 case "$config" in
57 *.conf) true;;
58 *) die "parameter filename must end in '*.conf'";;
59 esac
61 jobname=$(basename $config .conf)
63 shift
64 for f in "$@"; do
65 if [ $(dirname $f) != "." ]; then
66 die "file '$f' must be in the current working directory"
68 case "$f" in
69 *.ms2) true;;
70 *) die "argument file '$f' should end in '.ms2'";;
71 esac
72 done
74 [ -e $config ] || die "'$config' not found"
76 if ! ls -ld . | egrep -q '^drwxrws'; then
77 chmod g+rwxs . || err "attempt to make this directory group writable failed"
81 # Do some basic locking. This tries to prevent simultaneous runs on the same
82 # parameter file, which would produce output to the same file, wasting
83 # resources and causing confusion.
85 lockfile="$jobname.lock"
86 trap "rm -f $lockfile" EXIT
87 ln -s $$ $lockfile 2>/dev/null || true
88 lockpid=$(ls -ld $lockfile | sed -e 's/^.*> //')
90 if [ $lockpid != $$ ]; then
91 die "this directory locked by another process (pid = $lockpid)?
92 remove $lockfile if not"
95 # point of no return
96 if [ "$logging" == "1" ]; then
97 exec < /dev/null > "$jobname.log" 2>&1
100 # priority?
102 # Be very careful with quoting, as these names may eventually come from
103 # Windows users...
105 shared_d=$SHAREDTMPDIR/greylag-$(date +%s)-$$ # unique
107 # Could add removal of the shared directory to the EXIT trap, but probably we
108 # shouldn't because pdq (and maybe SGE) may react badly
109 # trap "rm -rf $lockfile $shared_d &" EXIT
111 mkdir $shared_d || die "'mkdir $shared_d' failed!"
113 cp -p "$config" $shared_d/ || die "config cp failed!"
115 err set up work directory, recreating indices if necessary
116 for f in "$@"; do
117 idx="$f.idx"
118 if ! [ -e "$idx" -a "$idx" -nt "$f" ]; then
119 greylag-index-spectra "$f" \
120 || die "greylag-index-spectra failed"
122 cp -p "$f" "$idx" $shared_d/ || die "ms2/idx cp failed"
123 done
125 jobbasedir=$(basename $PWD)
127 #########################################################################
128 # submit the job in this shared directory so that the nodes can see it
129 pushd $shared_d > /dev/null || die "pushd failed!"
131 # use err
133 #cat <<EOF
134 #job name: $jobname
135 #shared dir: $shared_d
136 #config: $config
137 #ms2: $@
138 #EOF
139 #ls -l $shared_d/
142 err search
144 # It's important that the same ms2 arguments be given (in the same order) each
145 # time.
147 qsub -sync y -r y -b y -S /bin/bash -cwd -V -hard -l virtual_free=250M \
148 -q all.q \
149 -t 1-$PARTS \
150 -N "greylag-$jobbasedir-$jobname-$$" \
151 -e 'grind.$TASK_ID.err' \
152 -o 'grind.$TASK_ID.out' \
153 greylag-grind $greylag_options \
154 --work-slice '$(greylag-sge-slice ${SGE_TASK_ID} ${SGE_TASK_LAST})' \
155 id "$config" "$@"
156 qsub_status=$?
157 cat $(find . -name 'grind-*.out' -o -name 'grind-*.err' | sort -t . -k 2n,3) /dev/null
158 if [ "$qsub_status" != 0 ]; then
159 die "grind on nodes failed"
162 if [ "$(find . -name '*.gwr' | wc -l)" != $PARTS ]; then
163 die "SGE/grind on nodes failed"
166 popd > /dev/null || die "popd failed!"
167 #########################################################################
169 # now merge results (on master, for now)
171 err merge results
173 find $shared_d -name '*.gwr' | greylag-merge --files-on-stdin "$jobname.gwr" \
174 || die "merge failed"
176 err write sqt files
178 greylag-sqt "$jobname.gwr" || die "sqt write failed"
180 # for now, don't do this
181 #rm -fr "$shared_d" $lockfile
183 err complete
184 touch "$jobname.done-ok" || err "touch failed"
185 exit 0