*** empty log message ***
[charm.git] / src / arch / bluegenep / charmrun
blob160acab394de17edf45443a7b80ab349acc03f0e
1 #!/bin/sh
3 # Conv-host for MPI:
4 # Translates +pN-style conv-host options into
5 # mpirun -npN options.
7 args=""
8 pes=1
9 machinefile=""
11 while [ $# -gt 0 ]
13 case $1 in
14 +ppn)
15 args=$args" +ppn "$2
16 shift
18 +ppn*)
19 args=$args" "$1
21 +p)
22 pes=$2
23 shift
25 +p*)
26 pes=`echo $1 | awk '{print substr($1,3)}'`
28 -machinefile)
29 machinefile=$2
30 args=" "$1" "$2" "$args
31 shift
33 *)
34 args=$args" "$1
36 esac
37 shift
38 done
41 printf "\nRunning on $pes processors: $args\n"
44 if [ -n "$PBS_NODEFILE" ]
45 then
46 # we are in a job shell
47 mpirun_cmd=`which mpirun`
48 if echo $mpirun_cmd | grep 'mvapich2' > /dev/null 2>/dev/null
49 then
50 mvapich2-start-mpd
51 mpirun -np $pes $args
52 mpdallexit
53 else # normal case
54 test -z "$machinefile" && args=-machinefile" "$PBS_NODEFILE" "$args
55 echo mpirun -np $pes $args
56 mpirun -np $pes $args
58 elif [ -n "$LSB_HOSTS" ]
59 then
60 # Tungsten
61 echo cmpirun -lsf -poll -no_smp -gm_long 200000 $args
62 cmpirun -lsf -poll -no_smp -gm_long 200000 $args
63 elif [ -n "$PBS_QUEUE" -o -n "$LSF_QUEUE" ]
64 then
65 # Interactive mode: create, and submit a batch job
66 script="charmrun_script.$$.sh"
67 indir=`pwd`
68 output="$indir/charmrun_script.$$.stdout"
69 result="$indir/charmrun_script.$$.result"
70 rm -f $result
71 # Some machine specific
72 USE_LSF=0
73 # 10 minutes
74 walllimit=10
75 queue_stat=qstat
76 queue_qsub=qsub
77 queue_kill=qdel
78 hostname=`hostname`
79 case "$hostname" in
80 turing*.turing.uiuc.edu)
81 ppn='#PBS -l nodes='$pes':ppn=1'
82 extra='-machinefile $PBS_NODEFILE'
84 tg-login*|honest*.ncsa.uiuc.edu)
85 # always ppn=2
86 nodes=`expr \( $pes + 1 \) / 2`
87 test $pes -eq 1 && ppns=1 || ppns=2
88 ppn='#PBS -l nodes='$nodes':ppn='$ppns
89 extra='-machinefile $PBS_NODEFILE'
91 co-login*.ncsa.uiuc.edu)
92 mem='#PBS -l mem=500mb'
93 ncpus="#PBS -l ncpus=$pes"
95 tun*)
96 USE_LSF=1
97 queue_stat=bjobs
98 queue_qsub=bsub
99 queue_kill=bkill
101 abe*)
102 # always ppn=2
103 nodes=`expr \( $pes + 1 \) / 2`
104 test $pes -eq 1 && ppns=1 || ppns=2
105 ppn='#PBS -l nodes='$nodes':ppn='$ppns
106 extra='-machinefile $PBS_NODEFILE'
109 ncpus="#PBS -l ncpus=$pes"
111 esac
112 if test $USE_LSF -eq 0
113 then
114 mpirun=`which mpirun`
115 cat > $script << EOF
116 #!/bin/sh
117 # This is a charmrun-generated PBS batch job script.
118 # The lines starting with #PBS are queuing system flags:
120 $ppn
122 $ncpus
124 #PBS -l walltime=$walllimit:00
126 $mem
128 #PBS -q $PBS_QUEUE
130 #PBS -N autobuild
132 #PBS -j oe
134 #PBS -o $output
136 cd $indir
138 cat \$PBS_NODEFILE
139 $mpirun -np $pes $extra $args
141 # Save mpirun exit status
142 status=\$?
143 echo \$status > $result
145 else
146 # use LSF
147 mpirun="cmpirun -lsf -poll -no_smp -gm_long 200000"
148 cat > $script << EOF
149 #!/bin/sh
150 # This is a charmrun-generated PBS batch job script.
151 # The lines starting with #PBS are queuing system flags:
153 #BSUB -J autobuild
154 #BSUB -W 0:$walllimit
155 #BSUB -n $pes
156 #BSUB -o $output
158 cd $indir
159 echo \$LSB_MCPU_HOSTS
160 $mpirun $args
161 # Save mpirun exit status
162 status=\$?
163 echo \$status > $result
167 End() {
168 echo "Charmrun> $queue_kill $jobid ..."
169 $queue_kill $jobid
170 rm -f $script
171 exit $1
174 echo "Submitting batch job for> $mpirun -np $pes $args"
175 echo " using the command> $queue_qsub $script"
176 chmod 755 $script
177 while [ -z "$jobid" ]
179 [ $USE_LSF = 0 ] && jobid=`$queue_qsub $script|tail -1`
180 [ $USE_LSF = 1 ] && jobid=`$queue_qsub < $script|tail -1|sed -e 's/[^0-9]*//g'`
181 done
182 echo "Job enqueued under job ID $jobid"
183 # kill job if interrupted
184 trap 'End 1' 2 3
185 retry=0
186 # Wait for the job to complete, by checking its status
187 while [ true ]
189 $queue_stat $jobid > tmp.$$
190 exitstatus=$?
191 if test -f $output
192 then
193 # The job is done-- print its output
194 rm tmp.$$
195 # When job hangs, result file does not exist
196 test -f $result && status=`cat $result` || status=1
197 test $status -eq 0 && status=`grep 'End of program' $output > /dev/null 2>&1`
198 cat $output
199 rm -f $result
200 test -f $status && rm -f $script $output
201 exit $status
203 # The job is still queued or running-- print status and wait
204 tail -1 tmp.$$
205 rm tmp.$$
206 # Job ID may not exist now
207 if test $exitstatus -ne 0
208 then
209 # retry a few times when error occurs
210 retry=`expr $retry + 1`
211 if test $retry -gt 6
212 then
213 echo "Charmrun> too many errors, abort!"
214 exit 1
215 else
216 sleep 15
218 else
219 # job still in queue
220 retry=0
221 sleep 20
223 done
224 else
225 [ -n "$MPI_MACHINEFILE" ] && args=" -machinefile $MPI_MACHINEFILE $args"
226 echo "charmrun> mpirun -np $pes $args"
227 mpirun -np $pes $args