3 import sys
, optparse
, pwd
5 from autotest_lib
.cli
import rpc
, host
6 from autotest_lib
.client
.common_lib
import host_queue_entry_states
8 parser
= optparse
.OptionParser(
9 usage
='Usage: %prog [options] <job id> [<hostname>]\n\n'
10 'Describes why the given job on the given host has not started.')
11 parser
.add_option('-w', '--web',
12 help='Autotest server to use (i.e. "autotest")')
13 options
, args
= parser
.parse_args()
21 autotest_host
= rpc
.get_autotest_server(options
.web
)
22 proxy
= rpc
.afe_comm(autotest_host
)
25 jobs
= proxy
.run('get_jobs', id=job_id
)
27 print 'No such job', job_id
32 RUNNING_HQE_STATUSES
= host_queue_entry_states
.ACTIVE_STATUSES
34 # any entry eligible for this host?
35 queue_entries
= proxy
.run('get_host_queue_entries', job__id
=job_id
)
37 ### Divine why an atomic group job is or is not running.
38 if queue_entries
and queue_entries
[0]['atomic_group']:
39 if queue_entries
[0]['status'] in RUNNING_HQE_STATUSES
:
40 print 'Job %d appears to have started (status: %s).' % (
41 job_id
, queue_entries
[0]['status'])
43 # Hosts in Repairing or Repair Failed will have Queued queue entries.
44 # We shouldn't consider those queue entries as a multi-group job.
46 for queue_entry
in queue_entries
:
47 if queue_entry
['host'] and queue_entry
['host']['status']:
48 if queue_entry
['host']['status'].startswith('Repair'):
49 repair_hostnames
.append(queue_entry
['host']['hostname'])
50 if queue_entry
['status'] in ('Completed', 'Stopped'):
51 print 'This job has already finished.'
53 queue_entries_with_hosts
= [queue_entry
for queue_entry
in queue_entries
54 if queue_entry
['host']]
55 all_queue_entries_have_hosts
= (len(queue_entries
) ==
56 len(queue_entries_with_hosts
))
57 if (not all_queue_entries_have_hosts
and len(queue_entries
) > 1 and
58 not repair_hostnames
):
59 # We test repair_hostnames so that this message is not printed when
60 # the script is run on an atomic group job which has hosts assigned
61 # but is not running because too many of them are in Repairing or will
62 # never run because hosts have exited Repairing into the Repair Failed
64 print 'This script does not support multi-group atomic group jobs.'
66 print 'Jobs scheduled in that state are typically unintentional.'
68 print 'Did you perhaps schedule the job via the web frontend and ask'
69 print 'that it run on more than 1 (atomic group) of hosts via the '
70 print '"Run on any" box? If so, always enter 1 there when scheduling'
71 print 'jobs on anything marked "(atomic group)".'
73 print len(queue_entries
), 'non-started atomic group HostQueueEntries',
74 print 'found for job', job_id
76 atomic_group_name
= queue_entries
[0]['atomic_group']['name']
77 # Get the list of labels associated with this atomic group.
78 atomic_labels
= proxy
.run('get_labels',
79 atomic_group__name
=atomic_group_name
)
80 if len(atomic_labels
) < 1:
81 print 'Job requests atomic group %s but no labels' % atomic_group_name
82 print '(and thus no hosts) are associated with that atomic group.'
84 job_sync_count
= job
['synch_count']
85 # Ugh! This is returned as a comma separated str of label names.
86 if job
.get('dependencies'):
87 job_dependency_label_names
= job
['dependencies'].split(',')
89 job_dependency_label_names
= []
91 meta_host_name
= queue_entries
[0]['meta_host']
93 meta_host
= proxy
.run('get_labels', atomic_group__name
=meta_host_name
)[0]
97 # A mapping from label name -> a list of hostnames usable for this job.
98 runnable_atomic_label_names
= {}
100 # A mapping from label name -> a host_exclude_reasons map as described
101 # within the loop below. Any atomic group labels in this map are not
102 # ready to run the job for the reasons contained within.
103 atomic_label_exclude_reasons
= {}
105 for label
in atomic_labels
:
106 label_name
= label
['name']
107 if meta_host
and meta_host_name
!= label_name
:
108 print 'Cannot run on atomic label %s due to meta_host %s.' % (
109 label_name
, meta_host_name
)
111 for dep_name
in job_dependency_label_names
:
112 if dep_name
!= label_name
:
113 print 'Not checking hosts in atomic label %s against' % (
115 print 'job dependency label %s. There may be less hosts' % (
117 print 'than examined below available to run this job.'
119 # Get the list of hosts associated with this atomic group label.
120 atomic_hosts
= proxy
.run('get_hosts', multiple_labels
=[label_name
])
122 # A map of hostname -> A list of reasons it can't be used.
123 host_exclude_reasons
= {}
125 atomic_hostnames
= [h
['hostname'] for h
in atomic_hosts
]
127 # Map hostnames to a list of ACL names on that host.
128 acl_groups
= proxy
.run('get_acl_groups',
129 hosts__hostname__in
=atomic_hostnames
)
130 hostname_to_acl_name_list
= {}
131 for acl
in acl_groups
:
132 for hostname
in acl
['hosts']:
133 hostname_to_acl_name_list
.setdefault(hostname
, []).append(
136 # Exclude any hosts that ACLs deny us access to.
137 accessible_hosts
= proxy
.run('get_hosts', hostname__in
=atomic_hostnames
,
138 aclgroup__users__login
=owner
)
139 assert len(accessible_hosts
) <= len(atomic_hosts
)
140 if len(accessible_hosts
) != len(atomic_hosts
):
141 accessible_hostnames
= set(h
['hostname'] for h
in accessible_hosts
)
142 acl_excluded_hostnames
= (set(atomic_hostnames
) -
143 accessible_hostnames
)
144 for hostname
in acl_excluded_hostnames
:
145 acls
= ','.join(hostname_to_acl_name_list
[hostname
])
146 host_exclude_reasons
.setdefault(hostname
, []).append(
147 'User %s does not have ACL access. ACLs: %s' % (
150 # Check for locked hosts.
151 locked_hosts
= [h
for h
in atomic_hosts
if h
['locked']]
152 for host
in locked_hosts
:
153 locker
= host
.get('locked_by') or 'UNKNOWN'
154 msg
= 'Locked by user %s on %s. No jobs will schedule on it.' % (
155 locker
, host
.get('lock_time'))
156 host_exclude_reasons
.setdefault(host
['hostname'], []).append(msg
)
158 # Exclude hosts that are not Ready.
159 for host
in atomic_hosts
:
160 hostname
= host
['hostname']
161 if host
['status'] != 'Ready':
162 message
= 'Status is %s' % host
['status']
163 if host
['status'] in ('Verifying', 'Pending', 'Running'):
164 running_hqes
= proxy
.run(
165 'get_host_queue_entries', host__hostname
=hostname
,
166 status__in
=RUNNING_HQE_STATUSES
)
168 message
+= ' (unknown job)'
170 message
+= ' (job %d)' % running_hqes
[0]['job']['id']
171 host_exclude_reasons
.setdefault(hostname
, []).append(message
)
173 # If we don't have enough usable hosts, this group cannot run the job.
174 usable_hostnames
= [host
['hostname'] for host
in atomic_hosts
175 if host
['hostname'] not in host_exclude_reasons
]
176 if len(usable_hostnames
) < job_sync_count
:
177 message
= ('%d hosts are required but only %d available.' %
178 (job_sync_count
, len(usable_hostnames
)))
179 atomic_label_exclude_reasons
[label_name
] = (message
,
180 host_exclude_reasons
)
182 runnable_atomic_label_names
[label_name
] = usable_hostnames
184 for label_name
, reason_tuple
in atomic_label_exclude_reasons
.iteritems():
185 job_reason
, hosts_reasons
= reason_tuple
186 print 'Atomic group "%s" via label "%s" CANNOT run job %d because:' % (
187 atomic_group_name
, label_name
, job_id
)
189 for hostname
in sorted(hosts_reasons
.keys()):
190 for reason
in hosts_reasons
[hostname
]:
191 print '%s\t%s' % (hostname
, reason
)
194 for label_name
, host_list
in runnable_atomic_label_names
.iteritems():
195 print 'Atomic group "%s" via label "%s" is READY to run job %d on:' % (
196 atomic_group_name
, label_name
, job_id
)
197 print ', '.join(host_list
)
198 print 'Is the job scheduler healthy?'
204 ### Not an atomic group synchronous job:
207 if len(queue_entries
) == 1 and queue_entries
[0]['host']:
208 hostname
= queue_entries
[0]['host']['hostname']
211 print '\nERROR: A hostname associated with the job is required.'
217 hosts
= proxy
.run('get_hosts', hostname
=hostname
)
219 print 'No such host', hostname
223 # Boolean to track our findings. We want to list all reasons it won't run,
224 # not just the first.
227 entries_for_this_host
= [entry
for entry
in queue_entries
229 and entry
['host']['hostname'] == hostname
]
230 host_label_names
= set(host
['labels'])
231 eligible_metahost_entries
= [entry
for entry
in queue_entries
232 if entry
['meta_host'] and not entry
['host']
233 and entry
['meta_host'] in host_label_names
234 and not entry
['complete']]
236 if entries_for_this_host
:
237 assert len(entries_for_this_host
) == 1, (
238 'Multiple entries for this job assigned to this host!')
239 entry
= entries_for_this_host
[0]
240 if entry
['active'] or entry
['complete']:
241 print ('Job already ran or is running on this host! (status: %s)' %
242 entry
['full_status'])
246 # no entry for this host -- maybe an eligible metahost entry?
247 if not eligible_metahost_entries
:
248 print ("Host isn't scheduled for this job, and no eligible metahost "
253 # meets atomic group requirements?
254 host_labels
= proxy
.run('get_labels', name__in
=list(host_label_names
))
255 host_atomic_group_labels
= [label
for label
in host_labels
256 if label
['atomic_group']]
257 host_atomic_group_name
= None
258 if host_atomic_group_labels
:
259 atomic_groups
= set()
260 for label
in host_atomic_group_labels
:
261 atomic_groups
.add(label
['atomic_group']['name'])
262 if len(atomic_groups
) != 1:
263 print 'Host has more than one atomic group!'
264 print list(atomic_groups
)
266 host_atomic_group_label
= host_atomic_group_labels
[0]
267 host_atomic_group_name
= host_atomic_group_label
['atomic_group']['name']
269 job_atomic_groups
= set(entry
['atomic_group'] for entry
in queue_entries
)
270 assert len(job_atomic_groups
) == 1, 'Job has more than one atomic group value!'
271 job_atomic_group
= job_atomic_groups
.pop() # might be None
272 job_atomic_group_name
= None
274 job_atomic_group_name
= job_atomic_group
['name']
276 if host_atomic_group_name
!= job_atomic_group_name
:
277 print ('Job is for atomic group %s, but host is in atomic group %s '
279 (job_atomic_group_name
, host_atomic_group_name
,
280 host_atomic_group_label
['name']))
285 print 'Host is locked by', host
['locked_by'], 'no jobs will schedule on it.'
289 accessible
= proxy
.run('get_hosts', hostname
=hostname
,
290 aclgroup__users__login
=owner
)
292 host_acls
= ', '.join(group
['name'] for group
in
293 proxy
.run('get_acl_groups', hosts__hostname
=hostname
))
294 owner_acls
= ', '.join(group
['name'] for group
in
295 proxy
.run('get_acl_groups', users__login
=owner
))
296 print 'Host not ACL-accessible to job owner', owner
297 print ' Host ACLs:', host_acls
298 print ' Owner Acls:', owner_acls
301 # meets dependencies?
302 job_deps_list
= job
['dependencies'].split(',')
304 if job_deps_list
!= ['']:
305 job_deps
= set(job_deps_list
)
306 unmet
= job_deps
- host_label_names
308 print ("Host labels (%s) don't satisfy job dependencies: %s" %
309 (', '.join(host_label_names
), ', '.join(unmet
)))
312 # at this point, if the job is for an unassigned atomic group, things are too
313 # complicated to proceed
314 unassigned_atomic_group_entries
= [entry
for entry
in queue_entries
315 if entry
['atomic_group']
316 and not entry
['host']]
317 if unassigned_atomic_group_entries
:
318 print ("Job is for an unassigned atomic group. That's too complicated, I "
319 "can't give you any definite answers. Sorry.")
322 # meets only_if_needed labels?
324 metahost_names
= set(entry
['meta_host']
325 for entry
in eligible_metahost_entries
)
326 job_deps_and_metahosts
= job_deps
.union(metahost_names
)
327 for label
in host_labels
:
328 unmet_exclusive_label
= (label
['only_if_needed'] and
329 label
['name'] not in job_deps_and_metahosts
)
330 if unmet_exclusive_label
:
331 print ('Host contains "only if needed" label %s, unused by job '
332 'dependencies and metahosts' % label
['name'])
336 if host
['status'] != 'Ready':
337 if host
['status'] == 'Pending':
338 active
= proxy
.run('get_host_queue_entries',
339 host
=host
['id'], active
=True)
341 print ('Host %s seems to be in "Pending" state incorrectly; please '
342 'report this to the Autotest team' % hostname
)
344 print 'Host not in "Ready" status (status="%s")' % host
['status']
348 print ("Job %s should run on host %s; if you've already waited about ten "
349 "minutes or longer, it's probably a server issue or a bug." %
353 print "All of the reasons this job is not running are listed above."