virt.virt_test_utils: run_autotest - 'tar' needs relative paths to strip the leading '/'
[autotest-zwu.git] / client / bin / cpuset.py
blob68fe50a37e92ec0441e3f46fe891d607591f1008
1 # Copyright 2007-2010 Google Inc. Released under the GPL v2
2 __author__ = "duanes (Duane Sand), pdahl (Peter Dahl)"
4 # A basic cpuset/cgroup container manager for limiting memory use during tests
5 # for use on kernels not running some site-specific container manager
7 import os, sys, re, glob, fcntl, logging
8 from autotest_lib.client.bin import utils
9 from autotest_lib.client.common_lib import error
11 SUPER_ROOT = '' # root of all containers or cgroups
12 NO_LIMIT = (1 << 63) - 1 # containername/memory.limit_in_bytes if no limit
14 # propio service classes:
15 PROPIO_PRIO = 1
16 PROPIO_NORMAL = 2
17 PROPIO_IDLE = 3
19 super_root_path = '' # usually '/dev/cgroup'; '/dev/cpuset' on 2.6.18
20 cpuset_prefix = None # usually 'cpuset.'; '' on 2.6.18
21 fake_numa_containers = False # container mem via numa=fake mem nodes, else pages
22 mem_isolation_on = False
23 node_mbytes = 0 # mbytes in one typical mem node
24 root_container_bytes = 0 # squishy limit on effective size of root container
27 def discover_container_style():
28 global super_root_path, cpuset_prefix
29 global mem_isolation_on, fake_numa_containers
30 global node_mbytes, root_container_bytes
31 if super_root_path != '':
32 return # already looked up
33 if os.path.exists('/dev/cgroup/tasks'):
34 # running on 2.6.26 or later kernel with containers on:
35 super_root_path = '/dev/cgroup'
36 cpuset_prefix = 'cpuset.'
37 if get_boot_numa():
38 mem_isolation_on = fake_numa_containers = True
39 else: # memcg containers IFF compiled-in & mounted & non-fakenuma boot
40 fake_numa_containers = False
41 mem_isolation_on = os.path.exists(
42 '/dev/cgroup/memory.limit_in_bytes')
43 # TODO: handle possibility of where memcg is mounted as its own
44 # cgroup hierarchy, separate from cpuset??
45 elif os.path.exists('/dev/cpuset/tasks'):
46 # running on 2.6.18 kernel with containers on:
47 super_root_path = '/dev/cpuset'
48 cpuset_prefix = ''
49 mem_isolation_on = fake_numa_containers = get_boot_numa() != ''
50 else:
51 # neither cpuset nor cgroup filesystem active:
52 super_root_path = None
53 cpuset_prefix = 'no_cpusets_or_cgroups_exist'
54 mem_isolation_on = fake_numa_containers = False
56 logging.debug('mem_isolation: %s', mem_isolation_on)
57 logging.debug('fake_numa_containers: %s', fake_numa_containers)
58 if fake_numa_containers:
59 node_mbytes = int(mbytes_per_mem_node())
60 elif mem_isolation_on: # memcg-style containers
61 # For now, limit total of all containers to using just 98% of system's
62 # visible total ram, to avoid oom events at system level, and avoid
63 # page reclaim overhead from going above kswapd highwater mark.
64 system_visible_pages = utils.memtotal() >> 2
65 usable_pages = int(system_visible_pages * 0.98)
66 root_container_bytes = usable_pages << 12
67 logging.debug('root_container_bytes: %s',
68 utils.human_format(root_container_bytes))
71 def need_mem_containers():
72 discover_container_style()
73 if not mem_isolation_on:
74 raise error.AutotestError('Mem-isolation containers not enabled '
75 'by latest reboot')
77 def need_fake_numa():
78 discover_container_style()
79 if not fake_numa_containers:
80 raise error.AutotestError('fake=numa not enabled by latest reboot')
83 def full_path(container_name):
84 discover_container_style()
85 return os.path.join(super_root_path, container_name)
88 def unpath(container_path):
89 return container_path[len(super_root_path)+1:]
92 def cpuset_attr(container_name, attr):
93 discover_container_style()
94 return os.path.join(super_root_path, container_name, cpuset_prefix+attr)
97 def io_attr(container_name, attr):
98 discover_container_style()
99 # current version assumes shared cgroup hierarchy
100 return os.path.join(super_root_path, container_name, 'io.'+attr)
103 def tasks_path(container_name):
104 return os.path.join(full_path(container_name), 'tasks')
107 def mems_path(container_name):
108 return cpuset_attr(container_name, 'mems')
111 def memory_path(container_name):
112 return os.path.join(super_root_path, container_name, 'memory')
115 def cpus_path(container_name):
116 return cpuset_attr(container_name, 'cpus')
119 def container_exists(name):
120 return name is not None and os.path.exists(tasks_path(name))
123 def move_tasks_into_container(name, tasks):
124 task_file = tasks_path(name)
125 for task in tasks:
126 try:
127 logging.debug('moving task %s into container "%s"', task, name)
128 utils.write_one_line(task_file, task)
129 except Exception:
130 if utils.pid_is_alive(task):
131 raise # task exists but couldn't move it
132 # task is gone or zombie so ignore this exception
135 def move_self_into_container(name):
136 me = str(os.getpid())
137 move_tasks_into_container(name, [me])
138 logging.debug('running self (pid %s) in container "%s"', me, name)
141 def _avail_mbytes_via_nodes(parent):
142 # total mbytes of mem nodes available for new containers in parent
143 free_nodes = available_exclusive_mem_nodes(parent)
144 mbytes = nodes_avail_mbytes(free_nodes)
145 # don't have exact model for how container mgr measures mem space
146 # better here to underestimate than overestimate
147 mbytes = max(mbytes - node_mbytes//2, 0)
148 return mbytes
151 def _avail_bytes_via_pages(parent):
152 # Get memory bytes available to parent container which could
153 # be allocated exclusively to new child containers.
154 # This excludes mem previously allocated to existing children.
155 available = container_bytes(parent)
156 mem_files_pattern = os.path.join(full_path(parent),
157 '*', 'memory.limit_in_bytes')
158 for mem_file in glob.glob(mem_files_pattern):
159 child_container = unpath(os.path.dirname(mem_file))
160 available -= container_bytes(child_container)
161 return available
164 def avail_mbytes(parent=SUPER_ROOT):
165 # total mbytes available in parent, for exclusive use in new containers
166 if fake_numa_containers:
167 return _avail_mbytes_via_nodes(parent)
168 else:
169 return _avail_bytes_via_pages(parent) >> 20
172 def delete_leftover_test_containers():
173 # recover mems and cores tied up by containers of prior failed tests:
174 for child in inner_containers_of(SUPER_ROOT):
175 _release_container_nest(child)
178 def my_lock(lockname):
179 # lockname is 'inner'
180 lockdir = os.environ['AUTODIR']
181 lockname = os.path.join(lockdir, '.cpuset.lock.'+lockname)
182 lockfile = open(lockname, 'w')
183 fcntl.flock(lockfile, fcntl.LOCK_EX)
184 return lockfile
187 def my_unlock(lockfile):
188 fcntl.flock(lockfile, fcntl.LOCK_UN)
189 lockfile.close()
192 # Convert '1-3,7,9-12' to set(1,2,3,7,9,10,11,12)
193 def rangelist_to_set(rangelist):
194 result = set()
195 if not rangelist:
196 return result
197 for x in rangelist.split(','):
198 if re.match(r'^(\d+)$', x):
199 result.add(int(x))
200 continue
201 m = re.match(r'^(\d+)-(\d+)$', x)
202 if m:
203 start = int(m.group(1))
204 end = int(m.group(2))
205 result.update(set(range(start, end+1)))
206 continue
207 msg = 'Cannot understand data input: %s %s' % (x, rangelist)
208 raise ValueError(msg)
209 return result
212 def my_container_name():
213 # Get current process's inherited or self-built container name
214 # within /dev/cpuset or /dev/cgroup. Is '' for root container.
215 name = utils.read_one_line('/proc/%i/cpuset' % os.getpid())
216 return name[1:] # strip leading /
219 def get_mem_nodes(container_name):
220 # all mem nodes now available to a container, both exclusive & shared
221 file_name = mems_path(container_name)
222 if os.path.exists(file_name):
223 return rangelist_to_set(utils.read_one_line(file_name))
224 else:
225 return set()
228 def _busy_mem_nodes(parent_container):
229 # Get set of numa memory nodes now used (exclusively or shared)
230 # by existing children of parent container
231 busy = set()
232 mem_files_pattern = os.path.join(full_path(parent_container),
233 '*', cpuset_prefix+'mems')
234 for mem_file in glob.glob(mem_files_pattern):
235 child_container = os.path.dirname(mem_file)
236 busy |= get_mem_nodes(child_container)
237 return busy
240 def available_exclusive_mem_nodes(parent_container):
241 # Get subset of numa memory nodes of parent container which could
242 # be allocated exclusively to new child containers.
243 # This excludes nodes now allocated to existing children.
244 need_fake_numa()
245 available = get_mem_nodes(parent_container)
246 available -= _busy_mem_nodes(parent_container)
247 return available
250 def my_mem_nodes():
251 # Get set of numa memory nodes owned by current process's container.
252 discover_container_style()
253 if not mem_isolation_on:
254 return set() # as expected by vmstress
255 return get_mem_nodes(my_container_name())
258 def my_available_exclusive_mem_nodes():
259 # Get subset of numa memory nodes owned by current process's
260 # container, which could be allocated exclusively to new child
261 # containers. This excludes any nodes now allocated
262 # to existing children.
263 return available_exclusive_mem_nodes(my_container_name())
266 def node_avail_kbytes(node):
267 return node_mbytes << 10 # crude; fixed numa node size
270 def nodes_avail_mbytes(nodes):
271 # nodes' combined user+avail size, in Mbytes
272 return sum(node_avail_kbytes(n) for n in nodes) // 1024
275 def container_bytes(name):
276 if fake_numa_containers:
277 return nodes_avail_mbytes(get_mem_nodes(name)) << 20
278 else:
279 while True:
280 file = memory_path(name) + '.limit_in_bytes'
281 limit = int(utils.read_one_line(file))
282 if limit < NO_LIMIT:
283 return limit
284 if name == SUPER_ROOT:
285 return root_container_bytes
286 name = os.path.dirname(name)
289 def container_mbytes(name):
290 return container_bytes(name) >> 20
293 def mbytes_per_mem_node():
294 # Get mbyte size of standard numa mem node, as float
295 # (some nodes are bigger than this)
296 # Replaces utils.node_size().
297 numa = get_boot_numa()
298 if numa.endswith('M'):
299 return float(numa[:-1]) # mbyte size of fake nodes
300 elif numa:
301 nodecnt = int(numa) # fake numa mem nodes for container isolation
302 else:
303 nodecnt = len(utils.numa_nodes()) # phys mem-controller nodes
304 # Use guessed total physical mem size, not kernel's
305 # lesser 'available memory' after various system tables.
306 return utils.rounded_memtotal() / (nodecnt * 1024.0)
309 def get_cpus(container_name):
310 file_name = cpus_path(container_name)
311 if os.path.exists(file_name):
312 return rangelist_to_set(utils.read_one_line(file_name))
313 else:
314 return set()
317 def get_tasks(container_name):
318 file_name = tasks_path(container_name)
319 try:
320 tasks = [x.rstrip() for x in open(file_name).readlines()]
321 except IOError:
322 if os.path.exists(file_name):
323 raise
324 tasks = [] # container doesn't exist anymore
325 return tasks
328 def inner_containers_of(parent):
329 pattern = os.path.join(full_path(parent), '*/tasks')
330 return [unpath(os.path.dirname(task_file))
331 for task_file in glob.glob(pattern)]
334 def _release_container_nest(nest):
335 # Destroy a container, and any nested sub-containers
336 nest_path = full_path(nest)
337 if os.path.exists(nest_path):
339 # bottom-up walk of tree, releasing all nested sub-containers
340 for child in inner_containers_of(nest):
341 _release_container_nest(child)
343 logging.debug("releasing container %s", nest)
345 # Transfer any survivor tasks (e.g. self) to parent container
346 parent = os.path.dirname(nest)
347 move_tasks_into_container(parent, get_tasks(nest))
349 # remove the now-empty outermost container of this nest
350 if os.path.exists(nest_path):
351 os.rmdir(nest_path) # nested, or dead manager
354 def release_container(container_name=None):
355 # Destroy a container
356 my_container = my_container_name()
357 if container_name is None:
358 container_name = my_container
359 _release_container_nest(container_name)
360 displaced = my_container_name()
361 if displaced != my_container:
362 logging.debug('now running self (pid %d) in container "%s"',
363 os.getpid(), displaced)
366 def remove_empty_prio_classes(prios):
367 # remove prio classes whose set of allowed priorities is empty
368 # e.g 'no:3;rt:;be:3;id:' --> 'no:3;be:3'
369 return ';'.join(p for p in prios.split(';') if p.split(':')[1])
372 def all_drive_names():
373 # list of all disk drives sda,sdb,...
374 paths = glob.glob('/sys/block/sd*')
375 if not paths:
376 paths = glob.glob('/sys/block/hd*')
377 return [os.path.basename(path) for path in paths]
380 def set_io_controls(container_name, disks=[], ioprio_classes=[PROPIO_NORMAL],
381 io_shares=[95], io_limits=[0]):
382 # set the propio controls for one container, for selected disks
383 # writing directly to /dev/cgroup/container_name/io.io_service_level
384 # without using containerd or container.py
385 # See wiki ProportionalIOScheduler for definitions
386 # ioprio_classes: list of service classes, one per disk
387 # using numeric propio service classes as used by kernel API, namely
388 # 1: RT, Real Time, aka PROPIO_PRIO
389 # 2: BE, Best Effort, aka PROPIO_NORMAL
390 # 3: PROPIO_IDLE
391 # io_shares: list of disk-time-fractions, one per disk,
392 # as percentage integer 0..100
393 # io_limits: list of limit on/off, one per disk
394 # 0: no limit, shares use of other containers' unused disk time
395 # 1: limited, container's use of disk time is capped to given DTF
396 # ioprio_classes defaults to best-effort
397 # io_limit defaults to no limit, use slack time
398 if not disks: # defaults to all drives
399 disks = all_drive_names()
400 io_shares = [io_shares [0]] * len(disks)
401 ioprio_classes = [ioprio_classes[0]] * len(disks)
402 io_limits = [io_limits [0]] * len(disks)
403 if not (len(disks) == len(ioprio_classes) and len(disks) == len(io_shares)
404 and len(disks) == len(io_limits)):
405 raise error.AutotestError('Unequal number of values for io controls')
406 service_level = io_attr(container_name, 'io_service_level')
407 if not os.path.exists(service_level):
408 return # kernel predates propio features
409 # or io cgroup is mounted separately from cpusets
410 disk_infos = []
411 for disk,ioclass,limit,share in zip(disks, ioprio_classes,
412 io_limits, io_shares):
413 parts = (disk, str(ioclass), str(limit), str(share))
414 disk_info = ' '.join(parts)
415 utils.write_one_line(service_level, disk_info)
416 disk_infos.append(disk_info)
417 logging.debug('set_io_controls of %s to %s',
418 container_name, ', '.join(disk_infos))
421 def abbrev_list(vals):
422 """Condense unsigned (0,4,5,6,7,10) to '0,4-7,10'."""
423 ranges = []
424 lower = 0
425 upper = -2
426 for val in sorted(vals)+[-1]:
427 if val != upper+1:
428 if lower == upper:
429 ranges.append(str(lower))
430 elif lower <= upper:
431 ranges.append('%d-%d' % (lower, upper))
432 lower = val
433 upper = val
434 return ','.join(ranges)
437 def create_container_with_specific_mems_cpus(name, mems, cpus):
438 need_fake_numa()
439 os.mkdir(full_path(name))
440 utils.write_one_line(cpuset_attr(name, 'mem_hardwall'), '1')
441 utils.write_one_line(mems_path(name), ','.join(map(str, mems)))
442 utils.write_one_line(cpus_path(name), ','.join(map(str, cpus)))
443 logging.debug('container %s has %d cpus and %d nodes totalling %s bytes',
444 name, len(cpus), len(get_mem_nodes(name)),
445 utils.human_format(container_bytes(name)) )
448 def create_container_via_memcg(name, parent, bytes, cpus):
449 # create container via direct memcg cgroup writes
450 os.mkdir(full_path(name))
451 nodes = utils.read_one_line(mems_path(parent))
452 utils.write_one_line(mems_path(name), nodes) # inherit parent's nodes
453 utils.write_one_line(memory_path(name)+'.limit_in_bytes', str(bytes))
454 utils.write_one_line(cpus_path(name), ','.join(map(str, cpus)))
455 logging.debug('Created container %s directly via memcg,'
456 ' has %d cpus and %s bytes',
457 name, len(cpus), utils.human_format(container_bytes(name)))
460 def _create_fake_numa_container_directly(name, parent, mbytes, cpus):
461 need_fake_numa()
462 lockfile = my_lock('inner') # serialize race between parallel tests
463 try:
464 # Pick specific mem nodes for new cpuset's exclusive use
465 # For now, arbitrarily pick highest available node numbers
466 needed_kbytes = mbytes * 1024
467 nodes = sorted(list(available_exclusive_mem_nodes(parent)))
468 kbytes = 0
469 nodecnt = 0
470 while kbytes < needed_kbytes and nodecnt < len(nodes):
471 nodecnt += 1
472 kbytes += node_avail_kbytes(nodes[-nodecnt])
473 if kbytes < needed_kbytes:
474 parent_mbytes = container_mbytes(parent)
475 if mbytes > parent_mbytes:
476 raise error.AutotestError(
477 "New container's %d Mbytes exceeds "
478 "parent container's %d Mbyte size"
479 % (mbytes, parent_mbytes) )
480 else:
481 raise error.AutotestError(
482 "Existing sibling containers hold "
483 "%d Mbytes needed by new container"
484 % ((needed_kbytes - kbytes)//1024) )
485 mems = nodes[-nodecnt:]
487 create_container_with_specific_mems_cpus(name, mems, cpus)
488 finally:
489 my_unlock(lockfile)
492 def create_container_directly(name, mbytes, cpus):
493 parent = os.path.dirname(name)
494 if fake_numa_containers:
495 _create_fake_numa_container_directly(name, parent, mbytes, cpus)
496 else:
497 create_container_via_memcg(name, parent, mbytes<<20, cpus)
500 def create_container_with_mbytes_and_specific_cpus(name, mbytes,
501 cpus=None, root=SUPER_ROOT, io={}, move_in=True, timeout=0):
502 """\
503 Create a cpuset container and move job's current pid into it
504 Allocate the list "cpus" of cpus to that container
506 name = arbitrary string tag
507 mbytes = reqested memory for job in megabytes
508 cpus = list of cpu indicies to associate with the cpuset
509 defaults to all cpus avail with given root
510 root = the parent cpuset to nest this new set within
511 '': unnested top-level container
512 io = arguments for proportional IO containers
513 move_in = True: Move current process into the new container now.
514 timeout = must be 0: persist until explicitly deleted.
516 need_mem_containers()
517 if not container_exists(root):
518 raise error.AutotestError('Parent container "%s" does not exist'
519 % root)
520 if cpus is None:
521 # default to biggest container we can make under root
522 cpus = get_cpus(root)
523 else:
524 cpus = set(cpus) # interface uses list
525 if not cpus:
526 raise error.AutotestError('Creating container with no cpus')
527 name = os.path.join(root, name) # path relative to super_root
528 if os.path.exists(full_path(name)):
529 raise error.AutotestError('Container %s already exists' % name)
530 create_container_directly(name, mbytes, cpus)
531 set_io_controls(name, **io)
532 if move_in:
533 move_self_into_container(name)
534 return name
537 def get_boot_numa():
538 # get boot-time numa=fake=xyz option for current boot
539 # eg numa=fake=nnn, numa=fake=nnnM, or nothing
540 label = 'numa=fake='
541 for arg in utils.read_one_line('/proc/cmdline').split():
542 if arg.startswith(label):
543 return arg[len(label):]
544 return ''