1 # Copyright 2007-2010 Google Inc. Released under the GPL v2
2 __author__
= "duanes (Duane Sand), pdahl (Peter Dahl)"
4 # A basic cpuset/cgroup container manager for limiting memory use during tests
5 # for use on kernels not running some site-specific container manager
7 import os
, sys
, re
, glob
, fcntl
, logging
8 from autotest_lib
.client
.bin
import utils
9 from autotest_lib
.client
.common_lib
import error
11 SUPER_ROOT
= '' # root of all containers or cgroups
12 NO_LIMIT
= (1 << 63) - 1 # containername/memory.limit_in_bytes if no limit
14 # propio service classes:
19 super_root_path
= '' # usually '/dev/cgroup'; '/dev/cpuset' on 2.6.18
20 cpuset_prefix
= None # usually 'cpuset.'; '' on 2.6.18
21 fake_numa_containers
= False # container mem via numa=fake mem nodes, else pages
22 mem_isolation_on
= False
23 node_mbytes
= 0 # mbytes in one typical mem node
24 root_container_bytes
= 0 # squishy limit on effective size of root container
27 def discover_container_style():
28 global super_root_path
, cpuset_prefix
29 global mem_isolation_on
, fake_numa_containers
30 global node_mbytes
, root_container_bytes
31 if super_root_path
!= '':
32 return # already looked up
33 if os
.path
.exists('/dev/cgroup/tasks'):
34 # running on 2.6.26 or later kernel with containers on:
35 super_root_path
= '/dev/cgroup'
36 cpuset_prefix
= 'cpuset.'
38 mem_isolation_on
= fake_numa_containers
= True
39 else: # memcg containers IFF compiled-in & mounted & non-fakenuma boot
40 fake_numa_containers
= False
41 mem_isolation_on
= os
.path
.exists(
42 '/dev/cgroup/memory.limit_in_bytes')
43 # TODO: handle possibility of where memcg is mounted as its own
44 # cgroup hierarchy, separate from cpuset??
45 elif os
.path
.exists('/dev/cpuset/tasks'):
46 # running on 2.6.18 kernel with containers on:
47 super_root_path
= '/dev/cpuset'
49 mem_isolation_on
= fake_numa_containers
= get_boot_numa() != ''
51 # neither cpuset nor cgroup filesystem active:
52 super_root_path
= None
53 cpuset_prefix
= 'no_cpusets_or_cgroups_exist'
54 mem_isolation_on
= fake_numa_containers
= False
56 logging
.debug('mem_isolation: %s', mem_isolation_on
)
57 logging
.debug('fake_numa_containers: %s', fake_numa_containers
)
58 if fake_numa_containers
:
59 node_mbytes
= int(mbytes_per_mem_node())
60 elif mem_isolation_on
: # memcg-style containers
61 # For now, limit total of all containers to using just 98% of system's
62 # visible total ram, to avoid oom events at system level, and avoid
63 # page reclaim overhead from going above kswapd highwater mark.
64 system_visible_pages
= utils
.memtotal() >> 2
65 usable_pages
= int(system_visible_pages
* 0.98)
66 root_container_bytes
= usable_pages
<< 12
67 logging
.debug('root_container_bytes: %s',
68 utils
.human_format(root_container_bytes
))
71 def need_mem_containers():
72 discover_container_style()
73 if not mem_isolation_on
:
74 raise error
.AutotestError('Mem-isolation containers not enabled '
78 discover_container_style()
79 if not fake_numa_containers
:
80 raise error
.AutotestError('fake=numa not enabled by latest reboot')
83 def full_path(container_name
):
84 discover_container_style()
85 return os
.path
.join(super_root_path
, container_name
)
88 def unpath(container_path
):
89 return container_path
[len(super_root_path
)+1:]
92 def cpuset_attr(container_name
, attr
):
93 discover_container_style()
94 return os
.path
.join(super_root_path
, container_name
, cpuset_prefix
+attr
)
97 def io_attr(container_name
, attr
):
98 discover_container_style()
99 # current version assumes shared cgroup hierarchy
100 return os
.path
.join(super_root_path
, container_name
, 'io.'+attr
)
103 def tasks_path(container_name
):
104 return os
.path
.join(full_path(container_name
), 'tasks')
107 def mems_path(container_name
):
108 return cpuset_attr(container_name
, 'mems')
111 def memory_path(container_name
):
112 return os
.path
.join(super_root_path
, container_name
, 'memory')
115 def cpus_path(container_name
):
116 return cpuset_attr(container_name
, 'cpus')
119 def container_exists(name
):
120 return name
is not None and os
.path
.exists(tasks_path(name
))
123 def move_tasks_into_container(name
, tasks
):
124 task_file
= tasks_path(name
)
127 logging
.debug('moving task %s into container "%s"', task
, name
)
128 utils
.write_one_line(task_file
, task
)
130 if utils
.pid_is_alive(task
):
131 raise # task exists but couldn't move it
132 # task is gone or zombie so ignore this exception
135 def move_self_into_container(name
):
136 me
= str(os
.getpid())
137 move_tasks_into_container(name
, [me
])
138 logging
.debug('running self (pid %s) in container "%s"', me
, name
)
141 def _avail_mbytes_via_nodes(parent
):
142 # total mbytes of mem nodes available for new containers in parent
143 free_nodes
= available_exclusive_mem_nodes(parent
)
144 mbytes
= nodes_avail_mbytes(free_nodes
)
145 # don't have exact model for how container mgr measures mem space
146 # better here to underestimate than overestimate
147 mbytes
= max(mbytes
- node_mbytes
//2, 0)
151 def _avail_bytes_via_pages(parent
):
152 # Get memory bytes available to parent container which could
153 # be allocated exclusively to new child containers.
154 # This excludes mem previously allocated to existing children.
155 available
= container_bytes(parent
)
156 mem_files_pattern
= os
.path
.join(full_path(parent
),
157 '*', 'memory.limit_in_bytes')
158 for mem_file
in glob
.glob(mem_files_pattern
):
159 child_container
= unpath(os
.path
.dirname(mem_file
))
160 available
-= container_bytes(child_container
)
164 def avail_mbytes(parent
=SUPER_ROOT
):
165 # total mbytes available in parent, for exclusive use in new containers
166 if fake_numa_containers
:
167 return _avail_mbytes_via_nodes(parent
)
169 return _avail_bytes_via_pages(parent
) >> 20
172 def delete_leftover_test_containers():
173 # recover mems and cores tied up by containers of prior failed tests:
174 for child
in inner_containers_of(SUPER_ROOT
):
175 _release_container_nest(child
)
178 def my_lock(lockname
):
179 # lockname is 'inner'
180 lockdir
= os
.environ
['AUTODIR']
181 lockname
= os
.path
.join(lockdir
, '.cpuset.lock.'+lockname
)
182 lockfile
= open(lockname
, 'w')
183 fcntl
.flock(lockfile
, fcntl
.LOCK_EX
)
187 def my_unlock(lockfile
):
188 fcntl
.flock(lockfile
, fcntl
.LOCK_UN
)
192 # Convert '1-3,7,9-12' to set(1,2,3,7,9,10,11,12)
193 def rangelist_to_set(rangelist
):
197 for x
in rangelist
.split(','):
198 if re
.match(r
'^(\d+)$', x
):
201 m
= re
.match(r
'^(\d+)-(\d+)$', x
)
203 start
= int(m
.group(1))
204 end
= int(m
.group(2))
205 result
.update(set(range(start
, end
+1)))
207 msg
= 'Cannot understand data input: %s %s' % (x
, rangelist
)
208 raise ValueError(msg
)
212 def my_container_name():
213 # Get current process's inherited or self-built container name
214 # within /dev/cpuset or /dev/cgroup. Is '' for root container.
215 name
= utils
.read_one_line('/proc/%i/cpuset' % os
.getpid())
216 return name
[1:] # strip leading /
219 def get_mem_nodes(container_name
):
220 # all mem nodes now available to a container, both exclusive & shared
221 file_name
= mems_path(container_name
)
222 if os
.path
.exists(file_name
):
223 return rangelist_to_set(utils
.read_one_line(file_name
))
228 def _busy_mem_nodes(parent_container
):
229 # Get set of numa memory nodes now used (exclusively or shared)
230 # by existing children of parent container
232 mem_files_pattern
= os
.path
.join(full_path(parent_container
),
233 '*', cpuset_prefix
+'mems')
234 for mem_file
in glob
.glob(mem_files_pattern
):
235 child_container
= os
.path
.dirname(mem_file
)
236 busy |
= get_mem_nodes(child_container
)
240 def available_exclusive_mem_nodes(parent_container
):
241 # Get subset of numa memory nodes of parent container which could
242 # be allocated exclusively to new child containers.
243 # This excludes nodes now allocated to existing children.
245 available
= get_mem_nodes(parent_container
)
246 available
-= _busy_mem_nodes(parent_container
)
251 # Get set of numa memory nodes owned by current process's container.
252 discover_container_style()
253 if not mem_isolation_on
:
254 return set() # as expected by vmstress
255 return get_mem_nodes(my_container_name())
258 def my_available_exclusive_mem_nodes():
259 # Get subset of numa memory nodes owned by current process's
260 # container, which could be allocated exclusively to new child
261 # containers. This excludes any nodes now allocated
262 # to existing children.
263 return available_exclusive_mem_nodes(my_container_name())
266 def node_avail_kbytes(node
):
267 return node_mbytes
<< 10 # crude; fixed numa node size
270 def nodes_avail_mbytes(nodes
):
271 # nodes' combined user+avail size, in Mbytes
272 return sum(node_avail_kbytes(n
) for n
in nodes
) // 1024
275 def container_bytes(name
):
276 if fake_numa_containers
:
277 return nodes_avail_mbytes(get_mem_nodes(name
)) << 20
280 file = memory_path(name
) + '.limit_in_bytes'
281 limit
= int(utils
.read_one_line(file))
284 if name
== SUPER_ROOT
:
285 return root_container_bytes
286 name
= os
.path
.dirname(name
)
289 def container_mbytes(name
):
290 return container_bytes(name
) >> 20
293 def mbytes_per_mem_node():
294 # Get mbyte size of standard numa mem node, as float
295 # (some nodes are bigger than this)
296 # Replaces utils.node_size().
297 numa
= get_boot_numa()
298 if numa
.endswith('M'):
299 return float(numa
[:-1]) # mbyte size of fake nodes
301 nodecnt
= int(numa
) # fake numa mem nodes for container isolation
303 nodecnt
= len(utils
.numa_nodes()) # phys mem-controller nodes
304 # Use guessed total physical mem size, not kernel's
305 # lesser 'available memory' after various system tables.
306 return utils
.rounded_memtotal() / (nodecnt
* 1024.0)
309 def get_cpus(container_name
):
310 file_name
= cpus_path(container_name
)
311 if os
.path
.exists(file_name
):
312 return rangelist_to_set(utils
.read_one_line(file_name
))
317 def get_tasks(container_name
):
318 file_name
= tasks_path(container_name
)
320 tasks
= [x
.rstrip() for x
in open(file_name
).readlines()]
322 if os
.path
.exists(file_name
):
324 tasks
= [] # container doesn't exist anymore
328 def inner_containers_of(parent
):
329 pattern
= os
.path
.join(full_path(parent
), '*/tasks')
330 return [unpath(os
.path
.dirname(task_file
))
331 for task_file
in glob
.glob(pattern
)]
334 def _release_container_nest(nest
):
335 # Destroy a container, and any nested sub-containers
336 nest_path
= full_path(nest
)
337 if os
.path
.exists(nest_path
):
339 # bottom-up walk of tree, releasing all nested sub-containers
340 for child
in inner_containers_of(nest
):
341 _release_container_nest(child
)
343 logging
.debug("releasing container %s", nest
)
345 # Transfer any survivor tasks (e.g. self) to parent container
346 parent
= os
.path
.dirname(nest
)
347 move_tasks_into_container(parent
, get_tasks(nest
))
349 # remove the now-empty outermost container of this nest
350 if os
.path
.exists(nest_path
):
351 os
.rmdir(nest_path
) # nested, or dead manager
354 def release_container(container_name
=None):
355 # Destroy a container
356 my_container
= my_container_name()
357 if container_name
is None:
358 container_name
= my_container
359 _release_container_nest(container_name
)
360 displaced
= my_container_name()
361 if displaced
!= my_container
:
362 logging
.debug('now running self (pid %d) in container "%s"',
363 os
.getpid(), displaced
)
366 def remove_empty_prio_classes(prios
):
367 # remove prio classes whose set of allowed priorities is empty
368 # e.g 'no:3;rt:;be:3;id:' --> 'no:3;be:3'
369 return ';'.join(p
for p
in prios
.split(';') if p
.split(':')[1])
372 def all_drive_names():
373 # list of all disk drives sda,sdb,...
374 paths
= glob
.glob('/sys/block/sd*')
376 paths
= glob
.glob('/sys/block/hd*')
377 return [os
.path
.basename(path
) for path
in paths
]
380 def set_io_controls(container_name
, disks
=[], ioprio_classes
=[PROPIO_NORMAL
],
381 io_shares
=[95], io_limits
=[0]):
382 # set the propio controls for one container, for selected disks
383 # writing directly to /dev/cgroup/container_name/io.io_service_level
384 # without using containerd or container.py
385 # See wiki ProportionalIOScheduler for definitions
386 # ioprio_classes: list of service classes, one per disk
387 # using numeric propio service classes as used by kernel API, namely
388 # 1: RT, Real Time, aka PROPIO_PRIO
389 # 2: BE, Best Effort, aka PROPIO_NORMAL
391 # io_shares: list of disk-time-fractions, one per disk,
392 # as percentage integer 0..100
393 # io_limits: list of limit on/off, one per disk
394 # 0: no limit, shares use of other containers' unused disk time
395 # 1: limited, container's use of disk time is capped to given DTF
396 # ioprio_classes defaults to best-effort
397 # io_limit defaults to no limit, use slack time
398 if not disks
: # defaults to all drives
399 disks
= all_drive_names()
400 io_shares
= [io_shares
[0]] * len(disks
)
401 ioprio_classes
= [ioprio_classes
[0]] * len(disks
)
402 io_limits
= [io_limits
[0]] * len(disks
)
403 if not (len(disks
) == len(ioprio_classes
) and len(disks
) == len(io_shares
)
404 and len(disks
) == len(io_limits
)):
405 raise error
.AutotestError('Unequal number of values for io controls')
406 service_level
= io_attr(container_name
, 'io_service_level')
407 if not os
.path
.exists(service_level
):
408 return # kernel predates propio features
409 # or io cgroup is mounted separately from cpusets
411 for disk
,ioclass
,limit
,share
in zip(disks
, ioprio_classes
,
412 io_limits
, io_shares
):
413 parts
= (disk
, str(ioclass
), str(limit
), str(share
))
414 disk_info
= ' '.join(parts
)
415 utils
.write_one_line(service_level
, disk_info
)
416 disk_infos
.append(disk_info
)
417 logging
.debug('set_io_controls of %s to %s',
418 container_name
, ', '.join(disk_infos
))
421 def abbrev_list(vals
):
422 """Condense unsigned (0,4,5,6,7,10) to '0,4-7,10'."""
426 for val
in sorted(vals
)+[-1]:
429 ranges
.append(str(lower
))
431 ranges
.append('%d-%d' % (lower
, upper
))
434 return ','.join(ranges
)
437 def create_container_with_specific_mems_cpus(name
, mems
, cpus
):
439 os
.mkdir(full_path(name
))
440 utils
.write_one_line(cpuset_attr(name
, 'mem_hardwall'), '1')
441 utils
.write_one_line(mems_path(name
), ','.join(map(str, mems
)))
442 utils
.write_one_line(cpus_path(name
), ','.join(map(str, cpus
)))
443 logging
.debug('container %s has %d cpus and %d nodes totalling %s bytes',
444 name
, len(cpus
), len(get_mem_nodes(name
)),
445 utils
.human_format(container_bytes(name
)) )
448 def create_container_via_memcg(name
, parent
, bytes
, cpus
):
449 # create container via direct memcg cgroup writes
450 os
.mkdir(full_path(name
))
451 nodes
= utils
.read_one_line(mems_path(parent
))
452 utils
.write_one_line(mems_path(name
), nodes
) # inherit parent's nodes
453 utils
.write_one_line(memory_path(name
)+'.limit_in_bytes', str(bytes
))
454 utils
.write_one_line(cpus_path(name
), ','.join(map(str, cpus
)))
455 logging
.debug('Created container %s directly via memcg,'
456 ' has %d cpus and %s bytes',
457 name
, len(cpus
), utils
.human_format(container_bytes(name
)))
460 def _create_fake_numa_container_directly(name
, parent
, mbytes
, cpus
):
462 lockfile
= my_lock('inner') # serialize race between parallel tests
464 # Pick specific mem nodes for new cpuset's exclusive use
465 # For now, arbitrarily pick highest available node numbers
466 needed_kbytes
= mbytes
* 1024
467 nodes
= sorted(list(available_exclusive_mem_nodes(parent
)))
470 while kbytes
< needed_kbytes
and nodecnt
< len(nodes
):
472 kbytes
+= node_avail_kbytes(nodes
[-nodecnt
])
473 if kbytes
< needed_kbytes
:
474 parent_mbytes
= container_mbytes(parent
)
475 if mbytes
> parent_mbytes
:
476 raise error
.AutotestError(
477 "New container's %d Mbytes exceeds "
478 "parent container's %d Mbyte size"
479 % (mbytes
, parent_mbytes
) )
481 raise error
.AutotestError(
482 "Existing sibling containers hold "
483 "%d Mbytes needed by new container"
484 % ((needed_kbytes
- kbytes
)//1024) )
485 mems
= nodes
[-nodecnt
:]
487 create_container_with_specific_mems_cpus(name
, mems
, cpus
)
492 def create_container_directly(name
, mbytes
, cpus
):
493 parent
= os
.path
.dirname(name
)
494 if fake_numa_containers
:
495 _create_fake_numa_container_directly(name
, parent
, mbytes
, cpus
)
497 create_container_via_memcg(name
, parent
, mbytes
<<20, cpus
)
500 def create_container_with_mbytes_and_specific_cpus(name
, mbytes
,
501 cpus
=None, root
=SUPER_ROOT
, io
={}, move_in
=True, timeout
=0):
503 Create a cpuset container and move job's current pid into it
504 Allocate the list "cpus" of cpus to that container
506 name = arbitrary string tag
507 mbytes = reqested memory for job in megabytes
508 cpus = list of cpu indicies to associate with the cpuset
509 defaults to all cpus avail with given root
510 root = the parent cpuset to nest this new set within
511 '': unnested top-level container
512 io = arguments for proportional IO containers
513 move_in = True: Move current process into the new container now.
514 timeout = must be 0: persist until explicitly deleted.
516 need_mem_containers()
517 if not container_exists(root
):
518 raise error
.AutotestError('Parent container "%s" does not exist'
521 # default to biggest container we can make under root
522 cpus
= get_cpus(root
)
524 cpus
= set(cpus
) # interface uses list
526 raise error
.AutotestError('Creating container with no cpus')
527 name
= os
.path
.join(root
, name
) # path relative to super_root
528 if os
.path
.exists(full_path(name
)):
529 raise error
.AutotestError('Container %s already exists' % name
)
530 create_container_directly(name
, mbytes
, cpus
)
531 set_io_controls(name
, **io
)
533 move_self_into_container(name
)
538 # get boot-time numa=fake=xyz option for current boot
539 # eg numa=fake=nnn, numa=fake=nnnM, or nothing
541 for arg
in utils
.read_one_line('/proc/cmdline').split():
542 if arg
.startswith(label
):
543 return arg
[len(label
):]