Reading the cmdline of a dead process raises an exception too.
[iotop.git] / iotop.py
blobb46dbd81efb687e39ba5e9d8aeac8264259d625d
1 #!/usr/bin/python
2 # iotop: Display I/O usage of processes in a top like UI
3 # Copyright (c) 2007 Guillaume Chazarain <guichaz@yahoo.fr>, GPLv2
4 # See ./iotop.py --help for some help
6 import curses
7 import errno
8 import optparse
9 import os
10 import pwd
11 import select
12 import socket
13 import struct
14 import sys
15 import time
18 # Check for requirements:
19 # o Python >= 2.5 for AF_NETLINK sockets
20 # o Linux >= 2.6.20 with I/O accounting
22 try:
23 socket.NETLINK_ROUTE
24 python25 = True
25 except AttributeError:
26 python25 = False
28 ioaccounting = os.path.exists('/proc/self/io')
30 if not python25 or not ioaccounting:
31 def boolean2string(boolean):
32 return boolean and 'Found' or 'Not found'
33 print 'Could not run iotop as some of the requirements are not met:'
34 print '- Python >= 2.5 for AF_NETLINK support:', boolean2string(python25)
35 print '- Linux >= 2.6.20 with I/O accounting support:', \
36 boolean2string(ioaccounting)
37 sys.exit(1)
40 # Netlink stuff
41 # Based on code from pynl80211: Netlink message generation/parsing
42 # http://git.sipsolutions.net/?p=pynl80211.git
43 # Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
44 # GPLv2
46 # flags
47 NLM_F_REQUEST = 1
49 # types
50 NLMSG_ERROR = 2
51 NLMSG_MIN_TYPE = 0x10
53 class Attr:
54 def __init__(self, type, str, *kw):
55 self.type = type
56 if len(kw):
57 self.data = struct.pack(str, *kw)
58 else:
59 self.data = str
61 def _dump(self):
62 hdr = struct.pack('HH', len(self.data)+4, self.type)
63 length = len(self.data)
64 pad = ((length + 4 - 1) & ~3 ) - length
65 return hdr + self.data + '\0' * pad
67 def u16(self):
68 return struct.unpack('H', self.data)[0]
70 class NulStrAttr(Attr):
71 def __init__(self, type, str):
72 Attr.__init__(self, type, '%dsB'%len(str), str, 0)
74 class U32Attr(Attr):
75 def __init__(self, type, val):
76 Attr.__init__(self, type, 'L', val)
78 NETLINK_GENERIC = 16
80 class Message:
81 def __init__(self, tp, flags = 0, seq = -1, payload = []):
82 self.type = tp
83 self.flags = flags
84 self.seq = seq
85 self.pid = -1
86 if type(payload) == list:
87 contents = []
88 for attr in payload:
89 contents.append(attr._dump())
90 self.payload = ''.join(contents)
91 else:
92 self.payload = payload
94 def send(self, conn):
95 if self.seq == -1:
96 self.seq = conn.seq()
98 self.pid = conn.pid
99 length = len(self.payload)
101 hdr = struct.pack('IHHII', length + 4*4, self.type, self.flags,
102 self.seq, self.pid)
103 conn.send(hdr + self.payload)
105 class Connection:
106 def __init__(self, nltype, groups=0, unexpected_msg_handler = None):
107 self.fd = socket.socket(socket.AF_NETLINK, socket.SOCK_RAW, nltype)
108 self.fd.setsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF, 65536)
109 self.fd.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, 65536)
110 self.fd.bind((0, groups))
111 self.pid, self.groups = self.fd.getsockname()
112 self._seq = 0
113 self.unexpected = unexpected_msg_handler
115 def send(self, msg):
116 self.fd.send(msg)
118 def recv(self):
119 cntnts = self.fd.recv(65536)
120 # should check msgflags for TRUNC!
121 len, type, flags, seq, pid = struct.unpack('IHHII', cntnts[:16])
122 m = Message(type, flags, seq, cntnts[16:])
123 m.pid = pid
124 if m.type == NLMSG_ERROR:
125 errno = -struct.unpack('i', m.payload[:4])[0]
126 if errno != 0:
127 e = OSError('Netlink error: %s (%d)' % \
128 (os.strerror(errno), errno))
129 e.errno = errno
130 return m
132 def seq(self):
133 self._seq += 1
134 return self._seq
136 def parse_attributes(str):
137 attrs = {}
138 while str:
139 l, tp = struct.unpack('HH', str[:4])
140 attrs[tp] = Attr(tp, str[4:l])
141 l = ((l + 4 - 1) & ~3 )
142 str = str[l:]
143 return attrs
145 CTRL_CMD_GETFAMILY = 3
147 CTRL_ATTR_FAMILY_ID = 1
148 CTRL_ATTR_FAMILY_NAME = 2
150 class GenlHdr:
151 def __init__(self, cmd, version = 0):
152 self.cmd = cmd
153 self.version = version
155 def _dump(self):
156 return struct.pack('BBxx', self.cmd, self.version)
158 def _genl_hdr_parse(data):
159 return GenlHdr(*struct.unpack('BBxx', data))
161 GENL_ID_CTRL = NLMSG_MIN_TYPE
163 class GeNlMessage(Message):
164 def __init__(self, family, cmd, attrs=[], flags=0):
165 self.cmd = cmd
166 self.attrs = attrs
167 self.family = family
168 Message.__init__(self, family, flags=flags,
169 payload=[GenlHdr(self.cmd)] + attrs)
171 class Controller:
172 def __init__(self, conn):
173 self.conn = conn
175 def get_family_id(self, family):
176 a = NulStrAttr(CTRL_ATTR_FAMILY_NAME, family)
177 m = GeNlMessage(GENL_ID_CTRL, CTRL_CMD_GETFAMILY,
178 flags=NLM_F_REQUEST, attrs=[a])
179 m.send(self.conn)
180 m = self.conn.recv()
181 gh = _genl_hdr_parse(m.payload[:4])
182 attrs = parse_attributes(m.payload[4:])
183 return attrs[CTRL_ATTR_FAMILY_ID].u16()
186 # Netlink usage for taskstats
189 TASKSTATS_CMD_GET = 1
190 TASKSTATS_CMD_ATTR_PID = 1
191 TASKSTATS_CMD_ATTR_TGID = 2
193 class TaskStatsNetlink(object):
194 # Keep in sync with human_stats(stats, duration) and pinfo.did_some_io()
195 members_offsets = [
196 ('blkio_delay_total', 40),
197 ('swapin_delay_total', 56),
198 ('ac_etime', 144),
199 ('read_bytes', 248),
200 ('write_bytes', 256),
201 ('cancelled_write_bytes', 264)
204 def __init__(self, options):
205 self.options = options
206 self.connection = Connection(NETLINK_GENERIC)
207 controller = Controller(self.connection)
208 self.family_id = controller.get_family_id('TASKSTATS')
210 def get_task_stats(self, pid):
211 if self.options.processes:
212 attr = TASKSTATS_CMD_ATTR_TGID
213 else:
214 attr = TASKSTATS_CMD_ATTR_PID
215 request = GeNlMessage(self.family_id, cmd=TASKSTATS_CMD_GET,
216 attrs=[U32Attr(attr, pid)],
217 flags=NLM_F_REQUEST)
218 request.send(self.connection)
219 try:
220 reply = self.connection.recv()
221 except OSError, e:
222 if e.errno == errno.ESRCH:
223 # OSError: Netlink error: No such process (3)
224 return
225 raise
226 if len(reply.payload) < 292:
227 # Short reply
228 return
229 reply_data = reply.payload[20:]
231 reply_length, reply_type = struct.unpack('HH', reply.payload[4:8])
232 reply_version = struct.unpack('H', reply.payload[20:22])[0]
233 assert reply_length >= 288
234 assert reply_type == attr + 3
235 assert reply_version >= 4
237 res = {}
238 for name, offset in TaskStatsNetlink.members_offsets:
239 data = reply_data[offset: offset + 8]
240 res[name] = struct.unpack('Q', data)[0]
242 return res
245 # PIDs manipulations
248 def find_uids(options):
249 options.uids = []
250 error = False
251 for u in options.users or []:
252 try:
253 uid = int(u)
254 except ValueError:
255 try:
256 passwd = pwd.getpwnam(u)
257 except KeyError:
258 print >> sys.stderr, 'Unknown user:', u
259 error = True
260 else:
261 uid = passwd.pw_uid
262 if not error:
263 options.uids.append(uid)
264 if error:
265 sys.exit(1)
267 class pinfo(object):
268 def __init__(self, pid, options):
269 self.mark = False
270 self.pid = pid
271 self.stats = {}
272 for name, offset in TaskStatsNetlink.members_offsets:
273 self.stats[name] = (0, 0) # Total, Delta
274 self.parse_status('/proc/%d/status' % pid, options)
276 def check_if_valid(self, uid, options):
277 self.valid = options.pids or not options.uids or uid in options.uids
279 def parse_status(self, path, options):
280 for line in open(path):
281 if line.startswith('Name:'):
282 # Name kernel threads
283 self.name = '[' + line.split()[1].strip() + ']'
284 elif line.startswith('Uid:'):
285 uid = int(line.split()[1])
286 # We check monitored PIDs only here
287 self.check_if_valid(uid, options)
288 try:
289 self.user = pwd.getpwuid(uid).pw_name
290 except KeyError:
291 self.user = str(uid)
292 break
294 def add_stats(self, stats):
295 self.stats_timestamp = time.time()
296 for name, value in stats.iteritems():
297 prev_value = self.stats[name][0]
298 self.stats[name] = (value, value - prev_value)
300 def get_cmdline(self):
301 # A process may exec, so we must always reread its cmdline
302 try:
303 proc_cmdline = open('/proc/%d/cmdline' % self.pid)
304 cmdline = proc_cmdline.read(4096)
305 except IOError:
306 return '{no such process}'
307 parts = cmdline.split('\0')
308 if parts[0].startswith('/'):
309 first_command_char = parts[0].rfind('/') + 1
310 parts[0] = parts[0][first_command_char:]
311 cmdline = ' '.join(parts).strip()
312 return cmdline.encode('string_escape') or self.name
314 def did_some_io(self):
315 for name in self.stats:
316 if name != 'ac_etime' and self.stats[name][1]:
317 return True
319 return False
321 class ProcessList(object):
322 def __init__(self, taskstats_connection, options):
323 # {pid: pinfo}
324 self.processes = {}
325 self.taskstats_connection = taskstats_connection
326 self.options = options
328 # A first time as we are interested in the delta
329 self.update_process_counts()
331 def get_process(self, pid):
332 process = self.processes.get(pid, None)
333 if not process:
334 try:
335 process = pinfo(pid, self.options)
336 except IOError:
337 # IOError: [Errno 2] No such file or directory: '/proc/...'
338 return
339 if not process.valid:
340 return
341 self.processes[pid] = process
342 return process
344 def list_pids(self, tgid):
345 if self.options.processes or self.options.pids:
346 return [tgid]
347 try:
348 return map(int, os.listdir('/proc/%d/task' % tgid))
349 except OSError:
350 return []
352 def update_process_counts(self):
353 total_read = total_write = duration = 0
354 tgids = self.options.pids or [int(tgid) for tgid in os.listdir('/proc')
355 if '0' <= tgid[0] and tgid[0] <= '9']
356 for tgid in tgids:
357 for pid in self.list_pids(tgid):
358 process = self.get_process(pid)
359 if process:
360 stats = self.taskstats_connection.get_task_stats(pid)
361 if stats:
362 process.mark = False
363 process.add_stats(stats)
364 total_read += process.stats['read_bytes'][1]
365 total_write += process.stats['write_bytes'][1]
366 if not duration:
367 duration = process.stats['ac_etime'][1] / 1000000.0
368 return total_read, total_write, duration
370 def refresh_processes(self):
371 for process in self.processes.values():
372 process.mark = True
373 total_read_and_write_and_duration = self.update_process_counts()
374 to_delete = []
375 for pid, process in self.processes.iteritems():
376 if process.mark:
377 to_delete.append(pid)
378 for pid in to_delete:
379 del self.processes[pid]
380 return total_read_and_write_and_duration
383 # Utility functions for the UI
386 UNITS = ['B', 'K', 'M', 'G', 'T', 'P', 'E']
388 def human_bandwidth(size, duration):
389 bw = size and float(size) / duration
390 for i in xrange(len(UNITS) - 1, 0, -1):
391 base = 1 << (10 * i)
392 if 2 * base < size:
393 res = '%.2f %s' % ((float(bw) / base), UNITS[i])
394 break
395 else:
396 res = str(bw) + ' ' + UNITS[0]
397 return res + '/s'
399 def human_stats(stats):
400 # Keep in sync with TaskStatsNetlink.members_offsets and
401 # IOTopUI.get_data(self)
402 duration = stats['ac_etime'][1] / 1000000.0
403 def delay2percent(name): # delay in ns, duration in s
404 if not duration:
405 return 'KERNBUG'
406 return '%.2f %%' % min(99.99, stats[name][1] / (duration * 10000000.0))
407 io_delay = delay2percent('blkio_delay_total')
408 swapin_delay = delay2percent('swapin_delay_total')
409 read_bytes = human_bandwidth(stats['read_bytes'][1], duration)
410 written_bytes = stats['write_bytes'][1] - stats['cancelled_write_bytes'][1]
411 written_bytes = max(0, written_bytes)
412 write_bytes = human_bandwidth(written_bytes, duration)
413 return io_delay, swapin_delay, read_bytes, write_bytes
416 # The UI
419 class IOTopUI(object):
420 # key, reverse
421 sorting_keys = [
422 (lambda p: p.pid, False),
423 (lambda p: p.user, False),
424 (lambda p: p.stats['read_bytes'][1], True),
425 (lambda p: p.stats['write_bytes'][1] -
426 p.stats['cancelled_write_bytes'][1], True),
427 (lambda p: p.stats['swapin_delay_total'][1], True),
428 # The default sorting (by I/O % time) should show processes doing
429 # only writes, without waiting on them
430 (lambda p: p.stats['blkio_delay_total'][1] or
431 int(not(not(p.stats['read_bytes'][1] or
432 p.stats['write_bytes'][1]))), True),
433 (lambda p: p.get_cmdline(), False),
436 def __init__(self, win, process_list, options):
437 self.process_list = process_list
438 self.options = options
439 self.sorting_key = 5
440 self.sorting_reverse = IOTopUI.sorting_keys[5][1]
441 if not self.options.batch:
442 self.win = win
443 self.resize()
444 curses.use_default_colors()
445 curses.start_color()
446 try:
447 curses.curs_set(0)
448 except curses.error:
449 # This call can fail with misconfigured terminals, for example
450 # TERM=xterm-color. This is harmless
451 pass
453 def resize(self):
454 self.height, self.width = self.win.getmaxyx()
456 def run(self):
457 iterations = 0
458 poll = select.poll()
459 if not self.options.batch:
460 poll.register(sys.stdin.fileno(), select.POLLIN|select.POLLPRI)
461 while self.options.iterations is None or \
462 iterations < self.options.iterations:
463 total = self.process_list.refresh_processes()
464 total_read, total_write, duration = total
465 self.refresh_display(total_read, total_write, duration)
466 if self.options.iterations is not None:
467 iterations += 1
468 if iterations >= self.options.iterations:
469 break
471 try:
472 events = poll.poll(self.options.delay_seconds * 1000.0)
473 except select.error, e:
474 if e.args and e.args[0] == errno.EINTR:
475 events = 0
476 else:
477 raise
478 if not self.options.batch:
479 self.resize()
480 if events:
481 key = self.win.getch()
482 self.handle_key(key)
484 def reverse_sorting(self):
485 self.sorting_reverse = not self.sorting_reverse
487 def adjust_sorting_key(self, delta):
488 orig_sorting_key = self.sorting_key
489 self.sorting_key += delta
490 self.sorting_key = max(0, self.sorting_key)
491 self.sorting_key = min(len(IOTopUI.sorting_keys) - 1, self.sorting_key)
492 if orig_sorting_key != self.sorting_key:
493 self.sorting_reverse = IOTopUI.sorting_keys[self.sorting_key][1]
495 def handle_key(self, key):
496 key_bindings = {
497 ord('q'):
498 lambda: sys.exit(0),
499 ord('Q'):
500 lambda: sys.exit(0),
501 ord('r'):
502 lambda: self.reverse_sorting(),
503 ord('R'):
504 lambda: self.reverse_sorting(),
505 curses.KEY_LEFT:
506 lambda: self.adjust_sorting_key(-1),
507 curses.KEY_RIGHT:
508 lambda: self.adjust_sorting_key(1),
509 curses.KEY_HOME:
510 lambda: self.adjust_sorting_key(-len(IOTopUI.sorting_keys)),
511 curses.KEY_END:
512 lambda: self.adjust_sorting_key(len(IOTopUI.sorting_keys))
515 action = key_bindings.get(key, lambda: None)
516 action()
518 def get_data(self):
519 def format(p):
520 stats = human_stats(p.stats)
521 io_delay, swapin_delay, read_bytes, write_bytes = stats
522 line = '%5d %-8s %11s %11s %7s %7s ' % (p.pid, p.user[:8],
523 read_bytes, write_bytes, swapin_delay, io_delay)
524 if self.options.batch:
525 max_cmdline_length = 4096
526 else:
527 max_cmdline_length = self.width - len(line)
528 line += p.get_cmdline()[:max_cmdline_length]
529 return line
531 def should_format(p):
532 return not self.options.only or p.did_some_io()
534 processes = self.process_list.processes.values()
535 key = IOTopUI.sorting_keys[self.sorting_key][0]
536 processes.sort(key=key, reverse=self.sorting_reverse)
537 if not self.options.batch:
538 del processes[self.height - 2:]
539 return [format(p) for p in processes if should_format(p)]
541 def refresh_display(self, total_read, total_write, duration):
542 summary = 'Total DISK READ: %s | Total DISK WRITE: %s' % (
543 human_bandwidth(total_read, duration),
544 human_bandwidth(total_write, duration))
545 titles = [' PID', ' USER', ' DISK READ', ' DISK WRITE',
546 ' SWAPIN', ' IO', ' COMMAND']
547 lines = self.get_data()
548 if self.options.batch:
549 print summary
550 print ''.join(titles)
551 for l in lines:
552 print l
553 else:
554 self.win.clear()
555 self.win.addstr(summary)
556 self.win.hline(1, 0, ord(' ') | curses.A_REVERSE, self.width)
557 for i in xrange(len(titles)):
558 attr = curses.A_REVERSE
559 title = titles[i]
560 if i == self.sorting_key:
561 attr |= curses.A_BOLD
562 title += self.sorting_reverse and '>' or '<'
563 self.win.addstr(title, attr)
564 for i in xrange(len(lines)):
565 self.win.insstr(i + 2, 0, lines[i])
566 self.win.refresh()
568 def run_iotop(win, options):
569 taskstats_connection = TaskStatsNetlink(options)
570 process_list = ProcessList(taskstats_connection, options)
571 ui = IOTopUI(win, process_list, options)
572 ui.run()
575 # Main program
578 VERSION = '0.2'
580 USAGE = '''%s [OPTIONS]
582 DISK READ and DISK WRITE are the block I/O bandwidth used during the sampling
583 period. SWAPIN and IO are the percentages of time the thread spent respectively
584 while swapping in and waiting on I/O more generally.
585 Controls: left and right arrows to change the sorting column, r to invert the
586 sorting order, q to quit, any other key to force a refresh''' % sys.argv[0]
588 def main():
589 parser = optparse.OptionParser(usage=USAGE, version='iotop ' + VERSION)
590 parser.add_option('-d', '--delay', type='float', dest='delay_seconds',
591 help='delay between iterations [1 second]',
592 metavar='SEC', default=1)
593 parser.add_option('-p', '--pid', type='int', dest='pids', action='append',
594 help='processes to monitor [all]', metavar='PID')
595 parser.add_option('-u', '--user', type='str', dest='users', action='append',
596 help='users to monitor [all]', metavar='USER')
597 parser.add_option('-b', '--batch', action='store_true', dest='batch',
598 help='non-interactive mode')
599 parser.add_option('-P', '--processes', action='store_true',
600 dest='processes',
601 help='show only processes, not all threads')
602 parser.add_option('-o', '--only', action='store_true',
603 dest='only',
604 help='only show processes or threads actually doing I/O')
605 parser.add_option('-n', '--iter', type='int', dest='iterations',
606 metavar='NUM',
607 help='number of iterations before ending [infinite]')
608 options, args = parser.parse_args()
609 if args:
610 parser.error('Unexpected arguments: ' + ' '.join(args))
611 find_uids(options)
612 options.pids = options.pids or []
613 if options.batch:
614 run_iotop(None, options)
615 else:
616 curses.wrapper(run_iotop, options)
618 if __name__ == '__main__':
619 try:
620 main()
621 except KeyboardInterrupt:
622 pass
623 sys.exit(0)