bot.py: add minor log message
[buildbot.git] / buildbot / slave / bot.py
blob794b09deb075a81cee26af53c4c4cba06504238b
1 #! /usr/bin/python
3 import time, os, os.path, re, sys
5 from twisted.spread import pb
6 from twisted.python import log, usage, failure
7 from twisted.internet import reactor, defer
8 from twisted.application import service, internet
9 from twisted.cred import credentials
11 from buildbot.util import now
12 from buildbot.pbutil import ReconnectingPBClientFactory
13 from buildbot.slave import registry
14 # make sure the standard commands get registered
15 from buildbot.slave import commands
17 class NoCommandRunning(pb.Error):
18 pass
19 class WrongCommandRunning(pb.Error):
20 pass
21 class UnknownCommand(pb.Error):
22 pass
24 class Master:
25 def __init__(self, host, port, username, password):
26 self.host = host
27 self.port = port
28 self.username = username
29 self.password = password
31 class SlaveBuild:
33 """This is an object that can hold state from one step to another in the
34 same build. All SlaveCommands have access to it.
35 """
36 def __init__(self, builder):
37 self.builder = builder
39 class SlaveBuilder(pb.Referenceable, service.Service):
41 """This is the local representation of a single Builder: it handles a
42 single kind of build (like an all-warnings build). It has a name and a
43 home directory. The rest of its behavior is determined by the master.
44 """
46 stopCommandOnShutdown = True
48 # remote is a ref to the Builder object on the master side, and is set
49 # when they attach. We use it to detect when the connection to the master
50 # is severed.
51 remote = None
53 # .build points to a SlaveBuild object, a new one for each build
54 build = None
56 # .command points to a SlaveCommand instance, and is set while the step
57 # is running. We use it to implement the stopBuild method.
58 command = None
60 # .remoteStep is a ref to the master-side BuildStep object, and is set
61 # when the step is started
62 remoteStep = None
64 def __init__(self, name, not_really):
65 #service.Service.__init__(self) # Service has no __init__ method
66 self.setName(name)
67 self.not_really = not_really
69 def __repr__(self):
70 return "<SlaveBuilder '%s'>" % self.name
72 def setServiceParent(self, parent):
73 service.Service.setServiceParent(self, parent)
74 self.bot = self.parent
75 # note that self.parent will go away when the buildmaster's config
76 # file changes and this Builder is removed (possibly because it has
77 # been changed, so the Builder will be re-added again in a moment).
78 # This may occur during a build, while a step is running.
80 def setBuilddir(self, builddir):
81 assert self.parent
82 self.builddir = builddir
83 self.basedir = os.path.join(self.bot.basedir, self.builddir)
84 if not os.path.isdir(self.basedir):
85 os.mkdir(self.basedir)
87 def stopService(self):
88 service.Service.stopService(self)
89 if self.stopCommandOnShutdown:
90 self.stopCommand()
92 def activity(self):
93 bot = self.parent
94 if bot:
95 buildslave = bot.parent
96 if buildslave:
97 bf = buildslave.bf
98 bf.activity()
100 def remote_setMaster(self, remote):
101 self.remote = remote
102 self.remote.notifyOnDisconnect(self.lostRemote)
103 def remote_print(self, message):
104 log.msg("SlaveBuilder.remote_print(%s): message from master: %s" %
105 (self.name, message))
106 if message == "ping":
107 return self.remote_ping()
109 def remote_ping(self):
110 log.msg("SlaveBuilder.remote_ping(%s)" % self)
111 if self.bot and self.bot.parent:
112 debugOpts = self.bot.parent.debugOpts
113 if debugOpts.get("stallPings"):
114 log.msg(" debug_stallPings")
115 timeout, timers = debugOpts["stallPings"]
116 d = defer.Deferred()
117 t = reactor.callLater(timeout, d.callback, None)
118 timers.append(t)
119 return d
120 if debugOpts.get("failPingOnce"):
121 log.msg(" debug_failPingOnce")
122 class FailPingError(pb.Error): pass
123 del debugOpts['failPingOnce']
124 raise FailPingError("debug_failPingOnce means we should fail")
126 def lostRemote(self, remote):
127 log.msg("lost remote")
128 self.remote = None
130 def lostRemoteStep(self, remotestep):
131 log.msg("lost remote step")
132 self.remoteStep = None
133 if self.stopCommandOnShutdown:
134 self.stopCommand()
136 # the following are Commands that can be invoked by the master-side
137 # Builder
138 def remote_startBuild(self):
139 """This is invoked before the first step of any new build is run. It
140 creates a new SlaveBuild object, which holds slave-side state from
141 one step to the next."""
142 self.build = SlaveBuild(self)
143 log.msg("%s.startBuild" % self)
145 def remote_startCommand(self, stepref, stepId, command, args):
147 This gets invoked by L{buildbot.process.step.RemoteCommand.start}, as
148 part of various master-side BuildSteps, to start various commands
149 that actually do the build. I return nothing. Eventually I will call
150 .commandComplete() to notify the master-side RemoteCommand that I'm
151 done.
154 self.activity()
156 if self.command:
157 log.msg("leftover command, dropping it")
158 self.stopCommand()
160 try:
161 factory, version = registry.commandRegistry[command]
162 except KeyError:
163 raise UnknownCommand, "unrecognized SlaveCommand '%s'" % command
164 self.command = factory(self, stepId, args)
166 log.msg(" startCommand:%s [id %s]" % (command,stepId))
167 self.remoteStep = stepref
168 self.remoteStep.notifyOnDisconnect(self.lostRemoteStep)
169 d = self.command.doStart()
170 d.addCallback(lambda res: None)
171 d.addBoth(self.commandComplete)
172 return None
174 def remote_interruptCommand(self, stepId, why):
175 """Halt the current step."""
176 log.msg("asked to interrupt current command: %s" % why)
177 self.activity()
178 if not self.command:
179 # TODO: just log it, a race could result in their interrupting a
180 # command that wasn't actually running
181 log.msg(" .. but none was running")
182 return
183 self.command.doInterrupt()
186 def stopCommand(self):
187 """Make any currently-running command die, with no further status
188 output. This is used when the buildslave is shutting down or the
189 connection to the master has been lost. Interrupt the command,
190 silence it, and then forget about it."""
191 if not self.command:
192 return
193 log.msg("stopCommand: halting current command %s" % self.command)
194 self.command.doInterrupt() # shut up! and die!
195 self.command = None # forget you!
197 # sendUpdate is invoked by the Commands we spawn
198 def sendUpdate(self, data):
199 """This sends the status update to the master-side
200 L{buildbot.process.step.RemoteCommand} object, giving it a sequence
201 number in the process. It adds the update to a queue, and asks the
202 master to acknowledge the update so it can be removed from that
203 queue."""
205 if not self.running:
206 # .running comes from service.Service, and says whether the
207 # service is running or not. If we aren't running, don't send any
208 # status messages.
209 return
210 # the update[1]=0 comes from the leftover 'updateNum', which the
211 # master still expects to receive. Provide it to avoid significant
212 # interoperability issues between new slaves and old masters.
213 if self.remoteStep:
214 update = [data, 0]
215 updates = [update]
216 d = self.remoteStep.callRemote("update", updates)
217 d.addCallback(self.ackUpdate)
218 d.addErrback(self._ackFailed, "SlaveBuilder.sendUpdate")
220 def ackUpdate(self, acknum):
221 self.activity() # update the "last activity" timer
223 def ackComplete(self, dummy):
224 self.activity() # update the "last activity" timer
226 def _ackFailed(self, why, where):
227 log.msg("SlaveBuilder._ackFailed:", where)
228 #log.err(why) # we don't really care
231 # this is fired by the Deferred attached to each Command
232 def commandComplete(self, failure):
233 if failure:
234 log.msg("SlaveBuilder.commandFailed", self.command)
235 log.err(failure)
236 # failure, if present, is a failure.Failure. To send it across
237 # the wire, we must turn it into a pb.CopyableFailure.
238 failure = pb.CopyableFailure(failure)
239 failure.unsafeTracebacks = True
240 else:
241 # failure is None
242 log.msg("SlaveBuilder.commandComplete", self.command)
243 self.command = None
244 if not self.running:
245 log.msg(" but we weren't running, quitting silently")
246 return
247 if self.remoteStep:
248 self.remoteStep.dontNotifyOnDisconnect(self.lostRemoteStep)
249 d = self.remoteStep.callRemote("complete", failure)
250 d.addCallback(self.ackComplete)
251 d.addErrback(self._ackFailed, "sendComplete")
252 self.remoteStep = None
255 def remote_shutdown(self):
256 print "slave shutting down on command from master"
257 reactor.stop()
260 class Bot(pb.Referenceable, service.MultiService):
261 """I represent the slave-side bot."""
262 usePTY = None
263 name = "bot"
265 def __init__(self, basedir, usePTY, not_really=0):
266 service.MultiService.__init__(self)
267 self.basedir = basedir
268 self.usePTY = usePTY
269 self.not_really = not_really
270 self.builders = {}
272 def startService(self):
273 assert os.path.isdir(self.basedir)
274 service.MultiService.startService(self)
276 def remote_getDirs(self):
277 return filter(lambda d: os.path.isdir(d), os.listdir(self.basedir))
279 def remote_getCommands(self):
280 commands = {}
281 for name, (factory, version) in registry.commandRegistry.items():
282 commands[name] = version
283 return commands
285 def remote_setBuilderList(self, wanted):
286 retval = {}
287 for (name, builddir) in wanted:
288 b = self.builders.get(name, None)
289 if b:
290 if b.builddir != builddir:
291 log.msg("changing builddir for builder %s from %s to %s" \
292 % (name, b.builddir, builddir))
293 b.setBuilddir(builddir)
294 else:
295 b = SlaveBuilder(name, self.not_really)
296 b.usePTY = self.usePTY
297 b.setServiceParent(self)
298 b.setBuilddir(builddir)
299 self.builders[name] = b
300 retval[name] = b
301 for name in self.builders.keys():
302 if not name in map(lambda a: a[0], wanted):
303 log.msg("removing old builder %s" % name)
304 self.builders[name].disownServiceParent()
305 del(self.builders[name])
306 return retval
308 def remote_print(self, message):
309 log.msg("message from master:", message)
311 def remote_getSlaveInfo(self):
312 """This command retrieves data from the files in SLAVEDIR/info/* and
313 sends the contents to the buildmaster. These are used to describe
314 the slave and its configuration, and should be created and
315 maintained by the slave administrator. They will be retrieved each
316 time the master-slave connection is established.
319 files = {}
320 basedir = os.path.join(self.basedir, "info")
321 if not os.path.isdir(basedir):
322 return files
323 for f in os.listdir(basedir):
324 filename = os.path.join(basedir, f)
325 if os.path.isfile(filename):
326 files[f] = open(filename, "r").read()
327 return files
329 def debug_forceBuild(self, name):
330 d = self.perspective.callRemote("forceBuild", name)
331 d.addCallbacks(log.msg, log.err)
333 class BotFactory(ReconnectingPBClientFactory):
334 # 'keepaliveInterval' serves two purposes. The first is to keep the
335 # connection alive: it guarantees that there will be at least some
336 # traffic once every 'keepaliveInterval' seconds, which may help keep an
337 # interposed NAT gateway from dropping the address mapping because it
338 # thinks the connection has been abandoned. The second is to put an upper
339 # limit on how long the buildmaster might have gone away before we notice
340 # it. For this second purpose, we insist upon seeing *some* evidence of
341 # the buildmaster at least once every 'keepaliveInterval' seconds.
342 keepaliveInterval = None # None = do not use keepalives
344 # 'keepaliveTimeout' seconds before the interval expires, we will send a
345 # keepalive request, both to add some traffic to the connection, and to
346 # prompt a response from the master in case all our builders are idle. We
347 # don't insist upon receiving a timely response from this message: a slow
348 # link might put the request at the wrong end of a large build message.
349 keepaliveTimeout = 30 # how long we will go without a response
351 keepaliveTimer = None
352 activityTimer = None
353 lastActivity = 0
354 unsafeTracebacks = 1
355 perspective = None
357 def __init__(self, keepaliveInterval, keepaliveTimeout):
358 ReconnectingPBClientFactory.__init__(self)
359 self.keepaliveInterval = keepaliveInterval
360 self.keepaliveTimeout = keepaliveTimeout
362 def startedConnecting(self, connector):
363 ReconnectingPBClientFactory.startedConnecting(self, connector)
364 self.connector = connector
366 def gotPerspective(self, perspective):
367 ReconnectingPBClientFactory.gotPerspective(self, perspective)
368 self.perspective = perspective
369 try:
370 perspective.broker.transport.setTcpKeepAlive(1)
371 except:
372 log.msg("unable to set SO_KEEPALIVE")
373 if not self.keepaliveInterval:
374 self.keepaliveInterval = 10*60
375 self.activity()
376 if self.keepaliveInterval:
377 log.msg("sending application-level keepalives every %d seconds" \
378 % self.keepaliveInterval)
379 self.startTimers()
381 def clientConnectionFailed(self, connector, reason):
382 self.connector = None
383 ReconnectingPBClientFactory.clientConnectionFailed(self,
384 connector, reason)
386 def clientConnectionLost(self, connector, reason):
387 self.connector = None
388 self.stopTimers()
389 self.perspective = None
390 ReconnectingPBClientFactory.clientConnectionLost(self,
391 connector, reason)
393 def startTimers(self):
394 assert self.keepaliveInterval
395 assert not self.keepaliveTimer
396 assert not self.activityTimer
397 # Insist that doKeepalive fires before checkActivity. Really, it
398 # needs to happen at least one RTT beforehand.
399 assert self.keepaliveInterval > self.keepaliveTimeout
401 # arrange to send a keepalive a little while before our deadline
402 when = self.keepaliveInterval - self.keepaliveTimeout
403 self.keepaliveTimer = reactor.callLater(when, self.doKeepalive)
404 # and check for activity too
405 self.activityTimer = reactor.callLater(self.keepaliveInterval,
406 self.checkActivity)
408 def stopTimers(self):
409 if self.keepaliveTimer:
410 self.keepaliveTimer.cancel()
411 self.keepaliveTimer = None
412 if self.activityTimer:
413 self.activityTimer.cancel()
414 self.activityTimer = None
416 def activity(self, res=None):
417 self.lastActivity = now()
419 def doKeepalive(self):
420 # send the keepalive request. If it fails outright, the connection
421 # was already dropped, so just log and ignore.
422 self.keepaliveTimer = None
423 log.msg("sending app-level keepalive")
424 d = self.perspective.callRemote("keepalive")
425 d.addCallback(self.activity)
426 d.addErrback(self.keepaliveLost)
428 def keepaliveLost(self, f):
429 log.msg("BotFactory.keepaliveLost")
431 def checkActivity(self):
432 self.activityTimer = None
433 if self.lastActivity + self.keepaliveInterval < now():
434 log.msg("BotFactory.checkActivity: nothing from master for "
435 "%d secs" % (now() - self.lastActivity))
436 self.perspective.broker.transport.loseConnection()
437 return
438 self.startTimers()
440 def stopFactory(self):
441 ReconnectingPBClientFactory.stopFactory(self)
442 self.stopTimers()
445 class BuildSlave(service.MultiService):
446 botClass = Bot
448 # debugOpts is a dictionary used during unit tests.
450 # debugOpts['stallPings'] can be set to a tuple of (timeout, []). Any
451 # calls to remote_print will stall for 'timeout' seconds before
452 # returning. The DelayedCalls used to implement this are stashed in the
453 # list so they can be cancelled later.
455 # debugOpts['failPingOnce'] can be set to True to make the slaveping fail
456 # exactly once.
458 def __init__(self, host, port, name, passwd, basedir, keepalive,
459 usePTY, keepaliveTimeout=30, umask=None, debugOpts={}):
460 service.MultiService.__init__(self)
461 self.debugOpts = debugOpts.copy()
462 bot = self.botClass(basedir, usePTY)
463 bot.setServiceParent(self)
464 self.bot = bot
465 if keepalive == 0:
466 keepalive = None
467 self.umask = umask
468 bf = self.bf = BotFactory(keepalive, keepaliveTimeout)
469 bf.startLogin(credentials.UsernamePassword(name, passwd), client=bot)
470 self.connection = c = internet.TCPClient(host, port, bf)
471 c.setServiceParent(self)
473 def waitUntilDisconnected(self):
474 # utility method for testing. Returns a Deferred that will fire when
475 # we lose the connection to the master.
476 if not self.bf.perspective:
477 return defer.succeed(None)
478 d = defer.Deferred()
479 self.bf.perspective.notifyOnDisconnect(lambda res: d.callback(None))
480 return d
482 def startService(self):
483 if self.umask is not None:
484 os.umask(self.umask)
485 service.MultiService.startService(self)
487 def stopService(self):
488 self.bf.continueTrying = 0
489 self.bf.stopTrying()
490 service.MultiService.stopService(self)
491 # now kill the TCP connection
492 # twisted >2.0.1 does this for us, and leaves _connection=None
493 if self.connection._connection:
494 self.connection._connection.disconnect()