finally remove forceBuild
[buildbot.git] / buildbot / slave / bot.py
blob6cb29f60c1d1b67f4ed2f6f1865bd377b219e3d7
1 #! /usr/bin/python
3 import os.path
5 from twisted.spread import pb
6 from twisted.python import log
7 from twisted.internet import reactor, defer
8 from twisted.application import service, internet
9 from twisted.cred import credentials
11 from buildbot.util import now
12 from buildbot.pbutil import ReconnectingPBClientFactory
13 from buildbot.slave import registry
14 # make sure the standard commands get registered
15 from buildbot.slave import commands
17 class NoCommandRunning(pb.Error):
18 pass
19 class WrongCommandRunning(pb.Error):
20 pass
21 class UnknownCommand(pb.Error):
22 pass
24 class Master:
25 def __init__(self, host, port, username, password):
26 self.host = host
27 self.port = port
28 self.username = username
29 self.password = password
31 class SlaveBuild:
33 """This is an object that can hold state from one step to another in the
34 same build. All SlaveCommands have access to it.
35 """
36 def __init__(self, builder):
37 self.builder = builder
39 class SlaveBuilder(pb.Referenceable, service.Service):
41 """This is the local representation of a single Builder: it handles a
42 single kind of build (like an all-warnings build). It has a name and a
43 home directory. The rest of its behavior is determined by the master.
44 """
46 stopCommandOnShutdown = True
48 # remote is a ref to the Builder object on the master side, and is set
49 # when they attach. We use it to detect when the connection to the master
50 # is severed.
51 remote = None
53 # .build points to a SlaveBuild object, a new one for each build
54 build = None
56 # .command points to a SlaveCommand instance, and is set while the step
57 # is running. We use it to implement the stopBuild method.
58 command = None
60 # .remoteStep is a ref to the master-side BuildStep object, and is set
61 # when the step is started
62 remoteStep = None
64 def __init__(self, name, not_really):
65 #service.Service.__init__(self) # Service has no __init__ method
66 self.setName(name)
67 self.not_really = not_really
69 def __repr__(self):
70 return "<SlaveBuilder '%s' at %d>" % (self.name, id(self))
72 def setServiceParent(self, parent):
73 service.Service.setServiceParent(self, parent)
74 self.bot = self.parent
75 # note that self.parent will go away when the buildmaster's config
76 # file changes and this Builder is removed (possibly because it has
77 # been changed, so the Builder will be re-added again in a moment).
78 # This may occur during a build, while a step is running.
80 def setBuilddir(self, builddir):
81 assert self.parent
82 self.builddir = builddir
83 self.basedir = os.path.join(self.bot.basedir, self.builddir)
84 if not os.path.isdir(self.basedir):
85 os.mkdir(self.basedir)
87 def stopService(self):
88 service.Service.stopService(self)
89 if self.stopCommandOnShutdown:
90 self.stopCommand()
92 def activity(self):
93 bot = self.parent
94 if bot:
95 buildslave = bot.parent
96 if buildslave:
97 bf = buildslave.bf
98 bf.activity()
100 def remote_setMaster(self, remote):
101 self.remote = remote
102 self.remote.notifyOnDisconnect(self.lostRemote)
103 def remote_print(self, message):
104 log.msg("SlaveBuilder.remote_print(%s): message from master: %s" %
105 (self.name, message))
106 if message == "ping":
107 return self.remote_ping()
109 def remote_ping(self):
110 log.msg("SlaveBuilder.remote_ping(%s)" % self)
111 if self.bot and self.bot.parent:
112 debugOpts = self.bot.parent.debugOpts
113 if debugOpts.get("stallPings"):
114 log.msg(" debug_stallPings")
115 timeout, timers = debugOpts["stallPings"]
116 d = defer.Deferred()
117 t = reactor.callLater(timeout, d.callback, None)
118 timers.append(t)
119 return d
120 if debugOpts.get("failPingOnce"):
121 log.msg(" debug_failPingOnce")
122 class FailPingError(pb.Error): pass
123 del debugOpts['failPingOnce']
124 raise FailPingError("debug_failPingOnce means we should fail")
126 def lostRemote(self, remote):
127 log.msg("lost remote")
128 self.remote = None
130 def lostRemoteStep(self, remotestep):
131 log.msg("lost remote step")
132 self.remoteStep = None
133 if self.stopCommandOnShutdown:
134 self.stopCommand()
136 # the following are Commands that can be invoked by the master-side
137 # Builder
138 def remote_startBuild(self):
139 """This is invoked before the first step of any new build is run. It
140 creates a new SlaveBuild object, which holds slave-side state from
141 one step to the next."""
142 self.build = SlaveBuild(self)
143 log.msg("%s.startBuild" % self)
145 def remote_startCommand(self, stepref, stepId, command, args):
147 This gets invoked by L{buildbot.process.step.RemoteCommand.start}, as
148 part of various master-side BuildSteps, to start various commands
149 that actually do the build. I return nothing. Eventually I will call
150 .commandComplete() to notify the master-side RemoteCommand that I'm
151 done.
154 self.activity()
156 if self.command:
157 log.msg("leftover command, dropping it")
158 self.stopCommand()
160 try:
161 factory, version = registry.commandRegistry[command]
162 except KeyError:
163 raise UnknownCommand, "unrecognized SlaveCommand '%s'" % command
164 self.command = factory(self, stepId, args)
166 log.msg(" startCommand:%s [id %s]" % (command,stepId))
167 self.remoteStep = stepref
168 self.remoteStep.notifyOnDisconnect(self.lostRemoteStep)
169 d = self.command.doStart()
170 d.addCallback(lambda res: None)
171 d.addBoth(self.commandComplete)
172 return None
174 def remote_interruptCommand(self, stepId, why):
175 """Halt the current step."""
176 log.msg("asked to interrupt current command: %s" % why)
177 self.activity()
178 if not self.command:
179 # TODO: just log it, a race could result in their interrupting a
180 # command that wasn't actually running
181 log.msg(" .. but none was running")
182 return
183 self.command.doInterrupt()
186 def stopCommand(self):
187 """Make any currently-running command die, with no further status
188 output. This is used when the buildslave is shutting down or the
189 connection to the master has been lost. Interrupt the command,
190 silence it, and then forget about it."""
191 if not self.command:
192 return
193 log.msg("stopCommand: halting current command %s" % self.command)
194 self.command.doInterrupt() # shut up! and die!
195 self.command = None # forget you!
197 # sendUpdate is invoked by the Commands we spawn
198 def sendUpdate(self, data):
199 """This sends the status update to the master-side
200 L{buildbot.process.step.RemoteCommand} object, giving it a sequence
201 number in the process. It adds the update to a queue, and asks the
202 master to acknowledge the update so it can be removed from that
203 queue."""
205 if not self.running:
206 # .running comes from service.Service, and says whether the
207 # service is running or not. If we aren't running, don't send any
208 # status messages.
209 return
210 # the update[1]=0 comes from the leftover 'updateNum', which the
211 # master still expects to receive. Provide it to avoid significant
212 # interoperability issues between new slaves and old masters.
213 if self.remoteStep:
214 update = [data, 0]
215 updates = [update]
216 d = self.remoteStep.callRemote("update", updates)
217 d.addCallback(self.ackUpdate)
218 d.addErrback(self._ackFailed, "SlaveBuilder.sendUpdate")
220 def ackUpdate(self, acknum):
221 self.activity() # update the "last activity" timer
223 def ackComplete(self, dummy):
224 self.activity() # update the "last activity" timer
226 def _ackFailed(self, why, where):
227 log.msg("SlaveBuilder._ackFailed:", where)
228 #log.err(why) # we don't really care
231 # this is fired by the Deferred attached to each Command
232 def commandComplete(self, failure):
233 if failure:
234 log.msg("SlaveBuilder.commandFailed", self.command)
235 log.err(failure)
236 # failure, if present, is a failure.Failure. To send it across
237 # the wire, we must turn it into a pb.CopyableFailure.
238 failure = pb.CopyableFailure(failure)
239 failure.unsafeTracebacks = True
240 else:
241 # failure is None
242 log.msg("SlaveBuilder.commandComplete", self.command)
243 self.command = None
244 if not self.running:
245 log.msg(" but we weren't running, quitting silently")
246 return
247 if self.remoteStep:
248 self.remoteStep.dontNotifyOnDisconnect(self.lostRemoteStep)
249 d = self.remoteStep.callRemote("complete", failure)
250 d.addCallback(self.ackComplete)
251 d.addErrback(self._ackFailed, "sendComplete")
252 self.remoteStep = None
255 def remote_shutdown(self):
256 print "slave shutting down on command from master"
257 reactor.stop()
260 class Bot(pb.Referenceable, service.MultiService):
261 """I represent the slave-side bot."""
262 usePTY = None
263 name = "bot"
265 def __init__(self, basedir, usePTY, not_really=0):
266 service.MultiService.__init__(self)
267 self.basedir = basedir
268 self.usePTY = usePTY
269 self.not_really = not_really
270 self.builders = {}
272 def startService(self):
273 assert os.path.isdir(self.basedir)
274 service.MultiService.startService(self)
276 def remote_getDirs(self):
277 return filter(lambda d: os.path.isdir(d), os.listdir(self.basedir))
279 def remote_getCommands(self):
280 commands = {}
281 for name, (factory, version) in registry.commandRegistry.items():
282 commands[name] = version
283 return commands
285 def remote_setBuilderList(self, wanted):
286 retval = {}
287 wanted_dirs = []
288 for (name, builddir) in wanted:
289 wanted_dirs.append(builddir)
290 b = self.builders.get(name, None)
291 if b:
292 if b.builddir != builddir:
293 log.msg("changing builddir for builder %s from %s to %s" \
294 % (name, b.builddir, builddir))
295 b.setBuilddir(builddir)
296 else:
297 b = SlaveBuilder(name, self.not_really)
298 b.usePTY = self.usePTY
299 b.setServiceParent(self)
300 b.setBuilddir(builddir)
301 self.builders[name] = b
302 retval[name] = b
303 for name in self.builders.keys():
304 if not name in map(lambda a: a[0], wanted):
305 log.msg("removing old builder %s" % name)
306 self.builders[name].disownServiceParent()
307 del(self.builders[name])
309 for d in os.listdir(self.basedir):
310 if os.path.isdir(d):
311 if d not in wanted_dirs:
312 log.msg("I have a leftover directory '%s' that is not "
313 "being used by the buildmaster: you can delete "
314 "it now" % d)
315 return retval
317 def remote_print(self, message):
318 log.msg("message from master:", message)
320 def remote_getSlaveInfo(self):
321 """This command retrieves data from the files in SLAVEDIR/info/* and
322 sends the contents to the buildmaster. These are used to describe
323 the slave and its configuration, and should be created and
324 maintained by the slave administrator. They will be retrieved each
325 time the master-slave connection is established.
328 files = {}
329 basedir = os.path.join(self.basedir, "info")
330 if not os.path.isdir(basedir):
331 return files
332 for f in os.listdir(basedir):
333 filename = os.path.join(basedir, f)
334 if os.path.isfile(filename):
335 files[f] = open(filename, "r").read()
336 return files
338 class BotFactory(ReconnectingPBClientFactory):
339 # 'keepaliveInterval' serves two purposes. The first is to keep the
340 # connection alive: it guarantees that there will be at least some
341 # traffic once every 'keepaliveInterval' seconds, which may help keep an
342 # interposed NAT gateway from dropping the address mapping because it
343 # thinks the connection has been abandoned. The second is to put an upper
344 # limit on how long the buildmaster might have gone away before we notice
345 # it. For this second purpose, we insist upon seeing *some* evidence of
346 # the buildmaster at least once every 'keepaliveInterval' seconds.
347 keepaliveInterval = None # None = do not use keepalives
349 # 'keepaliveTimeout' seconds before the interval expires, we will send a
350 # keepalive request, both to add some traffic to the connection, and to
351 # prompt a response from the master in case all our builders are idle. We
352 # don't insist upon receiving a timely response from this message: a slow
353 # link might put the request at the wrong end of a large build message.
354 keepaliveTimeout = 30 # how long we will go without a response
356 keepaliveTimer = None
357 activityTimer = None
358 lastActivity = 0
359 unsafeTracebacks = 1
360 perspective = None
362 def __init__(self, keepaliveInterval, keepaliveTimeout):
363 ReconnectingPBClientFactory.__init__(self)
364 self.keepaliveInterval = keepaliveInterval
365 self.keepaliveTimeout = keepaliveTimeout
367 def startedConnecting(self, connector):
368 ReconnectingPBClientFactory.startedConnecting(self, connector)
369 self.connector = connector
371 def gotPerspective(self, perspective):
372 ReconnectingPBClientFactory.gotPerspective(self, perspective)
373 self.perspective = perspective
374 try:
375 perspective.broker.transport.setTcpKeepAlive(1)
376 except:
377 log.msg("unable to set SO_KEEPALIVE")
378 if not self.keepaliveInterval:
379 self.keepaliveInterval = 10*60
380 self.activity()
381 if self.keepaliveInterval:
382 log.msg("sending application-level keepalives every %d seconds" \
383 % self.keepaliveInterval)
384 self.startTimers()
386 def clientConnectionFailed(self, connector, reason):
387 self.connector = None
388 ReconnectingPBClientFactory.clientConnectionFailed(self,
389 connector, reason)
391 def clientConnectionLost(self, connector, reason):
392 self.connector = None
393 self.stopTimers()
394 self.perspective = None
395 ReconnectingPBClientFactory.clientConnectionLost(self,
396 connector, reason)
398 def startTimers(self):
399 assert self.keepaliveInterval
400 assert not self.keepaliveTimer
401 assert not self.activityTimer
402 # Insist that doKeepalive fires before checkActivity. Really, it
403 # needs to happen at least one RTT beforehand.
404 assert self.keepaliveInterval > self.keepaliveTimeout
406 # arrange to send a keepalive a little while before our deadline
407 when = self.keepaliveInterval - self.keepaliveTimeout
408 self.keepaliveTimer = reactor.callLater(when, self.doKeepalive)
409 # and check for activity too
410 self.activityTimer = reactor.callLater(self.keepaliveInterval,
411 self.checkActivity)
413 def stopTimers(self):
414 if self.keepaliveTimer:
415 self.keepaliveTimer.cancel()
416 self.keepaliveTimer = None
417 if self.activityTimer:
418 self.activityTimer.cancel()
419 self.activityTimer = None
421 def activity(self, res=None):
422 self.lastActivity = now()
424 def doKeepalive(self):
425 # send the keepalive request. If it fails outright, the connection
426 # was already dropped, so just log and ignore.
427 self.keepaliveTimer = None
428 log.msg("sending app-level keepalive")
429 d = self.perspective.callRemote("keepalive")
430 d.addCallback(self.activity)
431 d.addErrback(self.keepaliveLost)
433 def keepaliveLost(self, f):
434 log.msg("BotFactory.keepaliveLost")
436 def checkActivity(self):
437 self.activityTimer = None
438 if self.lastActivity + self.keepaliveInterval < now():
439 log.msg("BotFactory.checkActivity: nothing from master for "
440 "%d secs" % (now() - self.lastActivity))
441 self.perspective.broker.transport.loseConnection()
442 return
443 self.startTimers()
445 def stopFactory(self):
446 ReconnectingPBClientFactory.stopFactory(self)
447 self.stopTimers()
450 class BuildSlave(service.MultiService):
451 botClass = Bot
453 # debugOpts is a dictionary used during unit tests.
455 # debugOpts['stallPings'] can be set to a tuple of (timeout, []). Any
456 # calls to remote_print will stall for 'timeout' seconds before
457 # returning. The DelayedCalls used to implement this are stashed in the
458 # list so they can be cancelled later.
460 # debugOpts['failPingOnce'] can be set to True to make the slaveping fail
461 # exactly once.
463 def __init__(self, host, port, name, passwd, basedir, keepalive,
464 usePTY, keepaliveTimeout=30, umask=None, debugOpts={}):
465 service.MultiService.__init__(self)
466 self.debugOpts = debugOpts.copy()
467 bot = self.botClass(basedir, usePTY)
468 bot.setServiceParent(self)
469 self.bot = bot
470 if keepalive == 0:
471 keepalive = None
472 self.umask = umask
473 bf = self.bf = BotFactory(keepalive, keepaliveTimeout)
474 bf.startLogin(credentials.UsernamePassword(name, passwd), client=bot)
475 self.connection = c = internet.TCPClient(host, port, bf)
476 c.setServiceParent(self)
478 def waitUntilDisconnected(self):
479 # utility method for testing. Returns a Deferred that will fire when
480 # we lose the connection to the master.
481 if not self.bf.perspective:
482 return defer.succeed(None)
483 d = defer.Deferred()
484 self.bf.perspective.notifyOnDisconnect(lambda res: d.callback(None))
485 return d
487 def startService(self):
488 if self.umask is not None:
489 os.umask(self.umask)
490 service.MultiService.startService(self)
492 def stopService(self):
493 self.bf.continueTrying = 0
494 self.bf.stopTrying()
495 service.MultiService.stopService(self)
496 # now kill the TCP connection
497 # twisted >2.0.1 does this for us, and leaves _connection=None
498 if self.connection._connection:
499 self.connection._connection.disconnect()