Merge branch 'master' of git://github.com/bbangert/buildbot
[buildbot.git] / buildbot / buildslave.py
blob154384ec2b026dd588bab932a584daa1a28556b9
1 # Portions copyright Canonical Ltd. 2009
3 import time
4 from email.Message import Message
5 from email.Utils import formatdate
6 from zope.interface import implements
7 from twisted.python import log
8 from twisted.internet import defer, reactor
9 from twisted.application import service
10 import twisted.spread.pb
12 from buildbot.pbutil import NewCredPerspective
13 from buildbot.status.builder import SlaveStatus
14 from buildbot.status.mail import MailNotifier
15 from buildbot.interfaces import IBuildSlave, ILatentBuildSlave
16 from buildbot.process.properties import Properties
19 class AbstractBuildSlave(NewCredPerspective, service.MultiService):
20 """This is the master-side representative for a remote buildbot slave.
21 There is exactly one for each slave described in the config file (the
22 c['slaves'] list). When buildbots connect in (.attach), they get a
23 reference to this instance. The BotMaster object is stashed as the
24 .botmaster attribute. The BotMaster is also our '.parent' Service.
26 I represent a build slave -- a remote machine capable of
27 running builds. I am instantiated by the configuration file, and can be
28 subclassed to add extra functionality."""
30 implements(IBuildSlave)
32 def __init__(self, name, password, max_builds=None,
33 notify_on_missing=[], missing_timeout=3600,
34 properties={}):
35 """
36 @param name: botname this machine will supply when it connects
37 @param password: password this machine will supply when
38 it connects
39 @param max_builds: maximum number of simultaneous builds that will
40 be run concurrently on this buildslave (the
41 default is None for no limit)
42 @param properties: properties that will be applied to builds run on
43 this slave
44 @type properties: dictionary
45 """
46 service.MultiService.__init__(self)
47 self.slavename = name
48 self.password = password
49 self.botmaster = None # no buildmaster yet
50 self.slave_status = SlaveStatus(name)
51 self.slave = None # a RemoteReference to the Bot, when connected
52 self.slave_commands = None
53 self.slavebuilders = {}
54 self.max_builds = max_builds
56 self.properties = Properties()
57 self.properties.update(properties, "BuildSlave")
58 self.properties.setProperty("slavename", name, "BuildSlave")
60 self.lastMessageReceived = 0
61 if isinstance(notify_on_missing, str):
62 notify_on_missing = [notify_on_missing]
63 self.notify_on_missing = notify_on_missing
64 for i in notify_on_missing:
65 assert isinstance(i, str)
66 self.missing_timeout = missing_timeout
67 self.missing_timer = None
69 def update(self, new):
70 """
71 Given a new BuildSlave, configure this one identically. Because
72 BuildSlave objects are remotely referenced, we can't replace them
73 without disconnecting the slave, yet there's no reason to do that.
74 """
75 # the reconfiguration logic should guarantee this:
76 assert self.slavename == new.slavename
77 assert self.password == new.password
78 assert self.__class__ == new.__class__
79 self.max_builds = new.max_builds
81 def __repr__(self):
82 if self.botmaster:
83 builders = self.botmaster.getBuildersForSlave(self.slavename)
84 return "<%s '%s', current builders: %s>" % \
85 (self.__class__.__name__, self.slavename,
86 ','.join(map(lambda b: b.name, builders)))
87 else:
88 return "<%s '%s', (no builders yet)>" % \
89 (self.__class__.__name__, self.slavename)
91 def setBotmaster(self, botmaster):
92 assert not self.botmaster, "BuildSlave already has a botmaster"
93 self.botmaster = botmaster
94 self.startMissingTimer()
96 def stopMissingTimer(self):
97 if self.missing_timer:
98 self.missing_timer.cancel()
99 self.missing_timer = None
101 def startMissingTimer(self):
102 if self.notify_on_missing and self.missing_timeout and self.parent:
103 self.stopMissingTimer() # in case it's already running
104 self.missing_timer = reactor.callLater(self.missing_timeout,
105 self._missing_timer_fired)
107 def _missing_timer_fired(self):
108 self.missing_timer = None
109 # notify people, but only if we're still in the config
110 if not self.parent:
111 return
113 buildmaster = self.botmaster.parent
114 status = buildmaster.getStatus()
115 text = "The Buildbot working for '%s'\n" % status.getProjectName()
116 text += ("has noticed that the buildslave named %s went away\n" %
117 self.slavename)
118 text += "\n"
119 text += ("It last disconnected at %s (buildmaster-local time)\n" %
120 time.ctime(time.time() - self.missing_timeout)) # approx
121 text += "\n"
122 text += "The admin on record (as reported by BUILDSLAVE:info/admin)\n"
123 text += "was '%s'.\n" % self.slave_status.getAdmin()
124 text += "\n"
125 text += "Sincerely,\n"
126 text += " The Buildbot\n"
127 text += " %s\n" % status.getProjectURL()
128 subject = "Buildbot: buildslave %s was lost" % self.slavename
129 return self._mail_missing_message(subject, text)
132 def updateSlave(self):
133 """Called to add or remove builders after the slave has connected.
135 @return: a Deferred that indicates when an attached slave has
136 accepted the new builders and/or released the old ones."""
137 if self.slave:
138 return self.sendBuilderList()
139 else:
140 return defer.succeed(None)
142 def updateSlaveStatus(self, buildStarted=None, buildFinished=None):
143 if buildStarted:
144 self.slave_status.buildStarted(buildStarted)
145 if buildFinished:
146 self.slave_status.buildFinished(buildFinished)
148 def attached(self, bot):
149 """This is called when the slave connects.
151 @return: a Deferred that fires with a suitable pb.IPerspective to
152 give to the slave (i.e. 'self')"""
154 if self.slave:
155 # uh-oh, we've got a duplicate slave. The most likely
156 # explanation is that the slave is behind a slow link, thinks we
157 # went away, and has attempted to reconnect, so we've got two
158 # "connections" from the same slave, but the previous one is
159 # stale. Give the new one precedence.
160 log.msg("duplicate slave %s replacing old one" % self.slavename)
162 # just in case we've got two identically-configured slaves,
163 # report the IP addresses of both so someone can resolve the
164 # squabble
165 tport = self.slave.broker.transport
166 log.msg("old slave was connected from", tport.getPeer())
167 log.msg("new slave is from", bot.broker.transport.getPeer())
168 d = self.disconnect()
169 else:
170 d = defer.succeed(None)
171 # now we go through a sequence of calls, gathering information, then
172 # tell the Botmaster that it can finally give this slave to all the
173 # Builders that care about it.
175 # we accumulate slave information in this 'state' dictionary, then
176 # set it atomically if we make it far enough through the process
177 state = {}
179 # Reset graceful shutdown status
180 self.slave_status.setGraceful(False)
181 # We want to know when the graceful shutdown flag changes
182 self.slave_status.addGracefulWatcher(self._gracefulChanged)
184 def _log_attachment_on_slave(res):
185 d1 = bot.callRemote("print", "attached")
186 d1.addErrback(lambda why: None)
187 return d1
188 d.addCallback(_log_attachment_on_slave)
190 def _get_info(res):
191 d1 = bot.callRemote("getSlaveInfo")
192 def _got_info(info):
193 log.msg("Got slaveinfo from '%s'" % self.slavename)
194 # TODO: info{} might have other keys
195 state["admin"] = info.get("admin")
196 state["host"] = info.get("host")
197 def _info_unavailable(why):
198 # maybe an old slave, doesn't implement remote_getSlaveInfo
199 log.msg("BuildSlave.info_unavailable")
200 log.err(why)
201 d1.addCallbacks(_got_info, _info_unavailable)
202 return d1
203 d.addCallback(_get_info)
205 def _get_commands(res):
206 d1 = bot.callRemote("getCommands")
207 def _got_commands(commands):
208 state["slave_commands"] = commands
209 def _commands_unavailable(why):
210 # probably an old slave
211 log.msg("BuildSlave._commands_unavailable")
212 if why.check(AttributeError):
213 return
214 log.err(why)
215 d1.addCallbacks(_got_commands, _commands_unavailable)
216 return d1
217 d.addCallback(_get_commands)
219 def _accept_slave(res):
220 self.slave_status.setAdmin(state.get("admin"))
221 self.slave_status.setHost(state.get("host"))
222 self.slave_status.setConnected(True)
223 self.slave_commands = state.get("slave_commands")
224 self.slave = bot
225 log.msg("bot attached")
226 self.messageReceivedFromSlave()
227 self.stopMissingTimer()
229 return self.updateSlave()
230 d.addCallback(_accept_slave)
231 d.addCallback(lambda res: self.botmaster.maybeStartAllBuilds())
233 # Finally, the slave gets a reference to this BuildSlave. They
234 # receive this later, after we've started using them.
235 d.addCallback(lambda res: self)
236 return d
238 def messageReceivedFromSlave(self):
239 now = time.time()
240 self.lastMessageReceived = now
241 self.slave_status.setLastMessageReceived(now)
243 def detached(self, mind):
244 self.slave = None
245 self.slave_status.removeGracefulWatcher(self._gracefulChanged)
246 self.slave_status.setConnected(False)
247 log.msg("BuildSlave.detached(%s)" % self.slavename)
249 def disconnect(self):
250 """Forcibly disconnect the slave.
252 This severs the TCP connection and returns a Deferred that will fire
253 (with None) when the connection is probably gone.
255 If the slave is still alive, they will probably try to reconnect
256 again in a moment.
258 This is called in two circumstances. The first is when a slave is
259 removed from the config file. In this case, when they try to
260 reconnect, they will be rejected as an unknown slave. The second is
261 when we wind up with two connections for the same slave, in which
262 case we disconnect the older connection.
265 if not self.slave:
266 return defer.succeed(None)
267 log.msg("disconnecting old slave %s now" % self.slavename)
268 # When this Deferred fires, we'll be ready to accept the new slave
269 return self._disconnect(self.slave)
271 def _disconnect(self, slave):
272 # all kinds of teardown will happen as a result of
273 # loseConnection(), but it happens after a reactor iteration or
274 # two. Hook the actual disconnect so we can know when it is safe
275 # to connect the new slave. We have to wait one additional
276 # iteration (with callLater(0)) to make sure the *other*
277 # notifyOnDisconnect handlers have had a chance to run.
278 d = defer.Deferred()
280 # notifyOnDisconnect runs the callback with one argument, the
281 # RemoteReference being disconnected.
282 def _disconnected(rref):
283 reactor.callLater(0, d.callback, None)
284 slave.notifyOnDisconnect(_disconnected)
285 tport = slave.broker.transport
286 # this is the polite way to request that a socket be closed
287 tport.loseConnection()
288 try:
289 # but really we don't want to wait for the transmit queue to
290 # drain. The remote end is unlikely to ACK the data, so we'd
291 # probably have to wait for a (20-minute) TCP timeout.
292 #tport._closeSocket()
293 # however, doing _closeSocket (whether before or after
294 # loseConnection) somehow prevents the notifyOnDisconnect
295 # handlers from being run. Bummer.
296 tport.offset = 0
297 tport.dataBuffer = ""
298 except:
299 # however, these hacks are pretty internal, so don't blow up if
300 # they fail or are unavailable
301 log.msg("failed to accelerate the shutdown process")
302 pass
303 log.msg("waiting for slave to finish disconnecting")
305 return d
307 def sendBuilderList(self):
308 our_builders = self.botmaster.getBuildersForSlave(self.slavename)
309 blist = [(b.name, b.builddir) for b in our_builders]
310 d = self.slave.callRemote("setBuilderList", blist)
311 return d
313 def perspective_keepalive(self):
314 pass
316 def addSlaveBuilder(self, sb):
317 if sb.builder_name not in self.slavebuilders:
318 log.msg("%s adding %s" % (self, sb))
319 elif sb is not self.slavebuilders[sb.builder_name]:
320 log.msg("%s replacing %s" % (self, sb))
321 else:
322 return
323 self.slavebuilders[sb.builder_name] = sb
325 def removeSlaveBuilder(self, sb):
326 try:
327 del self.slavebuilders[sb.builder_name]
328 except KeyError:
329 pass
330 else:
331 log.msg("%s removed %s" % (self, sb))
333 def canStartBuild(self):
335 I am called when a build is requested to see if this buildslave
336 can start a build. This function can be used to limit overall
337 concurrency on the buildslave.
339 # If we're waiting to shutdown gracefully, then we shouldn't
340 # accept any new jobs.
341 if self.slave_status.getGraceful():
342 return False
344 if self.max_builds:
345 active_builders = [sb for sb in self.slavebuilders.values()
346 if sb.isBusy()]
347 if len(active_builders) >= self.max_builds:
348 return False
349 return True
351 def _mail_missing_message(self, subject, text):
352 # first, see if we have a MailNotifier we can use. This gives us a
353 # fromaddr and a relayhost.
354 buildmaster = self.botmaster.parent
355 for st in buildmaster.statusTargets:
356 if isinstance(st, MailNotifier):
357 break
358 else:
359 # if not, they get a default MailNotifier, which always uses SMTP
360 # to localhost and uses a dummy fromaddr of "buildbot".
361 log.msg("buildslave-missing msg using default MailNotifier")
362 st = MailNotifier("buildbot")
363 # now construct the mail
365 m = Message()
366 m.set_payload(text)
367 m['Date'] = formatdate(localtime=True)
368 m['Subject'] = subject
369 m['From'] = st.fromaddr
370 recipients = self.notify_on_missing
371 m['To'] = ", ".join(recipients)
372 d = st.sendMessage(m, recipients)
373 # return the Deferred for testing purposes
374 return d
376 def _gracefulChanged(self, graceful):
377 """This is called when our graceful shutdown setting changes"""
378 if graceful:
379 active_builders = [sb for sb in self.slavebuilders.values()
380 if sb.isBusy()]
381 if len(active_builders) == 0:
382 # Shut down!
383 self.shutdown()
385 def shutdown(self):
386 """Shutdown the slave"""
387 # Look for a builder with a remote reference to the client side
388 # slave. If we can find one, then call "shutdown" on the remote
389 # builder, which will cause the slave buildbot process to exit.
390 d = None
391 for b in self.slavebuilders.values():
392 if b.remote:
393 d = b.remote.callRemote("shutdown")
394 break
396 if d:
397 log.msg("Shutting down slave: %s" % self.slavename)
398 # The remote shutdown call will not complete successfully since the
399 # buildbot process exits almost immediately after getting the
400 # shutdown request.
401 # Here we look at the reason why the remote call failed, and if
402 # it's because the connection was lost, that means the slave
403 # shutdown as expected.
404 def _errback(why):
405 if why.check(twisted.spread.pb.PBConnectionLost):
406 log.msg("Lost connection to %s" % self.slavename)
407 else:
408 log.err("Unexpected error when trying to shutdown %s" % self.slavename)
409 d.addErrback(_errback)
410 return d
411 log.err("Couldn't find remote builder to shut down slave")
412 return defer.succeed(None)
414 class BuildSlave(AbstractBuildSlave):
416 def sendBuilderList(self):
417 d = AbstractBuildSlave.sendBuilderList(self)
418 def _sent(slist):
419 dl = []
420 for name, remote in slist.items():
421 # use get() since we might have changed our mind since then
422 b = self.botmaster.builders.get(name)
423 if b:
424 d1 = b.attached(self, remote, self.slave_commands)
425 dl.append(d1)
426 return defer.DeferredList(dl)
427 def _set_failed(why):
428 log.msg("BuildSlave.sendBuilderList (%s) failed" % self)
429 log.err(why)
430 # TODO: hang up on them?, without setBuilderList we can't use
431 # them
432 d.addCallbacks(_sent, _set_failed)
433 return d
435 def detached(self, mind):
436 AbstractBuildSlave.detached(self, mind)
437 self.botmaster.slaveLost(self)
438 self.startMissingTimer()
440 def buildFinished(self, sb):
441 """This is called when a build on this slave is finished."""
442 # If we're gracefully shutting down, and we have no more active
443 # builders, then it's safe to disconnect
444 if self.slave_status.getGraceful():
445 active_builders = [sb for sb in self.slavebuilders.values()
446 if sb.isBusy()]
447 if len(active_builders) == 0:
448 # Shut down!
449 return self.shutdown()
450 return defer.succeed(None)
452 class AbstractLatentBuildSlave(AbstractBuildSlave):
453 """A build slave that will start up a slave instance when needed.
455 To use, subclass and implement start_instance and stop_instance.
457 See ec2buildslave.py for a concrete example. Also see the stub example in
458 test/test_slaves.py.
461 implements(ILatentBuildSlave)
463 substantiated = False
464 substantiation_deferred = None
465 build_wait_timer = None
466 _start_result = _shutdown_callback_handle = None
468 def __init__(self, name, password, max_builds=None,
469 notify_on_missing=[], missing_timeout=60*20,
470 build_wait_timeout=60*10,
471 properties={}):
472 AbstractBuildSlave.__init__(
473 self, name, password, max_builds, notify_on_missing,
474 missing_timeout, properties)
475 self.building = set()
476 self.build_wait_timeout = build_wait_timeout
478 def start_instance(self):
479 # responsible for starting instance that will try to connect with
480 # this master. Should return deferred. Problems should use an
481 # errback.
482 raise NotImplementedError
484 def stop_instance(self, fast=False):
485 # responsible for shutting down instance.
486 raise NotImplementedError
488 def substantiate(self, sb):
489 if self.substantiated:
490 self._clearBuildWaitTimer()
491 self._setBuildWaitTimer()
492 return defer.succeed(self)
493 if self.substantiation_deferred is None:
494 if self.parent and not self.missing_timer:
495 # start timer. if timer times out, fail deferred
496 self.missing_timer = reactor.callLater(
497 self.missing_timeout,
498 self._substantiation_failed, defer.TimeoutError())
499 self.substantiation_deferred = defer.Deferred()
500 if self.slave is None:
501 self._substantiate() # start up instance
502 # else: we're waiting for an old one to detach. the _substantiate
503 # will be done in ``detached`` below.
504 return self.substantiation_deferred
506 def _substantiate(self):
507 # register event trigger
508 d = self.start_instance()
509 self._shutdown_callback_handle = reactor.addSystemEventTrigger(
510 'before', 'shutdown', self._soft_disconnect, fast=True)
511 def stash_reply(result):
512 self._start_result = result
513 def clean_up(failure):
514 if self.missing_timer is not None:
515 self.missing_timer.cancel()
516 self._substantiation_failed(failure)
517 if self._shutdown_callback_handle is not None:
518 handle = self._shutdown_callback_handle
519 del self._shutdown_callback_handle
520 reactor.removeSystemEventTrigger(handle)
521 return failure
522 d.addCallbacks(stash_reply, clean_up)
523 return d
525 def attached(self, bot):
526 if self.substantiation_deferred is None:
527 msg = 'Slave %s received connection while not trying to ' \
528 'substantiate. Disconnecting.' % (self.slavename,)
529 log.msg(msg)
530 self._disconnect(bot)
531 return defer.fail(RuntimeError(msg))
532 return AbstractBuildSlave.attached(self, bot)
534 def detached(self, mind):
535 AbstractBuildSlave.detached(self, mind)
536 if self.substantiation_deferred is not None:
537 self._substantiate()
539 def _substantiation_failed(self, failure):
540 d = self.substantiation_deferred
541 self.substantiation_deferred = None
542 self.missing_timer = None
543 d.errback(failure)
544 self.insubstantiate()
545 # notify people, but only if we're still in the config
546 if not self.parent or not self.notify_on_missing:
547 return
549 buildmaster = self.botmaster.parent
550 status = buildmaster.getStatus()
551 text = "The Buildbot working for '%s'\n" % status.getProjectName()
552 text += ("has noticed that the latent buildslave named %s \n" %
553 self.slavename)
554 text += "never substantiated after a request\n"
555 text += "\n"
556 text += ("The request was made at %s (buildmaster-local time)\n" %
557 time.ctime(time.time() - self.missing_timeout)) # approx
558 text += "\n"
559 text += "Sincerely,\n"
560 text += " The Buildbot\n"
561 text += " %s\n" % status.getProjectURL()
562 subject = "Buildbot: buildslave %s never substantiated" % self.slavename
563 return self._mail_missing_message(subject, text)
565 def buildStarted(self, sb):
566 assert self.substantiated
567 self._clearBuildWaitTimer()
568 self.building.add(sb.builder_name)
570 def buildFinished(self, sb):
571 self.building.remove(sb.builder_name)
572 if not self.building:
573 self._setBuildWaitTimer()
575 def _clearBuildWaitTimer(self):
576 if self.build_wait_timer is not None:
577 if self.build_wait_timer.active():
578 self.build_wait_timer.cancel()
579 self.build_wait_timer = None
581 def _setBuildWaitTimer(self):
582 self._clearBuildWaitTimer()
583 self.build_wait_timer = reactor.callLater(
584 self.build_wait_timeout, self._soft_disconnect)
586 def insubstantiate(self, fast=False):
587 self._clearBuildWaitTimer()
588 d = self.stop_instance(fast)
589 if self._shutdown_callback_handle is not None:
590 handle = self._shutdown_callback_handle
591 del self._shutdown_callback_handle
592 reactor.removeSystemEventTrigger(handle)
593 self.substantiated = False
594 self.building.clear() # just to be sure
595 return d
597 def _soft_disconnect(self, fast=False):
598 d = AbstractBuildSlave.disconnect(self)
599 if self.slave is not None:
600 # this could be called when the slave needs to shut down, such as
601 # in BotMaster.removeSlave, *or* when a new slave requests a
602 # connection when we already have a slave. It's not clear what to
603 # do in the second case: this shouldn't happen, and if it
604 # does...if it's a latent slave, shutting down will probably kill
605 # something we want...but we can't know what the status is. So,
606 # here, we just do what should be appropriate for the first case,
607 # and put our heads in the sand for the second, at least for now.
608 # The best solution to the odd situation is removing it as a
609 # possibilty: make the master in charge of connecting to the
610 # slave, rather than vice versa. TODO.
611 d = defer.DeferredList([d, self.insubstantiate(fast)])
612 else:
613 if self.substantiation_deferred is not None:
614 # unlike the previous block, we don't expect this situation when
615 # ``attached`` calls ``disconnect``, only when we get a simple
616 # request to "go away".
617 self.substantiation_deferred.errback()
618 self.substantiation_deferred = None
619 if self.missing_timer:
620 self.missing_timer.cancel()
621 self.missing_timer = None
622 self.stop_instance()
623 return d
625 def disconnect(self):
626 d = self._soft_disconnect()
627 # this removes the slave from all builders. It won't come back
628 # without a restart (or maybe a sighup)
629 self.botmaster.slaveLost(self)
631 def stopService(self):
632 res = defer.maybeDeferred(AbstractBuildSlave.stopService, self)
633 if self.slave is not None:
634 d = self._soft_disconnect()
635 res = defer.DeferredList([res, d])
636 return res
638 def updateSlave(self):
639 """Called to add or remove builders after the slave has connected.
641 Also called after botmaster's builders are initially set.
643 @return: a Deferred that indicates when an attached slave has
644 accepted the new builders and/or released the old ones."""
645 for b in self.botmaster.getBuildersForSlave(self.slavename):
646 if b.name not in self.slavebuilders:
647 b.addLatentSlave(self)
648 return AbstractBuildSlave.updateSlave(self)
650 def sendBuilderList(self):
651 d = AbstractBuildSlave.sendBuilderList(self)
652 def _sent(slist):
653 dl = []
654 for name, remote in slist.items():
655 # use get() since we might have changed our mind since then.
656 # we're checking on the builder in addition to the
657 # slavebuilders out of a bit of paranoia.
658 b = self.botmaster.builders.get(name)
659 sb = self.slavebuilders.get(name)
660 if b and sb:
661 d1 = sb.attached(self, remote, self.slave_commands)
662 dl.append(d1)
663 return defer.DeferredList(dl)
664 def _set_failed(why):
665 log.msg("BuildSlave.sendBuilderList (%s) failed" % self)
666 log.err(why)
667 # TODO: hang up on them?, without setBuilderList we can't use
668 # them
669 if self.substantiation_deferred:
670 self.substantiation_deferred.errback()
671 self.substantiation_deferred = None
672 if self.missing_timer:
673 self.missing_timer.cancel()
674 self.missing_timer = None
675 # TODO: maybe log? send an email?
676 return why
677 d.addCallbacks(_sent, _set_failed)
678 def _substantiated(res):
679 self.substantiated = True
680 if self.substantiation_deferred:
681 d = self.substantiation_deferred
682 del self.substantiation_deferred
683 res = self._start_result
684 del self._start_result
685 d.callback(res)
686 # note that the missing_timer is already handled within
687 # ``attached``
688 if not self.building:
689 self._setBuildWaitTimer()
690 d.addCallback(_substantiated)
691 return d