(closes #575) explicitly import set() on py2.3
[buildbot.git] / buildbot / buildslave.py
blobb18882f2e35c17ced1a64f51cdc13817780d123b
1 # Portions copyright Canonical Ltd. 2009
3 import time
4 from email.Message import Message
5 from email.Utils import formatdate
6 from zope.interface import implements
7 from twisted.python import log
8 from twisted.internet import defer, reactor
9 from twisted.application import service
10 import twisted.spread.pb
12 from buildbot.pbutil import NewCredPerspective
13 from buildbot.status.builder import SlaveStatus
14 from buildbot.status.mail import MailNotifier
15 from buildbot.interfaces import IBuildSlave, ILatentBuildSlave
16 from buildbot.process.properties import Properties
18 import sys
19 if sys.version_info[:3] < (2,4,0):
20 from sets import Set as set
22 class AbstractBuildSlave(NewCredPerspective, service.MultiService):
23 """This is the master-side representative for a remote buildbot slave.
24 There is exactly one for each slave described in the config file (the
25 c['slaves'] list). When buildbots connect in (.attach), they get a
26 reference to this instance. The BotMaster object is stashed as the
27 .botmaster attribute. The BotMaster is also our '.parent' Service.
29 I represent a build slave -- a remote machine capable of
30 running builds. I am instantiated by the configuration file, and can be
31 subclassed to add extra functionality."""
33 implements(IBuildSlave)
35 def __init__(self, name, password, max_builds=None,
36 notify_on_missing=[], missing_timeout=3600,
37 properties={}):
38 """
39 @param name: botname this machine will supply when it connects
40 @param password: password this machine will supply when
41 it connects
42 @param max_builds: maximum number of simultaneous builds that will
43 be run concurrently on this buildslave (the
44 default is None for no limit)
45 @param properties: properties that will be applied to builds run on
46 this slave
47 @type properties: dictionary
48 """
49 service.MultiService.__init__(self)
50 self.slavename = name
51 self.password = password
52 self.botmaster = None # no buildmaster yet
53 self.slave_status = SlaveStatus(name)
54 self.slave = None # a RemoteReference to the Bot, when connected
55 self.slave_commands = None
56 self.slavebuilders = {}
57 self.max_builds = max_builds
59 self.properties = Properties()
60 self.properties.update(properties, "BuildSlave")
61 self.properties.setProperty("slavename", name, "BuildSlave")
63 self.lastMessageReceived = 0
64 if isinstance(notify_on_missing, str):
65 notify_on_missing = [notify_on_missing]
66 self.notify_on_missing = notify_on_missing
67 for i in notify_on_missing:
68 assert isinstance(i, str)
69 self.missing_timeout = missing_timeout
70 self.missing_timer = None
72 def update(self, new):
73 """
74 Given a new BuildSlave, configure this one identically. Because
75 BuildSlave objects are remotely referenced, we can't replace them
76 without disconnecting the slave, yet there's no reason to do that.
77 """
78 # the reconfiguration logic should guarantee this:
79 assert self.slavename == new.slavename
80 assert self.password == new.password
81 assert self.__class__ == new.__class__
82 self.max_builds = new.max_builds
84 def __repr__(self):
85 if self.botmaster:
86 builders = self.botmaster.getBuildersForSlave(self.slavename)
87 return "<%s '%s', current builders: %s>" % \
88 (self.__class__.__name__, self.slavename,
89 ','.join(map(lambda b: b.name, builders)))
90 else:
91 return "<%s '%s', (no builders yet)>" % \
92 (self.__class__.__name__, self.slavename)
94 def setBotmaster(self, botmaster):
95 assert not self.botmaster, "BuildSlave already has a botmaster"
96 self.botmaster = botmaster
97 self.startMissingTimer()
99 def stopMissingTimer(self):
100 if self.missing_timer:
101 self.missing_timer.cancel()
102 self.missing_timer = None
104 def startMissingTimer(self):
105 if self.notify_on_missing and self.missing_timeout and self.parent:
106 self.stopMissingTimer() # in case it's already running
107 self.missing_timer = reactor.callLater(self.missing_timeout,
108 self._missing_timer_fired)
110 def _missing_timer_fired(self):
111 self.missing_timer = None
112 # notify people, but only if we're still in the config
113 if not self.parent:
114 return
116 buildmaster = self.botmaster.parent
117 status = buildmaster.getStatus()
118 text = "The Buildbot working for '%s'\n" % status.getProjectName()
119 text += ("has noticed that the buildslave named %s went away\n" %
120 self.slavename)
121 text += "\n"
122 text += ("It last disconnected at %s (buildmaster-local time)\n" %
123 time.ctime(time.time() - self.missing_timeout)) # approx
124 text += "\n"
125 text += "The admin on record (as reported by BUILDSLAVE:info/admin)\n"
126 text += "was '%s'.\n" % self.slave_status.getAdmin()
127 text += "\n"
128 text += "Sincerely,\n"
129 text += " The Buildbot\n"
130 text += " %s\n" % status.getProjectURL()
131 subject = "Buildbot: buildslave %s was lost" % self.slavename
132 return self._mail_missing_message(subject, text)
135 def updateSlave(self):
136 """Called to add or remove builders after the slave has connected.
138 @return: a Deferred that indicates when an attached slave has
139 accepted the new builders and/or released the old ones."""
140 if self.slave:
141 return self.sendBuilderList()
142 else:
143 return defer.succeed(None)
145 def updateSlaveStatus(self, buildStarted=None, buildFinished=None):
146 if buildStarted:
147 self.slave_status.buildStarted(buildStarted)
148 if buildFinished:
149 self.slave_status.buildFinished(buildFinished)
151 def attached(self, bot):
152 """This is called when the slave connects.
154 @return: a Deferred that fires with a suitable pb.IPerspective to
155 give to the slave (i.e. 'self')"""
157 if self.slave:
158 # uh-oh, we've got a duplicate slave. The most likely
159 # explanation is that the slave is behind a slow link, thinks we
160 # went away, and has attempted to reconnect, so we've got two
161 # "connections" from the same slave, but the previous one is
162 # stale. Give the new one precedence.
163 log.msg("duplicate slave %s replacing old one" % self.slavename)
165 # just in case we've got two identically-configured slaves,
166 # report the IP addresses of both so someone can resolve the
167 # squabble
168 tport = self.slave.broker.transport
169 log.msg("old slave was connected from", tport.getPeer())
170 log.msg("new slave is from", bot.broker.transport.getPeer())
171 d = self.disconnect()
172 else:
173 d = defer.succeed(None)
174 # now we go through a sequence of calls, gathering information, then
175 # tell the Botmaster that it can finally give this slave to all the
176 # Builders that care about it.
178 # we accumulate slave information in this 'state' dictionary, then
179 # set it atomically if we make it far enough through the process
180 state = {}
182 # Reset graceful shutdown status
183 self.slave_status.setGraceful(False)
184 # We want to know when the graceful shutdown flag changes
185 self.slave_status.addGracefulWatcher(self._gracefulChanged)
187 def _log_attachment_on_slave(res):
188 d1 = bot.callRemote("print", "attached")
189 d1.addErrback(lambda why: None)
190 return d1
191 d.addCallback(_log_attachment_on_slave)
193 def _get_info(res):
194 d1 = bot.callRemote("getSlaveInfo")
195 def _got_info(info):
196 log.msg("Got slaveinfo from '%s'" % self.slavename)
197 # TODO: info{} might have other keys
198 state["admin"] = info.get("admin")
199 state["host"] = info.get("host")
200 def _info_unavailable(why):
201 # maybe an old slave, doesn't implement remote_getSlaveInfo
202 log.msg("BuildSlave.info_unavailable")
203 log.err(why)
204 d1.addCallbacks(_got_info, _info_unavailable)
205 return d1
206 d.addCallback(_get_info)
208 def _get_commands(res):
209 d1 = bot.callRemote("getCommands")
210 def _got_commands(commands):
211 state["slave_commands"] = commands
212 def _commands_unavailable(why):
213 # probably an old slave
214 log.msg("BuildSlave._commands_unavailable")
215 if why.check(AttributeError):
216 return
217 log.err(why)
218 d1.addCallbacks(_got_commands, _commands_unavailable)
219 return d1
220 d.addCallback(_get_commands)
222 def _accept_slave(res):
223 self.slave_status.setAdmin(state.get("admin"))
224 self.slave_status.setHost(state.get("host"))
225 self.slave_status.setConnected(True)
226 self.slave_commands = state.get("slave_commands")
227 self.slave = bot
228 log.msg("bot attached")
229 self.messageReceivedFromSlave()
230 self.stopMissingTimer()
232 return self.updateSlave()
233 d.addCallback(_accept_slave)
234 d.addCallback(lambda res: self.botmaster.maybeStartAllBuilds())
236 # Finally, the slave gets a reference to this BuildSlave. They
237 # receive this later, after we've started using them.
238 d.addCallback(lambda res: self)
239 return d
241 def messageReceivedFromSlave(self):
242 now = time.time()
243 self.lastMessageReceived = now
244 self.slave_status.setLastMessageReceived(now)
246 def detached(self, mind):
247 self.slave = None
248 self.slave_status.removeGracefulWatcher(self._gracefulChanged)
249 self.slave_status.setConnected(False)
250 log.msg("BuildSlave.detached(%s)" % self.slavename)
252 def disconnect(self):
253 """Forcibly disconnect the slave.
255 This severs the TCP connection and returns a Deferred that will fire
256 (with None) when the connection is probably gone.
258 If the slave is still alive, they will probably try to reconnect
259 again in a moment.
261 This is called in two circumstances. The first is when a slave is
262 removed from the config file. In this case, when they try to
263 reconnect, they will be rejected as an unknown slave. The second is
264 when we wind up with two connections for the same slave, in which
265 case we disconnect the older connection.
268 if not self.slave:
269 return defer.succeed(None)
270 log.msg("disconnecting old slave %s now" % self.slavename)
271 # When this Deferred fires, we'll be ready to accept the new slave
272 return self._disconnect(self.slave)
274 def _disconnect(self, slave):
275 # all kinds of teardown will happen as a result of
276 # loseConnection(), but it happens after a reactor iteration or
277 # two. Hook the actual disconnect so we can know when it is safe
278 # to connect the new slave. We have to wait one additional
279 # iteration (with callLater(0)) to make sure the *other*
280 # notifyOnDisconnect handlers have had a chance to run.
281 d = defer.Deferred()
283 # notifyOnDisconnect runs the callback with one argument, the
284 # RemoteReference being disconnected.
285 def _disconnected(rref):
286 reactor.callLater(0, d.callback, None)
287 slave.notifyOnDisconnect(_disconnected)
288 tport = slave.broker.transport
289 # this is the polite way to request that a socket be closed
290 tport.loseConnection()
291 try:
292 # but really we don't want to wait for the transmit queue to
293 # drain. The remote end is unlikely to ACK the data, so we'd
294 # probably have to wait for a (20-minute) TCP timeout.
295 #tport._closeSocket()
296 # however, doing _closeSocket (whether before or after
297 # loseConnection) somehow prevents the notifyOnDisconnect
298 # handlers from being run. Bummer.
299 tport.offset = 0
300 tport.dataBuffer = ""
301 except:
302 # however, these hacks are pretty internal, so don't blow up if
303 # they fail or are unavailable
304 log.msg("failed to accelerate the shutdown process")
305 pass
306 log.msg("waiting for slave to finish disconnecting")
308 return d
310 def sendBuilderList(self):
311 our_builders = self.botmaster.getBuildersForSlave(self.slavename)
312 blist = [(b.name, b.builddir) for b in our_builders]
313 d = self.slave.callRemote("setBuilderList", blist)
314 return d
316 def perspective_keepalive(self):
317 pass
319 def addSlaveBuilder(self, sb):
320 if sb.builder_name not in self.slavebuilders:
321 log.msg("%s adding %s" % (self, sb))
322 elif sb is not self.slavebuilders[sb.builder_name]:
323 log.msg("%s replacing %s" % (self, sb))
324 else:
325 return
326 self.slavebuilders[sb.builder_name] = sb
328 def removeSlaveBuilder(self, sb):
329 try:
330 del self.slavebuilders[sb.builder_name]
331 except KeyError:
332 pass
333 else:
334 log.msg("%s removed %s" % (self, sb))
336 def canStartBuild(self):
338 I am called when a build is requested to see if this buildslave
339 can start a build. This function can be used to limit overall
340 concurrency on the buildslave.
342 # If we're waiting to shutdown gracefully, then we shouldn't
343 # accept any new jobs.
344 if self.slave_status.getGraceful():
345 return False
347 if self.max_builds:
348 active_builders = [sb for sb in self.slavebuilders.values()
349 if sb.isBusy()]
350 if len(active_builders) >= self.max_builds:
351 return False
352 return True
354 def _mail_missing_message(self, subject, text):
355 # first, see if we have a MailNotifier we can use. This gives us a
356 # fromaddr and a relayhost.
357 buildmaster = self.botmaster.parent
358 for st in buildmaster.statusTargets:
359 if isinstance(st, MailNotifier):
360 break
361 else:
362 # if not, they get a default MailNotifier, which always uses SMTP
363 # to localhost and uses a dummy fromaddr of "buildbot".
364 log.msg("buildslave-missing msg using default MailNotifier")
365 st = MailNotifier("buildbot")
366 # now construct the mail
368 m = Message()
369 m.set_payload(text)
370 m['Date'] = formatdate(localtime=True)
371 m['Subject'] = subject
372 m['From'] = st.fromaddr
373 recipients = self.notify_on_missing
374 m['To'] = ", ".join(recipients)
375 d = st.sendMessage(m, recipients)
376 # return the Deferred for testing purposes
377 return d
379 def _gracefulChanged(self, graceful):
380 """This is called when our graceful shutdown setting changes"""
381 if graceful:
382 active_builders = [sb for sb in self.slavebuilders.values()
383 if sb.isBusy()]
384 if len(active_builders) == 0:
385 # Shut down!
386 self.shutdown()
388 def shutdown(self):
389 """Shutdown the slave"""
390 # Look for a builder with a remote reference to the client side
391 # slave. If we can find one, then call "shutdown" on the remote
392 # builder, which will cause the slave buildbot process to exit.
393 d = None
394 for b in self.slavebuilders.values():
395 if b.remote:
396 d = b.remote.callRemote("shutdown")
397 break
399 if d:
400 log.msg("Shutting down slave: %s" % self.slavename)
401 # The remote shutdown call will not complete successfully since the
402 # buildbot process exits almost immediately after getting the
403 # shutdown request.
404 # Here we look at the reason why the remote call failed, and if
405 # it's because the connection was lost, that means the slave
406 # shutdown as expected.
407 def _errback(why):
408 if why.check(twisted.spread.pb.PBConnectionLost):
409 log.msg("Lost connection to %s" % self.slavename)
410 else:
411 log.err("Unexpected error when trying to shutdown %s" % self.slavename)
412 d.addErrback(_errback)
413 return d
414 log.err("Couldn't find remote builder to shut down slave")
415 return defer.succeed(None)
417 class BuildSlave(AbstractBuildSlave):
419 def sendBuilderList(self):
420 d = AbstractBuildSlave.sendBuilderList(self)
421 def _sent(slist):
422 dl = []
423 for name, remote in slist.items():
424 # use get() since we might have changed our mind since then
425 b = self.botmaster.builders.get(name)
426 if b:
427 d1 = b.attached(self, remote, self.slave_commands)
428 dl.append(d1)
429 return defer.DeferredList(dl)
430 def _set_failed(why):
431 log.msg("BuildSlave.sendBuilderList (%s) failed" % self)
432 log.err(why)
433 # TODO: hang up on them?, without setBuilderList we can't use
434 # them
435 d.addCallbacks(_sent, _set_failed)
436 return d
438 def detached(self, mind):
439 AbstractBuildSlave.detached(self, mind)
440 self.botmaster.slaveLost(self)
441 self.startMissingTimer()
443 def buildFinished(self, sb):
444 """This is called when a build on this slave is finished."""
445 # If we're gracefully shutting down, and we have no more active
446 # builders, then it's safe to disconnect
447 if self.slave_status.getGraceful():
448 active_builders = [sb for sb in self.slavebuilders.values()
449 if sb.isBusy()]
450 if len(active_builders) == 0:
451 # Shut down!
452 return self.shutdown()
453 return defer.succeed(None)
455 class AbstractLatentBuildSlave(AbstractBuildSlave):
456 """A build slave that will start up a slave instance when needed.
458 To use, subclass and implement start_instance and stop_instance.
460 See ec2buildslave.py for a concrete example. Also see the stub example in
461 test/test_slaves.py.
464 implements(ILatentBuildSlave)
466 substantiated = False
467 substantiation_deferred = None
468 build_wait_timer = None
469 _start_result = _shutdown_callback_handle = None
471 def __init__(self, name, password, max_builds=None,
472 notify_on_missing=[], missing_timeout=60*20,
473 build_wait_timeout=60*10,
474 properties={}):
475 AbstractBuildSlave.__init__(
476 self, name, password, max_builds, notify_on_missing,
477 missing_timeout, properties)
478 self.building = set()
479 self.build_wait_timeout = build_wait_timeout
481 def start_instance(self):
482 # responsible for starting instance that will try to connect with
483 # this master. Should return deferred. Problems should use an
484 # errback.
485 raise NotImplementedError
487 def stop_instance(self, fast=False):
488 # responsible for shutting down instance.
489 raise NotImplementedError
491 def substantiate(self, sb):
492 if self.substantiated:
493 self._clearBuildWaitTimer()
494 self._setBuildWaitTimer()
495 return defer.succeed(self)
496 if self.substantiation_deferred is None:
497 if self.parent and not self.missing_timer:
498 # start timer. if timer times out, fail deferred
499 self.missing_timer = reactor.callLater(
500 self.missing_timeout,
501 self._substantiation_failed, defer.TimeoutError())
502 self.substantiation_deferred = defer.Deferred()
503 if self.slave is None:
504 self._substantiate() # start up instance
505 # else: we're waiting for an old one to detach. the _substantiate
506 # will be done in ``detached`` below.
507 return self.substantiation_deferred
509 def _substantiate(self):
510 # register event trigger
511 d = self.start_instance()
512 self._shutdown_callback_handle = reactor.addSystemEventTrigger(
513 'before', 'shutdown', self._soft_disconnect, fast=True)
514 def stash_reply(result):
515 self._start_result = result
516 def clean_up(failure):
517 if self.missing_timer is not None:
518 self.missing_timer.cancel()
519 self._substantiation_failed(failure)
520 if self._shutdown_callback_handle is not None:
521 handle = self._shutdown_callback_handle
522 del self._shutdown_callback_handle
523 reactor.removeSystemEventTrigger(handle)
524 return failure
525 d.addCallbacks(stash_reply, clean_up)
526 return d
528 def attached(self, bot):
529 if self.substantiation_deferred is None:
530 msg = 'Slave %s received connection while not trying to ' \
531 'substantiate. Disconnecting.' % (self.slavename,)
532 log.msg(msg)
533 self._disconnect(bot)
534 return defer.fail(RuntimeError(msg))
535 return AbstractBuildSlave.attached(self, bot)
537 def detached(self, mind):
538 AbstractBuildSlave.detached(self, mind)
539 if self.substantiation_deferred is not None:
540 self._substantiate()
542 def _substantiation_failed(self, failure):
543 d = self.substantiation_deferred
544 self.substantiation_deferred = None
545 self.missing_timer = None
546 d.errback(failure)
547 self.insubstantiate()
548 # notify people, but only if we're still in the config
549 if not self.parent or not self.notify_on_missing:
550 return
552 buildmaster = self.botmaster.parent
553 status = buildmaster.getStatus()
554 text = "The Buildbot working for '%s'\n" % status.getProjectName()
555 text += ("has noticed that the latent buildslave named %s \n" %
556 self.slavename)
557 text += "never substantiated after a request\n"
558 text += "\n"
559 text += ("The request was made at %s (buildmaster-local time)\n" %
560 time.ctime(time.time() - self.missing_timeout)) # approx
561 text += "\n"
562 text += "Sincerely,\n"
563 text += " The Buildbot\n"
564 text += " %s\n" % status.getProjectURL()
565 subject = "Buildbot: buildslave %s never substantiated" % self.slavename
566 return self._mail_missing_message(subject, text)
568 def buildStarted(self, sb):
569 assert self.substantiated
570 self._clearBuildWaitTimer()
571 self.building.add(sb.builder_name)
573 def buildFinished(self, sb):
574 self.building.remove(sb.builder_name)
575 if not self.building:
576 self._setBuildWaitTimer()
578 def _clearBuildWaitTimer(self):
579 if self.build_wait_timer is not None:
580 if self.build_wait_timer.active():
581 self.build_wait_timer.cancel()
582 self.build_wait_timer = None
584 def _setBuildWaitTimer(self):
585 self._clearBuildWaitTimer()
586 self.build_wait_timer = reactor.callLater(
587 self.build_wait_timeout, self._soft_disconnect)
589 def insubstantiate(self, fast=False):
590 self._clearBuildWaitTimer()
591 d = self.stop_instance(fast)
592 if self._shutdown_callback_handle is not None:
593 handle = self._shutdown_callback_handle
594 del self._shutdown_callback_handle
595 reactor.removeSystemEventTrigger(handle)
596 self.substantiated = False
597 self.building.clear() # just to be sure
598 return d
600 def _soft_disconnect(self, fast=False):
601 d = AbstractBuildSlave.disconnect(self)
602 if self.slave is not None:
603 # this could be called when the slave needs to shut down, such as
604 # in BotMaster.removeSlave, *or* when a new slave requests a
605 # connection when we already have a slave. It's not clear what to
606 # do in the second case: this shouldn't happen, and if it
607 # does...if it's a latent slave, shutting down will probably kill
608 # something we want...but we can't know what the status is. So,
609 # here, we just do what should be appropriate for the first case,
610 # and put our heads in the sand for the second, at least for now.
611 # The best solution to the odd situation is removing it as a
612 # possibilty: make the master in charge of connecting to the
613 # slave, rather than vice versa. TODO.
614 d = defer.DeferredList([d, self.insubstantiate(fast)])
615 else:
616 if self.substantiation_deferred is not None:
617 # unlike the previous block, we don't expect this situation when
618 # ``attached`` calls ``disconnect``, only when we get a simple
619 # request to "go away".
620 self.substantiation_deferred.errback()
621 self.substantiation_deferred = None
622 if self.missing_timer:
623 self.missing_timer.cancel()
624 self.missing_timer = None
625 self.stop_instance()
626 return d
628 def disconnect(self):
629 d = self._soft_disconnect()
630 # this removes the slave from all builders. It won't come back
631 # without a restart (or maybe a sighup)
632 self.botmaster.slaveLost(self)
634 def stopService(self):
635 res = defer.maybeDeferred(AbstractBuildSlave.stopService, self)
636 if self.slave is not None:
637 d = self._soft_disconnect()
638 res = defer.DeferredList([res, d])
639 return res
641 def updateSlave(self):
642 """Called to add or remove builders after the slave has connected.
644 Also called after botmaster's builders are initially set.
646 @return: a Deferred that indicates when an attached slave has
647 accepted the new builders and/or released the old ones."""
648 for b in self.botmaster.getBuildersForSlave(self.slavename):
649 if b.name not in self.slavebuilders:
650 b.addLatentSlave(self)
651 return AbstractBuildSlave.updateSlave(self)
653 def sendBuilderList(self):
654 d = AbstractBuildSlave.sendBuilderList(self)
655 def _sent(slist):
656 dl = []
657 for name, remote in slist.items():
658 # use get() since we might have changed our mind since then.
659 # we're checking on the builder in addition to the
660 # slavebuilders out of a bit of paranoia.
661 b = self.botmaster.builders.get(name)
662 sb = self.slavebuilders.get(name)
663 if b and sb:
664 d1 = sb.attached(self, remote, self.slave_commands)
665 dl.append(d1)
666 return defer.DeferredList(dl)
667 def _set_failed(why):
668 log.msg("BuildSlave.sendBuilderList (%s) failed" % self)
669 log.err(why)
670 # TODO: hang up on them?, without setBuilderList we can't use
671 # them
672 if self.substantiation_deferred:
673 self.substantiation_deferred.errback()
674 self.substantiation_deferred = None
675 if self.missing_timer:
676 self.missing_timer.cancel()
677 self.missing_timer = None
678 # TODO: maybe log? send an email?
679 return why
680 d.addCallbacks(_sent, _set_failed)
681 def _substantiated(res):
682 self.substantiated = True
683 if self.substantiation_deferred:
684 d = self.substantiation_deferred
685 del self.substantiation_deferred
686 res = self._start_result
687 del self._start_result
688 d.callback(res)
689 # note that the missing_timer is already handled within
690 # ``attached``
691 if not self.building:
692 self._setBuildWaitTimer()
693 d.addCallback(_substantiated)
694 return d