(refs #376) prettify the ETA in the waterfall
[buildbot.git] / buildbot / buildslave.py
blob31f5f97185566ce34670a0cfc58e720f49e08be6
1 # Portions copyright Canonical Ltd. 2009
3 import time
4 from email.Message import Message
5 from email.Utils import formatdate
6 from zope.interface import implements
7 from twisted.python import log
8 from twisted.internet import defer, reactor
9 from twisted.application import service
10 import twisted.spread.pb
12 from buildbot.pbutil import NewCredPerspective
13 from buildbot.status.builder import SlaveStatus
14 from buildbot.status.mail import MailNotifier
15 from buildbot.interfaces import IBuildSlave, ILatentBuildSlave
16 from buildbot.process.properties import Properties
19 class AbstractBuildSlave(NewCredPerspective, service.MultiService):
20 """This is the master-side representative for a remote buildbot slave.
21 There is exactly one for each slave described in the config file (the
22 c['slaves'] list). When buildbots connect in (.attach), they get a
23 reference to this instance. The BotMaster object is stashed as the
24 .botmaster attribute. The BotMaster is also our '.parent' Service.
26 I represent a build slave -- a remote machine capable of
27 running builds. I am instantiated by the configuration file, and can be
28 subclassed to add extra functionality."""
30 implements(IBuildSlave)
32 def __init__(self, name, password, max_builds=None,
33 notify_on_missing=[], missing_timeout=3600,
34 properties={}):
35 """
36 @param name: botname this machine will supply when it connects
37 @param password: password this machine will supply when
38 it connects
39 @param max_builds: maximum number of simultaneous builds that will
40 be run concurrently on this buildslave (the
41 default is None for no limit)
42 @param properties: properties that will be applied to builds run on
43 this slave
44 @type properties: dictionary
45 """
46 service.MultiService.__init__(self)
47 self.slavename = name
48 self.password = password
49 self.botmaster = None # no buildmaster yet
50 self.slave_status = SlaveStatus(name)
51 self.slave = None # a RemoteReference to the Bot, when connected
52 self.slave_commands = None
53 self.slavebuilders = {}
54 self.max_builds = max_builds
56 self.properties = Properties()
57 self.properties.update(properties, "BuildSlave")
58 self.properties.setProperty("slavename", name, "BuildSlave")
60 self.lastMessageReceived = 0
61 if isinstance(notify_on_missing, str):
62 notify_on_missing = [notify_on_missing]
63 self.notify_on_missing = notify_on_missing
64 for i in notify_on_missing:
65 assert isinstance(i, str)
66 self.missing_timeout = missing_timeout
67 self.missing_timer = None
69 def update(self, new):
70 """
71 Given a new BuildSlave, configure this one identically. Because
72 BuildSlave objects are remotely referenced, we can't replace them
73 without disconnecting the slave, yet there's no reason to do that.
74 """
75 # the reconfiguration logic should guarantee this:
76 assert self.slavename == new.slavename
77 assert self.password == new.password
78 assert self.__class__ == new.__class__
79 self.max_builds = new.max_builds
81 def __repr__(self):
82 if self.botmaster:
83 builders = self.botmaster.getBuildersForSlave(self.slavename)
84 return "<%s '%s', current builders: %s>" % \
85 (self.__class__.__name__, self.slavename,
86 ','.join(map(lambda b: b.name, builders)))
87 else:
88 return "<%s '%s', (no builders yet)>" % \
89 (self.__class__.__name__, self.slavename)
91 def setBotmaster(self, botmaster):
92 assert not self.botmaster, "BuildSlave already has a botmaster"
93 self.botmaster = botmaster
95 def updateSlave(self):
96 """Called to add or remove builders after the slave has connected.
98 @return: a Deferred that indicates when an attached slave has
99 accepted the new builders and/or released the old ones."""
100 if self.slave:
101 return self.sendBuilderList()
102 else:
103 return defer.succeed(None)
105 def updateSlaveStatus(self, buildStarted=None, buildFinished=None):
106 if buildStarted:
107 self.slave_status.buildStarted(buildStarted)
108 if buildFinished:
109 self.slave_status.buildFinished(buildFinished)
111 def attached(self, bot):
112 """This is called when the slave connects.
114 @return: a Deferred that fires with a suitable pb.IPerspective to
115 give to the slave (i.e. 'self')"""
117 if self.slave:
118 # uh-oh, we've got a duplicate slave. The most likely
119 # explanation is that the slave is behind a slow link, thinks we
120 # went away, and has attempted to reconnect, so we've got two
121 # "connections" from the same slave, but the previous one is
122 # stale. Give the new one precedence.
123 log.msg("duplicate slave %s replacing old one" % self.slavename)
125 # just in case we've got two identically-configured slaves,
126 # report the IP addresses of both so someone can resolve the
127 # squabble
128 tport = self.slave.broker.transport
129 log.msg("old slave was connected from", tport.getPeer())
130 log.msg("new slave is from", bot.broker.transport.getPeer())
131 d = self.disconnect()
132 else:
133 d = defer.succeed(None)
134 # now we go through a sequence of calls, gathering information, then
135 # tell the Botmaster that it can finally give this slave to all the
136 # Builders that care about it.
138 # we accumulate slave information in this 'state' dictionary, then
139 # set it atomically if we make it far enough through the process
140 state = {}
142 # Reset graceful shutdown status
143 self.slave_status.setGraceful(False)
144 # We want to know when the graceful shutdown flag changes
145 self.slave_status.addGracefulWatcher(self._gracefulChanged)
147 def _log_attachment_on_slave(res):
148 d1 = bot.callRemote("print", "attached")
149 d1.addErrback(lambda why: None)
150 return d1
151 d.addCallback(_log_attachment_on_slave)
153 def _get_info(res):
154 d1 = bot.callRemote("getSlaveInfo")
155 def _got_info(info):
156 log.msg("Got slaveinfo from '%s'" % self.slavename)
157 # TODO: info{} might have other keys
158 state["admin"] = info.get("admin")
159 state["host"] = info.get("host")
160 def _info_unavailable(why):
161 # maybe an old slave, doesn't implement remote_getSlaveInfo
162 log.msg("BuildSlave.info_unavailable")
163 log.err(why)
164 d1.addCallbacks(_got_info, _info_unavailable)
165 return d1
166 d.addCallback(_get_info)
168 def _get_commands(res):
169 d1 = bot.callRemote("getCommands")
170 def _got_commands(commands):
171 state["slave_commands"] = commands
172 def _commands_unavailable(why):
173 # probably an old slave
174 log.msg("BuildSlave._commands_unavailable")
175 if why.check(AttributeError):
176 return
177 log.err(why)
178 d1.addCallbacks(_got_commands, _commands_unavailable)
179 return d1
180 d.addCallback(_get_commands)
182 def _accept_slave(res):
183 self.slave_status.setAdmin(state.get("admin"))
184 self.slave_status.setHost(state.get("host"))
185 self.slave_status.setConnected(True)
186 self.slave_commands = state.get("slave_commands")
187 self.slave = bot
188 log.msg("bot attached")
189 self.messageReceivedFromSlave()
190 if self.missing_timer:
191 self.missing_timer.cancel()
192 self.missing_timer = None
194 return self.updateSlave()
195 d.addCallback(_accept_slave)
197 # Finally, the slave gets a reference to this BuildSlave. They
198 # receive this later, after we've started using them.
199 d.addCallback(lambda res: self)
200 return d
202 def messageReceivedFromSlave(self):
203 now = time.time()
204 self.lastMessageReceived = now
205 self.slave_status.setLastMessageReceived(now)
207 def detached(self, mind):
208 self.slave = None
209 self.slave_status.removeGracefulWatcher(self._gracefulChanged)
210 self.slave_status.setConnected(False)
211 log.msg("BuildSlave.detached(%s)" % self.slavename)
213 def disconnect(self):
214 """Forcibly disconnect the slave.
216 This severs the TCP connection and returns a Deferred that will fire
217 (with None) when the connection is probably gone.
219 If the slave is still alive, they will probably try to reconnect
220 again in a moment.
222 This is called in two circumstances. The first is when a slave is
223 removed from the config file. In this case, when they try to
224 reconnect, they will be rejected as an unknown slave. The second is
225 when we wind up with two connections for the same slave, in which
226 case we disconnect the older connection.
229 if not self.slave:
230 return defer.succeed(None)
231 log.msg("disconnecting old slave %s now" % self.slavename)
232 # When this Deferred fires, we'll be ready to accept the new slave
233 return self._disconnect(self.slave)
235 def _disconnect(self, slave):
236 # all kinds of teardown will happen as a result of
237 # loseConnection(), but it happens after a reactor iteration or
238 # two. Hook the actual disconnect so we can know when it is safe
239 # to connect the new slave. We have to wait one additional
240 # iteration (with callLater(0)) to make sure the *other*
241 # notifyOnDisconnect handlers have had a chance to run.
242 d = defer.Deferred()
244 # notifyOnDisconnect runs the callback with one argument, the
245 # RemoteReference being disconnected.
246 def _disconnected(rref):
247 reactor.callLater(0, d.callback, None)
248 slave.notifyOnDisconnect(_disconnected)
249 tport = slave.broker.transport
250 # this is the polite way to request that a socket be closed
251 tport.loseConnection()
252 try:
253 # but really we don't want to wait for the transmit queue to
254 # drain. The remote end is unlikely to ACK the data, so we'd
255 # probably have to wait for a (20-minute) TCP timeout.
256 #tport._closeSocket()
257 # however, doing _closeSocket (whether before or after
258 # loseConnection) somehow prevents the notifyOnDisconnect
259 # handlers from being run. Bummer.
260 tport.offset = 0
261 tport.dataBuffer = ""
262 except:
263 # however, these hacks are pretty internal, so don't blow up if
264 # they fail or are unavailable
265 log.msg("failed to accelerate the shutdown process")
266 pass
267 log.msg("waiting for slave to finish disconnecting")
269 return d
271 def sendBuilderList(self):
272 our_builders = self.botmaster.getBuildersForSlave(self.slavename)
273 blist = [(b.name, b.builddir) for b in our_builders]
274 d = self.slave.callRemote("setBuilderList", blist)
275 return d
277 def perspective_keepalive(self):
278 pass
280 def addSlaveBuilder(self, sb):
281 if sb.builder_name not in self.slavebuilders:
282 log.msg("%s adding %s" % (self, sb))
283 elif sb is not self.slavebuilders[sb.builder_name]:
284 log.msg("%s replacing %s" % (self, sb))
285 else:
286 return
287 self.slavebuilders[sb.builder_name] = sb
289 def removeSlaveBuilder(self, sb):
290 try:
291 del self.slavebuilders[sb.builder_name]
292 except KeyError:
293 pass
294 else:
295 log.msg("%s removed %s" % (self, sb))
297 def canStartBuild(self):
299 I am called when a build is requested to see if this buildslave
300 can start a build. This function can be used to limit overall
301 concurrency on the buildslave.
303 # If we're waiting to shutdown gracefully, then we shouldn't
304 # accept any new jobs.
305 if self.slave_status.getGraceful():
306 return False
308 if self.max_builds:
309 active_builders = [sb for sb in self.slavebuilders.values()
310 if sb.isBusy()]
311 if len(active_builders) >= self.max_builds:
312 return False
313 return True
315 def _mail_missing_message(self, subject, text):
316 # first, see if we have a MailNotifier we can use. This gives us a
317 # fromaddr and a relayhost.
318 buildmaster = self.botmaster.parent
319 for st in buildmaster.statusTargets:
320 if isinstance(st, MailNotifier):
321 break
322 else:
323 # if not, they get a default MailNotifier, which always uses SMTP
324 # to localhost and uses a dummy fromaddr of "buildbot".
325 log.msg("buildslave-missing msg using default MailNotifier")
326 st = MailNotifier("buildbot")
327 # now construct the mail
329 m = Message()
330 m.set_payload(text)
331 m['Date'] = formatdate(localtime=True)
332 m['Subject'] = subject
333 m['From'] = st.fromaddr
334 recipients = self.notify_on_missing
335 m['To'] = ", ".join(recipients)
336 d = st.sendMessage(m, recipients)
337 # return the Deferred for testing purposes
338 return d
340 def _gracefulChanged(self, graceful):
341 """This is called when our graceful shutdown setting changes"""
342 if graceful:
343 active_builders = [sb for sb in self.slavebuilders.values()
344 if sb.isBusy()]
345 if len(active_builders) == 0:
346 # Shut down!
347 self.shutdown()
349 def shutdown(self):
350 """Shutdown the slave"""
351 # Look for a builder with a remote reference to the client side
352 # slave. If we can find one, then call "shutdown" on the remote
353 # builder, which will cause the slave buildbot process to exit.
354 d = None
355 for b in self.slavebuilders.values():
356 if b.remote:
357 d = b.remote.callRemote("shutdown")
358 break
360 if d:
361 log.msg("Shutting down slave: %s" % self.slavename)
362 # The remote shutdown call will not complete successfully since the
363 # buildbot process exits almost immediately after getting the
364 # shutdown request.
365 # Here we look at the reason why the remote call failed, and if
366 # it's because the connection was lost, that means the slave
367 # shutdown as expected.
368 def _errback(why):
369 if why.check(twisted.spread.pb.PBConnectionLost):
370 log.msg("Lost connection to %s" % self.slavename)
371 else:
372 log.err("Unexpected error when trying to shutdown %s" % self.slavename)
373 d.addErrback(_errback)
374 return d
375 log.err("Couldn't find remote builder to shut down slave")
376 return defer.succeed(None)
378 class BuildSlave(AbstractBuildSlave):
380 def sendBuilderList(self):
381 d = AbstractBuildSlave.sendBuilderList(self)
382 def _sent(slist):
383 dl = []
384 for name, remote in slist.items():
385 # use get() since we might have changed our mind since then
386 b = self.botmaster.builders.get(name)
387 if b:
388 d1 = b.attached(self, remote, self.slave_commands)
389 dl.append(d1)
390 return defer.DeferredList(dl)
391 def _set_failed(why):
392 log.msg("BuildSlave.sendBuilderList (%s) failed" % self)
393 log.err(why)
394 # TODO: hang up on them?, without setBuilderList we can't use
395 # them
396 d.addCallbacks(_sent, _set_failed)
397 return d
399 def detached(self, mind):
400 AbstractBuildSlave.detached(self, mind)
401 self.botmaster.slaveLost(self)
402 if self.notify_on_missing and self.parent and not self.missing_timer:
403 self.missing_timer = reactor.callLater(self.missing_timeout,
404 self._missing_timer_fired)
406 def _missing_timer_fired(self):
407 self.missing_timer = None
408 # notify people, but only if we're still in the config
409 if not self.parent:
410 return
412 buildmaster = self.botmaster.parent
413 status = buildmaster.getStatus()
414 text = "The Buildbot working for '%s'\n" % status.getProjectName()
415 text += ("has noticed that the buildslave named %s went away\n" %
416 self.slavename)
417 text += "\n"
418 text += ("It last disconnected at %s (buildmaster-local time)\n" %
419 time.ctime(time.time() - self.missing_timeout)) # approx
420 text += "\n"
421 text += "The admin on record (as reported by BUILDSLAVE:info/admin)\n"
422 text += "was '%s'.\n" % self.slave_status.getAdmin()
423 text += "\n"
424 text += "Sincerely,\n"
425 text += " The Buildbot\n"
426 text += " %s\n" % status.getProjectURL()
427 subject = "Buildbot: buildslave %s was lost" % self.slavename
428 return self._mail_missing_message(subject, text)
430 def buildFinished(self, sb):
431 """This is called when a build on this slave is finished."""
432 # If we're gracefully shutting down, and we have no more active
433 # builders, then it's safe to disconnect
434 if self.slave_status.getGraceful():
435 active_builders = [sb for sb in self.slavebuilders.values()
436 if sb.isBusy()]
437 if len(active_builders) == 0:
438 # Shut down!
439 return self.shutdown()
440 return defer.succeed(None)
442 class AbstractLatentBuildSlave(AbstractBuildSlave):
443 """A build slave that will start up a slave instance when needed.
445 To use, subclass and implement start_instance and stop_instance.
447 See ec2buildslave.py for a concrete example. Also see the stub example in
448 test/test_slaves.py.
451 implements(ILatentBuildSlave)
453 substantiated = False
454 substantiation_deferred = None
455 build_wait_timer = None
456 _start_result = _shutdown_callback_handle = None
458 def __init__(self, name, password, max_builds=None,
459 notify_on_missing=[], missing_timeout=60*20,
460 build_wait_timeout=60*10,
461 properties={}):
462 AbstractBuildSlave.__init__(
463 self, name, password, max_builds, notify_on_missing,
464 missing_timeout, properties)
465 self.building = set()
466 self.build_wait_timeout = build_wait_timeout
468 def start_instance(self):
469 # responsible for starting instance that will try to connect with
470 # this master. Should return deferred. Problems should use an
471 # errback.
472 raise NotImplementedError
474 def stop_instance(self, fast=False):
475 # responsible for shutting down instance.
476 raise NotImplementedError
478 def substantiate(self, sb):
479 if self.substantiated:
480 self._clearBuildWaitTimer()
481 self._setBuildWaitTimer()
482 return defer.succeed(self)
483 if self.substantiation_deferred is None:
484 if self.parent and not self.missing_timer:
485 # start timer. if timer times out, fail deferred
486 self.missing_timer = reactor.callLater(
487 self.missing_timeout,
488 self._substantiation_failed, defer.TimeoutError())
489 self.substantiation_deferred = defer.Deferred()
490 if self.slave is None:
491 self._substantiate() # start up instance
492 # else: we're waiting for an old one to detach. the _substantiate
493 # will be done in ``detached`` below.
494 return self.substantiation_deferred
496 def _substantiate(self):
497 # register event trigger
498 d = self.start_instance()
499 self._shutdown_callback_handle = reactor.addSystemEventTrigger(
500 'before', 'shutdown', self._soft_disconnect, fast=True)
501 def stash_reply(result):
502 self._start_result = result
503 def clean_up(failure):
504 if self.missing_timer is not None:
505 self.missing_timer.cancel()
506 self._substantiation_failed(failure)
507 if self._shutdown_callback_handle is not None:
508 handle = self._shutdown_callback_handle
509 del self._shutdown_callback_handle
510 reactor.removeSystemEventTrigger(handle)
511 return failure
512 d.addCallbacks(stash_reply, clean_up)
513 return d
515 def attached(self, bot):
516 if self.substantiation_deferred is None:
517 log.msg('Slave %s received connection while not trying to '
518 'substantiate. Disconnecting.' % (self.slavename,))
519 self._disconnect(bot)
520 return defer.fail()
521 return AbstractBuildSlave.attached(self, bot)
523 def detached(self, mind):
524 AbstractBuildSlave.detached(self, mind)
525 if self.substantiation_deferred is not None:
526 self._substantiate()
528 def _substantiation_failed(self, failure):
529 d = self.substantiation_deferred
530 self.substantiation_deferred = None
531 self.missing_timer = None
532 d.errback(failure)
533 self.insubstantiate()
534 # notify people, but only if we're still in the config
535 if not self.parent or not self.notify_on_missing:
536 return
538 status = buildmaster.getStatus()
539 text = "The Buildbot working for '%s'\n" % status.getProjectName()
540 text += ("has noticed that the latent buildslave named %s \n" %
541 self.slavename)
542 text += "never substantiated after a request\n"
543 text += "\n"
544 text += ("The request was made at %s (buildmaster-local time)\n" %
545 time.ctime(time.time() - self.missing_timeout)) # approx
546 text += "\n"
547 text += "Sincerely,\n"
548 text += " The Buildbot\n"
549 text += " %s\n" % status.getProjectURL()
550 subject = "Buildbot: buildslave %s never substantiated" % self.slavename
551 return self._mail_missing_message(subject, text)
553 def buildStarted(self, sb):
554 assert self.substantiated
555 self._clearBuildWaitTimer()
556 self.building.add(sb.builder_name)
558 def buildFinished(self, sb):
559 self.building.remove(sb.builder_name)
560 if not self.building:
561 self._setBuildWaitTimer()
563 def _clearBuildWaitTimer(self):
564 if self.build_wait_timer is not None:
565 if self.build_wait_timer.active():
566 self.build_wait_timer.cancel()
567 self.build_wait_timer = None
569 def _setBuildWaitTimer(self):
570 self._clearBuildWaitTimer()
571 self.build_wait_timer = reactor.callLater(
572 self.build_wait_timeout, self._soft_disconnect)
574 def insubstantiate(self, fast=False):
575 self._clearBuildWaitTimer()
576 d = self.stop_instance(fast)
577 if self._shutdown_callback_handle is not None:
578 handle = self._shutdown_callback_handle
579 del self._shutdown_callback_handle
580 reactor.removeSystemEventTrigger(handle)
581 self.substantiated = False
582 self.building.clear() # just to be sure
583 return d
585 def _soft_disconnect(self, fast=False):
586 d = AbstractBuildSlave.disconnect(self)
587 if self.slave is not None:
588 # this could be called when the slave needs to shut down, such as
589 # in BotMaster.removeSlave, *or* when a new slave requests a
590 # connection when we already have a slave. It's not clear what to
591 # do in the second case: this shouldn't happen, and if it
592 # does...if it's a latent slave, shutting down will probably kill
593 # something we want...but we can't know what the status is. So,
594 # here, we just do what should be appropriate for the first case,
595 # and put our heads in the sand for the second, at least for now.
596 # The best solution to the odd situation is removing it as a
597 # possibilty: make the master in charge of connecting to the
598 # slave, rather than vice versa. TODO.
599 d = defer.DeferredList([d, self.insubstantiate(fast)])
600 else:
601 if self.substantiation_deferred is not None:
602 # unlike the previous block, we don't expect this situation when
603 # ``attached`` calls ``disconnect``, only when we get a simple
604 # request to "go away".
605 self.substantiation_deferred.errback()
606 self.substantiation_deferred = None
607 if self.missing_timer:
608 self.missing_timer.cancel()
609 self.missing_timer = None
610 self.stop_instance()
611 return d
613 def disconnect(self):
614 d = self._soft_disconnect()
615 # this removes the slave from all builders. It won't come back
616 # without a restart (or maybe a sighup)
617 self.botmaster.slaveLost(self)
619 def stopService(self):
620 res = defer.maybeDeferred(AbstractBuildSlave.stopService, self)
621 if self.slave is not None:
622 d = self._soft_disconnect()
623 res = defer.DeferredList([res, d])
624 return res
626 def updateSlave(self):
627 """Called to add or remove builders after the slave has connected.
629 Also called after botmaster's builders are initially set.
631 @return: a Deferred that indicates when an attached slave has
632 accepted the new builders and/or released the old ones."""
633 for b in self.botmaster.getBuildersForSlave(self.slavename):
634 if b.name not in self.slavebuilders:
635 b.addLatentSlave(self)
636 return AbstractBuildSlave.updateSlave(self)
638 def sendBuilderList(self):
639 d = AbstractBuildSlave.sendBuilderList(self)
640 def _sent(slist):
641 dl = []
642 for name, remote in slist.items():
643 # use get() since we might have changed our mind since then.
644 # we're checking on the builder in addition to the
645 # slavebuilders out of a bit of paranoia.
646 b = self.botmaster.builders.get(name)
647 sb = self.slavebuilders.get(name)
648 if b and sb:
649 d1 = sb.attached(self, remote, self.slave_commands)
650 dl.append(d1)
651 return defer.DeferredList(dl)
652 def _set_failed(why):
653 log.msg("BuildSlave.sendBuilderList (%s) failed" % self)
654 log.err(why)
655 # TODO: hang up on them?, without setBuilderList we can't use
656 # them
657 if self.substantiation_deferred:
658 self.substantiation_deferred.errback()
659 self.substantiation_deferred = None
660 if self.missing_timer:
661 self.missing_timer.cancel()
662 self.missing_timer = None
663 # TODO: maybe log? send an email?
664 return why
665 d.addCallbacks(_sent, _set_failed)
666 def _substantiated(res):
667 self.substantiated = True
668 if self.substantiation_deferred:
669 d = self.substantiation_deferred
670 del self.substantiation_deferred
671 res = self._start_result
672 del self._start_result
673 d.callback(res)
674 # note that the missing_timer is already handled within
675 # ``attached``
676 if not self.building:
677 self._setBuildWaitTimer()
678 d.addCallback(_substantiated)
679 return d