1 # Portions copyright Canonical Ltd. 2009
4 from email
.Message
import Message
5 from email
.Utils
import formatdate
6 from zope
.interface
import implements
7 from twisted
.python
import log
8 from twisted
.internet
import defer
, reactor
9 from twisted
.application
import service
10 import twisted
.spread
.pb
12 from buildbot
.pbutil
import NewCredPerspective
13 from buildbot
.status
.builder
import SlaveStatus
14 from buildbot
.status
.mail
import MailNotifier
15 from buildbot
.interfaces
import IBuildSlave
, ILatentBuildSlave
16 from buildbot
.process
.properties
import Properties
19 class AbstractBuildSlave(NewCredPerspective
, service
.MultiService
):
20 """This is the master-side representative for a remote buildbot slave.
21 There is exactly one for each slave described in the config file (the
22 c['slaves'] list). When buildbots connect in (.attach), they get a
23 reference to this instance. The BotMaster object is stashed as the
24 .botmaster attribute. The BotMaster is also our '.parent' Service.
26 I represent a build slave -- a remote machine capable of
27 running builds. I am instantiated by the configuration file, and can be
28 subclassed to add extra functionality."""
30 implements(IBuildSlave
)
32 def __init__(self
, name
, password
, max_builds
=None,
33 notify_on_missing
=[], missing_timeout
=3600,
36 @param name: botname this machine will supply when it connects
37 @param password: password this machine will supply when
39 @param max_builds: maximum number of simultaneous builds that will
40 be run concurrently on this buildslave (the
41 default is None for no limit)
42 @param properties: properties that will be applied to builds run on
44 @type properties: dictionary
46 service
.MultiService
.__init
__(self
)
48 self
.password
= password
49 self
.botmaster
= None # no buildmaster yet
50 self
.slave_status
= SlaveStatus(name
)
51 self
.slave
= None # a RemoteReference to the Bot, when connected
52 self
.slave_commands
= None
53 self
.slavebuilders
= {}
54 self
.max_builds
= max_builds
56 self
.properties
= Properties()
57 self
.properties
.update(properties
, "BuildSlave")
58 self
.properties
.setProperty("slavename", name
, "BuildSlave")
60 self
.lastMessageReceived
= 0
61 if isinstance(notify_on_missing
, str):
62 notify_on_missing
= [notify_on_missing
]
63 self
.notify_on_missing
= notify_on_missing
64 for i
in notify_on_missing
:
65 assert isinstance(i
, str)
66 self
.missing_timeout
= missing_timeout
67 self
.missing_timer
= None
69 def update(self
, new
):
71 Given a new BuildSlave, configure this one identically. Because
72 BuildSlave objects are remotely referenced, we can't replace them
73 without disconnecting the slave, yet there's no reason to do that.
75 # the reconfiguration logic should guarantee this:
76 assert self
.slavename
== new
.slavename
77 assert self
.password
== new
.password
78 assert self
.__class
__ == new
.__class
__
79 self
.max_builds
= new
.max_builds
83 builders
= self
.botmaster
.getBuildersForSlave(self
.slavename
)
84 return "<%s '%s', current builders: %s>" % \
85 (self
.__class
__.__name
__, self
.slavename
,
86 ','.join(map(lambda b
: b
.name
, builders
)))
88 return "<%s '%s', (no builders yet)>" % \
89 (self
.__class
__.__name
__, self
.slavename
)
91 def setBotmaster(self
, botmaster
):
92 assert not self
.botmaster
, "BuildSlave already has a botmaster"
93 self
.botmaster
= botmaster
94 self
.startMissingTimer()
96 def stopMissingTimer(self
):
97 if self
.missing_timer
:
98 self
.missing_timer
.cancel()
99 self
.missing_timer
= None
101 def startMissingTimer(self
):
102 if self
.notify_on_missing
and self
.missing_timeout
and self
.parent
:
103 self
.stopMissingTimer() # in case it's already running
104 self
.missing_timer
= reactor
.callLater(self
.missing_timeout
,
105 self
._missing
_timer
_fired
)
107 def _missing_timer_fired(self
):
108 self
.missing_timer
= None
109 # notify people, but only if we're still in the config
113 buildmaster
= self
.botmaster
.parent
114 status
= buildmaster
.getStatus()
115 text
= "The Buildbot working for '%s'\n" % status
.getProjectName()
116 text
+= ("has noticed that the buildslave named %s went away\n" %
119 text
+= ("It last disconnected at %s (buildmaster-local time)\n" %
120 time
.ctime(time
.time() - self
.missing_timeout
)) # approx
122 text
+= "The admin on record (as reported by BUILDSLAVE:info/admin)\n"
123 text
+= "was '%s'.\n" % self
.slave_status
.getAdmin()
125 text
+= "Sincerely,\n"
126 text
+= " The Buildbot\n"
127 text
+= " %s\n" % status
.getProjectURL()
128 subject
= "Buildbot: buildslave %s was lost" % self
.slavename
129 return self
._mail
_missing
_message
(subject
, text
)
132 def updateSlave(self
):
133 """Called to add or remove builders after the slave has connected.
135 @return: a Deferred that indicates when an attached slave has
136 accepted the new builders and/or released the old ones."""
138 return self
.sendBuilderList()
140 return defer
.succeed(None)
142 def updateSlaveStatus(self
, buildStarted
=None, buildFinished
=None):
144 self
.slave_status
.buildStarted(buildStarted
)
146 self
.slave_status
.buildFinished(buildFinished
)
148 def attached(self
, bot
):
149 """This is called when the slave connects.
151 @return: a Deferred that fires with a suitable pb.IPerspective to
152 give to the slave (i.e. 'self')"""
155 # uh-oh, we've got a duplicate slave. The most likely
156 # explanation is that the slave is behind a slow link, thinks we
157 # went away, and has attempted to reconnect, so we've got two
158 # "connections" from the same slave, but the previous one is
159 # stale. Give the new one precedence.
160 log
.msg("duplicate slave %s replacing old one" % self
.slavename
)
162 # just in case we've got two identically-configured slaves,
163 # report the IP addresses of both so someone can resolve the
165 tport
= self
.slave
.broker
.transport
166 log
.msg("old slave was connected from", tport
.getPeer())
167 log
.msg("new slave is from", bot
.broker
.transport
.getPeer())
168 d
= self
.disconnect()
170 d
= defer
.succeed(None)
171 # now we go through a sequence of calls, gathering information, then
172 # tell the Botmaster that it can finally give this slave to all the
173 # Builders that care about it.
175 # we accumulate slave information in this 'state' dictionary, then
176 # set it atomically if we make it far enough through the process
179 # Reset graceful shutdown status
180 self
.slave_status
.setGraceful(False)
181 # We want to know when the graceful shutdown flag changes
182 self
.slave_status
.addGracefulWatcher(self
._gracefulChanged
)
184 def _log_attachment_on_slave(res
):
185 d1
= bot
.callRemote("print", "attached")
186 d1
.addErrback(lambda why
: None)
188 d
.addCallback(_log_attachment_on_slave
)
191 d1
= bot
.callRemote("getSlaveInfo")
193 log
.msg("Got slaveinfo from '%s'" % self
.slavename
)
194 # TODO: info{} might have other keys
195 state
["admin"] = info
.get("admin")
196 state
["host"] = info
.get("host")
197 def _info_unavailable(why
):
198 # maybe an old slave, doesn't implement remote_getSlaveInfo
199 log
.msg("BuildSlave.info_unavailable")
201 d1
.addCallbacks(_got_info
, _info_unavailable
)
203 d
.addCallback(_get_info
)
205 def _get_commands(res
):
206 d1
= bot
.callRemote("getCommands")
207 def _got_commands(commands
):
208 state
["slave_commands"] = commands
209 def _commands_unavailable(why
):
210 # probably an old slave
211 log
.msg("BuildSlave._commands_unavailable")
212 if why
.check(AttributeError):
215 d1
.addCallbacks(_got_commands
, _commands_unavailable
)
217 d
.addCallback(_get_commands
)
219 def _accept_slave(res
):
220 self
.slave_status
.setAdmin(state
.get("admin"))
221 self
.slave_status
.setHost(state
.get("host"))
222 self
.slave_status
.setConnected(True)
223 self
.slave_commands
= state
.get("slave_commands")
225 log
.msg("bot attached")
226 self
.messageReceivedFromSlave()
227 self
.stopMissingTimer()
229 return self
.updateSlave()
230 d
.addCallback(_accept_slave
)
231 d
.addCallback(lambda res
: self
.botmaster
.maybeStartAllBuilds())
233 # Finally, the slave gets a reference to this BuildSlave. They
234 # receive this later, after we've started using them.
235 d
.addCallback(lambda res
: self
)
238 def messageReceivedFromSlave(self
):
240 self
.lastMessageReceived
= now
241 self
.slave_status
.setLastMessageReceived(now
)
243 def detached(self
, mind
):
245 self
.slave_status
.removeGracefulWatcher(self
._gracefulChanged
)
246 self
.slave_status
.setConnected(False)
247 log
.msg("BuildSlave.detached(%s)" % self
.slavename
)
249 def disconnect(self
):
250 """Forcibly disconnect the slave.
252 This severs the TCP connection and returns a Deferred that will fire
253 (with None) when the connection is probably gone.
255 If the slave is still alive, they will probably try to reconnect
258 This is called in two circumstances. The first is when a slave is
259 removed from the config file. In this case, when they try to
260 reconnect, they will be rejected as an unknown slave. The second is
261 when we wind up with two connections for the same slave, in which
262 case we disconnect the older connection.
266 return defer
.succeed(None)
267 log
.msg("disconnecting old slave %s now" % self
.slavename
)
268 # When this Deferred fires, we'll be ready to accept the new slave
269 return self
._disconnect
(self
.slave
)
271 def _disconnect(self
, slave
):
272 # all kinds of teardown will happen as a result of
273 # loseConnection(), but it happens after a reactor iteration or
274 # two. Hook the actual disconnect so we can know when it is safe
275 # to connect the new slave. We have to wait one additional
276 # iteration (with callLater(0)) to make sure the *other*
277 # notifyOnDisconnect handlers have had a chance to run.
280 # notifyOnDisconnect runs the callback with one argument, the
281 # RemoteReference being disconnected.
282 def _disconnected(rref
):
283 reactor
.callLater(0, d
.callback
, None)
284 slave
.notifyOnDisconnect(_disconnected
)
285 tport
= slave
.broker
.transport
286 # this is the polite way to request that a socket be closed
287 tport
.loseConnection()
289 # but really we don't want to wait for the transmit queue to
290 # drain. The remote end is unlikely to ACK the data, so we'd
291 # probably have to wait for a (20-minute) TCP timeout.
292 #tport._closeSocket()
293 # however, doing _closeSocket (whether before or after
294 # loseConnection) somehow prevents the notifyOnDisconnect
295 # handlers from being run. Bummer.
297 tport
.dataBuffer
= ""
299 # however, these hacks are pretty internal, so don't blow up if
300 # they fail or are unavailable
301 log
.msg("failed to accelerate the shutdown process")
303 log
.msg("waiting for slave to finish disconnecting")
307 def sendBuilderList(self
):
308 our_builders
= self
.botmaster
.getBuildersForSlave(self
.slavename
)
309 blist
= [(b
.name
, b
.builddir
) for b
in our_builders
]
310 d
= self
.slave
.callRemote("setBuilderList", blist
)
313 def perspective_keepalive(self
):
316 def addSlaveBuilder(self
, sb
):
317 if sb
.builder_name
not in self
.slavebuilders
:
318 log
.msg("%s adding %s" % (self
, sb
))
319 elif sb
is not self
.slavebuilders
[sb
.builder_name
]:
320 log
.msg("%s replacing %s" % (self
, sb
))
323 self
.slavebuilders
[sb
.builder_name
] = sb
325 def removeSlaveBuilder(self
, sb
):
327 del self
.slavebuilders
[sb
.builder_name
]
331 log
.msg("%s removed %s" % (self
, sb
))
333 def canStartBuild(self
):
335 I am called when a build is requested to see if this buildslave
336 can start a build. This function can be used to limit overall
337 concurrency on the buildslave.
339 # If we're waiting to shutdown gracefully, then we shouldn't
340 # accept any new jobs.
341 if self
.slave_status
.getGraceful():
345 active_builders
= [sb
for sb
in self
.slavebuilders
.values()
347 if len(active_builders
) >= self
.max_builds
:
351 def _mail_missing_message(self
, subject
, text
):
352 # first, see if we have a MailNotifier we can use. This gives us a
353 # fromaddr and a relayhost.
354 buildmaster
= self
.botmaster
.parent
355 for st
in buildmaster
.statusTargets
:
356 if isinstance(st
, MailNotifier
):
359 # if not, they get a default MailNotifier, which always uses SMTP
360 # to localhost and uses a dummy fromaddr of "buildbot".
361 log
.msg("buildslave-missing msg using default MailNotifier")
362 st
= MailNotifier("buildbot")
363 # now construct the mail
367 m
['Date'] = formatdate(localtime
=True)
368 m
['Subject'] = subject
369 m
['From'] = st
.fromaddr
370 recipients
= self
.notify_on_missing
371 m
['To'] = ", ".join(recipients
)
372 d
= st
.sendMessage(m
, recipients
)
373 # return the Deferred for testing purposes
376 def _gracefulChanged(self
, graceful
):
377 """This is called when our graceful shutdown setting changes"""
379 active_builders
= [sb
for sb
in self
.slavebuilders
.values()
381 if len(active_builders
) == 0:
386 """Shutdown the slave"""
387 # Look for a builder with a remote reference to the client side
388 # slave. If we can find one, then call "shutdown" on the remote
389 # builder, which will cause the slave buildbot process to exit.
391 for b
in self
.slavebuilders
.values():
393 d
= b
.remote
.callRemote("shutdown")
397 log
.msg("Shutting down slave: %s" % self
.slavename
)
398 # The remote shutdown call will not complete successfully since the
399 # buildbot process exits almost immediately after getting the
401 # Here we look at the reason why the remote call failed, and if
402 # it's because the connection was lost, that means the slave
403 # shutdown as expected.
405 if why
.check(twisted
.spread
.pb
.PBConnectionLost
):
406 log
.msg("Lost connection to %s" % self
.slavename
)
408 log
.err("Unexpected error when trying to shutdown %s" % self
.slavename
)
409 d
.addErrback(_errback
)
411 log
.err("Couldn't find remote builder to shut down slave")
412 return defer
.succeed(None)
414 class BuildSlave(AbstractBuildSlave
):
416 def sendBuilderList(self
):
417 d
= AbstractBuildSlave
.sendBuilderList(self
)
420 for name
, remote
in slist
.items():
421 # use get() since we might have changed our mind since then
422 b
= self
.botmaster
.builders
.get(name
)
424 d1
= b
.attached(self
, remote
, self
.slave_commands
)
426 return defer
.DeferredList(dl
)
427 def _set_failed(why
):
428 log
.msg("BuildSlave.sendBuilderList (%s) failed" % self
)
430 # TODO: hang up on them?, without setBuilderList we can't use
432 d
.addCallbacks(_sent
, _set_failed
)
435 def detached(self
, mind
):
436 AbstractBuildSlave
.detached(self
, mind
)
437 self
.botmaster
.slaveLost(self
)
438 self
.startMissingTimer()
440 def buildFinished(self
, sb
):
441 """This is called when a build on this slave is finished."""
442 # If we're gracefully shutting down, and we have no more active
443 # builders, then it's safe to disconnect
444 if self
.slave_status
.getGraceful():
445 active_builders
= [sb
for sb
in self
.slavebuilders
.values()
447 if len(active_builders
) == 0:
449 return self
.shutdown()
450 return defer
.succeed(None)
452 class AbstractLatentBuildSlave(AbstractBuildSlave
):
453 """A build slave that will start up a slave instance when needed.
455 To use, subclass and implement start_instance and stop_instance.
457 See ec2buildslave.py for a concrete example. Also see the stub example in
461 implements(ILatentBuildSlave
)
463 substantiated
= False
464 substantiation_deferred
= None
465 build_wait_timer
= None
466 _start_result
= _shutdown_callback_handle
= None
468 def __init__(self
, name
, password
, max_builds
=None,
469 notify_on_missing
=[], missing_timeout
=60*20,
470 build_wait_timeout
=60*10,
472 AbstractBuildSlave
.__init
__(
473 self
, name
, password
, max_builds
, notify_on_missing
,
474 missing_timeout
, properties
)
475 self
.building
= set()
476 self
.build_wait_timeout
= build_wait_timeout
478 def start_instance(self
):
479 # responsible for starting instance that will try to connect with
480 # this master. Should return deferred. Problems should use an
482 raise NotImplementedError
484 def stop_instance(self
, fast
=False):
485 # responsible for shutting down instance.
486 raise NotImplementedError
488 def substantiate(self
, sb
):
489 if self
.substantiated
:
490 self
._clearBuildWaitTimer
()
491 self
._setBuildWaitTimer
()
492 return defer
.succeed(self
)
493 if self
.substantiation_deferred
is None:
494 if self
.parent
and not self
.missing_timer
:
495 # start timer. if timer times out, fail deferred
496 self
.missing_timer
= reactor
.callLater(
497 self
.missing_timeout
,
498 self
._substantiation
_failed
, defer
.TimeoutError())
499 self
.substantiation_deferred
= defer
.Deferred()
500 if self
.slave
is None:
501 self
._substantiate
() # start up instance
502 # else: we're waiting for an old one to detach. the _substantiate
503 # will be done in ``detached`` below.
504 return self
.substantiation_deferred
506 def _substantiate(self
):
507 # register event trigger
508 d
= self
.start_instance()
509 self
._shutdown
_callback
_handle
= reactor
.addSystemEventTrigger(
510 'before', 'shutdown', self
._soft
_disconnect
, fast
=True)
511 def stash_reply(result
):
512 self
._start
_result
= result
513 def clean_up(failure
):
514 if self
.missing_timer
is not None:
515 self
.missing_timer
.cancel()
516 self
._substantiation
_failed
(failure
)
517 if self
._shutdown
_callback
_handle
is not None:
518 handle
= self
._shutdown
_callback
_handle
519 del self
._shutdown
_callback
_handle
520 reactor
.removeSystemEventTrigger(handle
)
522 d
.addCallbacks(stash_reply
, clean_up
)
525 def attached(self
, bot
):
526 if self
.substantiation_deferred
is None:
527 msg
= 'Slave %s received connection while not trying to ' \
528 'substantiate. Disconnecting.' % (self
.slavename
,)
530 self
._disconnect
(bot
)
531 return defer
.fail(RuntimeError(msg
))
532 return AbstractBuildSlave
.attached(self
, bot
)
534 def detached(self
, mind
):
535 AbstractBuildSlave
.detached(self
, mind
)
536 if self
.substantiation_deferred
is not None:
539 def _substantiation_failed(self
, failure
):
540 d
= self
.substantiation_deferred
541 self
.substantiation_deferred
= None
542 self
.missing_timer
= None
544 self
.insubstantiate()
545 # notify people, but only if we're still in the config
546 if not self
.parent
or not self
.notify_on_missing
:
549 buildmaster
= self
.botmaster
.parent
550 status
= buildmaster
.getStatus()
551 text
= "The Buildbot working for '%s'\n" % status
.getProjectName()
552 text
+= ("has noticed that the latent buildslave named %s \n" %
554 text
+= "never substantiated after a request\n"
556 text
+= ("The request was made at %s (buildmaster-local time)\n" %
557 time
.ctime(time
.time() - self
.missing_timeout
)) # approx
559 text
+= "Sincerely,\n"
560 text
+= " The Buildbot\n"
561 text
+= " %s\n" % status
.getProjectURL()
562 subject
= "Buildbot: buildslave %s never substantiated" % self
.slavename
563 return self
._mail
_missing
_message
(subject
, text
)
565 def buildStarted(self
, sb
):
566 assert self
.substantiated
567 self
._clearBuildWaitTimer
()
568 self
.building
.add(sb
.builder_name
)
570 def buildFinished(self
, sb
):
571 self
.building
.remove(sb
.builder_name
)
572 if not self
.building
:
573 self
._setBuildWaitTimer
()
575 def _clearBuildWaitTimer(self
):
576 if self
.build_wait_timer
is not None:
577 if self
.build_wait_timer
.active():
578 self
.build_wait_timer
.cancel()
579 self
.build_wait_timer
= None
581 def _setBuildWaitTimer(self
):
582 self
._clearBuildWaitTimer
()
583 self
.build_wait_timer
= reactor
.callLater(
584 self
.build_wait_timeout
, self
._soft
_disconnect
)
586 def insubstantiate(self
, fast
=False):
587 self
._clearBuildWaitTimer
()
588 d
= self
.stop_instance(fast
)
589 if self
._shutdown
_callback
_handle
is not None:
590 handle
= self
._shutdown
_callback
_handle
591 del self
._shutdown
_callback
_handle
592 reactor
.removeSystemEventTrigger(handle
)
593 self
.substantiated
= False
594 self
.building
.clear() # just to be sure
597 def _soft_disconnect(self
, fast
=False):
598 d
= AbstractBuildSlave
.disconnect(self
)
599 if self
.slave
is not None:
600 # this could be called when the slave needs to shut down, such as
601 # in BotMaster.removeSlave, *or* when a new slave requests a
602 # connection when we already have a slave. It's not clear what to
603 # do in the second case: this shouldn't happen, and if it
604 # does...if it's a latent slave, shutting down will probably kill
605 # something we want...but we can't know what the status is. So,
606 # here, we just do what should be appropriate for the first case,
607 # and put our heads in the sand for the second, at least for now.
608 # The best solution to the odd situation is removing it as a
609 # possibilty: make the master in charge of connecting to the
610 # slave, rather than vice versa. TODO.
611 d
= defer
.DeferredList([d
, self
.insubstantiate(fast
)])
613 if self
.substantiation_deferred
is not None:
614 # unlike the previous block, we don't expect this situation when
615 # ``attached`` calls ``disconnect``, only when we get a simple
616 # request to "go away".
617 self
.substantiation_deferred
.errback()
618 self
.substantiation_deferred
= None
619 if self
.missing_timer
:
620 self
.missing_timer
.cancel()
621 self
.missing_timer
= None
625 def disconnect(self
):
626 d
= self
._soft
_disconnect
()
627 # this removes the slave from all builders. It won't come back
628 # without a restart (or maybe a sighup)
629 self
.botmaster
.slaveLost(self
)
631 def stopService(self
):
632 res
= defer
.maybeDeferred(AbstractBuildSlave
.stopService
, self
)
633 if self
.slave
is not None:
634 d
= self
._soft
_disconnect
()
635 res
= defer
.DeferredList([res
, d
])
638 def updateSlave(self
):
639 """Called to add or remove builders after the slave has connected.
641 Also called after botmaster's builders are initially set.
643 @return: a Deferred that indicates when an attached slave has
644 accepted the new builders and/or released the old ones."""
645 for b
in self
.botmaster
.getBuildersForSlave(self
.slavename
):
646 if b
.name
not in self
.slavebuilders
:
647 b
.addLatentSlave(self
)
648 return AbstractBuildSlave
.updateSlave(self
)
650 def sendBuilderList(self
):
651 d
= AbstractBuildSlave
.sendBuilderList(self
)
654 for name
, remote
in slist
.items():
655 # use get() since we might have changed our mind since then.
656 # we're checking on the builder in addition to the
657 # slavebuilders out of a bit of paranoia.
658 b
= self
.botmaster
.builders
.get(name
)
659 sb
= self
.slavebuilders
.get(name
)
661 d1
= sb
.attached(self
, remote
, self
.slave_commands
)
663 return defer
.DeferredList(dl
)
664 def _set_failed(why
):
665 log
.msg("BuildSlave.sendBuilderList (%s) failed" % self
)
667 # TODO: hang up on them?, without setBuilderList we can't use
669 if self
.substantiation_deferred
:
670 self
.substantiation_deferred
.errback()
671 self
.substantiation_deferred
= None
672 if self
.missing_timer
:
673 self
.missing_timer
.cancel()
674 self
.missing_timer
= None
675 # TODO: maybe log? send an email?
677 d
.addCallbacks(_sent
, _set_failed
)
678 def _substantiated(res
):
679 self
.substantiated
= True
680 if self
.substantiation_deferred
:
681 d
= self
.substantiation_deferred
682 del self
.substantiation_deferred
683 res
= self
._start
_result
684 del self
._start
_result
686 # note that the missing_timer is already handled within
688 if not self
.building
:
689 self
._setBuildWaitTimer
()
690 d
.addCallback(_substantiated
)