1 # Portions copyright Canonical Ltd. 2009
4 from email
.Message
import Message
5 from email
.Utils
import formatdate
6 from zope
.interface
import implements
7 from twisted
.python
import log
8 from twisted
.internet
import defer
, reactor
9 from twisted
.application
import service
10 import twisted
.spread
.pb
12 from buildbot
.pbutil
import NewCredPerspective
13 from buildbot
.status
.builder
import SlaveStatus
14 from buildbot
.status
.mail
import MailNotifier
15 from buildbot
.interfaces
import IBuildSlave
, ILatentBuildSlave
16 from buildbot
.process
.properties
import Properties
19 class AbstractBuildSlave(NewCredPerspective
, service
.MultiService
):
20 """This is the master-side representative for a remote buildbot slave.
21 There is exactly one for each slave described in the config file (the
22 c['slaves'] list). When buildbots connect in (.attach), they get a
23 reference to this instance. The BotMaster object is stashed as the
24 .botmaster attribute. The BotMaster is also our '.parent' Service.
26 I represent a build slave -- a remote machine capable of
27 running builds. I am instantiated by the configuration file, and can be
28 subclassed to add extra functionality."""
30 implements(IBuildSlave
)
32 def __init__(self
, name
, password
, max_builds
=None,
33 notify_on_missing
=[], missing_timeout
=3600,
36 @param name: botname this machine will supply when it connects
37 @param password: password this machine will supply when
39 @param max_builds: maximum number of simultaneous builds that will
40 be run concurrently on this buildslave (the
41 default is None for no limit)
42 @param properties: properties that will be applied to builds run on
44 @type properties: dictionary
46 service
.MultiService
.__init
__(self
)
48 self
.password
= password
49 self
.botmaster
= None # no buildmaster yet
50 self
.slave_status
= SlaveStatus(name
)
51 self
.slave
= None # a RemoteReference to the Bot, when connected
52 self
.slave_commands
= None
53 self
.slavebuilders
= {}
54 self
.max_builds
= max_builds
56 self
.properties
= Properties()
57 self
.properties
.update(properties
, "BuildSlave")
58 self
.properties
.setProperty("slavename", name
, "BuildSlave")
60 self
.lastMessageReceived
= 0
61 if isinstance(notify_on_missing
, str):
62 notify_on_missing
= [notify_on_missing
]
63 self
.notify_on_missing
= notify_on_missing
64 for i
in notify_on_missing
:
65 assert isinstance(i
, str)
66 self
.missing_timeout
= missing_timeout
67 self
.missing_timer
= None
69 def update(self
, new
):
71 Given a new BuildSlave, configure this one identically. Because
72 BuildSlave objects are remotely referenced, we can't replace them
73 without disconnecting the slave, yet there's no reason to do that.
75 # the reconfiguration logic should guarantee this:
76 assert self
.slavename
== new
.slavename
77 assert self
.password
== new
.password
78 assert self
.__class
__ == new
.__class
__
79 self
.max_builds
= new
.max_builds
83 builders
= self
.botmaster
.getBuildersForSlave(self
.slavename
)
84 return "<%s '%s', current builders: %s>" % \
85 (self
.__class
__.__name
__, self
.slavename
,
86 ','.join(map(lambda b
: b
.name
, builders
)))
88 return "<%s '%s', (no builders yet)>" % \
89 (self
.__class
__.__name
__, self
.slavename
)
91 def setBotmaster(self
, botmaster
):
92 assert not self
.botmaster
, "BuildSlave already has a botmaster"
93 self
.botmaster
= botmaster
95 def updateSlave(self
):
96 """Called to add or remove builders after the slave has connected.
98 @return: a Deferred that indicates when an attached slave has
99 accepted the new builders and/or released the old ones."""
101 return self
.sendBuilderList()
103 return defer
.succeed(None)
105 def updateSlaveStatus(self
, buildStarted
=None, buildFinished
=None):
107 self
.slave_status
.buildStarted(buildStarted
)
109 self
.slave_status
.buildFinished(buildFinished
)
111 def attached(self
, bot
):
112 """This is called when the slave connects.
114 @return: a Deferred that fires with a suitable pb.IPerspective to
115 give to the slave (i.e. 'self')"""
118 # uh-oh, we've got a duplicate slave. The most likely
119 # explanation is that the slave is behind a slow link, thinks we
120 # went away, and has attempted to reconnect, so we've got two
121 # "connections" from the same slave, but the previous one is
122 # stale. Give the new one precedence.
123 log
.msg("duplicate slave %s replacing old one" % self
.slavename
)
125 # just in case we've got two identically-configured slaves,
126 # report the IP addresses of both so someone can resolve the
128 tport
= self
.slave
.broker
.transport
129 log
.msg("old slave was connected from", tport
.getPeer())
130 log
.msg("new slave is from", bot
.broker
.transport
.getPeer())
131 d
= self
.disconnect()
133 d
= defer
.succeed(None)
134 # now we go through a sequence of calls, gathering information, then
135 # tell the Botmaster that it can finally give this slave to all the
136 # Builders that care about it.
138 # we accumulate slave information in this 'state' dictionary, then
139 # set it atomically if we make it far enough through the process
142 # Reset graceful shutdown status
143 self
.slave_status
.setGraceful(False)
144 # We want to know when the graceful shutdown flag changes
145 self
.slave_status
.addGracefulWatcher(self
._gracefulChanged
)
147 def _log_attachment_on_slave(res
):
148 d1
= bot
.callRemote("print", "attached")
149 d1
.addErrback(lambda why
: None)
151 d
.addCallback(_log_attachment_on_slave
)
154 d1
= bot
.callRemote("getSlaveInfo")
156 log
.msg("Got slaveinfo from '%s'" % self
.slavename
)
157 # TODO: info{} might have other keys
158 state
["admin"] = info
.get("admin")
159 state
["host"] = info
.get("host")
160 def _info_unavailable(why
):
161 # maybe an old slave, doesn't implement remote_getSlaveInfo
162 log
.msg("BuildSlave.info_unavailable")
164 d1
.addCallbacks(_got_info
, _info_unavailable
)
166 d
.addCallback(_get_info
)
168 def _get_commands(res
):
169 d1
= bot
.callRemote("getCommands")
170 def _got_commands(commands
):
171 state
["slave_commands"] = commands
172 def _commands_unavailable(why
):
173 # probably an old slave
174 log
.msg("BuildSlave._commands_unavailable")
175 if why
.check(AttributeError):
178 d1
.addCallbacks(_got_commands
, _commands_unavailable
)
180 d
.addCallback(_get_commands
)
182 def _accept_slave(res
):
183 self
.slave_status
.setAdmin(state
.get("admin"))
184 self
.slave_status
.setHost(state
.get("host"))
185 self
.slave_status
.setConnected(True)
186 self
.slave_commands
= state
.get("slave_commands")
188 log
.msg("bot attached")
189 self
.messageReceivedFromSlave()
190 if self
.missing_timer
:
191 self
.missing_timer
.cancel()
192 self
.missing_timer
= None
194 return self
.updateSlave()
195 d
.addCallback(_accept_slave
)
197 # Finally, the slave gets a reference to this BuildSlave. They
198 # receive this later, after we've started using them.
199 d
.addCallback(lambda res
: self
)
202 def messageReceivedFromSlave(self
):
204 self
.lastMessageReceived
= now
205 self
.slave_status
.setLastMessageReceived(now
)
207 def detached(self
, mind
):
209 self
.slave_status
.removeGracefulWatcher(self
._gracefulChanged
)
210 self
.slave_status
.setConnected(False)
211 log
.msg("BuildSlave.detached(%s)" % self
.slavename
)
213 def disconnect(self
):
214 """Forcibly disconnect the slave.
216 This severs the TCP connection and returns a Deferred that will fire
217 (with None) when the connection is probably gone.
219 If the slave is still alive, they will probably try to reconnect
222 This is called in two circumstances. The first is when a slave is
223 removed from the config file. In this case, when they try to
224 reconnect, they will be rejected as an unknown slave. The second is
225 when we wind up with two connections for the same slave, in which
226 case we disconnect the older connection.
230 return defer
.succeed(None)
231 log
.msg("disconnecting old slave %s now" % self
.slavename
)
232 # When this Deferred fires, we'll be ready to accept the new slave
233 return self
._disconnect
(self
.slave
)
235 def _disconnect(self
, slave
):
236 # all kinds of teardown will happen as a result of
237 # loseConnection(), but it happens after a reactor iteration or
238 # two. Hook the actual disconnect so we can know when it is safe
239 # to connect the new slave. We have to wait one additional
240 # iteration (with callLater(0)) to make sure the *other*
241 # notifyOnDisconnect handlers have had a chance to run.
244 # notifyOnDisconnect runs the callback with one argument, the
245 # RemoteReference being disconnected.
246 def _disconnected(rref
):
247 reactor
.callLater(0, d
.callback
, None)
248 slave
.notifyOnDisconnect(_disconnected
)
249 tport
= slave
.broker
.transport
250 # this is the polite way to request that a socket be closed
251 tport
.loseConnection()
253 # but really we don't want to wait for the transmit queue to
254 # drain. The remote end is unlikely to ACK the data, so we'd
255 # probably have to wait for a (20-minute) TCP timeout.
256 #tport._closeSocket()
257 # however, doing _closeSocket (whether before or after
258 # loseConnection) somehow prevents the notifyOnDisconnect
259 # handlers from being run. Bummer.
261 tport
.dataBuffer
= ""
263 # however, these hacks are pretty internal, so don't blow up if
264 # they fail or are unavailable
265 log
.msg("failed to accelerate the shutdown process")
267 log
.msg("waiting for slave to finish disconnecting")
271 def sendBuilderList(self
):
272 our_builders
= self
.botmaster
.getBuildersForSlave(self
.slavename
)
273 blist
= [(b
.name
, b
.builddir
) for b
in our_builders
]
274 d
= self
.slave
.callRemote("setBuilderList", blist
)
277 def perspective_keepalive(self
):
280 def addSlaveBuilder(self
, sb
):
281 if sb
.builder_name
not in self
.slavebuilders
:
282 log
.msg("%s adding %s" % (self
, sb
))
283 elif sb
is not self
.slavebuilders
[sb
.builder_name
]:
284 log
.msg("%s replacing %s" % (self
, sb
))
287 self
.slavebuilders
[sb
.builder_name
] = sb
289 def removeSlaveBuilder(self
, sb
):
291 del self
.slavebuilders
[sb
.builder_name
]
295 log
.msg("%s removed %s" % (self
, sb
))
297 def canStartBuild(self
):
299 I am called when a build is requested to see if this buildslave
300 can start a build. This function can be used to limit overall
301 concurrency on the buildslave.
303 # If we're waiting to shutdown gracefully, then we shouldn't
304 # accept any new jobs.
305 if self
.slave_status
.getGraceful():
309 active_builders
= [sb
for sb
in self
.slavebuilders
.values()
311 if len(active_builders
) >= self
.max_builds
:
315 def _mail_missing_message(self
, subject
, text
):
316 # first, see if we have a MailNotifier we can use. This gives us a
317 # fromaddr and a relayhost.
318 buildmaster
= self
.botmaster
.parent
319 for st
in buildmaster
.statusTargets
:
320 if isinstance(st
, MailNotifier
):
323 # if not, they get a default MailNotifier, which always uses SMTP
324 # to localhost and uses a dummy fromaddr of "buildbot".
325 log
.msg("buildslave-missing msg using default MailNotifier")
326 st
= MailNotifier("buildbot")
327 # now construct the mail
331 m
['Date'] = formatdate(localtime
=True)
332 m
['Subject'] = subject
333 m
['From'] = st
.fromaddr
334 recipients
= self
.notify_on_missing
335 m
['To'] = ", ".join(recipients
)
336 d
= st
.sendMessage(m
, recipients
)
337 # return the Deferred for testing purposes
340 def _gracefulChanged(self
, graceful
):
341 """This is called when our graceful shutdown setting changes"""
343 active_builders
= [sb
for sb
in self
.slavebuilders
.values()
345 if len(active_builders
) == 0:
350 """Shutdown the slave"""
351 # Look for a builder with a remote reference to the client side
352 # slave. If we can find one, then call "shutdown" on the remote
353 # builder, which will cause the slave buildbot process to exit.
355 for b
in self
.slavebuilders
.values():
357 d
= b
.remote
.callRemote("shutdown")
361 log
.msg("Shutting down slave: %s" % self
.slavename
)
362 # The remote shutdown call will not complete successfully since the
363 # buildbot process exits almost immediately after getting the
365 # Here we look at the reason why the remote call failed, and if
366 # it's because the connection was lost, that means the slave
367 # shutdown as expected.
369 if why
.check(twisted
.spread
.pb
.PBConnectionLost
):
370 log
.msg("Lost connection to %s" % self
.slavename
)
372 log
.err("Unexpected error when trying to shutdown %s" % self
.slavename
)
373 d
.addErrback(_errback
)
375 log
.err("Couldn't find remote builder to shut down slave")
376 return defer
.succeed(None)
378 class BuildSlave(AbstractBuildSlave
):
380 def sendBuilderList(self
):
381 d
= AbstractBuildSlave
.sendBuilderList(self
)
384 for name
, remote
in slist
.items():
385 # use get() since we might have changed our mind since then
386 b
= self
.botmaster
.builders
.get(name
)
388 d1
= b
.attached(self
, remote
, self
.slave_commands
)
390 return defer
.DeferredList(dl
)
391 def _set_failed(why
):
392 log
.msg("BuildSlave.sendBuilderList (%s) failed" % self
)
394 # TODO: hang up on them?, without setBuilderList we can't use
396 d
.addCallbacks(_sent
, _set_failed
)
399 def detached(self
, mind
):
400 AbstractBuildSlave
.detached(self
, mind
)
401 self
.botmaster
.slaveLost(self
)
402 if self
.notify_on_missing
and self
.parent
and not self
.missing_timer
:
403 self
.missing_timer
= reactor
.callLater(self
.missing_timeout
,
404 self
._missing
_timer
_fired
)
406 def _missing_timer_fired(self
):
407 self
.missing_timer
= None
408 # notify people, but only if we're still in the config
412 buildmaster
= self
.botmaster
.parent
413 status
= buildmaster
.getStatus()
414 text
= "The Buildbot working for '%s'\n" % status
.getProjectName()
415 text
+= ("has noticed that the buildslave named %s went away\n" %
418 text
+= ("It last disconnected at %s (buildmaster-local time)\n" %
419 time
.ctime(time
.time() - self
.missing_timeout
)) # approx
421 text
+= "The admin on record (as reported by BUILDSLAVE:info/admin)\n"
422 text
+= "was '%s'.\n" % self
.slave_status
.getAdmin()
424 text
+= "Sincerely,\n"
425 text
+= " The Buildbot\n"
426 text
+= " %s\n" % status
.getProjectURL()
427 subject
= "Buildbot: buildslave %s was lost" % self
.slavename
428 return self
._mail
_missing
_message
(subject
, text
)
430 def buildFinished(self
, sb
):
431 """This is called when a build on this slave is finished."""
432 # If we're gracefully shutting down, and we have no more active
433 # builders, then it's safe to disconnect
434 if self
.slave_status
.getGraceful():
435 active_builders
= [sb
for sb
in self
.slavebuilders
.values()
437 if len(active_builders
) == 0:
439 return self
.shutdown()
440 return defer
.succeed(None)
442 class AbstractLatentBuildSlave(AbstractBuildSlave
):
443 """A build slave that will start up a slave instance when needed.
445 To use, subclass and implement start_instance and stop_instance.
447 See ec2buildslave.py for a concrete example. Also see the stub example in
451 implements(ILatentBuildSlave
)
453 substantiated
= False
454 substantiation_deferred
= None
455 build_wait_timer
= None
456 _start_result
= _shutdown_callback_handle
= None
458 def __init__(self
, name
, password
, max_builds
=None,
459 notify_on_missing
=[], missing_timeout
=60*20,
460 build_wait_timeout
=60*10,
462 AbstractBuildSlave
.__init
__(
463 self
, name
, password
, max_builds
, notify_on_missing
,
464 missing_timeout
, properties
)
465 self
.building
= set()
466 self
.build_wait_timeout
= build_wait_timeout
468 def start_instance(self
):
469 # responsible for starting instance that will try to connect with
470 # this master. Should return deferred. Problems should use an
472 raise NotImplementedError
474 def stop_instance(self
, fast
=False):
475 # responsible for shutting down instance.
476 raise NotImplementedError
478 def substantiate(self
, sb
):
479 if self
.substantiated
:
480 self
._clearBuildWaitTimer
()
481 self
._setBuildWaitTimer
()
482 return defer
.succeed(self
)
483 if self
.substantiation_deferred
is None:
484 if self
.parent
and not self
.missing_timer
:
485 # start timer. if timer times out, fail deferred
486 self
.missing_timer
= reactor
.callLater(
487 self
.missing_timeout
,
488 self
._substantiation
_failed
, defer
.TimeoutError())
489 self
.substantiation_deferred
= defer
.Deferred()
490 if self
.slave
is None:
491 self
._substantiate
() # start up instance
492 # else: we're waiting for an old one to detach. the _substantiate
493 # will be done in ``detached`` below.
494 return self
.substantiation_deferred
496 def _substantiate(self
):
497 # register event trigger
498 d
= self
.start_instance()
499 self
._shutdown
_callback
_handle
= reactor
.addSystemEventTrigger(
500 'before', 'shutdown', self
._soft
_disconnect
, fast
=True)
501 def stash_reply(result
):
502 self
._start
_result
= result
503 def clean_up(failure
):
504 if self
.missing_timer
is not None:
505 self
.missing_timer
.cancel()
506 self
._substantiation
_failed
(failure
)
507 if self
._shutdown
_callback
_handle
is not None:
508 handle
= self
._shutdown
_callback
_handle
509 del self
._shutdown
_callback
_handle
510 reactor
.removeSystemEventTrigger(handle
)
512 d
.addCallbacks(stash_reply
, clean_up
)
515 def attached(self
, bot
):
516 if self
.substantiation_deferred
is None:
517 log
.msg('Slave %s received connection while not trying to '
518 'substantiate. Disconnecting.' % (self
.slavename
,))
519 self
._disconnect
(bot
)
521 return AbstractBuildSlave
.attached(self
, bot
)
523 def detached(self
, mind
):
524 AbstractBuildSlave
.detached(self
, mind
)
525 if self
.substantiation_deferred
is not None:
528 def _substantiation_failed(self
, failure
):
529 d
= self
.substantiation_deferred
530 self
.substantiation_deferred
= None
531 self
.missing_timer
= None
533 self
.insubstantiate()
534 # notify people, but only if we're still in the config
535 if not self
.parent
or not self
.notify_on_missing
:
538 status
= buildmaster
.getStatus()
539 text
= "The Buildbot working for '%s'\n" % status
.getProjectName()
540 text
+= ("has noticed that the latent buildslave named %s \n" %
542 text
+= "never substantiated after a request\n"
544 text
+= ("The request was made at %s (buildmaster-local time)\n" %
545 time
.ctime(time
.time() - self
.missing_timeout
)) # approx
547 text
+= "Sincerely,\n"
548 text
+= " The Buildbot\n"
549 text
+= " %s\n" % status
.getProjectURL()
550 subject
= "Buildbot: buildslave %s never substantiated" % self
.slavename
551 return self
._mail
_missing
_message
(subject
, text
)
553 def buildStarted(self
, sb
):
554 assert self
.substantiated
555 self
._clearBuildWaitTimer
()
556 self
.building
.add(sb
.builder_name
)
558 def buildFinished(self
, sb
):
559 self
.building
.remove(sb
.builder_name
)
560 if not self
.building
:
561 self
._setBuildWaitTimer
()
563 def _clearBuildWaitTimer(self
):
564 if self
.build_wait_timer
is not None:
565 if self
.build_wait_timer
.active():
566 self
.build_wait_timer
.cancel()
567 self
.build_wait_timer
= None
569 def _setBuildWaitTimer(self
):
570 self
._clearBuildWaitTimer
()
571 self
.build_wait_timer
= reactor
.callLater(
572 self
.build_wait_timeout
, self
._soft
_disconnect
)
574 def insubstantiate(self
, fast
=False):
575 self
._clearBuildWaitTimer
()
576 d
= self
.stop_instance(fast
)
577 if self
._shutdown
_callback
_handle
is not None:
578 handle
= self
._shutdown
_callback
_handle
579 del self
._shutdown
_callback
_handle
580 reactor
.removeSystemEventTrigger(handle
)
581 self
.substantiated
= False
582 self
.building
.clear() # just to be sure
585 def _soft_disconnect(self
, fast
=False):
586 d
= AbstractBuildSlave
.disconnect(self
)
587 if self
.slave
is not None:
588 # this could be called when the slave needs to shut down, such as
589 # in BotMaster.removeSlave, *or* when a new slave requests a
590 # connection when we already have a slave. It's not clear what to
591 # do in the second case: this shouldn't happen, and if it
592 # does...if it's a latent slave, shutting down will probably kill
593 # something we want...but we can't know what the status is. So,
594 # here, we just do what should be appropriate for the first case,
595 # and put our heads in the sand for the second, at least for now.
596 # The best solution to the odd situation is removing it as a
597 # possibilty: make the master in charge of connecting to the
598 # slave, rather than vice versa. TODO.
599 d
= defer
.DeferredList([d
, self
.insubstantiate(fast
)])
601 if self
.substantiation_deferred
is not None:
602 # unlike the previous block, we don't expect this situation when
603 # ``attached`` calls ``disconnect``, only when we get a simple
604 # request to "go away".
605 self
.substantiation_deferred
.errback()
606 self
.substantiation_deferred
= None
607 if self
.missing_timer
:
608 self
.missing_timer
.cancel()
609 self
.missing_timer
= None
613 def disconnect(self
):
614 d
= self
._soft
_disconnect
()
615 # this removes the slave from all builders. It won't come back
616 # without a restart (or maybe a sighup)
617 self
.botmaster
.slaveLost(self
)
619 def stopService(self
):
620 res
= defer
.maybeDeferred(AbstractBuildSlave
.stopService
, self
)
621 if self
.slave
is not None:
622 d
= self
._soft
_disconnect
()
623 res
= defer
.DeferredList([res
, d
])
626 def updateSlave(self
):
627 """Called to add or remove builders after the slave has connected.
629 Also called after botmaster's builders are initially set.
631 @return: a Deferred that indicates when an attached slave has
632 accepted the new builders and/or released the old ones."""
633 for b
in self
.botmaster
.getBuildersForSlave(self
.slavename
):
634 if b
.name
not in self
.slavebuilders
:
635 b
.addLatentSlave(self
)
636 return AbstractBuildSlave
.updateSlave(self
)
638 def sendBuilderList(self
):
639 d
= AbstractBuildSlave
.sendBuilderList(self
)
642 for name
, remote
in slist
.items():
643 # use get() since we might have changed our mind since then.
644 # we're checking on the builder in addition to the
645 # slavebuilders out of a bit of paranoia.
646 b
= self
.botmaster
.builders
.get(name
)
647 sb
= self
.slavebuilders
.get(name
)
649 d1
= sb
.attached(self
, remote
, self
.slave_commands
)
651 return defer
.DeferredList(dl
)
652 def _set_failed(why
):
653 log
.msg("BuildSlave.sendBuilderList (%s) failed" % self
)
655 # TODO: hang up on them?, without setBuilderList we can't use
657 if self
.substantiation_deferred
:
658 self
.substantiation_deferred
.errback()
659 self
.substantiation_deferred
= None
660 if self
.missing_timer
:
661 self
.missing_timer
.cancel()
662 self
.missing_timer
= None
663 # TODO: maybe log? send an email?
665 d
.addCallbacks(_sent
, _set_failed
)
666 def _substantiated(res
):
667 self
.substantiated
= True
668 if self
.substantiation_deferred
:
669 d
= self
.substantiation_deferred
670 del self
.substantiation_deferred
671 res
= self
._start
_result
672 del self
._start
_result
674 # note that the missing_timer is already handled within
676 if not self
.building
:
677 self
._setBuildWaitTimer
()
678 d
.addCallback(_substantiated
)