1 # Portions copyright Canonical Ltd. 2009
4 from email
.Message
import Message
5 from email
.Utils
import formatdate
6 from zope
.interface
import implements
7 from twisted
.python
import log
8 from twisted
.internet
import defer
, reactor
9 from twisted
.application
import service
10 import twisted
.spread
.pb
12 from buildbot
.pbutil
import NewCredPerspective
13 from buildbot
.status
.builder
import SlaveStatus
14 from buildbot
.status
.mail
import MailNotifier
15 from buildbot
.interfaces
import IBuildSlave
, ILatentBuildSlave
16 from buildbot
.process
.properties
import Properties
19 if sys
.version_info
[:3] < (2,4,0):
20 from sets
import Set
as set
22 class AbstractBuildSlave(NewCredPerspective
, service
.MultiService
):
23 """This is the master-side representative for a remote buildbot slave.
24 There is exactly one for each slave described in the config file (the
25 c['slaves'] list). When buildbots connect in (.attach), they get a
26 reference to this instance. The BotMaster object is stashed as the
27 .botmaster attribute. The BotMaster is also our '.parent' Service.
29 I represent a build slave -- a remote machine capable of
30 running builds. I am instantiated by the configuration file, and can be
31 subclassed to add extra functionality."""
33 implements(IBuildSlave
)
35 def __init__(self
, name
, password
, max_builds
=None,
36 notify_on_missing
=[], missing_timeout
=3600,
39 @param name: botname this machine will supply when it connects
40 @param password: password this machine will supply when
42 @param max_builds: maximum number of simultaneous builds that will
43 be run concurrently on this buildslave (the
44 default is None for no limit)
45 @param properties: properties that will be applied to builds run on
47 @type properties: dictionary
49 service
.MultiService
.__init
__(self
)
51 self
.password
= password
52 self
.botmaster
= None # no buildmaster yet
53 self
.slave_status
= SlaveStatus(name
)
54 self
.slave
= None # a RemoteReference to the Bot, when connected
55 self
.slave_commands
= None
56 self
.slavebuilders
= {}
57 self
.max_builds
= max_builds
59 self
.properties
= Properties()
60 self
.properties
.update(properties
, "BuildSlave")
61 self
.properties
.setProperty("slavename", name
, "BuildSlave")
63 self
.lastMessageReceived
= 0
64 if isinstance(notify_on_missing
, str):
65 notify_on_missing
= [notify_on_missing
]
66 self
.notify_on_missing
= notify_on_missing
67 for i
in notify_on_missing
:
68 assert isinstance(i
, str)
69 self
.missing_timeout
= missing_timeout
70 self
.missing_timer
= None
72 def update(self
, new
):
74 Given a new BuildSlave, configure this one identically. Because
75 BuildSlave objects are remotely referenced, we can't replace them
76 without disconnecting the slave, yet there's no reason to do that.
78 # the reconfiguration logic should guarantee this:
79 assert self
.slavename
== new
.slavename
80 assert self
.password
== new
.password
81 assert self
.__class
__ == new
.__class
__
82 self
.max_builds
= new
.max_builds
86 builders
= self
.botmaster
.getBuildersForSlave(self
.slavename
)
87 return "<%s '%s', current builders: %s>" % \
88 (self
.__class
__.__name
__, self
.slavename
,
89 ','.join(map(lambda b
: b
.name
, builders
)))
91 return "<%s '%s', (no builders yet)>" % \
92 (self
.__class
__.__name
__, self
.slavename
)
94 def setBotmaster(self
, botmaster
):
95 assert not self
.botmaster
, "BuildSlave already has a botmaster"
96 self
.botmaster
= botmaster
97 self
.startMissingTimer()
99 def stopMissingTimer(self
):
100 if self
.missing_timer
:
101 self
.missing_timer
.cancel()
102 self
.missing_timer
= None
104 def startMissingTimer(self
):
105 if self
.notify_on_missing
and self
.missing_timeout
and self
.parent
:
106 self
.stopMissingTimer() # in case it's already running
107 self
.missing_timer
= reactor
.callLater(self
.missing_timeout
,
108 self
._missing
_timer
_fired
)
110 def _missing_timer_fired(self
):
111 self
.missing_timer
= None
112 # notify people, but only if we're still in the config
116 buildmaster
= self
.botmaster
.parent
117 status
= buildmaster
.getStatus()
118 text
= "The Buildbot working for '%s'\n" % status
.getProjectName()
119 text
+= ("has noticed that the buildslave named %s went away\n" %
122 text
+= ("It last disconnected at %s (buildmaster-local time)\n" %
123 time
.ctime(time
.time() - self
.missing_timeout
)) # approx
125 text
+= "The admin on record (as reported by BUILDSLAVE:info/admin)\n"
126 text
+= "was '%s'.\n" % self
.slave_status
.getAdmin()
128 text
+= "Sincerely,\n"
129 text
+= " The Buildbot\n"
130 text
+= " %s\n" % status
.getProjectURL()
131 subject
= "Buildbot: buildslave %s was lost" % self
.slavename
132 return self
._mail
_missing
_message
(subject
, text
)
135 def updateSlave(self
):
136 """Called to add or remove builders after the slave has connected.
138 @return: a Deferred that indicates when an attached slave has
139 accepted the new builders and/or released the old ones."""
141 return self
.sendBuilderList()
143 return defer
.succeed(None)
145 def updateSlaveStatus(self
, buildStarted
=None, buildFinished
=None):
147 self
.slave_status
.buildStarted(buildStarted
)
149 self
.slave_status
.buildFinished(buildFinished
)
151 def attached(self
, bot
):
152 """This is called when the slave connects.
154 @return: a Deferred that fires with a suitable pb.IPerspective to
155 give to the slave (i.e. 'self')"""
158 # uh-oh, we've got a duplicate slave. The most likely
159 # explanation is that the slave is behind a slow link, thinks we
160 # went away, and has attempted to reconnect, so we've got two
161 # "connections" from the same slave, but the previous one is
162 # stale. Give the new one precedence.
163 log
.msg("duplicate slave %s replacing old one" % self
.slavename
)
165 # just in case we've got two identically-configured slaves,
166 # report the IP addresses of both so someone can resolve the
168 tport
= self
.slave
.broker
.transport
169 log
.msg("old slave was connected from", tport
.getPeer())
170 log
.msg("new slave is from", bot
.broker
.transport
.getPeer())
171 d
= self
.disconnect()
173 d
= defer
.succeed(None)
174 # now we go through a sequence of calls, gathering information, then
175 # tell the Botmaster that it can finally give this slave to all the
176 # Builders that care about it.
178 # we accumulate slave information in this 'state' dictionary, then
179 # set it atomically if we make it far enough through the process
182 # Reset graceful shutdown status
183 self
.slave_status
.setGraceful(False)
184 # We want to know when the graceful shutdown flag changes
185 self
.slave_status
.addGracefulWatcher(self
._gracefulChanged
)
187 def _log_attachment_on_slave(res
):
188 d1
= bot
.callRemote("print", "attached")
189 d1
.addErrback(lambda why
: None)
191 d
.addCallback(_log_attachment_on_slave
)
194 d1
= bot
.callRemote("getSlaveInfo")
196 log
.msg("Got slaveinfo from '%s'" % self
.slavename
)
197 # TODO: info{} might have other keys
198 state
["admin"] = info
.get("admin")
199 state
["host"] = info
.get("host")
200 def _info_unavailable(why
):
201 # maybe an old slave, doesn't implement remote_getSlaveInfo
202 log
.msg("BuildSlave.info_unavailable")
204 d1
.addCallbacks(_got_info
, _info_unavailable
)
206 d
.addCallback(_get_info
)
208 def _get_commands(res
):
209 d1
= bot
.callRemote("getCommands")
210 def _got_commands(commands
):
211 state
["slave_commands"] = commands
212 def _commands_unavailable(why
):
213 # probably an old slave
214 log
.msg("BuildSlave._commands_unavailable")
215 if why
.check(AttributeError):
218 d1
.addCallbacks(_got_commands
, _commands_unavailable
)
220 d
.addCallback(_get_commands
)
222 def _accept_slave(res
):
223 self
.slave_status
.setAdmin(state
.get("admin"))
224 self
.slave_status
.setHost(state
.get("host"))
225 self
.slave_status
.setConnected(True)
226 self
.slave_commands
= state
.get("slave_commands")
228 log
.msg("bot attached")
229 self
.messageReceivedFromSlave()
230 self
.stopMissingTimer()
232 return self
.updateSlave()
233 d
.addCallback(_accept_slave
)
234 d
.addCallback(lambda res
: self
.botmaster
.maybeStartAllBuilds())
236 # Finally, the slave gets a reference to this BuildSlave. They
237 # receive this later, after we've started using them.
238 d
.addCallback(lambda res
: self
)
241 def messageReceivedFromSlave(self
):
243 self
.lastMessageReceived
= now
244 self
.slave_status
.setLastMessageReceived(now
)
246 def detached(self
, mind
):
248 self
.slave_status
.removeGracefulWatcher(self
._gracefulChanged
)
249 self
.slave_status
.setConnected(False)
250 log
.msg("BuildSlave.detached(%s)" % self
.slavename
)
252 def disconnect(self
):
253 """Forcibly disconnect the slave.
255 This severs the TCP connection and returns a Deferred that will fire
256 (with None) when the connection is probably gone.
258 If the slave is still alive, they will probably try to reconnect
261 This is called in two circumstances. The first is when a slave is
262 removed from the config file. In this case, when they try to
263 reconnect, they will be rejected as an unknown slave. The second is
264 when we wind up with two connections for the same slave, in which
265 case we disconnect the older connection.
269 return defer
.succeed(None)
270 log
.msg("disconnecting old slave %s now" % self
.slavename
)
271 # When this Deferred fires, we'll be ready to accept the new slave
272 return self
._disconnect
(self
.slave
)
274 def _disconnect(self
, slave
):
275 # all kinds of teardown will happen as a result of
276 # loseConnection(), but it happens after a reactor iteration or
277 # two. Hook the actual disconnect so we can know when it is safe
278 # to connect the new slave. We have to wait one additional
279 # iteration (with callLater(0)) to make sure the *other*
280 # notifyOnDisconnect handlers have had a chance to run.
283 # notifyOnDisconnect runs the callback with one argument, the
284 # RemoteReference being disconnected.
285 def _disconnected(rref
):
286 reactor
.callLater(0, d
.callback
, None)
287 slave
.notifyOnDisconnect(_disconnected
)
288 tport
= slave
.broker
.transport
289 # this is the polite way to request that a socket be closed
290 tport
.loseConnection()
292 # but really we don't want to wait for the transmit queue to
293 # drain. The remote end is unlikely to ACK the data, so we'd
294 # probably have to wait for a (20-minute) TCP timeout.
295 #tport._closeSocket()
296 # however, doing _closeSocket (whether before or after
297 # loseConnection) somehow prevents the notifyOnDisconnect
298 # handlers from being run. Bummer.
300 tport
.dataBuffer
= ""
302 # however, these hacks are pretty internal, so don't blow up if
303 # they fail or are unavailable
304 log
.msg("failed to accelerate the shutdown process")
306 log
.msg("waiting for slave to finish disconnecting")
310 def sendBuilderList(self
):
311 our_builders
= self
.botmaster
.getBuildersForSlave(self
.slavename
)
312 blist
= [(b
.name
, b
.builddir
) for b
in our_builders
]
313 d
= self
.slave
.callRemote("setBuilderList", blist
)
316 def perspective_keepalive(self
):
319 def addSlaveBuilder(self
, sb
):
320 if sb
.builder_name
not in self
.slavebuilders
:
321 log
.msg("%s adding %s" % (self
, sb
))
322 elif sb
is not self
.slavebuilders
[sb
.builder_name
]:
323 log
.msg("%s replacing %s" % (self
, sb
))
326 self
.slavebuilders
[sb
.builder_name
] = sb
328 def removeSlaveBuilder(self
, sb
):
330 del self
.slavebuilders
[sb
.builder_name
]
334 log
.msg("%s removed %s" % (self
, sb
))
336 def canStartBuild(self
):
338 I am called when a build is requested to see if this buildslave
339 can start a build. This function can be used to limit overall
340 concurrency on the buildslave.
342 # If we're waiting to shutdown gracefully, then we shouldn't
343 # accept any new jobs.
344 if self
.slave_status
.getGraceful():
348 active_builders
= [sb
for sb
in self
.slavebuilders
.values()
350 if len(active_builders
) >= self
.max_builds
:
354 def _mail_missing_message(self
, subject
, text
):
355 # first, see if we have a MailNotifier we can use. This gives us a
356 # fromaddr and a relayhost.
357 buildmaster
= self
.botmaster
.parent
358 for st
in buildmaster
.statusTargets
:
359 if isinstance(st
, MailNotifier
):
362 # if not, they get a default MailNotifier, which always uses SMTP
363 # to localhost and uses a dummy fromaddr of "buildbot".
364 log
.msg("buildslave-missing msg using default MailNotifier")
365 st
= MailNotifier("buildbot")
366 # now construct the mail
370 m
['Date'] = formatdate(localtime
=True)
371 m
['Subject'] = subject
372 m
['From'] = st
.fromaddr
373 recipients
= self
.notify_on_missing
374 m
['To'] = ", ".join(recipients
)
375 d
= st
.sendMessage(m
, recipients
)
376 # return the Deferred for testing purposes
379 def _gracefulChanged(self
, graceful
):
380 """This is called when our graceful shutdown setting changes"""
382 active_builders
= [sb
for sb
in self
.slavebuilders
.values()
384 if len(active_builders
) == 0:
389 """Shutdown the slave"""
390 # Look for a builder with a remote reference to the client side
391 # slave. If we can find one, then call "shutdown" on the remote
392 # builder, which will cause the slave buildbot process to exit.
394 for b
in self
.slavebuilders
.values():
396 d
= b
.remote
.callRemote("shutdown")
400 log
.msg("Shutting down slave: %s" % self
.slavename
)
401 # The remote shutdown call will not complete successfully since the
402 # buildbot process exits almost immediately after getting the
404 # Here we look at the reason why the remote call failed, and if
405 # it's because the connection was lost, that means the slave
406 # shutdown as expected.
408 if why
.check(twisted
.spread
.pb
.PBConnectionLost
):
409 log
.msg("Lost connection to %s" % self
.slavename
)
411 log
.err("Unexpected error when trying to shutdown %s" % self
.slavename
)
412 d
.addErrback(_errback
)
414 log
.err("Couldn't find remote builder to shut down slave")
415 return defer
.succeed(None)
417 class BuildSlave(AbstractBuildSlave
):
419 def sendBuilderList(self
):
420 d
= AbstractBuildSlave
.sendBuilderList(self
)
423 for name
, remote
in slist
.items():
424 # use get() since we might have changed our mind since then
425 b
= self
.botmaster
.builders
.get(name
)
427 d1
= b
.attached(self
, remote
, self
.slave_commands
)
429 return defer
.DeferredList(dl
)
430 def _set_failed(why
):
431 log
.msg("BuildSlave.sendBuilderList (%s) failed" % self
)
433 # TODO: hang up on them?, without setBuilderList we can't use
435 d
.addCallbacks(_sent
, _set_failed
)
438 def detached(self
, mind
):
439 AbstractBuildSlave
.detached(self
, mind
)
440 self
.botmaster
.slaveLost(self
)
441 self
.startMissingTimer()
443 def buildFinished(self
, sb
):
444 """This is called when a build on this slave is finished."""
445 # If we're gracefully shutting down, and we have no more active
446 # builders, then it's safe to disconnect
447 if self
.slave_status
.getGraceful():
448 active_builders
= [sb
for sb
in self
.slavebuilders
.values()
450 if len(active_builders
) == 0:
452 return self
.shutdown()
453 return defer
.succeed(None)
455 class AbstractLatentBuildSlave(AbstractBuildSlave
):
456 """A build slave that will start up a slave instance when needed.
458 To use, subclass and implement start_instance and stop_instance.
460 See ec2buildslave.py for a concrete example. Also see the stub example in
464 implements(ILatentBuildSlave
)
466 substantiated
= False
467 substantiation_deferred
= None
468 build_wait_timer
= None
469 _start_result
= _shutdown_callback_handle
= None
471 def __init__(self
, name
, password
, max_builds
=None,
472 notify_on_missing
=[], missing_timeout
=60*20,
473 build_wait_timeout
=60*10,
475 AbstractBuildSlave
.__init
__(
476 self
, name
, password
, max_builds
, notify_on_missing
,
477 missing_timeout
, properties
)
478 self
.building
= set()
479 self
.build_wait_timeout
= build_wait_timeout
481 def start_instance(self
):
482 # responsible for starting instance that will try to connect with
483 # this master. Should return deferred. Problems should use an
485 raise NotImplementedError
487 def stop_instance(self
, fast
=False):
488 # responsible for shutting down instance.
489 raise NotImplementedError
491 def substantiate(self
, sb
):
492 if self
.substantiated
:
493 self
._clearBuildWaitTimer
()
494 self
._setBuildWaitTimer
()
495 return defer
.succeed(self
)
496 if self
.substantiation_deferred
is None:
497 if self
.parent
and not self
.missing_timer
:
498 # start timer. if timer times out, fail deferred
499 self
.missing_timer
= reactor
.callLater(
500 self
.missing_timeout
,
501 self
._substantiation
_failed
, defer
.TimeoutError())
502 self
.substantiation_deferred
= defer
.Deferred()
503 if self
.slave
is None:
504 self
._substantiate
() # start up instance
505 # else: we're waiting for an old one to detach. the _substantiate
506 # will be done in ``detached`` below.
507 return self
.substantiation_deferred
509 def _substantiate(self
):
510 # register event trigger
511 d
= self
.start_instance()
512 self
._shutdown
_callback
_handle
= reactor
.addSystemEventTrigger(
513 'before', 'shutdown', self
._soft
_disconnect
, fast
=True)
514 def stash_reply(result
):
515 self
._start
_result
= result
516 def clean_up(failure
):
517 if self
.missing_timer
is not None:
518 self
.missing_timer
.cancel()
519 self
._substantiation
_failed
(failure
)
520 if self
._shutdown
_callback
_handle
is not None:
521 handle
= self
._shutdown
_callback
_handle
522 del self
._shutdown
_callback
_handle
523 reactor
.removeSystemEventTrigger(handle
)
525 d
.addCallbacks(stash_reply
, clean_up
)
528 def attached(self
, bot
):
529 if self
.substantiation_deferred
is None:
530 msg
= 'Slave %s received connection while not trying to ' \
531 'substantiate. Disconnecting.' % (self
.slavename
,)
533 self
._disconnect
(bot
)
534 return defer
.fail(RuntimeError(msg
))
535 return AbstractBuildSlave
.attached(self
, bot
)
537 def detached(self
, mind
):
538 AbstractBuildSlave
.detached(self
, mind
)
539 if self
.substantiation_deferred
is not None:
542 def _substantiation_failed(self
, failure
):
543 d
= self
.substantiation_deferred
544 self
.substantiation_deferred
= None
545 self
.missing_timer
= None
547 self
.insubstantiate()
548 # notify people, but only if we're still in the config
549 if not self
.parent
or not self
.notify_on_missing
:
552 buildmaster
= self
.botmaster
.parent
553 status
= buildmaster
.getStatus()
554 text
= "The Buildbot working for '%s'\n" % status
.getProjectName()
555 text
+= ("has noticed that the latent buildslave named %s \n" %
557 text
+= "never substantiated after a request\n"
559 text
+= ("The request was made at %s (buildmaster-local time)\n" %
560 time
.ctime(time
.time() - self
.missing_timeout
)) # approx
562 text
+= "Sincerely,\n"
563 text
+= " The Buildbot\n"
564 text
+= " %s\n" % status
.getProjectURL()
565 subject
= "Buildbot: buildslave %s never substantiated" % self
.slavename
566 return self
._mail
_missing
_message
(subject
, text
)
568 def buildStarted(self
, sb
):
569 assert self
.substantiated
570 self
._clearBuildWaitTimer
()
571 self
.building
.add(sb
.builder_name
)
573 def buildFinished(self
, sb
):
574 self
.building
.remove(sb
.builder_name
)
575 if not self
.building
:
576 self
._setBuildWaitTimer
()
578 def _clearBuildWaitTimer(self
):
579 if self
.build_wait_timer
is not None:
580 if self
.build_wait_timer
.active():
581 self
.build_wait_timer
.cancel()
582 self
.build_wait_timer
= None
584 def _setBuildWaitTimer(self
):
585 self
._clearBuildWaitTimer
()
586 self
.build_wait_timer
= reactor
.callLater(
587 self
.build_wait_timeout
, self
._soft
_disconnect
)
589 def insubstantiate(self
, fast
=False):
590 self
._clearBuildWaitTimer
()
591 d
= self
.stop_instance(fast
)
592 if self
._shutdown
_callback
_handle
is not None:
593 handle
= self
._shutdown
_callback
_handle
594 del self
._shutdown
_callback
_handle
595 reactor
.removeSystemEventTrigger(handle
)
596 self
.substantiated
= False
597 self
.building
.clear() # just to be sure
600 def _soft_disconnect(self
, fast
=False):
601 d
= AbstractBuildSlave
.disconnect(self
)
602 if self
.slave
is not None:
603 # this could be called when the slave needs to shut down, such as
604 # in BotMaster.removeSlave, *or* when a new slave requests a
605 # connection when we already have a slave. It's not clear what to
606 # do in the second case: this shouldn't happen, and if it
607 # does...if it's a latent slave, shutting down will probably kill
608 # something we want...but we can't know what the status is. So,
609 # here, we just do what should be appropriate for the first case,
610 # and put our heads in the sand for the second, at least for now.
611 # The best solution to the odd situation is removing it as a
612 # possibilty: make the master in charge of connecting to the
613 # slave, rather than vice versa. TODO.
614 d
= defer
.DeferredList([d
, self
.insubstantiate(fast
)])
616 if self
.substantiation_deferred
is not None:
617 # unlike the previous block, we don't expect this situation when
618 # ``attached`` calls ``disconnect``, only when we get a simple
619 # request to "go away".
620 self
.substantiation_deferred
.errback()
621 self
.substantiation_deferred
= None
622 if self
.missing_timer
:
623 self
.missing_timer
.cancel()
624 self
.missing_timer
= None
628 def disconnect(self
):
629 d
= self
._soft
_disconnect
()
630 # this removes the slave from all builders. It won't come back
631 # without a restart (or maybe a sighup)
632 self
.botmaster
.slaveLost(self
)
634 def stopService(self
):
635 res
= defer
.maybeDeferred(AbstractBuildSlave
.stopService
, self
)
636 if self
.slave
is not None:
637 d
= self
._soft
_disconnect
()
638 res
= defer
.DeferredList([res
, d
])
641 def updateSlave(self
):
642 """Called to add or remove builders after the slave has connected.
644 Also called after botmaster's builders are initially set.
646 @return: a Deferred that indicates when an attached slave has
647 accepted the new builders and/or released the old ones."""
648 for b
in self
.botmaster
.getBuildersForSlave(self
.slavename
):
649 if b
.name
not in self
.slavebuilders
:
650 b
.addLatentSlave(self
)
651 return AbstractBuildSlave
.updateSlave(self
)
653 def sendBuilderList(self
):
654 d
= AbstractBuildSlave
.sendBuilderList(self
)
657 for name
, remote
in slist
.items():
658 # use get() since we might have changed our mind since then.
659 # we're checking on the builder in addition to the
660 # slavebuilders out of a bit of paranoia.
661 b
= self
.botmaster
.builders
.get(name
)
662 sb
= self
.slavebuilders
.get(name
)
664 d1
= sb
.attached(self
, remote
, self
.slave_commands
)
666 return defer
.DeferredList(dl
)
667 def _set_failed(why
):
668 log
.msg("BuildSlave.sendBuilderList (%s) failed" % self
)
670 # TODO: hang up on them?, without setBuilderList we can't use
672 if self
.substantiation_deferred
:
673 self
.substantiation_deferred
.errback()
674 self
.substantiation_deferred
= None
675 if self
.missing_timer
:
676 self
.missing_timer
.cancel()
677 self
.missing_timer
= None
678 # TODO: maybe log? send an email?
680 d
.addCallbacks(_sent
, _set_failed
)
681 def _substantiated(res
):
682 self
.substantiated
= True
683 if self
.substantiation_deferred
:
684 d
= self
.substantiation_deferred
685 del self
.substantiation_deferred
686 res
= self
._start
_result
687 del self
._start
_result
689 # note that the missing_timer is already handled within
691 if not self
.building
:
692 self
._setBuildWaitTimer
()
693 d
.addCallback(_substantiated
)