Try to reclobber on retry.
[buildbot.git] / buildbot / buildslave.py
blob0daf74c1aa65ae00cec7d88fb27f78c066ae2418
2 import time
3 from email.Message import Message
4 from email.Utils import formatdate
5 from zope.interface import implements
6 from twisted.python import log
7 from twisted.internet import defer, reactor
8 from twisted.application import service
10 from buildbot.pbutil import NewCredPerspective
11 from buildbot.status.builder import SlaveStatus
12 from buildbot.status.mail import MailNotifier
13 from buildbot.interfaces import IBuildSlave
14 from buildbot.process.properties import Properties
16 class BuildSlave(NewCredPerspective, service.MultiService):
17 """This is the master-side representative for a remote buildbot slave.
18 There is exactly one for each slave described in the config file (the
19 c['slaves'] list). When buildbots connect in (.attach), they get a
20 reference to this instance. The BotMaster object is stashed as the
21 .botmaster attribute. The BotMaster is also our '.parent' Service.
23 I represent a build slave -- a remote machine capable of
24 running builds. I am instantiated by the configuration file, and can be
25 subclassed to add extra functionality."""
27 implements(IBuildSlave)
29 def __init__(self, name, password, max_builds=None,
30 notify_on_missing=[], missing_timeout=3600,
31 properties={}):
32 """
33 @param name: botname this machine will supply when it connects
34 @param password: password this machine will supply when
35 it connects
36 @param max_builds: maximum number of simultaneous builds that will
37 be run concurrently on this buildslave (the
38 default is None for no limit)
39 @param properties: properties that will be applied to builds run on
40 this slave
41 @type properties: dictionary
42 """
43 service.MultiService.__init__(self)
44 self.slavename = name
45 self.password = password
46 self.botmaster = None # no buildmaster yet
47 self.slave_status = SlaveStatus(name)
48 self.slave = None # a RemoteReference to the Bot, when connected
49 self.slave_commands = None
50 self.slavebuilders = []
51 self.max_builds = max_builds
53 self.properties = Properties()
54 self.properties.update(properties, "BuildSlave")
55 self.properties.setProperty("slavename", name, "BuildSlave")
57 self.lastMessageReceived = 0
58 if isinstance(notify_on_missing, str):
59 notify_on_missing = [notify_on_missing]
60 self.notify_on_missing = notify_on_missing
61 for i in notify_on_missing:
62 assert isinstance(i, str)
63 self.missing_timeout = missing_timeout
64 self.missing_timer = None
66 def update(self, new):
67 """
68 Given a new BuildSlave, configure this one identically. Because
69 BuildSlave objects are remotely referenced, we can't replace them
70 without disconnecting the slave, yet there's no reason to do that.
71 """
72 # the reconfiguration logic should guarantee this:
73 assert self.slavename == new.slavename
74 assert self.password == new.password
75 assert self.__class__ == new.__class__
76 self.max_builds = new.max_builds
78 def __repr__(self):
79 if self.botmaster:
80 builders = self.botmaster.getBuildersForSlave(self.slavename)
81 return "<BuildSlave '%s', current builders: %s>" % \
82 (self.slavename, ','.join(map(lambda b: b.name, builders)))
83 else:
84 return "<BuildSlave '%s', (no builders yet)>" % self.slavename
86 def setBotmaster(self, botmaster):
87 assert not self.botmaster, "BuildSlave already has a botmaster"
88 self.botmaster = botmaster
90 def updateSlave(self):
91 """Called to add or remove builders after the slave has connected.
93 @return: a Deferred that indicates when an attached slave has
94 accepted the new builders and/or released the old ones."""
95 if self.slave:
96 return self.sendBuilderList()
97 return defer.succeed(None)
99 def updateSlaveStatus(self, buildStarted=None, buildFinished=None):
100 if buildStarted:
101 self.slave_status.buildStarted(buildStarted)
102 if buildFinished:
103 self.slave_status.buildFinished(buildFinished)
105 def attached(self, bot):
106 """This is called when the slave connects.
108 @return: a Deferred that fires with a suitable pb.IPerspective to
109 give to the slave (i.e. 'self')"""
111 if self.slave:
112 # uh-oh, we've got a duplicate slave. The most likely
113 # explanation is that the slave is behind a slow link, thinks we
114 # went away, and has attempted to reconnect, so we've got two
115 # "connections" from the same slave, but the previous one is
116 # stale. Give the new one precedence.
117 log.msg("duplicate slave %s replacing old one" % self.slavename)
119 # just in case we've got two identically-configured slaves,
120 # report the IP addresses of both so someone can resolve the
121 # squabble
122 tport = self.slave.broker.transport
123 log.msg("old slave was connected from", tport.getPeer())
124 log.msg("new slave is from", bot.broker.transport.getPeer())
125 d = self.disconnect()
126 else:
127 d = defer.succeed(None)
128 # now we go through a sequence of calls, gathering information, then
129 # tell the Botmaster that it can finally give this slave to all the
130 # Builders that care about it.
132 # we accumulate slave information in this 'state' dictionary, then
133 # set it atomically if we make it far enough through the process
134 state = {}
136 def _log_attachment_on_slave(res):
137 d1 = bot.callRemote("print", "attached")
138 d1.addErrback(lambda why: None)
139 return d1
140 d.addCallback(_log_attachment_on_slave)
142 def _get_info(res):
143 d1 = bot.callRemote("getSlaveInfo")
144 def _got_info(info):
145 log.msg("Got slaveinfo from '%s'" % self.slavename)
146 # TODO: info{} might have other keys
147 state["admin"] = info.get("admin")
148 state["host"] = info.get("host")
149 def _info_unavailable(why):
150 # maybe an old slave, doesn't implement remote_getSlaveInfo
151 log.msg("BuildSlave.info_unavailable")
152 log.err(why)
153 d1.addCallbacks(_got_info, _info_unavailable)
154 return d1
155 d.addCallback(_get_info)
157 def _get_commands(res):
158 d1 = bot.callRemote("getCommands")
159 def _got_commands(commands):
160 state["slave_commands"] = commands
161 def _commands_unavailable(why):
162 # probably an old slave
163 log.msg("BuildSlave._commands_unavailable")
164 if why.check(AttributeError):
165 return
166 log.err(why)
167 d1.addCallbacks(_got_commands, _commands_unavailable)
168 return d1
169 d.addCallback(_get_commands)
171 def _accept_slave(res):
172 self.slave_status.setAdmin(state.get("admin"))
173 self.slave_status.setHost(state.get("host"))
174 self.slave_status.setConnected(True)
175 self.slave_commands = state.get("slave_commands")
176 self.slave = bot
177 log.msg("bot attached")
178 self.messageReceivedFromSlave()
179 if self.missing_timer:
180 self.missing_timer.cancel()
181 self.missing_timer = None
183 return self.updateSlave()
184 d.addCallback(_accept_slave)
186 # Finally, the slave gets a reference to this BuildSlave. They
187 # receive this later, after we've started using them.
188 d.addCallback(lambda res: self)
189 return d
191 def messageReceivedFromSlave(self):
192 now = time.time()
193 self.lastMessageReceived = now
194 self.slave_status.setLastMessageReceived(now)
196 def detached(self, mind):
197 self.slave = None
198 self.slave_status.setConnected(False)
199 self.botmaster.slaveLost(self)
200 log.msg("BuildSlave.detached(%s)" % self.slavename)
201 if self.notify_on_missing and self.parent and not self.missing_timer:
202 self.missing_timer = reactor.callLater(self.missing_timeout,
203 self._missing_timer_fired)
205 def _missing_timer_fired(self):
206 self.missing_timer = None
207 # notify people, but only if we're still in the config
208 if not self.parent:
209 return
211 # first, see if we have a MailNotifier we can use. This gives us a
212 # fromaddr and a relayhost.
213 buildmaster = self.botmaster.parent
214 status = buildmaster.getStatus()
215 for st in buildmaster.statusTargets:
216 if isinstance(st, MailNotifier):
217 break
218 else:
219 # if not, they get a default MailNotifier, which always uses SMTP
220 # to localhost and uses a dummy fromaddr of "buildbot".
221 log.msg("buildslave-missing msg using default MailNotifier")
222 st = MailNotifier("buildbot")
223 # now construct the mail
224 text = "The Buildbot working for '%s'\n" % status.getProjectName()
225 text += ("has noticed that the buildslave named %s went away\n" %
226 self.slavename)
227 text += "\n"
228 text += ("It last disconnected at %s (buildmaster-local time)\n" %
229 time.ctime(time.time() - self.missing_timeout)) # close enough
230 text += "\n"
231 text += "The admin on record (as reported by BUILDSLAVE:info/admin)\n"
232 text += "was '%s'.\n" % self.slave_status.getAdmin()
233 text += "\n"
234 text += "Sincerely,\n"
235 text += " The Buildbot\n"
236 text += " %s\n" % status.getProjectURL()
238 m = Message()
239 m.set_payload(text)
240 m['Date'] = formatdate(localtime=True)
241 m['Subject'] = "Buildbot: buildslave %s was lost" % self.slavename
242 m['From'] = st.fromaddr
243 recipients = self.notify_on_missing
244 m['To'] = ", ".join(recipients)
245 d = st.sendMessage(m, recipients)
246 # return the Deferred for testing purposes
247 return d
249 def disconnect(self):
250 """Forcibly disconnect the slave.
252 This severs the TCP connection and returns a Deferred that will fire
253 (with None) when the connection is probably gone.
255 If the slave is still alive, they will probably try to reconnect
256 again in a moment.
258 This is called in two circumstances. The first is when a slave is
259 removed from the config file. In this case, when they try to
260 reconnect, they will be rejected as an unknown slave. The second is
261 when we wind up with two connections for the same slave, in which
262 case we disconnect the older connection.
265 if not self.slave:
266 return defer.succeed(None)
267 log.msg("disconnecting old slave %s now" % self.slavename)
269 # all kinds of teardown will happen as a result of
270 # loseConnection(), but it happens after a reactor iteration or
271 # two. Hook the actual disconnect so we can know when it is safe
272 # to connect the new slave. We have to wait one additional
273 # iteration (with callLater(0)) to make sure the *other*
274 # notifyOnDisconnect handlers have had a chance to run.
275 d = defer.Deferred()
277 # notifyOnDisconnect runs the callback with one argument, the
278 # RemoteReference being disconnected.
279 def _disconnected(rref):
280 reactor.callLater(0, d.callback, None)
281 self.slave.notifyOnDisconnect(_disconnected)
282 tport = self.slave.broker.transport
283 # this is the polite way to request that a socket be closed
284 tport.loseConnection()
285 try:
286 # but really we don't want to wait for the transmit queue to
287 # drain. The remote end is unlikely to ACK the data, so we'd
288 # probably have to wait for a (20-minute) TCP timeout.
289 #tport._closeSocket()
290 # however, doing _closeSocket (whether before or after
291 # loseConnection) somehow prevents the notifyOnDisconnect
292 # handlers from being run. Bummer.
293 tport.offset = 0
294 tport.dataBuffer = ""
295 pass
296 except:
297 # however, these hacks are pretty internal, so don't blow up if
298 # they fail or are unavailable
299 log.msg("failed to accelerate the shutdown process")
300 pass
301 log.msg("waiting for slave to finish disconnecting")
303 # When this Deferred fires, we'll be ready to accept the new slave
304 return d
306 def sendBuilderList(self):
307 our_builders = self.botmaster.getBuildersForSlave(self.slavename)
308 blist = [(b.name, b.builddir) for b in our_builders]
309 d = self.slave.callRemote("setBuilderList", blist)
310 def _sent(slist):
311 dl = []
312 for name, remote in slist.items():
313 # use get() since we might have changed our mind since then
314 b = self.botmaster.builders.get(name)
315 if b:
316 d1 = b.attached(self, remote, self.slave_commands)
317 dl.append(d1)
318 return defer.DeferredList(dl)
319 def _set_failed(why):
320 log.msg("BuildSlave.sendBuilderList (%s) failed" % self)
321 log.err(why)
322 # TODO: hang up on them?, without setBuilderList we can't use
323 # them
324 d.addCallbacks(_sent, _set_failed)
325 return d
327 def perspective_keepalive(self):
328 pass
330 def addSlaveBuilder(self, sb):
331 log.msg("%s adding %s" % (self, sb))
332 self.slavebuilders.append(sb)
334 def removeSlaveBuilder(self, sb):
335 log.msg("%s removing %s" % (self, sb))
336 if sb in self.slavebuilders:
337 self.slavebuilders.remove(sb)
339 def canStartBuild(self):
341 I am called when a build is requested to see if this buildslave
342 can start a build. This function can be used to limit overall
343 concurrency on the buildslave.
345 if self.max_builds:
346 active_builders = [sb for sb in self.slavebuilders if sb.isBusy()]
347 if len(active_builders) >= self.max_builds:
348 return False
349 return True