1 ;;; GNU Guix --- Functional package management for GNU
2 ;;; Copyright © 2014, 2015 Ludovic Courtès <ludo@gnu.org>
4 ;;; This file is part of GNU Guix.
6 ;;; GNU Guix is free software; you can redistribute it and/or modify it
7 ;;; under the terms of the GNU General Public License as published by
8 ;;; the Free Software Foundation; either version 3 of the License, or (at
9 ;;; your option) any later version.
11 ;;; GNU Guix is distributed in the hope that it will be useful, but
12 ;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ;;; GNU General Public License for more details.
16 ;;; You should have received a copy of the GNU General Public License
17 ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
19 (define-module (guix scripts offload)
20 #:use-module (guix config)
21 #:use-module (guix records)
22 #:use-module (guix store)
23 #:use-module (guix derivations)
24 #:use-module (guix serialization)
25 #:use-module (guix nar)
26 #:use-module (guix utils)
27 #:use-module ((guix build utils) #:select (which mkdir-p))
28 #:use-module (guix ui)
29 #:use-module (srfi srfi-1)
30 #:use-module (srfi srfi-11)
31 #:use-module (srfi srfi-26)
32 #:use-module (srfi srfi-34)
33 #:use-module (srfi srfi-35)
34 #:use-module (ice-9 popen)
35 #:use-module (ice-9 rdelim)
36 #:use-module (ice-9 match)
37 #:use-module (ice-9 regex)
38 #:use-module (ice-9 format)
39 #:use-module (rnrs io ports)
40 #:export (build-machine
46 ;;; Attempt to offload builds to the machines listed in
47 ;;; /etc/guix/machines.scm, transferring missing dependencies over SSH, and
48 ;;; retrieving the build output(s) over SSH upon success.
50 ;;; This command should not be used directly; instead, it is called on-demand
51 ;;; by the daemon, unless it was started with '--no-build-hook' or a client
52 ;;; inhibited build hooks.
57 (define-record-type* <build-machine>
58 build-machine make-build-machine
60 (name build-machine-name) ; string
61 (port build-machine-port ; number
63 (system build-machine-system) ; string
64 (user build-machine-user) ; string
65 (private-key build-machine-private-key ; file name
66 (default (user-lsh-private-key)))
67 (parallel-builds build-machine-parallel-builds ; number
69 (speed build-machine-speed ; inexact real
71 (features build-machine-features ; list of strings
73 (ssh-options build-machine-ssh-options ; list of strings
76 (define-record-type* <build-requirements>
77 build-requirements make-build-requirements
79 (system build-requirements-system) ; string
80 (features build-requirements-features ; list of strings
84 ;; File that lists machines available as build slaves.
85 (string-append %config-directory "/machines.scm"))
91 ;; FIXME: 'lshg' fails to pass large amounts of data, see
92 ;; <http://lists.lysator.liu.se/pipermail/lsh-bugs/2014q1/000639.html>.
95 (define (user-lsh-private-key)
96 "Return the user's default lsh private key, or #f if it could not be
98 (and=> (getenv "HOME")
99 (cut string-append <> "/.lsh/identity")))
102 ;; Module in which the machine description file is loaded.
103 (let ((module (make-fresh-user-module)))
104 (module-use! module (resolve-interface '(guix scripts offload)))
107 (define* (build-machines #:optional (file %machine-file))
108 "Read the list of build machines from FILE and return it."
111 ;; Avoid ABI incompatibility with the <build-machine> record.
112 (set! %fresh-auto-compile #t)
114 (save-module-excursion
116 (set-current-module %user-module)
117 (primitive-load file))))
121 (let ((err (system-error-errno args)))
122 ;; Silently ignore missing file since this is a common case.
125 (leave (_ "failed to open machine file '~a': ~a~%")
126 file (strerror err)))))
127 (('syntax-error proc message properties form . rest)
128 (let ((loc (source-properties->location properties)))
129 (leave (_ "~a: ~a~%")
130 (location->string loc) message)))
132 (leave (_ "failed to load machine file '~a': ~s~%")
135 ;;; FIXME: The idea was to open the connection to MACHINE once for all, but
136 ;;; lshg is currently non-functional.
137 ;; (define (open-ssh-gateway machine)
138 ;; "Initiate an SSH connection gateway to MACHINE, and return the PID of the
139 ;; running lsh gateway upon success, or #f on failure."
140 ;; (catch 'system-error
142 ;; (let* ((port (open-pipe* OPEN_READ %lsh-command
143 ;; "-l" (build-machine-user machine)
144 ;; "-i" (build-machine-private-key machine)
145 ;; ;; XXX: With lsh 2.1, passing '--write-pid'
146 ;; ;; last causes the PID not to be printed.
147 ;; "--write-pid" "--gateway" "--background"
148 ;; (build-machine-name machine)))
149 ;; (line (read-line port))
150 ;; (status (close-pipe port)))
151 ;; (if (zero? status)
152 ;; (let ((pid (string->number line)))
153 ;; (if (integer? pid)
156 ;; (warning (_ "'~a' did not write its PID on stdout: ~s~%")
157 ;; %lsh-command line)
160 ;; (warning (_ "failed to initiate SSH connection to '~a':\
161 ;; '~a' exited with ~a~%")
162 ;; (build-machine-name machine)
164 ;; (status:exit-val status))
167 ;; (leave (_ "failed to execute '~a': ~a~%")
168 ;; %lsh-command (strerror (system-error-errno args))))))
170 (define-syntax with-error-to-port
172 ((_ port exp0 exp ...)
174 (old (current-error-port)))
177 (set-current-error-port new))
181 (set-current-error-port old)))))))
183 (define* (remote-pipe machine mode command
184 #:key (error-port (current-error-port)) (quote? #t))
185 "Run COMMAND (a string list) on MACHINE, assuming an lsh gateway has been
186 set up. When QUOTE? is true, perform shell-quotation of all the elements of
187 COMMAND. Return either a pipe opened with MODE, or #f if the lsh client could
189 (define (shell-quote str)
190 ;; Sort-of shell-quote STR so it can be passed as an argument to the
192 (with-output-to-string
196 ;; Let the child inherit ERROR-PORT.
197 (with-error-to-port error-port
198 (apply open-pipe* mode %lshg-command
199 "-l" (build-machine-user machine)
200 "-p" (number->string (build-machine-port machine))
202 ;; XXX: Remove '-i' when %LSHG-COMMAND really is lshg.
203 "-i" (build-machine-private-key machine)
205 (append (build-machine-ssh-options machine)
206 (list (build-machine-name machine))
208 (map shell-quote command)
216 (define (lock-file file)
217 "Wait and acquire an exclusive lock on FILE. Return an open port."
218 (mkdir-p (dirname file))
219 (let ((port (open-file file "w0")))
220 (fcntl-flock port 'write-lock)
223 (define (unlock-file lock)
225 (fcntl-flock lock 'unlock)
229 (define-syntax-rule (with-file-lock file exp ...)
230 "Wait to acquire a lock on FILE and evaluate EXP in that context."
231 (let ((port (lock-file file)))
238 (unlock-file port)))))
240 (define-syntax-rule (with-machine-lock machine hint exp ...)
241 "Wait to acquire MACHINE's exclusive lock for HINT, and evaluate EXP in that
243 (with-file-lock (machine-lock-file machine hint)
247 (define (machine-slot-file machine slot)
248 "Return the file name of MACHINE's file for SLOT."
249 ;; For each machine we have a bunch of files representing each build slot.
250 ;; When choosing a build machine, we attempt to get an exclusive lock on one
251 ;; of these; if we fail, that means all the build slots are already taken.
252 ;; Inspired by Nix's build-remote.pl.
253 (string-append (string-append %state-directory "/offload/"
254 (build-machine-name machine)
255 "/" (number->string slot))))
257 (define (acquire-build-slot machine)
258 "Attempt to acquire a build slot on MACHINE. Return the port representing
259 the slot, or #f if none is available.
261 This mechanism allows us to set a hard limit on the number of simultaneous
262 connections allowed to MACHINE."
263 (mkdir-p (dirname (machine-slot-file machine 0)))
264 (with-machine-lock machine 'slots
266 (let ((port (open-file (machine-slot-file machine slot)
270 (fcntl-flock port 'write-lock #:wait? #f)
272 (format (current-error-port)
273 "process ~a acquired build slot '~a'~%"
274 (getpid) (port-filename port))
277 ;; PORT is already locked by another process.
280 (iota (build-machine-parallel-builds machine)))))
282 (define (release-build-slot slot)
283 "Release SLOT, a build slot as returned as by 'acquire-build-slot'."
291 (define (build-log-port)
292 "Return the default port where build logs should be sent. The default is
293 file descriptor 4, which is open by the daemon before running the offload
295 (let ((port (fdopen 4 "w0")))
296 ;; Make sure file descriptor 4 isn't closed when PORT is GC'd.
297 (set-port-revealed! port 1)
300 (define %gc-root-file
301 ;; File name of the temporary GC root we install.
302 (format #f "offload-~a-~a" (gethostname) (getpid)))
304 (define (register-gc-root file machine)
305 "Mark FILE, a store item, as a garbage collector root on MACHINE."
308 (use-modules (guix config))
310 ;; Note: we can't use 'add-indirect-root' because dangling links under
311 ;; gcroots/auto are automatically deleted by the GC. This strategy
312 ;; doesn't have this problem, but it requires write access to that
314 (let ((root-directory (string-append %state-directory
318 (mkdir root-directory))
320 (unless (= EEXIST (system-error-errno args))
321 (error "failed to create remote GC root directory"
322 root-directory (system-error-errno args)))))
327 (string-append root-directory "/" ,%gc-root-file)))
329 ;; If FILE already exists, we can assume that either it's a stale
330 ;; reference (which is fine), or another process is already
331 ;; building the derivation represented by FILE (which is fine
332 ;; too.) Thus, do nothing in that case.
333 (unless (= EEXIST (system-error-errno args))
334 (apply throw args)))))))
336 (let ((pipe (remote-pipe machine OPEN_READ
337 `("guile" "-c" ,(object->string script)))))
338 (get-string-all pipe)
339 (let ((status (close-pipe pipe)))
340 (unless (zero? status)
341 ;; Better be safe than sorry: if we ignore the error here, then FILE
342 ;; may be GC'd just before we start using it.
343 (leave (_ "failed to register GC root for '~a' on '~a' (status: ~a)~%")
344 file (build-machine-name machine) status)))))
346 (define (remove-gc-roots machine)
347 "Remove from MACHINE the GC roots previously installed with
351 (use-modules (guix config) (ice-9 ftw)
352 (srfi srfi-1) (srfi srfi-26))
354 (let ((root-directory (string-append %state-directory
358 (string-append root-directory "/" ,%gc-root-file)))
360 ;; These ones were created with 'guix build -r' (there can be more
361 ;; than one in case of multiple-output derivations.)
362 (let ((roots (filter (cut string-prefix? ,%gc-root-file <>)
364 (for-each (lambda (file)
365 (false-if-exception (delete-file file)))
368 (let ((pipe (remote-pipe machine OPEN_READ
369 `("guile" "-c" ,(object->string script)))))
370 (get-string-all pipe)
373 (define* (offload drv machine
374 #:key print-build-trace? (max-silent-time 3600)
375 build-timeout (log-port (build-log-port)))
376 "Perform DRV on MACHINE, assuming DRV and its prerequisites are available
377 there, and write the build log to LOG-PORT. Return the exit status."
378 (format (current-error-port) "offloading '~a' to '~a'...~%"
379 (derivation-file-name drv) (build-machine-name machine))
380 (format (current-error-port) "@ build-remote ~a ~a~%"
381 (derivation-file-name drv) (build-machine-name machine))
383 ;; Normally DRV has already been protected from GC when it was transferred.
384 ;; The '-r' flag below prevents the build result from being GC'd.
385 (let ((pipe (remote-pipe machine OPEN_READ
388 ,(format #f "--max-silent-time=~a"
391 (list (format #f "--timeout=~a"
394 ,(derivation-file-name drv))
396 ;; Since 'guix build' writes the build log to its
397 ;; stderr, everything will go directly to LOG-PORT.
398 #:error-port log-port)))
399 (let loop ((line (read-line pipe)))
400 (unless (eof-object? line)
401 (display line log-port)
403 (loop (read-line pipe))))
407 (define* (transfer-and-offload drv machine
411 (max-silent-time 3600)
414 "Offload DRV to MACHINE. Prior to the actual offloading, transfer all of
415 INPUTS to MACHINE; if building DRV succeeds, retrieve all of OUTPUTS from
418 (register-gc-root (derivation-file-name drv) machine)
419 (send-files (cons (derivation-file-name drv) inputs)
421 (let ((status (offload drv machine
422 #:print-build-trace? print-build-trace?
423 #:max-silent-time max-silent-time
424 #:build-timeout build-timeout)))
427 (retrieve-files outputs machine)
428 (remove-gc-roots machine)
429 (format (current-error-port)
430 "done with offloaded '~a'~%"
431 (derivation-file-name drv)))
433 (remove-gc-roots machine)
434 (format (current-error-port)
435 "derivation '~a' offloaded to '~a' failed \
437 (derivation-file-name drv)
438 (build-machine-name machine)
439 (status:exit-val status))
441 ;; Use exit code 100 for a permanent build failure. The daemon
442 ;; interprets other non-zero codes as transient build failures.
443 (primitive-exit 100))))))
445 (define (send-files files machine)
446 "Send the subset of FILES that's missing to MACHINE's store. Return #t on
447 success, #f otherwise."
448 (define (missing-files files)
449 ;; Return the subset of FILES not already on MACHINE.
450 (let*-values (((files)
451 (format #f "~{~a~%~}" files))
454 (append (list (which %lshg-command)
455 "-l" (build-machine-user machine)
457 (build-machine-port machine))
458 "-i" (build-machine-private-key machine))
459 (build-machine-ssh-options machine)
460 (cons (build-machine-name machine)
461 '("guix" "archive" "--missing")))
462 (open-input-string files)))
464 (get-string-all missing)))
465 (for-each waitpid pids)
466 (string-tokenize result)))
469 (guard (c ((nix-protocol-error? c)
470 (warning (_ "failed to export files for '~a': ~s~%")
471 (build-machine-name machine)
475 ;; Compute the subset of FILES missing on MACHINE, and send them in
476 ;; topologically sorted order so that they can actually be imported.
477 (let* ((files (missing-files (topologically-sorted store files)))
478 (pipe (remote-pipe machine OPEN_WRITE
480 "guix" "archive" "--import")
482 (format #t (_ "sending ~a store files to '~a'...~%")
483 (length files) (build-machine-name machine))
484 (call-with-compressed-output-port 'xz pipe
488 (export-paths store files compressed))
490 (warning (_ "failed while exporting files to '~a': ~a~%")
491 (build-machine-name machine)
492 (strerror (system-error-errno args)))))))
494 ;; Wait for the 'lsh' process to complete.
495 (zero? (close-pipe pipe))))))
497 (define (retrieve-files files machine)
498 "Retrieve FILES from MACHINE's store, and import them."
500 (build-machine-name machine))
502 (let ((pipe (remote-pipe machine OPEN_READ
503 `("guix" "archive" "--export" ,@files
508 (guard (c ((nix-protocol-error? c)
509 (warning (_ "failed to import files from '~a': ~s~%")
512 (format (current-error-port) "retrieving ~a files from '~a'...~%"
515 ;; We cannot use the 'import-paths' RPC here because we already
516 ;; hold the locks for FILES.
517 (call-with-decompressed-port 'xz pipe
518 (lambda (decompressed)
519 (restore-file-set decompressed
520 #:log-port (current-error-port)
523 ;; Wait for the 'lsh' process to complete.
524 (zero? (close-pipe pipe)))))))
531 (define (machine-matches? machine requirements)
532 "Return #t if MACHINE matches REQUIREMENTS."
533 (and (string=? (build-requirements-system requirements)
534 (build-machine-system machine))
536 (build-requirements-features requirements)
537 (build-machine-features machine))))
539 (define (machine-load machine)
540 "Return the load of MACHINE, divided by the number of parallel builds
542 (let* ((pipe (remote-pipe machine OPEN_READ `("cat" "/proc/loadavg")))
543 (line (read-line pipe))
544 (status (close-pipe pipe)))
545 (unless (eqv? 0 (status:exit-val status))
546 (warning (_ "failed to obtain load of '~a': SSH client exited with ~a~%")
547 (build-machine-name machine)
548 (status:exit-val status)))
550 (if (eof-object? line)
551 +inf.0 ;MACHINE does not respond, so assume it is infinitely loaded
552 (match (string-tokenize line)
553 ((one five fifteen . _)
554 (let* ((raw (string->number five))
555 (jobs (build-machine-parallel-builds machine))
556 (normalized (/ raw jobs)))
557 (format (current-error-port) "load on machine '~a' is ~s\
559 (build-machine-name machine) raw normalized)
562 +inf.0))))) ;something's fishy about MACHINE, so avoid it
564 (define (machine-power-factor m)
565 "Return a factor that aggregates the speed and load of M. The higher the
567 (/ (build-machine-speed m)
568 (+ 1 (machine-load m))))
570 (define (machine-less-loaded-or-faster? m1 m2)
571 "Return #t if M1 is either less loaded or faster than M2. (This relation
572 defines a total order on machines.)"
573 (> (machine-power-factor m1) (machine-power-factor m2)))
575 (define (machine-lock-file machine hint)
576 "Return the name of MACHINE's lock file for HINT."
577 (string-append %state-directory "/offload/"
578 (build-machine-name machine)
579 "." (symbol->string hint) ".lock"))
581 (define (machine-choice-lock-file)
582 "Return the name of the file used as a lock when choosing a build machine."
583 (string-append %state-directory "/offload/machine-choice.lock"))
587 ;; List of acquired build slots (open ports).
590 (define (choose-build-machine machines)
591 "Return the best machine among MACHINES, or #f."
593 ;; Proceed like this:
594 ;; 1. Acquire the global machine-choice lock.
595 ;; 2. For all MACHINES, attempt to acquire a build slot, and filter out
596 ;; those machines for which we failed.
597 ;; 3. Choose the best machine among those that are left.
598 ;; 4. Release the previously-acquired build slots of the other machines.
599 ;; 5. Release the global machine-choice lock.
601 (with-file-lock (machine-choice-lock-file)
602 (define machines+slots
603 (filter-map (lambda (machine)
604 (let ((slot (acquire-build-slot machine)))
605 (and slot (list machine slot))))
608 (define (undecorate pred)
614 (pred machine1 machine2)))))))
616 (let loop ((machines+slots
618 (undecorate machine-less-loaded-or-faster?))))
619 (match machines+slots
620 (((best slot) others ...)
621 ;; Return the best machine unless it's already overloaded.
622 (if (< (machine-load best) 2.)
624 (((machines slots) ...)
625 ;; Release slots from the uninteresting machines.
626 (for-each release-build-slot slots)
628 ;; Prevent SLOT from being GC'd.
629 (set! %slots (cons slot %slots))
632 ;; BEST is overloaded, so try the next one.
633 (release-build-slot slot)
637 (define* (process-request wants-local? system drv features
639 print-build-trace? (max-silent-time 3600)
641 "Process a request to build DRV."
642 (let* ((local? (and wants-local? (string=? system (%current-system))))
643 (reqs (build-requirements
645 (features features)))
646 (candidates (filter (cut machine-matches? <> reqs)
650 ;; We'll never be able to match REQS.
651 (display "# decline\n"))
653 (let ((machine (choose-build-machine candidates)))
656 ;; Offload DRV to MACHINE.
657 (display "# accept\n")
658 (let ((inputs (string-tokenize (read-line)))
659 (outputs (string-tokenize (read-line))))
660 (transfer-and-offload drv machine
663 #:max-silent-time max-silent-time
664 #:build-timeout build-timeout
665 #:print-build-trace? print-build-trace?)))
667 ;; Not now, all the machines are busy.
668 (display "# postpone\n")))))))
670 (define-syntax-rule (with-nar-error-handling body ...)
671 "Execute BODY with any &nar-error suitably reported to the user."
672 (guard (c ((nar-error? c)
673 (let ((file (nar-error-file c)))
674 (if (condition-has-type? c &message)
675 (leave (_ "while importing file '~a': ~a~%")
676 file (gettext (condition-message c)))
677 (leave (_ "failed to import file '~a'~%")
686 (define (guix-offload . args)
687 (define request-line-rx
688 ;; The request format. See 'tryBuildHook' method in build.cc.
689 (make-regexp "([01]) ([a-z0-9_-]+) (/[[:graph:]]+.drv) ([[:graph:]]*)"))
692 (char-set-complement (char-set #\,)))
694 ;; Make sure $HOME really corresponds to the current user. This is
695 ;; necessary since lsh uses that to determine the location of the yarrow
696 ;; seed file, and fails if it's owned by someone else.
697 (and=> (passwd:dir (getpw (getuid)))
698 (cut setenv "HOME" <>))
701 ((system max-silent-time print-build-trace? build-timeout)
702 (let ((max-silent-time (string->number max-silent-time))
703 (build-timeout (string->number build-timeout))
704 (print-build-trace? (string=? print-build-trace? "1")))
705 (parameterize ((%current-system system))
706 (let loop ((line (read-line)))
707 (unless (eof-object? line)
708 (cond ((regexp-exec request-line-rx line)
711 (with-nar-error-handling
712 (process-request (equal? (match:substring match 1) "1")
713 (match:substring match 2) ; system
714 (call-with-input-file
715 (match:substring match 3)
718 (match:substring match 4) not-coma)
719 #:print-build-trace? print-build-trace?
720 #:max-silent-time max-silent-time
721 #:build-timeout build-timeout))))
723 (leave (_ "invalid request line: ~s~%") line)))
724 (loop (read-line)))))))
726 (show-version-and-exit "guix offload"))
728 (format #t (_ "Usage: guix offload SYSTEM PRINT-BUILD-TRACE
729 Process build offload requests written on the standard input, possibly
730 offloading builds to the machines listed in '~a'.~%")
733 This tool is meant to be used internally by 'guix-daemon'.\n"))
734 (show-bug-report-information))
736 (leave (_ "invalid arguments: ~{~s ~}~%") x))))
739 ;;; eval: (put 'with-machine-lock 'scheme-indent-function 2)
740 ;;; eval: (put 'with-file-lock 'scheme-indent-function 1)
741 ;;; eval: (put 'with-error-to-port 'scheme-indent-function 1)
744 ;;; offload.scm ends here