1
0
mirror of https://git.savannah.gnu.org/git/guix.git synced 2026-04-06 13:10:33 +02:00

build/syscalls: Introduce new safe-clone and use it.

* guix/build/syscalls.scm (without-automatic-finalization): Accept multiple
expressions.
(without-garbage-collection): New syntax.
(without-threads): Likewise.
(ensure-signal-delivery-thread, safe-clone): New procedures.
* tests/syscalls.scm: ("clone and unshare triggers EINVAL")
("safe-clone and unshare succeeds"): New tests.
* gnu/build/linux-container.scm (run-container): Adjust to use 'safe-clone'.

Relates-to: #1169
Change-Id: I044c11a899e24e547a7aed97f30c8e7250ab5363
This commit is contained in:
Maxim Cournoyer
2025-10-17 23:12:27 +09:00
parent 3966f76297
commit 1eccea7ffb
3 changed files with 168 additions and 94 deletions

View File

@@ -263,100 +263,93 @@ that host UIDs (respectively GIDs) map to in the namespace."
;; child process blocks until the parent writes to it. ;; child process blocks until the parent writes to it.
(match (socketpair PF_UNIX (logior SOCK_CLOEXEC SOCK_STREAM) 0) (match (socketpair PF_UNIX (logior SOCK_CLOEXEC SOCK_STREAM) 0)
((child . parent) ((child . parent)
(let ((flags (namespaces->bit-mask namespaces))) (safe-clone
(match (clone flags) (namespaces->bit-mask namespaces)
(0 (lambda ()
;; Inhibit thread creation until after the unshare call. (call-with-clean-exit
(gc-disable) (lambda ()
(call-with-clean-exit (close-port parent)
(lambda () ;; Wait for parent to set things up.
(close-port parent) (match (read child)
;; Wait for parent to set things up. ('ready
(match (read child) (purify-environment)
('ready (when (and (memq 'mnt namespaces)
(purify-environment) (not (string=? root "/")))
(when (and (memq 'mnt namespaces) (catch #t
(not (string=? root "/"))) (lambda ()
(catch #t (mount-file-systems root mounts
(lambda () #:mount-/proc? (memq 'pid namespaces)
(mount-file-systems root mounts #:mount-/sys? (memq 'net
#:mount-/proc? (memq 'pid namespaces) namespaces)
#:mount-/sys? (memq 'net #:populate-file-system
namespaces) (lambda ()
#:populate-file-system (populate-file-system)
(lambda () (when (and (memq 'net namespaces)
(populate-file-system) loopback-network?)
(when (and (memq 'net namespaces) (set-network-interface-up "lo")
loopback-network?)
(set-network-interface-up "lo")
;; When isolated from the ;; When isolated from the
;; network, provide a minimal ;; network, provide a minimal
;; /etc/hosts to resolve ;; /etc/hosts to resolve
;; "localhost". ;; "localhost".
(mkdir-p "/etc") (mkdir-p "/etc")
(call-with-output-file "/etc/hosts" (call-with-output-file "/etc/hosts"
(lambda (port) (lambda (port)
(display "127.0.0.1 localhost\n" port) (display "127.0.0.1 localhost\n" port)
(chmod port #o444))))) (chmod port #o444)))))
#:writable-root? #:writable-root?
(or writable-root? (or writable-root?
(not (memq 'mnt namespaces))))) (not (memq 'mnt namespaces)))))
(lambda args (lambda args
;; Forward the exception to the parent process. ;; Forward the exception to the parent process.
;; FIXME: SRFI-35 conditions and non-trivial objects ;; FIXME: SRFI-35 conditions and non-trivial objects
;; cannot be 'read' so they shouldn't be written as is. ;; cannot be 'read' so they shouldn't be written as is.
(write args child) (write args child)
(primitive-exit 3)))) (primitive-exit 3))))
(when (and lock-mounts? (when (and lock-mounts?
(memq 'mnt namespaces) (memq 'mnt namespaces)
(memq 'user namespaces)) (memq 'user namespaces))
;; Create a new mount namespace owned by a new user ;; Create a new mount namespace owned by a new user
;; namespace to "lock" together previous mounts, such that ;; namespace to "lock" together previous mounts, such that
;; they cannot be unmounted or remounted separately--see ;; they cannot be unmounted or remounted separately--see
;; mount_namespaces(7). ;; mount_namespaces(7).
;; (let ((uid (getuid)) (gid (getgid)))
;; Note: at this point, the process is single-threaded (no (unshare (logior CLONE_NEWUSER CLONE_NEWNS))
;; GC mark threads, no finalization thread, etc.) which is (when (file-exists? "/proc/self")
;; why unshare(CLONE_NEWUSER) can be used. (initialize-user-namespace (getpid)
(let ((uid (getuid)) (gid (getgid))) host-uids
(unshare (logior CLONE_NEWUSER CLONE_NEWNS)) #:host-uid uid
(gc-enable) #:host-gid gid
(when (file-exists? "/proc/self") #:guest-uid guest-uid
(initialize-user-namespace (getpid) #:guest-gid guest-gid))))
host-uids
#:host-uid uid
#:host-gid gid
#:guest-uid guest-uid
#:guest-gid guest-gid))))
;; TODO: Manage capabilities. ;; TODO: Manage capabilities.
(write 'ready child) (write 'ready child)
(close-port child) (close-port child)
(thunk)) (thunk))
(_ ;parent died or something (_ ;parent died or something
(primitive-exit 2)))))) (primitive-exit 2))))))
(pid (lambda (pid)
(close-port child) (close-port child)
(when (memq 'user namespaces) (when (memq 'user namespaces)
(initialize-user-namespace pid host-uids (initialize-user-namespace pid host-uids
#:guest-uid guest-uid #:guest-uid guest-uid
#:guest-gid guest-gid)) #:guest-gid guest-gid))
;; TODO: Initialize cgroups. ;; TODO: Initialize cgroups.
(write 'ready parent) (write 'ready parent)
(newline parent) (newline parent)
;; Check whether the child process' setup phase succeeded. ;; Check whether the child process' setup phase succeeded.
(let ((message (read parent))) (let ((message (read parent)))
(close-port parent) (close-port parent)
(match message (match message
('ready ;success ('ready ;success
pid) pid)
(((? symbol? key) args ...) ;exception (((? symbol? key) args ...) ;exception
(apply throw key args)) (apply throw key args))
(_ ;unexpected termination (_ ;unexpected termination
#f))))))))) #f))))))))
;; FIXME: This is copied from (guix utils), which we cannot use because it ;; FIXME: This is copied from (guix utils), which we cannot use because it
;; would pull (guix config) and all. ;; would pull (guix config) and all.

View File

@@ -150,6 +150,7 @@
CLONE_THREAD CLONE_THREAD
CLONE_VM CLONE_VM
clone clone
safe-clone
unshare unshare
setns setns
get-user-ns get-user-ns
@@ -1170,17 +1171,45 @@ caller lacks root privileges."
Turning finalization off shuts down the finalization thread as a side effect." Turning finalization off shuts down the finalization thread as a side effect."
(->bool ((force proc) (if enabled? 1 0)))))) (->bool ((force proc) (if enabled? 1 0))))))
(define-syntax-rule (without-automatic-finalization exp) (define-syntax-rule (without-automatic-finalization body ...)
"Turn off automatic finalization within the dynamic extent of EXP." "Turn off automatic finalization within the dynamic extent of BODY. This is
useful to ensure there is no finalization thread."
(let ((enabled? #t)) (let ((enabled? #t))
(dynamic-wind (dynamic-wind
(lambda () (lambda ()
(set! enabled? (%set-automatic-finalization-enabled?! #f))) (set! enabled? (%set-automatic-finalization-enabled?! #f)))
(lambda () (lambda ()
exp) body ...)
(lambda () (lambda ()
(%set-automatic-finalization-enabled?! enabled?))))) (%set-automatic-finalization-enabled?! enabled?)))))
(define-syntax-rule (without-garbage-collection body ...)
"Turn off garbage collection within the dynamic extent of BODY. This is useful
to avoid the creation new garbage collection thread. Note that pre-existing
GC marker threads are only disabled, not terminated."
(dynamic-wind
(lambda ()
(gc-disable))
(lambda ()
body ...)
(lambda ()
(gc-enable))))
(define-syntax-rule (without-threads body ...)
"Ensure the Guile finalizer thread is stopped and that garbage collection does
not run. Note that pre-existing GC marker threads are only disabled, not
terminated. This also leaves the signal handling thread to be disabled by
another means, since there is no Guile API to do so."
;; Note: the three kind of threads that Guile can spawn are the finalization
;; thread, the signal thread, or the GC marker threads.
(without-automatic-finalization
(without-garbage-collection body ...)))
(define (ensure-signal-delivery-thread)
"Ensure the signal delivery thread is spawned and its state set
to 'RUNNING'. This is valid as of the implementation as of Guile 3.0.9."
(sigaction SIGUSR1)) ;could be any signal
;; The libc interface to sys_clone is not useful for Scheme programs, so the ;; The libc interface to sys_clone is not useful for Scheme programs, so the
;; low-level system call is wrapped instead. The 'syscall' function is ;; low-level system call is wrapped instead. The 'syscall' function is
;; declared in <unistd.h> as a variadic function; in practice, it expects 6 ;; declared in <unistd.h> as a variadic function; in practice, it expects 6
@@ -1223,6 +1252,24 @@ are shared between the parent and child processes."
(list err)) (list err))
ret))))) ret)))))
(define (safe-clone flags child parent)
"This is a raw clone syscall wrapper that ensures no Guile thread will be
spawned during execution of the child. `clone' is called with FLAGS. CHILD
is a thunk to run in the child process. PARENT is procedure that accepts the
child PID as argument. This is useful in many contexts, such as when calling
`unshare' or async-unsafe procedures in the child when the parent process
memory (CLONE_VM) or threads (CLONE_THREAD) are shared with it."
;; TODO: Contribute `clone' to Guile, and handle these complications there,
;; similarly to how it's handled for scm_fork in posix.c.
;; XXX: This is a hack: as of Guile 3.0.9, by starting the signal delivery
;; thread in the parent, its state will be known as RUNNING, and the child
;; won't attempt to start it itself.
(ensure-signal-delivery-thread)
(match (clone flags)
(0 (without-threads (child)))
(pid (parent pid))))
(define (thread-count) (define (thread-count)
"Return the complete thread count of the current process. Unlike "Return the complete thread count of the current process. Unlike
`all-threads', this also counts the Guile signal delivery, and finalizer `all-threads', this also counts the Guile signal delivery, and finalizer

View File

@@ -3,6 +3,7 @@
;;; Copyright © 2015 David Thompson <davet@gnu.org> ;;; Copyright © 2015 David Thompson <davet@gnu.org>
;;; Copyright © 2020 Simon South <simon@simonsouth.net> ;;; Copyright © 2020 Simon South <simon@simonsouth.net>
;;; Copyright © 2020 Mathieu Othacehe <m.othacehe@gmail.com> ;;; Copyright © 2020 Mathieu Othacehe <m.othacehe@gmail.com>
;;; Copyright © 2025 Maxim Cournoyer <maxim@guixotic.coop>
;;; ;;;
;;; This file is part of GNU Guix. ;;; This file is part of GNU Guix.
;;; ;;;
@@ -29,7 +30,8 @@
#:use-module (srfi srfi-71) #:use-module (srfi srfi-71)
#:use-module (system foreign) #:use-module (system foreign)
#:use-module ((ice-9 ftw) #:select (scandir)) #:use-module ((ice-9 ftw) #:select (scandir))
#:use-module (ice-9 match)) #:use-module (ice-9 match)
#:use-module (ice-9 threads))
;; Test the (guix build syscalls) module, although there's not much that can ;; Test the (guix build syscalls) module, although there's not much that can
;; actually be tested without being root. ;; actually be tested without being root.
@@ -158,6 +160,38 @@
(lambda args (lambda args
(system-error-errno args)))) (system-error-errno args))))
(define child-thunk
(lambda ()
(gc) ;spawn GC threads
(primitive-exit
(catch 'system-error
(lambda ()
(unshare CLONE_THREAD)
0) ;no error
(lambda args
(system-error-errno args))))))
(define parent-proc
(lambda (pid)
(match (waitpid pid)
((_ . status)
(status:exit-val status)))))
(unless perform-container-tests?
(test-skip 1))
(test-equal "clone and unshare triggers EINVAL"
EINVAL
(match (clone (logior CLONE_NEWUSER SIGCHLD))
(0 (child-thunk))
(pid (parent-proc pid))))
(unless perform-container-tests?
(test-skip 1))
(test-equal "safe-clone and unshare succeeds"
0
(safe-clone (logior CLONE_NEWUSER SIGCHLD)
child-thunk parent-proc))
(unless perform-container-tests? (unless perform-container-tests?
(test-skip 1)) (test-skip 1))
(test-assert "setns" (test-assert "setns"