diff --git a/docs/PROGRESS.md b/docs/PROGRESS.md index b6b2188..862d32d 100644 --- a/docs/PROGRESS.md +++ b/docs/PROGRESS.md @@ -31,8 +31,48 @@ Current assessment: - Phase 1.1 is not fully complete yet because `i386` has not been checked and the subprocess crash needs investigation - verification harness committed as `e380e88` (`Add FreeBSD Guile verification harness`) +## 2026-04-01 — Phase 1.1 follow-up: subprocess crash isolated + +Completed work: + +- added a dedicated subprocess diagnostic harness: + - `tests/guile/run-subprocess-diagnostics.sh` + - `tests/guile/posix-spawn-freebsd-diagnostics.c` +- reproduced crashes for: + - `system*` + - `spawn` + - `open-pipe*` +- confirmed all three fail with `SIGSEGV` / `exit 139` +- confirmed native FreeBSD `posix_spawn` + `posix_spawn_file_actions_addclosefrom_np` works in a standalone C program +- confirmed FreeBSD behavior that triggers gnulib replacement logic: + - `posix_spawn_file_actions_adddup2` accepts an invalid fd in the gnulib probe + - `posix_spawn_file_actions_addopen` accepts an invalid fd in the gnulib probe + - `posix_spawnp` accepts a shebang-less executable script in the gnulib security probe +- wrote the analysis to `docs/reports/phase1-guile-subprocess-crash.md` + +Conclusion: + +- this is most likely an upstream Guile/gnulib ABI bug on FreeBSD, not a Guix-specific problem +- likely sequence: + 1. gnulib enables `REPLACE_POSIX_SPAWN=1` + 2. Guile still enables `HAVE_POSIX_SPAWN_FILE_ACTIONS_ADDCLOSEFROM_NP` + 3. Guile passes a gnulib replacement `posix_spawn_file_actions_t` object to native `posix_spawn_file_actions_addclosefrom_np` + 4. libc interprets gnulib struct fields as a native pointer and crashes +- evidence from the lldb core matches this hypothesis (`*fa = 0x0000000600000008`, consistent with gnulib `_allocated=8`, `_used=6`) + +Current assessment: + +- Phase 1.1 amd64 investigation is now much stronger and has a concrete root-cause hypothesis +- the next practical step is to validate a workaround or patch in Guile so subprocess helpers stop crashing +- after that, continue with Phase 1.2 (minimal native build environment / GNU Hello) + +Recent commits: + +- `e380e88` — `Add FreeBSD Guile verification harness` +- `cd721b1` — `Update progress after Guile verification` + Next recommended step: -1. investigate and isolate the `system*` / `open-pipe*` crash with a minimal reproducer -2. decide whether this requires a local Guile patch, a packaging fix, or Guix-side avoidance -3. once subprocess behavior is understood, continue with Phase 1.2 (minimal native build environment / GNU Hello) +1. patch or locally validate a fix for the `addclosefrom_np` / `REPLACE_POSIX_SPAWN` mismatch +2. re-run both Guile test harnesses after the fix +3. once subprocess behavior is stable, continue with Phase 1.2 (minimal native build environment / GNU Hello) diff --git a/docs/reports/phase1-guile-subprocess-crash.md b/docs/reports/phase1-guile-subprocess-crash.md new file mode 100644 index 0000000..d0add38 --- /dev/null +++ b/docs/reports/phase1-guile-subprocess-crash.md @@ -0,0 +1,182 @@ +# Phase 1.1 follow-up: Guile subprocess crash on FreeBSD + +Date: 2026-04-01 + +## Summary + +`guile3` on this `FreeBSD 15.0-STABLE` amd64 host crashes when Guile tries to create subprocesses through: + +- `system*` +- `spawn` +- `open-pipe*` + +The crash is reproducible and is **not** caused by FreeBSD's native `posix_spawn(3)` implementation by itself. The evidence points to an **upstream Guile/gnulib integration bug on FreeBSD**: + +- gnulib decides to replace `posix_spawn`/`posix_spawnp` on this platform +- Guile still calls the native FreeBSD extension `posix_spawn_file_actions_addclosefrom_np` +- that function receives a gnulib replacement `posix_spawn_file_actions_t` object with an incompatible ABI +- the process crashes inside libc when `addclosefrom_np` interprets gnulib's struct header as a native pointer + +## Repro artifacts added + +- `tests/guile/posix-spawn-freebsd-diagnostics.c` +- `tests/guile/run-subprocess-diagnostics.sh` + +Run with: + +```sh +./tests/guile/run-subprocess-diagnostics.sh +``` + +Expected output on the current host includes: + +```text +native-spawn-closefrom=ok +adddup2-invalid-fd-accepted=yes +addopen-invalid-fd-accepted=yes +posix_spawn-secure-exec-result=0 +posix_spawnp-secure-exec-result=3 +issue-profile-match=yes +system-star exit=139 +spawn exit=139 +open-pipe-star exit=139 +``` + +## Minimal Guile reproducers + +```sh +guile3 -c '(system* "/usr/bin/true")' +guile3 -c '(spawn "/usr/bin/true" (list "/usr/bin/true"))' +guile3 -c '(use-modules (ice-9 popen)) (open-pipe* OPEN_READ "/usr/bin/true")' +``` + +All three terminate with `SIGSEGV` (`exit 139`) on this machine. + +## Native FreeBSD `posix_spawn` is not the direct problem + +A standalone C test using FreeBSD's native APIs works correctly: + +- `posix_spawn_file_actions_init` +- `posix_spawn_file_actions_adddup2` +- `posix_spawn_file_actions_addclosefrom_np` +- `posix_spawn` + +The diagnostic program in `tests/guile/posix-spawn-freebsd-diagnostics.c` confirms this with: + +```text +native-spawn-closefrom=ok +``` + +So the crash is above libc, in how Guile/gnulib prepares the file-actions object. + +## Why gnulib replaces `posix_spawn` on this host + +Upstream Guile 3.0.10 vendors gnulib logic in `m4/posix_spawn.m4`. + +Two FreeBSD-relevant observations from the local diagnostics match gnulib's replacement logic: + +1. `posix_spawnp` is considered insecure by gnulib's test because it accepts a script without a shebang and ends up running it successfully instead of rejecting it with `ENOEXEC`. +2. FreeBSD's `posix_spawn_file_actions_adddup2` and `posix_spawn_file_actions_addopen` accept obviously invalid file descriptors in the gnulib probe cases, so gnulib also wants wrapper/replacement behavior there. + +Observed locally: + +```text +adddup2-invalid-fd-accepted=yes +addopen-invalid-fd-accepted=yes +posix_spawnp-secure-exec-result=3 +``` + +That strongly indicates `REPLACE_POSIX_SPAWN=1` in the Guile build on this system. + +## Root cause hypothesis + +### 1. Guile uses `addclosefrom_np` when the symbol exists + +In upstream Guile 3.0.10, `libguile/posix.c` contains: + +- `#ifdef HAVE_POSIX_SPAWN_FILE_ACTIONS_ADDCLOSEFROM_NP` +- `#define HAVE_ADDCLOSEFROM 1` +- later in `do_spawn(...)`: + +```c +#ifdef HAVE_ADDCLOSEFROM + posix_spawn_file_actions_addclosefrom_np (&actions, 3); +#else + close_inherited_fds (&actions, max_fd); +#endif +``` + +### 2. But gnulib can replace the `posix_spawn` ABI + +In upstream gnulib's `lib/spawn.in.h`, when `REPLACE_POSIX_SPAWN=1`, `posix_spawn_file_actions_t` becomes a gnulib-defined struct instead of the native FreeBSD opaque-pointer type. + +FreeBSD's native `/usr/include/spawn.h` defines: + +```c +typedef struct __posix_spawn_file_actions *posix_spawn_file_actions_t; +``` + +So native FreeBSD expects `posix_spawn_file_actions_t` to be pointer-like, while gnulib replacement mode uses an in-memory struct. + +### 3. The crash signature matches that ABI mismatch exactly + +The lldb backtrace from the core file shows the crash in: + +```text +libc.so.7`posix_spawn_file_actions_addclosefrom_np +``` + +with: + +```text +*fa = 0x0000000600000008 +``` + +That value matches the first two 32-bit fields of gnulib's replacement file-actions struct interpreted as a pointer: + +- `_allocated = 8` +- `_used = 6` + +Those values are exactly plausible after Guile schedules six `dup2` actions in `do_spawn(...)`. + +In other words, libc is reading gnulib's struct header as though it were a native pointer to `struct __posix_spawn_file_actions`, which explains the segmentation fault. + +## Assessment + +This looks like an **upstream Guile bug on FreeBSD-family systems where**: + +- gnulib decides `REPLACE_POSIX_SPAWN=1`, **and** +- the platform exposes native `posix_spawn_file_actions_addclosefrom_np` + +It does **not** look like a Guix-specific bug, nor primarily a local packaging mistake. + +## Recommended fix direction + +The safest fix is in Guile's `libguile/posix.c`: + +- only use `posix_spawn_file_actions_addclosefrom_np` when Guile is using the **native** `posix_spawn` / `posix_spawn_file_actions_t` ABI +- if gnulib replacement `posix_spawn` is active, fall back to `close_inherited_fds(&actions, max_fd)` instead + +In practice that likely means guarding the `HAVE_ADDCLOSEFROM` path with an additional condition equivalent to: + +```c +#if defined(HAVE_POSIX_SPAWN_FILE_ACTIONS_ADDCLOSEFROM_NP) && !defined(REPLACE_POSIX_SPAWN) +``` + +or another build-time condition that guarantees ABI compatibility. + +## Impact on the Guix-on-FreeBSD port + +This is an important blocker because Guix and Guile code frequently depend on subprocess creation helpers. + +However, the investigation also confirms: + +- lower-level process primitives still work (`primitive-fork`, `waitpid`) +- sockets, file I/O, and FFI still work +- the problem is narrow enough to patch or work around + +So the Guix port remains viable, but robust subprocess handling on FreeBSD will likely require either: + +1. a local Guile patch, or +2. an upstream fix to Guile/gnulib integration, or +3. temporary Guix-side avoidance of the crashing subprocess helpers while bootstrapping the port diff --git a/tests/guile/posix-spawn-freebsd-diagnostics.c b/tests/guile/posix-spawn-freebsd-diagnostics.c new file mode 100644 index 0000000..9a86037 --- /dev/null +++ b/tests/guile/posix-spawn-freebsd-diagnostics.c @@ -0,0 +1,134 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern char **environ; + +static int native_spawn_closefrom_test(void) { + posix_spawn_file_actions_t actions; + pid_t pid; + int status; + char *argv[] = { "/usr/bin/true", NULL }; + + if (posix_spawn_file_actions_init(&actions) != 0) + return 1; + if (posix_spawn_file_actions_adddup2(&actions, STDIN_FILENO, STDIN_FILENO) != 0) + return 2; + if (posix_spawn_file_actions_adddup2(&actions, STDOUT_FILENO, STDOUT_FILENO) != 0) + return 3; + if (posix_spawn_file_actions_adddup2(&actions, STDERR_FILENO, STDERR_FILENO) != 0) + return 4; + if (posix_spawn_file_actions_addclosefrom_np(&actions, 3) != 0) + return 5; + if (posix_spawn(&pid, "/usr/bin/true", &actions, NULL, argv, environ) != 0) + return 6; + if (waitpid(pid, &status, 0) < 0) + return 7; + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) + return 8; + return 0; +} + +static int adddup2_invalid_fd_accepted(void) { + posix_spawn_file_actions_t actions; + if (posix_spawn_file_actions_init(&actions) != 0) + return -1; + return posix_spawn_file_actions_adddup2(&actions, 10000000, 2) == 0; +} + +static int addopen_invalid_fd_accepted(void) { + posix_spawn_file_actions_t actions; + if (posix_spawn_file_actions_init(&actions) != 0) + return -1; + return posix_spawn_file_actions_addopen(&actions, 10000000, "foo", 0, O_RDONLY) == 0; +} + +static int write_script(char *path, size_t size) { + int fd; + const char script[] = ":\n"; + + if (snprintf(path, size, "/tmp/fruix-posix-spawn-script-XXXXXX") >= (int)size) + return -1; + fd = mkstemp(path); + if (fd < 0) + return -1; + if (write(fd, script, sizeof(script) - 1) != (ssize_t)(sizeof(script) - 1)) { + close(fd); + unlink(path); + return -1; + } + if (fchmod(fd, 0700) != 0) { + close(fd); + unlink(path); + return -1; + } + if (close(fd) != 0) { + unlink(path); + return -1; + } + return 0; +} + +static int secure_exec_result(int use_path) { + char script[128]; + pid_t child; + int err; + int status = 0; + char *argv[] = { script, NULL }; + char *env[] = { "PATH=/tmp:/usr/bin:/bin", NULL }; + + if (write_script(script, sizeof(script)) != 0) + return -1; + + err = use_path + ? posix_spawnp(&child, script, NULL, NULL, argv, env) + : posix_spawn(&child, script, NULL, NULL, argv, env); + + if (err == ENOEXEC) { + unlink(script); + return 0; + } + if (err != 0) { + unlink(script); + return 1; + } + while (waitpid(child, &status, 0) != child) + ; + unlink(script); + if (!WIFEXITED(status)) + return 2; + if (WEXITSTATUS(status) != 127) + return 3; + return 0; +} + +int main(void) { + int native_ok = native_spawn_closefrom_test(); + int dup2_broken = adddup2_invalid_fd_accepted(); + int addopen_broken = addopen_invalid_fd_accepted(); + int spawn_secure = secure_exec_result(0); + int spawnp_secure = secure_exec_result(1); + int issue_profile_match = + native_ok == 0 && + dup2_broken == 1 && + addopen_broken == 1 && + spawn_secure == 0 && + spawnp_secure == 3; + + printf("native-spawn-closefrom=%s\n", native_ok == 0 ? "ok" : "fail"); + printf("adddup2-invalid-fd-accepted=%s\n", dup2_broken == 1 ? "yes" : (dup2_broken == 0 ? "no" : "error")); + printf("addopen-invalid-fd-accepted=%s\n", addopen_broken == 1 ? "yes" : (addopen_broken == 0 ? "no" : "error")); + printf("posix_spawn-secure-exec-result=%d\n", spawn_secure); + printf("posix_spawnp-secure-exec-result=%d\n", spawnp_secure); + printf("issue-profile-match=%s\n", issue_profile_match ? "yes" : "no"); + + return issue_profile_match ? 0 : 1; +} diff --git a/tests/guile/run-subprocess-diagnostics.sh b/tests/guile/run-subprocess-diagnostics.sh new file mode 100755 index 0000000..e61e166 --- /dev/null +++ b/tests/guile/run-subprocess-diagnostics.sh @@ -0,0 +1,42 @@ +#!/bin/sh +set -eu + +script_dir=$(CDPATH= cd -- "$(dirname "$0")" && pwd) +repo_root=$(CDPATH= cd -- "$script_dir/../.." && pwd) +workdir=$(mktemp -d /tmp/fruix-guile-subprocess.XXXXXX) +trap 'rm -rf "$workdir"' EXIT INT TERM + +if command -v guile3 >/dev/null 2>&1; then + guile_bin=$(command -v guile3) +elif command -v guile-3.0 >/dev/null 2>&1; then + guile_bin=$(command -v guile-3.0) +else + echo "Unable to find guile3 or guile-3.0 in PATH" >&2 + exit 1 +fi + +ulimit -c 0 || true + +cc -Wall -Wextra -O2 "$repo_root/tests/guile/posix-spawn-freebsd-diagnostics.c" \ + -o "$workdir/posix-spawn-freebsd-diagnostics" + +printf '== Native posix_spawn diagnostics ==\n' +"$workdir/posix-spawn-freebsd-diagnostics" + +run_guile_case() { + name=$1 + code=$2 + set +e + "$guile_bin" -c "$code" >/dev/null 2>&1 + rc=$? + set -e + printf '%s exit=%s\n' "$name" "$rc" + [ "$rc" -eq 139 ] +} + +printf '== Guile subprocess crash repro ==\n' +run_guile_case system-star '(system* "/usr/bin/true")' +run_guile_case spawn '(spawn "/usr/bin/true" (list "/usr/bin/true"))' +run_guile_case open-pipe-star '(use-modules (ice-9 popen)) (open-pipe* OPEN_READ "/usr/bin/true")' + +printf 'known FreeBSD Guile subprocess crash profile reproduced\n'