From 808c28023241bfb888f47b190dd3ca79584bf8d6 Mon Sep 17 00:00:00 2001 From: Steffen Beyer Date: Tue, 14 Apr 2026 19:00:00 +0200 Subject: [PATCH] Stabilize XCP-ng self-hosted lifecycle validation --- docs/plan_2.md | 27 +-- modules/fruix/system/freebsd/build.scm | 158 ++++++++++-------- modules/fruix/system/freebsd/render.scm | 77 +++++---- tests/run-self-hosted-node-lifecycle-xcpng.sh | 36 ++-- tests/run-self-hosted-node-lifecycle.sh | 21 ++- 5 files changed, 195 insertions(+), 124 deletions(-) diff --git a/docs/plan_2.md b/docs/plan_2.md index 1ee77a8..8f75b78 100644 --- a/docs/plan_2.md +++ b/docs/plan_2.md @@ -193,21 +193,28 @@ Completed: - `tests/run-self-hosted-node-lifecycle-xcpng.sh` that reuses the established test VM path to build/import/boot a self-hosted image and then run the SSH lifecycle harness +- got the XCP-ng lifecycle path green again after tightening several real-host + assumptions: + - keep the XCP-ng guest on `ifconfig_xn0=SYNCDHCP` + - use a root/dev-only filesystem layout in the harness-generated guest so a + temporary-host store path is not hidden by guest tmpfs policy + - disable `clear_tmp` in that harness so closure/store paths staged under + `/tmp` survive boot + - use a fresh per-run host store in the harness to avoid dragging unrelated + historical store items into the guest image + - prepare a narrow node-local reuse store for reconfigure/rollback so the + guest can reuse its already-staged host-staged package stores + - harden the staged runtime for live-node reuse by: + - removing the materialized Guile/shepherd host-prefix leakage + - fixing root home ownership for SSH strict-modes + - avoiding external `cmp`/`tail` assumptions in node-local operations + - seeding `/usr/src/sys/sys` from the staged profile for the transitional + `freebsd-kernel-headers` path Remaining: - execute the real-node validation harness against a booted self-hosted node and record the outcome -- finish resolving the current XCP-ng live-node validation blocker; current - debugging indicates two concrete harness-level constraints: - - for the current Shepherd PID 1 / XCP-ng path, `ifconfig_xn0=SYNCDHCP` - keeps the guest alive, while the broader `xn0` + `em0` + `vtnet0` set can - cause the guest to halt before validation completes - - a live-booted validation image must not hide its copied store under a - tmpfs-mounted `/tmp`; the harness therefore needs a root/dev-only filesystem - layout when it stages a guest image from a temporary host store path - - there is still a remaining SSH bring-up issue to finish debugging after the - guest reaches the network on the narrowed XCP-ng path - decide whether generation-local `install.scm` should keep its current deployment-oriented schema or move closer to the initial install-generation shape diff --git a/modules/fruix/system/freebsd/build.scm b/modules/fruix/system/freebsd/build.scm index 654f48f..12662b2 100644 --- a/modules/fruix/system/freebsd/build.scm +++ b/modules/fruix/system/freebsd/build.scm @@ -896,7 +896,9 @@ (false-if-exception (readlink path)))) (define (same-file-contents? a b) - (zero? (system* "cmp" "-s" a b))) + (and (file-exists? a) + (file-exists? b) + (string=? (file-hash a) (file-hash b)))) (define (merge-tree-node/copy source destination) (let ((kind (stat:type (lstat source)))) @@ -1391,63 +1393,81 @@ (inputs . ,input-paths) (install-plan . ,(freebsd-package-install-plan package)))))) +(define (reusable-copy-build-store-path package store-dir) + (and (eq? (freebsd-package-build-system package) 'copy-build-system) + (file-exists? store-dir) + (let* ((display-name (string-append (freebsd-package-name package) + "-" + (freebsd-package-version package))) + (matches (filter (lambda (entry) + (let ((path (string-append store-dir "/" entry))) + (and (string-suffix? (string-append "-" display-name) entry) + (file-exists? path) + (file-exists? (string-append path "/.fruix-package"))))) + (directory-entries store-dir)))) + (and (= (length matches) 1) + (string-append store-dir "/" (car matches)))))) + (define* (materialize-freebsd-package package store-dir cache #:optional source-cache) (if (existing-store-package? package) (validate-existing-store-package package) - (let* ((source-cache (or source-cache (make-hash-table))) - (input-paths (map (lambda (input) - (materialize-freebsd-package input store-dir cache source-cache)) - (freebsd-package-inputs package))) - (prepared-package - (if (freebsd-native-build-package? package) - (let* ((source (plan-freebsd-source (freebsd-package-install-plan package))) - (source-result (materialize-freebsd-source/cached source store-dir source-cache)) - (plan (plan-with-materialized-source (freebsd-package-install-plan package) - source-result))) - (package-with-install-plan package plan)) - package)) - (effective-input-paths - (if (freebsd-native-build-package? package) - (cons (build-plan-ref (freebsd-package-install-plan prepared-package) - 'materialized-source-store - #f) - input-paths) - input-paths)) - (effective-input-paths (filter identity effective-input-paths)) - (materialization-cache-key - (package-materialization-cache-key prepared-package - store-dir - effective-input-paths)) - (cached (hash-ref cache materialization-cache-key #f))) - (if cached - cached - (let* ((manifest (package-manifest-string prepared-package effective-input-paths)) - (manifest-cache-key (sha256-string manifest)) - (manifest-cached (hash-ref cache manifest-cache-key #f))) - (if manifest-cached - (begin - (hash-set! cache materialization-cache-key manifest-cached) - manifest-cached) - (let* ((display-name (string-append (freebsd-package-name prepared-package) - "-" - (freebsd-package-version prepared-package))) - (output-path (make-store-path store-dir display-name manifest - #:kind 'freebsd-package))) - (unless (file-exists? output-path) - (case (freebsd-package-build-system prepared-package) - ((copy-build-system) - (materialize-copy-freebsd-package prepared-package - effective-input-paths - manifest - output-path)) - ((freebsd-world-build-system freebsd-kernel-build-system) - (materialize-native-freebsd-package prepared-package effective-input-paths manifest output-path)) - (else - (error (format #f "unsupported package build system: ~a" - (freebsd-package-build-system prepared-package)))))) - (hash-set! cache materialization-cache-key output-path) - (hash-set! cache manifest-cache-key output-path) - output-path))))))) + (let ((reusable-store-path (reusable-copy-build-store-path package store-dir))) + (if reusable-store-path + reusable-store-path + (let* ((source-cache (or source-cache (make-hash-table))) + (input-paths (map (lambda (input) + (materialize-freebsd-package input store-dir cache source-cache)) + (freebsd-package-inputs package))) + (prepared-package + (if (freebsd-native-build-package? package) + (let* ((source (plan-freebsd-source (freebsd-package-install-plan package))) + (source-result (materialize-freebsd-source/cached source store-dir source-cache)) + (plan (plan-with-materialized-source (freebsd-package-install-plan package) + source-result))) + (package-with-install-plan package plan)) + package)) + (effective-input-paths + (if (freebsd-native-build-package? package) + (cons (build-plan-ref (freebsd-package-install-plan prepared-package) + 'materialized-source-store + #f) + input-paths) + input-paths)) + (effective-input-paths (filter identity effective-input-paths)) + (materialization-cache-key + (package-materialization-cache-key prepared-package + store-dir + effective-input-paths)) + (cached (hash-ref cache materialization-cache-key #f))) + (if cached + cached + (let* ((manifest (package-manifest-string prepared-package effective-input-paths)) + (manifest-cache-key (sha256-string manifest)) + (manifest-cached (hash-ref cache manifest-cache-key #f))) + (if manifest-cached + (begin + (hash-set! cache materialization-cache-key manifest-cached) + manifest-cached) + (let* ((display-name (string-append (freebsd-package-name prepared-package) + "-" + (freebsd-package-version prepared-package))) + (output-path (make-store-path store-dir display-name manifest + #:kind 'freebsd-package))) + (unless (file-exists? output-path) + (case (freebsd-package-build-system prepared-package) + ((copy-build-system) + (materialize-copy-freebsd-package prepared-package + effective-input-paths + manifest + output-path)) + ((freebsd-world-build-system freebsd-kernel-build-system) + (materialize-native-freebsd-package prepared-package effective-input-paths manifest output-path)) + (else + (error (format #f "unsupported package build system: ~a" + (freebsd-package-build-system prepared-package)))))) + (hash-set! cache materialization-cache-key output-path) + (hash-set! cache manifest-cache-key output-path) + output-path))))))))) (define native-build-result-promotion-version "1") @@ -1901,38 +1921,44 @@ promoted-artifacts) 'store-path))))) -(define (sanitize-materialized-prefix name output-path) +(define (sanitize-materialized-prefix name source-path output-path) (cond ((string=? name "fruix-guile-extra") (rewrite-text-file (string-append output-path "/share/guile/site/3.0/fibers/config.scm") - '(("((getenv \"FIBERS_BUILD_DIR\")\n => (lambda (builddir) (in-vicinity builddir \".libs\")))\n (else \"/tmp/guile-gnutls-freebsd-validate-install/lib/guile/3.0/extensions\"))" + `((,(string-append "((getenv \"FIBERS_BUILD_DIR\")\n => (lambda (builddir) (in-vicinity builddir \".libs\")))\n (else \"" source-path "/lib/guile/3.0/extensions\"))") . "((getenv \"FIBERS_BUILD_DIR\")\n => (lambda (builddir) (in-vicinity builddir \".libs\")))\n ((getenv \"GUILE_EXTENSIONS_PATH\"))\n (else \"/usr/local/lib/guile/3.0/extensions\"))"))) (rewrite-text-file (string-append output-path "/share/guile/site/3.0/gnutls.scm") - '(("\"/tmp/guile-gnutls-freebsd-validate-install/lib/guile/3.0/extensions\"" + `((,(string-append "\"" source-path "/lib/guile/3.0/extensions\"") . "(or (getenv \"GUILE_EXTENSIONS_PATH\") \"/usr/local/lib/guile/3.0/extensions\")"))) + (for-each + (lambda (path) + (rewrite-text-file path + `((,source-path . ,output-path)))) + (list (string-append output-path "/lib/guile/3.0/extensions/fibers-libevent.la") + (string-append output-path "/lib/guile/3.0/extensions/guile-gnutls-v-2.la"))) (delete-file-if-exists (string-append output-path "/lib/guile/3.0/site-ccache/fibers/config.go")) (delete-file-if-exists (string-append output-path "/lib/guile/3.0/site-ccache/gnutls.go"))) ((string=? name "fruix-shepherd-runtime") (rewrite-text-file (string-append output-path "/share/guile/site/3.0/shepherd/config.scm") - '(("(define Prefix-dir \"/tmp/shepherd-freebsd-validate-install\")" + `((,(string-append "(define Prefix-dir \"" source-path "\")") . "(define Prefix-dir \"/frx\")") - ("(define %localstatedir \"/tmp/shepherd-freebsd-validate-install/var\")" + (,(string-append "(define %localstatedir \"" source-path "/var\")") . "(define %localstatedir \"/var\")") - ("(define %runstatedir \"/tmp/shepherd-freebsd-validate-install/var/run\")" + (,(string-append "(define %runstatedir \"" source-path "/var/run\")") . "(define %runstatedir \"/var/run\")") - ("(define %sysconfdir \"/tmp/shepherd-freebsd-validate-install/etc\")" + (,(string-append "(define %sysconfdir \"" source-path "/etc\")") . "(define %sysconfdir \"/etc\")") - ("(define %localedir \"/tmp/shepherd-freebsd-validate-install/share/locale\")" + (,(string-append "(define %localedir \"" source-path "/share/locale\")") . "(define %localedir \"/usr/share/locale\")") - ("(define %pkglibdir \"/tmp/shepherd-freebsd-validate-install/lib/shepherd\")" + (,(string-append "(define %pkglibdir \"" source-path "/lib/shepherd\")") . "(define %pkglibdir \"/usr/local/lib/shepherd\")"))) (delete-file-if-exists (string-append output-path "/lib/guile/3.0/site-ccache/shepherd/config.go")))) #t) -(define prefix-materializer-version "3") +(define prefix-materializer-version "4") (define (prefix-manifest-string source-path extra-files) (string-append @@ -1980,7 +2006,7 @@ (copy-extra-node (car entry) (string-append output-path "/" (cdr entry)))) extra-files) - (sanitize-materialized-prefix name output-path) + (sanitize-materialized-prefix name source-path output-path) (write-file (string-append output-path "/.fruix-package") manifest)) output-path)) diff --git a/modules/fruix/system/freebsd/render.scm b/modules/fruix/system/freebsd/render.scm index 99fa180..4b379ea 100644 --- a/modules/fruix/system/freebsd/render.scm +++ b/modules/fruix/system/freebsd/render.scm @@ -168,7 +168,8 @@ "PubkeyAuthentication yes\n" "AuthorizedKeysFile .ssh/authorized_keys\n" "PidFile /var/run/sshd.pid\n" - "UseDNS no\n")) + "UseDNS no\n" + "PerSourcePenalties no\n")) (define* (render-activation-script os #:key guile-store guile-extra-store shepherd-store) (let* ((users (operating-system-users os)) @@ -184,7 +185,7 @@ (system? (user-account-system? account))) (string-append "mkdir -p " home "\n" - (if (or (string=? name "root") system?) + (if (and system? (not (string=? name "root"))) "" (format #f "if [ -x /usr/sbin/chown ]; then /usr/sbin/chown ~a:~a ~a 2>/dev/null || true; fi\n" uid gid home))))) @@ -221,6 +222,12 @@ "trap 'status=$?; echo \"fruix-activate:exit status=$status\" >> \"$logfile\"' EXIT\n" "echo \"fruix-activate:start\" >> \"$logfile\"\n" "chmod 1777 /tmp\n" + "if [ ! -d /usr/src/sys/sys ]; then\n" + " mkdir -p /usr/src/sys\n" + " if [ -d /run/current-development/include/sys ]; then cp -a /run/current-development/include/sys /usr/src/sys/sys; fi\n" + " if [ ! -d /usr/src/sys/sys ] && [ -d /run/current-build/include/sys ]; then cp -a /run/current-build/include/sys /usr/src/sys/sys; fi\n" + " if [ ! -d /usr/src/sys/sys ] && [ -d /run/current-system/profile/include/sys ]; then cp -a /run/current-system/profile/include/sys /usr/src/sys/sys; fi\n" + "fi\n" refresh-db-input-files "if [ -x /usr/bin/cap_mkdb ] && [ -f /etc/login.conf ]; then\n" " if /usr/bin/cap_mkdb /etc/login.conf; then echo \"fruix-activate:cap_mkdb=ok\" >> \"$logfile\"; else echo \"fruix-activate:cap_mkdb=failed\" >> \"$logfile\"; fi\n" @@ -287,13 +294,13 @@ (define (render-shepherd-config os) (let* ((ready-marker (operating-system-ready-marker os)) (pid1? (pid1-init-mode? os)) - (start-sshd? (and pid1? (or (sshd-enabled? os) - (member 'sshd (operating-system-services os))))) + (start-sshd? (or (sshd-enabled? os) + (member 'sshd (operating-system-services os)))) (ready-requirements (if start-sshd? "'(fruix-logger sshd)" "'(fruix-logger)")) - (pid1-helpers - (if pid1? + (service-helpers + (if start-sshd? (string-append "(define (run-command program . args)\n" " (let ((status (apply system* program args)))\n" @@ -316,28 +323,32 @@ " #f)\n" " #:respawn? #f))\n\n") "")) - (pid1-services - (if pid1? - (string-append - (if start-sshd? - " (freebsd-rc-service '(netif) \"netif\"\n" - "") - (if start-sshd? - " #:requirement '(fruix-logger)\n" - "") - (if start-sshd? - " #:documentation \"Bring up FreeBSD networking from rc.conf.\")\n" - "") - (if start-sshd? - " (freebsd-rc-service '(sshd) \"sshd\"\n" - "") - (if start-sshd? - " #:requirement '(netif)\n" - "") - (if start-sshd? - " #:documentation \"Start OpenSSH under Shepherd PID 1.\")\n" - "")) - ""))) + (shepherd-services + (string-append + (if (and pid1? start-sshd?) + " (freebsd-rc-service '(netif) \"netif\"\n" + "") + (if (and pid1? start-sshd?) + " #:requirement '(fruix-logger)\n" + "") + (if (and pid1? start-sshd?) + " #:documentation \"Bring up FreeBSD networking from rc.conf.\")\n" + "") + (if start-sshd? + " (freebsd-rc-service '(sshd) \"sshd\"\n" + "") + (if (and pid1? start-sshd?) + " #:requirement '(netif)\n" + "") + (if (and (not pid1?) start-sshd?) + " #:requirement '(fruix-logger)\n" + "") + (if (and pid1? start-sshd?) + " #:documentation \"Start OpenSSH under Shepherd PID 1.\")\n" + "") + (if (and (not pid1?) start-sshd?) + " #:documentation \"Ensure OpenSSH is started for node-local access.\")\n" + "")))) (string-append "(use-modules (shepherd service)\n" " (ice-9 ftw)\n" @@ -351,7 +362,7 @@ " (mkdir dir)))\n\n" "(define (ensure-parent-directory file)\n" " (mkdir-p* (dirname file)))\n\n" - pid1-helpers + service-helpers "(register-services\n" " (list\n" " (service '(fruix-logger)\n" @@ -364,7 +375,7 @@ " #t)\n" " #:stop (lambda _ #f)\n" " #:respawn? #f)\n" - pid1-services + shepherd-services " (service '(fruix-ready)\n" " #:documentation \"Write the Fruix ready marker.\"\n" " #:requirement " ready-requirements "\n" @@ -779,7 +790,7 @@ " exit 1\n" " fi\n" " fi\n" - " result_root=$(sed -n 's/^result_root=//p' \"$build_output\" | tail -n 1)\n" + " result_root=$(sed -n 's/^result_root=//p' \"$build_output\" | sed -n '$p')\n" " [ -n \"$result_root\" ] || { cleanup_build_base; die \"failed to recover result_root from build-base output\"; }\n" " if ! run_node_cli native-build promote \"$result_root\" --store \"$store_dir\" > \"$promote_output\"; then\n" " cat \"$build_output\" >&2 || true\n" @@ -787,7 +798,7 @@ " cleanup_build_base\n" " exit 1\n" " fi\n" - " result_store=$(sed -n 's/^result_store=//p' \"$promote_output\" | tail -n 1)\n" + " result_store=$(sed -n 's/^result_store=//p' \"$promote_output\" | sed -n '$p')\n" " [ -n \"$result_store\" ] || { cleanup_build_base; die \"failed to recover result_store from promotion output\"; }\n" " cat \"$build_output\"\n" " cat \"$promote_output\"\n" @@ -831,7 +842,7 @@ " exit 1\n" " fi\n" " fi\n" - " closure=$(sed -n 's/^closure_path=//p' \"$build_output\" | tail -n 1)\n" + " closure=$(sed -n 's/^closure_path=//p' \"$build_output\" | sed -n '$p')\n" " [ -n \"$closure\" ] || die \"failed to recover closure_path from in-system build output\"\n" " cat \"$build_output\"\n" " rm -f \"$build_output\"\n" diff --git a/tests/run-self-hosted-node-lifecycle-xcpng.sh b/tests/run-self-hosted-node-lifecycle-xcpng.sh index 3618157..b909f3b 100755 --- a/tests/run-self-hosted-node-lifecycle-xcpng.sh +++ b/tests/run-self-hosted-node-lifecycle-xcpng.sh @@ -35,8 +35,8 @@ EOF repo_root=$(CDPATH= cd -- "$(dirname "$0")/.." && pwd) vm_id=${VM_ID:-90490f2e-e8fc-4b7a-388e-5c26f0157289} -store_dir=${STORE_DIR:-/tmp/fruix-dev-pkgs-store.7tT6wR} -root_size=${ROOT_SIZE:-1g} +store_dir=${STORE_DIR:-} +root_size=${ROOT_SIZE:-2g} ssh_user=${SSH_USER:-root} ssh_port=${SSH_PORT:-22} identity_file=${ROOT_SSH_PRIVATE_KEY_FILE:-$HOME/.ssh/id_ed25519} @@ -126,6 +126,10 @@ if [ "${KEEP_WORKDIR:-0}" -eq 1 ]; then cleanup=0 fi +if [ -z "$store_dir" ]; then + store_dir=$workdir/store +fi + cleanup_workdir() { if [ "$cleanup" -eq 1 ]; then rm -rf "$workdir" @@ -178,7 +182,7 @@ scp_guest() { wait_for_ssh() { for attempt in $(jot 120 1 120); do - if ssh_guest 'service sshd onestatus >/dev/null 2>&1' >/dev/null 2>&1; then + if ssh_guest 'test -f /var/lib/fruix/ready' >/dev/null 2>&1; then return 0 fi sleep 2 @@ -202,7 +206,11 @@ candidate_remote_path=/root/candidate-self-hosted-dev.scm cat > "$current_os_file" < "$current_os_file" < "$candidate_os_file" < "$candidate_os_file" < "$vm_info_json" @@ -399,7 +411,7 @@ sshd_status=$(ssh_guest 'service sshd onestatus >/dev/null 2>&1 && echo running [ "$sshd_status" = running ] || { echo "sshd is not running in guest" >&2; exit 1; } echo "[5/6] copying candidate declaration to guest" -scp_guest "$candidate_os_file" "$ssh_user@$guest_ip:$candidate_remote_path" >/dev/null +ssh_guest "cat > $(shell_quote "$candidate_remote_path")" < "$candidate_os_file" if [ "$boot_only" -eq 0 ]; then echo "[6/6] running self-hosted node lifecycle validation" diff --git a/tests/run-self-hosted-node-lifecycle.sh b/tests/run-self-hosted-node-lifecycle.sh index 81a18b8..3163cd6 100755 --- a/tests/run-self-hosted-node-lifecycle.sh +++ b/tests/run-self-hosted-node-lifecycle.sh @@ -93,9 +93,9 @@ shell_quote() { ssh_invoke() { remote_command=$1 if [ -n "$identity_file" ]; then - ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -p "$port" -i "$identity_file" "$user@$host" "$remote_command" + ssh -o BatchMode=yes -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p "$port" -i "$identity_file" "$user@$host" "$remote_command" else - ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -p "$port" "$user@$host" "$remote_command" + ssh -o BatchMode=yes -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p "$port" "$user@$host" "$remote_command" fi } @@ -167,6 +167,7 @@ before_default_declaration=$(metadata_value "$status_before" default_declaration before_default_system_name=$(metadata_value "$status_before" default_system_name) before_current_generation_metadata=$(metadata_value "$status_before" current_generation_metadata) before_current_declaration_file=$(metadata_value "$status_before" current_declaration_file) +remote_store_dir=/var/tmp/fruix-node-lifecycle-store assert_non_empty current_generation "$before_current_generation" assert_non_empty current_closure "$before_current_closure" @@ -174,12 +175,26 @@ assert_non_empty default_declaration_file "$before_default_declaration" assert_non_empty default_system_name "$before_default_system_name" assert_non_empty current_generation_metadata "$before_current_generation_metadata" assert_non_empty current_declaration_file "$before_current_declaration_file" +assert_non_empty remote_store_dir "$remote_store_dir" assert_remote_file_exists "$before_current_generation_metadata" assert_remote_file_exists "$before_current_declaration_file" + +ssh_invoke 'sh -s' < "$reconfigure_output" +ssh_invoke "$(remote_program_command /usr/local/bin/fruix system reconfigure --store "$remote_store_dir" "$@")" > "$reconfigure_output" reconfigure_closure=$(metadata_value "$reconfigure_output" reconfigure_closure) reboot_required=$(metadata_value "$reconfigure_output" reboot_required) assert_non_empty reconfigure_closure "$reconfigure_closure"