feat: add cluster sender reboot scenario

Rename the clustered Sender fanout scenario to cover the edge reboot recovery path and wire the new command through the CLI and npm scripts.

Use provider default instances from runtime config, allow cleanup diagnostics to tolerate expected timeouts, and document the updated scenario and live-debug helper guidance.

Validated with SUPERTEST_CERT_MODE=self-signed SUPERTEST_WAIT_ON_FAILURE=1 npm run scenario:cluster-sender-fanout-reboot and npm run typecheck.
This commit is contained in:
2026-05-19 09:26:10 +02:00
parent b0dd242146
commit 2745c21675
9 changed files with 27 additions and 28 deletions
+6
View File
@@ -75,6 +75,12 @@ devenv shell -- scripts/supertest-legion rpc <node> 'Application.started_applica
`scripts/supertest-legion` reads `.state/supertest/latest.json` and reuses the generated `LEGION_STATE_DIR`, `LEGION_CACHE_DIR`, Legion repo, and CLI entrypoint from the current or latest scenario run. Do not enter a second `devenv` inside the Legion checkout for this; the supertest shell provides the Node tooling and the helper invokes Legion directly.
It is fine to run these `devenv shell -- scripts/supertest-legion ...`
inspection commands while a scenario is running or waiting for failure
inspection. The general devenv caution is about starting unrelated commands in
the same checkout at the same time, not about nested/live-debug helper calls
that are part of an active supertest run.
Other live-debug tools:
- `legion ssh <node>`
+7 -5
View File
@@ -100,10 +100,10 @@ Run the single-node Sender ingest/HLS scenario:
npm run scenario:single-node-sender
```
Run the clustered Sender fanout scenario:
Run the clustered Sender fanout/reboot scenario:
```bash
npm run scenario:cluster-sender-fanout
npm run scenario:cluster-sender-fanout-reboot
```
Run the cluster lifecycle scenario:
@@ -130,7 +130,7 @@ Run one scenario directly with the generic entrypoint:
npm run scenario -- single-node-init
npm run scenario -- single-node-plugin-rollout-rollback
npm run scenario -- single-node-sender
npm run scenario -- cluster-sender-fanout
npm run scenario -- cluster-sender-fanout-reboot
npm run scenario -- cluster-plugin-rollout-sync-split-brain
npm run scenario -- cluster-lifecycle
```
@@ -170,8 +170,8 @@ main checkout contains unrelated local work.
Provisions one Hetzner init node, applies a plugin rollout through the public admin API, and rolls back to the pre-rollout generation.
- `single-node-sender`
Provisions one Hetzner init node, installs the `sender` plugin when needed, starts RTMP ingest through Legion, publishes an audio test stream, verifies HLS playlist and segment output, and stops the stream.
- `cluster-sender-fanout`
Builds a three-node OVH/Hetzner/Scaleway cluster, installs the `sender` plugin when needed, starts one origin plus two HLS edges through Legion, publishes a 6 Mbit/s video test stream, runs HLS viewers against every node, and verifies Sender viewer-count rollups through the admin API. Set `SUPERTEST_SENDER_VIEWERS_PER_NODE` to override the default of 3 viewers per node.
- `cluster-sender-fanout-reboot`
Builds a three-node OVH/Hetzner/Scaleway cluster, installs the `sender` plugin when needed, starts one origin plus two HLS edges through Legion, publishes a 6 Mbit/s video test stream, runs HLS viewers against every node, verifies Sender viewer-count rollups through the admin API, reboots one edge, and verifies recovery. Set `SUPERTEST_SENDER_VIEWERS_PER_NODE` to override the default of 3 viewers per node.
- `cluster-plugin-rollout-sync-split-brain`
Builds a three-node Hetzner/Scaleway/OVH cluster, applies the `supertest` plugin rollout, validates synced table writes and cluster pubsub across a temporary sync-port partition/rejoin, and rolls back every node.
- `cluster-lifecycle`
@@ -303,6 +303,8 @@ These tools are optional for normal runs, but they are practical when:
Override the Scaleway provider account id used in isolated state.
- `SUPERTEST_HETZNER_INSTANCE`
Override the Hetzner instance/offer selection.
- `SUPERTEST_OVH_INSTANCE`
Override the OVH instance/offer selection.
- `SUPERTEST_SCALEWAY_INSTANCE`
Override the Scaleway instance/offer selection.
- `SUPERTEST_HETZNER_BOOT_MODE`
+1 -1
View File
@@ -35,7 +35,7 @@
"scenario:single-node-init": "npm run scenario -- single-node-init",
"scenario:single-node-plugin-rollout-rollback": "npm run scenario -- single-node-plugin-rollout-rollback",
"scenario:single-node-sender": "npm run scenario -- single-node-sender",
"scenario:cluster-sender-fanout": "npm run scenario -- cluster-sender-fanout",
"scenario:cluster-sender-fanout-reboot": "npm run scenario -- cluster-sender-fanout-reboot",
"scenario:cluster-plugin-rollout-sync-split-brain": "npm run scenario -- cluster-plugin-rollout-sync-split-brain",
"scenario:cluster-lifecycle": "npm run scenario -- cluster-lifecycle"
},
+1 -1
View File
@@ -182,7 +182,7 @@ export function resolveRuntimeConfig(
scaleway: {
...buildProviderConfig({
accountId: readEnv(env, "SUPERTEST_SCALEWAY_ACCOUNT_ID") || "supertest-scaleway",
instance: readEnv(env, "SUPERTEST_SCALEWAY_INSTANCE") || "BASIC3-X2C-8G",
instance: readEnv(env, "SUPERTEST_SCALEWAY_INSTANCE") || "DEV1-L",
bootMode: parseBootMode(readEnv(env, "SUPERTEST_SCALEWAY_BOOT_MODE"))
}),
zone:
+2 -2
View File
@@ -6,7 +6,7 @@ import { runNamedScenario } from "./runner.js"
function printUsage(): void {
console.log(
"Usage: tribes-supertest <list|single-node-init|single-node-plugin-rollout-rollback|single-node-sender|cluster-sender-fanout|cluster-plugin-rollout-sync-split-brain|cluster-lifecycle>"
"Usage: tribes-supertest <list|single-node-init|single-node-plugin-rollout-rollback|single-node-sender|cluster-sender-fanout-reboot|cluster-plugin-rollout-sync-split-brain|cluster-lifecycle>"
)
console.log("")
console.log("Commands:")
@@ -20,7 +20,7 @@ function printUsage(): void {
console.log(
" Run single-node Sender rollout, RTMP ingest, and HLS validation."
)
console.log(" cluster-sender-fanout")
console.log(" cluster-sender-fanout-reboot")
console.log(
" Run clustered Sender origin/edge fanout and viewer-count validation."
)
+5 -2
View File
@@ -2266,6 +2266,7 @@ class LegionAdapter {
cwd: this.paths.scenarioRootDir,
env: process.env,
timeoutMs,
allowTimeout: allowFailure,
stdinText: `${script}\n`
},
outputDir,
@@ -2316,7 +2317,8 @@ class LegionAdapter {
],
cwd: this.paths.scenarioRootDir,
env: process.env,
timeoutMs
timeoutMs,
allowTimeout: allowFailure
},
dirname(localPath),
true
@@ -2673,6 +2675,7 @@ async function runCommand(
cwd: string
env: NodeJS.ProcessEnv
timeoutMs: number
allowTimeout?: boolean
stdinText?: string
},
outputDir: string,
@@ -2777,7 +2780,7 @@ async function runCommand(
})
await persistCommandResult(outputDir, result)
if (result.timedOut) {
if (result.timedOut && !input.allowTimeout) {
throw new Error(
`Command ${input.label} timed out after ${input.timeoutMs} ms.\nstdout:\n${result.stdout}\nstderr:\n${result.stderr}`
)
+2 -2
View File
@@ -2,7 +2,7 @@ import type { ProviderKind } from "./config.js"
import type { ScenarioExecution } from "./runner.js"
import { clusterLifecycleScenario } from "./scenarios/cluster-lifecycle.js"
import { clusterPluginRolloutSyncSplitBrainScenario } from "./scenarios/cluster-plugin-rollout-sync-split-brain.js"
import { clusterSenderFanoutScenario } from "./scenarios/cluster-sender-fanout.js"
import { clusterSenderFanoutRebootScenario } from "./scenarios/cluster-sender-fanout-reboot.js"
import { singleNodeInitScenario } from "./scenarios/single-node-init.js"
import { singleNodePluginRolloutRollbackScenario } from "./scenarios/single-node-plugin-rollout-rollback.js"
import { singleNodeSenderScenario } from "./scenarios/single-node-sender.js"
@@ -18,7 +18,7 @@ export const scenarioDefinitions: ScenarioDefinition[] = [
singleNodeInitScenario,
singleNodePluginRolloutRollbackScenario,
singleNodeSenderScenario,
clusterSenderFanoutScenario,
clusterSenderFanoutRebootScenario,
clusterPluginRolloutSyncSplitBrainScenario,
clusterLifecycleScenario
]
@@ -13,9 +13,6 @@ import type { ScenarioDefinition } from "../scenarios.js"
const DEFAULT_CLUSTER_SYNC_TIMEOUT_MS = 10 * 60_000
const DEFAULT_CLUSTER_SYNC_POLL_MS = 10_000
const DEFAULT_HETZNER_INSTANCE = "cpx31"
const DEFAULT_OVH_INSTANCE = "c3-8"
const DEFAULT_SCALEWAY_INSTANCE = "BASIC3-X2C-8G"
async function execute(context: ScenarioExecution): Promise<void> {
const primaryId = context.nodeId("hx-a")
@@ -27,7 +24,6 @@ async function execute(context: ScenarioExecution): Promise<void> {
await context.addManagedNode({
id: primaryId,
provider: "hetzner",
instance: DEFAULT_HETZNER_INSTANCE,
bootstrapMode: "init"
})
@@ -44,14 +40,12 @@ async function execute(context: ScenarioExecution): Promise<void> {
await context.planManagedNode({
id: joinScwId,
provider: "scaleway",
instance: DEFAULT_SCALEWAY_INSTANCE,
bootstrapMode: "join",
upstreamId: primaryId
})
await context.planManagedNode({
id: joinOvhId,
provider: "ovh",
instance: DEFAULT_OVH_INSTANCE,
bootstrapMode: "join",
upstreamId: primaryId
})
@@ -11,15 +11,12 @@ import type { LegionStoredState, LegionTrackedServer } from "../legion-state.js"
import type { ScenarioDefinition } from "../scenarios.js"
const SENDER_PLUGIN_NAME = "sender"
const DEFAULT_HETZNER_INSTANCE = "cpx31"
const DEFAULT_OVH_INSTANCE = "c3-8"
const DEFAULT_SCALEWAY_INSTANCE = "BASIC3-X2C-8G"
const DEFAULT_VIEWERS_PER_NODE = 3
const DEFAULT_CLUSTER_SYNC_TIMEOUT_MS = 10 * 60_000
const DEFAULT_CLUSTER_SYNC_POLL_MS = 10_000
const DEFAULT_SENDER_ROLLUP_TIMEOUT_MS = 4 * 60_000
const DEFAULT_SENDER_ROLLUP_POLL_MS = 3_000
const DEFAULT_EDGE_REBOOT_TIMEOUT_MS = 6 * 60_000
const DEFAULT_EDGE_REBOOT_TIMEOUT_MS = 3 * 60_000
const DEFAULT_SENDER_RECOVERY_TIMEOUT_MS = 4 * 60_000
async function execute(context: ScenarioExecution): Promise<void> {
@@ -35,7 +32,6 @@ async function execute(context: ScenarioExecution): Promise<void> {
await context.addManagedNode({
id: primaryId,
provider: "ovh",
instance: DEFAULT_OVH_INSTANCE,
bootstrapMode: "init"
})
@@ -52,14 +48,12 @@ async function execute(context: ScenarioExecution): Promise<void> {
await context.planManagedNode({
id: joinHxId,
provider: "hetzner",
instance: DEFAULT_HETZNER_INSTANCE,
bootstrapMode: "join",
upstreamId: primaryId
})
await context.planManagedNode({
id: joinScwId,
provider: "scaleway",
instance: DEFAULT_SCALEWAY_INSTANCE,
bootstrapMode: "join",
upstreamId: primaryId
})
@@ -954,8 +948,8 @@ function sleep(durationMs: number): Promise<void> {
return new Promise((resolvePromise) => setTimeout(resolvePromise, durationMs))
}
export const clusterSenderFanoutScenario: ScenarioDefinition = {
name: "cluster-sender-fanout",
export const clusterSenderFanoutRebootScenario: ScenarioDefinition = {
name: "cluster-sender-fanout-reboot",
description:
"Build a three-node OVH/Hetzner/Scaleway cluster, install Sender, start one origin plus two edges, push a 6 Mbit/s video stream, verify Sender viewer-count rollups, reboot one edge, and verify recovery.",
requiredProviders: ["ovh", "hetzner", "scaleway"],