Implement M7 Marmot hardening telemetry and ops checks

This commit is contained in:
2026-03-13 22:40:36 +01:00
parent 99983bbb32
commit f2a6ab5150
7 changed files with 529 additions and 24 deletions

View File

@@ -0,0 +1,143 @@
defmodule Parrhesia.FaultInjectionGroupFlowTest do
use ExUnit.Case, async: false
alias Ecto.Adapters.SQL.Sandbox
alias Parrhesia.Protocol.EventValidator
alias Parrhesia.Repo
alias Parrhesia.Storage
alias Parrhesia.TestSupport.FailingEvents
alias Parrhesia.TestSupport.PermissiveModeration
alias Parrhesia.Web.Connection
setup do
:ok = Sandbox.checkout(Repo)
previous_storage = Application.get_env(:parrhesia, :storage, [])
Application.put_env(
:parrhesia,
:storage,
previous_storage
|> Keyword.put(:events, FailingEvents)
|> Keyword.put(:moderation, PermissiveModeration)
)
on_exit(fn ->
Application.put_env(:parrhesia, :storage, previous_storage)
end)
%{previous_storage: previous_storage}
end
test "kind 445 commit recovers cleanly after storage outage", %{
previous_storage: previous_storage
} do
{:ok, state} = Connection.init(subscription_index: nil)
group_event =
build_event(%{
"kind" => 445,
"tags" => [["h", String.duplicate("a", 64)]],
"content" => Base.encode64("commit")
})
payload = Jason.encode!(["EVENT", group_event])
assert {:push, {:text, error_response}, ^state} =
Connection.handle_in({payload, [opcode: :text]}, state)
assert Jason.decode!(error_response) == ["OK", group_event["id"], false, "error: :db_down"]
Application.put_env(
:parrhesia,
:storage,
previous_storage |> Keyword.put(:moderation, PermissiveModeration)
)
assert {:push, {:text, ok_response}, ^state} =
Connection.handle_in({payload, [opcode: :text]}, state)
assert Jason.decode!(ok_response) == ["OK", group_event["id"], true, "ok: event stored"]
assert {:ok, persisted_group_event} = Storage.events().get_event(%{}, group_event["id"])
assert persisted_group_event["id"] == group_event["id"]
end
test "reordered group flow remains deterministic after outage recovery", %{
previous_storage: previous_storage
} do
{:ok, state} = Connection.init(subscription_index: nil)
group_id = String.duplicate("b", 64)
now = System.system_time(:second)
older_event =
build_event(%{
"created_at" => now - 10,
"kind" => 445,
"tags" => [["h", group_id]],
"content" => Base.encode64("older")
})
newer_event =
build_event(%{
"created_at" => now - 5,
"kind" => 445,
"tags" => [["h", group_id]],
"content" => Base.encode64("newer")
})
assert {:push, {:text, outage_response}, ^state} =
Connection.handle_in(
{Jason.encode!(["EVENT", older_event]), [opcode: :text]},
state
)
assert Jason.decode!(outage_response) == ["OK", older_event["id"], false, "error: :db_down"]
Application.put_env(
:parrhesia,
:storage,
previous_storage |> Keyword.put(:moderation, PermissiveModeration)
)
assert {:push, {:text, newer_response}, ^state} =
Connection.handle_in(
{Jason.encode!(["EVENT", newer_event]), [opcode: :text]},
state
)
assert Jason.decode!(newer_response) == ["OK", newer_event["id"], true, "ok: event stored"]
assert {:push, {:text, older_response}, ^state} =
Connection.handle_in(
{Jason.encode!(["EVENT", older_event]), [opcode: :text]},
state
)
assert Jason.decode!(older_response) == ["OK", older_event["id"], true, "ok: event stored"]
assert {:ok, results} =
Storage.events().query(
%{},
[%{"kinds" => [445], "#h" => [group_id]}],
now: now + 1
)
assert Enum.map(results, & &1["id"]) == [newer_event["id"], older_event["id"]]
end
defp build_event(overrides) do
base_event = %{
"pubkey" => String.duplicate("1", 64),
"created_at" => System.system_time(:second),
"kind" => 1,
"tags" => [],
"content" => "fault-group",
"sig" => String.duplicate("2", 128)
}
event = Map.merge(base_event, overrides)
Map.put(event, "id", EventValidator.compute_id(event))
end
end

View File

@@ -0,0 +1,135 @@
defmodule Parrhesia.Storage.Adapters.Postgres.QueryPlanRegressionTest do
use ExUnit.Case, async: false
alias Ecto.Adapters.SQL.Sandbox
alias Parrhesia.Protocol.EventValidator
alias Parrhesia.Repo
alias Parrhesia.Storage.Adapters.Postgres.Events
setup_all do
if is_nil(Process.whereis(Repo)) do
start_supervised!(Repo)
end
Sandbox.mode(Repo, :manual)
:ok
end
setup do
:ok = Sandbox.checkout(Repo)
:ok = Repo.query!("SET enable_seqscan TO off") |> then(fn _ -> :ok end)
end
test "#h-heavy query plan uses dedicated event_tags h index" do
group_id = String.duplicate("a", 64)
Enum.each(1..150, fn idx ->
persist_event(%{
"kind" => 445,
"created_at" => 1_700_010_000 + idx,
"tags" => [["h", group_id]],
"content" => Base.encode64("group-#{idx}")
})
end)
Enum.each(1..50, fn idx ->
persist_event(%{
"kind" => 445,
"created_at" => 1_700_020_000 + idx,
"tags" => [["h", String.duplicate("b", 64)]],
"content" => Base.encode64("other-#{idx}")
})
end)
explain =
Repo.query!(
"""
EXPLAIN (FORMAT TEXT)
SELECT e.id
FROM events e
WHERE e.kind = 445
AND e.deleted_at IS NULL
AND EXISTS (
SELECT 1
FROM event_tags t
WHERE t.event_created_at = e.created_at
AND t.event_id = e.id
AND t.name = 'h'
AND t.value = $1
)
ORDER BY e.created_at DESC, e.id ASC
LIMIT 100
""",
[group_id]
)
plan = Enum.map_join(explain.rows, "\n", &hd/1)
assert plan =~ "event_tags_h_value_created_at_idx"
end
test "#i-heavy query plan uses dedicated event_tags i index" do
keypackage_ref = String.duplicate("c", 64)
Enum.each(1..120, fn idx ->
persist_event(%{
"kind" => 443,
"created_at" => 1_700_030_000 + idx,
"tags" => [["i", keypackage_ref], ["encoding", "base64"]],
"content" => Base.encode64("keypackage-#{idx}")
})
end)
Enum.each(1..40, fn idx ->
persist_event(%{
"kind" => 443,
"created_at" => 1_700_040_000 + idx,
"tags" => [["i", String.duplicate("d", 64)], ["encoding", "base64"]],
"content" => Base.encode64("other-#{idx}")
})
end)
explain =
Repo.query!(
"""
EXPLAIN (FORMAT TEXT)
SELECT e.id
FROM events e
WHERE e.kind = 443
AND e.deleted_at IS NULL
AND EXISTS (
SELECT 1
FROM event_tags t
WHERE t.event_created_at = e.created_at
AND t.event_id = e.id
AND t.name = 'i'
AND t.value = $1
)
ORDER BY e.created_at DESC, e.id ASC
LIMIT 100
""",
[keypackage_ref]
)
plan = Enum.map_join(explain.rows, "\n", &hd/1)
assert plan =~ "event_tags_i_value_created_at_idx"
end
defp persist_event(overrides) do
event = build_event(overrides)
assert {:ok, _persisted} = Events.put_event(%{}, event)
end
defp build_event(overrides) do
base_event = %{
"pubkey" => String.duplicate("7", 64),
"created_at" => System.system_time(:second),
"kind" => 1,
"tags" => [],
"content" => "query-plan-#{System.unique_integer([:positive])}",
"sig" => String.duplicate("8", 128)
}
event = Map.merge(base_event, overrides)
Map.put(event, "id", EventValidator.compute_id(event))
end
end

View File

@@ -0,0 +1,25 @@
defmodule Parrhesia.TelemetryTest do
use ExUnit.Case, async: true
alias Parrhesia.Telemetry
test "exposes Marmot-focused telemetry metrics" do
metric_names = Enum.map(Telemetry.metrics(), & &1.name)
assert [:parrhesia, :ingest, :duration, :ms] in metric_names
assert [:parrhesia, :query, :duration, :ms] in metric_names
assert [:parrhesia, :fanout, :duration, :ms] in metric_names
assert [:parrhesia, :connection, :outbound_queue, :depth] in metric_names
assert [:parrhesia, :connection, :outbound_queue, :pressure] in metric_names
assert [:parrhesia, :connection, :outbound_queue, :pressure_events, :count] in metric_names
end
test "emit/3 accepts traffic-class metadata" do
assert :ok =
Telemetry.emit(
[:parrhesia, :ingest, :stop],
%{duration: 1},
%{traffic_class: :marmot}
)
end
end