fix: Subscription workers restart strategy, sandbox ownership race condition
Some checks failed
CI / Test (OTP 27.2 / Elixir 1.18.2) (push) Failing after 0s
CI / Test (OTP 28.4 / Elixir 1.19.4 + Marmot E2E) (push) Failing after 0s

Clear OTP SSL PEM cache between listener terminate/restart so reloaded
certs are read from disk instead of serving stale cached data. Make
reconcile_worker idempotent to prevent unnecessary worker churn when
put_server is followed by start_server. Add request timeouts to
RelayInfoClient to prevent hanging connections.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-17 19:42:18 +01:00
parent 65b47ec191
commit a19b7d97f0
4 changed files with 58 additions and 35 deletions

View File

@@ -74,6 +74,7 @@ defmodule Parrhesia.API.Sync.Manager do
{:ok, normalized_server} ->
updated_state =
state
|> stop_worker_if_running(normalized_server.id)
|> put_server_state(normalized_server)
|> persist_and_reconcile!(normalized_server.id)
@@ -248,9 +249,7 @@ defmodule Parrhesia.API.Sync.Manager do
state
desired_running?(state, server_id) ->
state
|> stop_worker_if_running(server_id)
|> maybe_start_worker(server_id)
maybe_start_worker(state, server_id)
true ->
stop_worker_if_running(state, server_id)

View File

@@ -22,7 +22,8 @@ defmodule Parrhesia.Sync.RelayInfoClient do
url: url,
headers: [{"accept", "application/nostr+json"}],
decode_body: false,
connect_options: opts
connect_options: Keyword.merge([timeout: 5_000], opts),
receive_timeout: 5_000
) do
{:ok, response} -> {:ok, response}
{:error, reason} -> {:error, reason}

View File

@@ -16,6 +16,7 @@ defmodule Parrhesia.Web.Endpoint do
@spec reload_listener(Supervisor.supervisor(), atom()) :: :ok | {:error, term()}
def reload_listener(supervisor \\ __MODULE__, listener_id) when is_atom(listener_id) do
with :ok <- Supervisor.terminate_child(supervisor, {:listener, listener_id}),
:ok <- clear_pem_cache(),
{:ok, _pid} <- Supervisor.restart_child(supervisor, {:listener, listener_id}) do
:ok
else
@@ -27,17 +28,44 @@ defmodule Parrhesia.Web.Endpoint do
@spec reload_all(Supervisor.supervisor()) :: :ok | {:error, term()}
def reload_all(supervisor \\ __MODULE__) do
supervisor
|> Supervisor.which_children()
|> Enum.filter(fn {id, _pid, _type, _modules} ->
match?({:listener, _listener_id}, id)
end)
|> Enum.reduce_while(:ok, fn {{:listener, listener_id}, _pid, _type, _modules}, :ok ->
case reload_listener(supervisor, listener_id) do
:ok -> {:cont, :ok}
{:error, _reason} = error -> {:halt, error}
end
end)
listener_ids =
supervisor
|> Supervisor.which_children()
|> Enum.flat_map(fn
{{:listener, listener_id}, _pid, _type, _modules} -> [listener_id]
_other -> []
end)
with :ok <- terminate_listeners(supervisor, listener_ids),
:ok <- clear_pem_cache() do
restart_listeners(supervisor, listener_ids)
end
end
defp terminate_listeners(_supervisor, []), do: :ok
defp terminate_listeners(supervisor, [listener_id | rest]) do
case Supervisor.terminate_child(supervisor, {:listener, listener_id}) do
:ok -> terminate_listeners(supervisor, rest)
{:error, _reason} = error -> error
end
end
defp restart_listeners(_supervisor, []), do: :ok
defp restart_listeners(supervisor, [listener_id | rest]) do
case Supervisor.restart_child(supervisor, {:listener, listener_id}) do
{:ok, _pid} -> restart_listeners(supervisor, rest)
{:error, _reason} = error -> error
end
end
# OTP's ssl module caches PEM file contents by filename. When cert/key
# files are replaced on disk, the cache must be cleared so the restarted
# listener reads the updated files.
defp clear_pem_cache do
:ssl.clear_pem_cache()
:ok
end
@impl true