fix: Subscription workers restart strategy, sandbox ownership race condition
Some checks failed
CI / Test (OTP 27.2 / Elixir 1.18.2) (push) Failing after 0s
CI / Test (OTP 28.4 / Elixir 1.19.4 + Marmot E2E) (push) Failing after 0s

Clear OTP SSL PEM cache between listener terminate/restart so reloaded
certs are read from disk instead of serving stale cached data. Make
reconcile_worker idempotent to prevent unnecessary worker churn when
put_server is followed by start_server. Add request timeouts to
RelayInfoClient to prevent hanging connections.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-17 19:42:18 +01:00
parent 65b47ec191
commit a19b7d97f0
4 changed files with 58 additions and 35 deletions

View File

@@ -74,6 +74,7 @@ defmodule Parrhesia.API.Sync.Manager do
{:ok, normalized_server} ->
updated_state =
state
|> stop_worker_if_running(normalized_server.id)
|> put_server_state(normalized_server)
|> persist_and_reconcile!(normalized_server.id)
@@ -248,9 +249,7 @@ defmodule Parrhesia.API.Sync.Manager do
state
desired_running?(state, server_id) ->
state
|> stop_worker_if_running(server_id)
|> maybe_start_worker(server_id)
maybe_start_worker(state, server_id)
true ->
stop_worker_if_running(state, server_id)

View File

@@ -22,7 +22,8 @@ defmodule Parrhesia.Sync.RelayInfoClient do
url: url,
headers: [{"accept", "application/nostr+json"}],
decode_body: false,
connect_options: opts
connect_options: Keyword.merge([timeout: 5_000], opts),
receive_timeout: 5_000
) do
{:ok, response} -> {:ok, response}
{:error, reason} -> {:error, reason}

View File

@@ -16,6 +16,7 @@ defmodule Parrhesia.Web.Endpoint do
@spec reload_listener(Supervisor.supervisor(), atom()) :: :ok | {:error, term()}
def reload_listener(supervisor \\ __MODULE__, listener_id) when is_atom(listener_id) do
with :ok <- Supervisor.terminate_child(supervisor, {:listener, listener_id}),
:ok <- clear_pem_cache(),
{:ok, _pid} <- Supervisor.restart_child(supervisor, {:listener, listener_id}) do
:ok
else
@@ -27,17 +28,44 @@ defmodule Parrhesia.Web.Endpoint do
@spec reload_all(Supervisor.supervisor()) :: :ok | {:error, term()}
def reload_all(supervisor \\ __MODULE__) do
supervisor
|> Supervisor.which_children()
|> Enum.filter(fn {id, _pid, _type, _modules} ->
match?({:listener, _listener_id}, id)
end)
|> Enum.reduce_while(:ok, fn {{:listener, listener_id}, _pid, _type, _modules}, :ok ->
case reload_listener(supervisor, listener_id) do
:ok -> {:cont, :ok}
{:error, _reason} = error -> {:halt, error}
end
end)
listener_ids =
supervisor
|> Supervisor.which_children()
|> Enum.flat_map(fn
{{:listener, listener_id}, _pid, _type, _modules} -> [listener_id]
_other -> []
end)
with :ok <- terminate_listeners(supervisor, listener_ids),
:ok <- clear_pem_cache() do
restart_listeners(supervisor, listener_ids)
end
end
defp terminate_listeners(_supervisor, []), do: :ok
defp terminate_listeners(supervisor, [listener_id | rest]) do
case Supervisor.terminate_child(supervisor, {:listener, listener_id}) do
:ok -> terminate_listeners(supervisor, rest)
{:error, _reason} = error -> error
end
end
defp restart_listeners(_supervisor, []), do: :ok
defp restart_listeners(supervisor, [listener_id | rest]) do
case Supervisor.restart_child(supervisor, {:listener, listener_id}) do
{:ok, _pid} -> restart_listeners(supervisor, rest)
{:error, _reason} = error -> error
end
end
# OTP's ssl module caches PEM file contents by filename. When cert/key
# files are replaced on disk, the cache must be cleared so the restarted
# listener reads the updated files.
defp clear_pem_cache do
:ssl.clear_pem_cache()
:ok
end
@impl true

View File

@@ -296,29 +296,24 @@ defmodule Parrhesia.Web.TLSE2ETest do
end
defp server_cert_fingerprint(port) do
command =
"printf '' | /usr/bin/openssl s_client -connect 127.0.0.1:#{port} -servername localhost -showcerts"
ssl_opts = [verify: :verify_none, server_name_indication: ~c"localhost"]
case System.cmd("/bin/sh", ["-c", command], stderr_to_stdout: true) do
{output, 0} ->
with {:ok, pem_entry} <- first_certificate_pem(output),
[entry | _rest] <- :public_key.pem_decode(pem_entry),
cert_der <- elem(entry, 1) do
{:ok, Base.encode64(:crypto.hash(:sha256, cert_der))}
else
[] -> {:error, :missing_certificate}
{:error, _reason} = error -> error
case :ssl.connect({127, 0, 0, 1}, port, ssl_opts, 5_000) do
{:ok, ssl_socket} ->
try do
case :ssl.peercert(ssl_socket) do
{:ok, cert_der} ->
{:ok, Base.encode64(:crypto.hash(:sha256, cert_der))}
{:error, reason} ->
{:error, reason}
end
after
:ssl.close(ssl_socket)
end
{output, status} ->
{:error, {:openssl_failed, status, output}}
end
end
defp first_certificate_pem(output) do
case Regex.run(~r/-----BEGIN CERTIFICATE-----.*?-----END CERTIFICATE-----/ms, output) do
[pem] -> {:ok, pem}
_other -> {:error, :missing_certificate}
{:error, reason} ->
{:error, reason}
end
end