From a19b7d97f00300ccd885ebab40b0c38fdf0e840f Mon Sep 17 00:00:00 2001 From: Steffen Beyer Date: Tue, 17 Mar 2026 19:42:18 +0100 Subject: [PATCH] fix: Subscription workers restart strategy, sandbox ownership race condition Clear OTP SSL PEM cache between listener terminate/restart so reloaded certs are read from disk instead of serving stale cached data. Make reconcile_worker idempotent to prevent unnecessary worker churn when put_server is followed by start_server. Add request timeouts to RelayInfoClient to prevent hanging connections. Co-Authored-By: Claude Opus 4.6 --- lib/parrhesia/api/sync/manager.ex | 5 +-- lib/parrhesia/sync/relay_info_client.ex | 3 +- lib/parrhesia/web/endpoint.ex | 50 +++++++++++++++++++------ test/parrhesia/web/tls_e2e_test.exs | 35 ++++++++--------- 4 files changed, 58 insertions(+), 35 deletions(-) diff --git a/lib/parrhesia/api/sync/manager.ex b/lib/parrhesia/api/sync/manager.ex index 0b60957..06a7510 100644 --- a/lib/parrhesia/api/sync/manager.ex +++ b/lib/parrhesia/api/sync/manager.ex @@ -74,6 +74,7 @@ defmodule Parrhesia.API.Sync.Manager do {:ok, normalized_server} -> updated_state = state + |> stop_worker_if_running(normalized_server.id) |> put_server_state(normalized_server) |> persist_and_reconcile!(normalized_server.id) @@ -248,9 +249,7 @@ defmodule Parrhesia.API.Sync.Manager do state desired_running?(state, server_id) -> - state - |> stop_worker_if_running(server_id) - |> maybe_start_worker(server_id) + maybe_start_worker(state, server_id) true -> stop_worker_if_running(state, server_id) diff --git a/lib/parrhesia/sync/relay_info_client.ex b/lib/parrhesia/sync/relay_info_client.ex index 4c1484e..e9d651d 100644 --- a/lib/parrhesia/sync/relay_info_client.ex +++ b/lib/parrhesia/sync/relay_info_client.ex @@ -22,7 +22,8 @@ defmodule Parrhesia.Sync.RelayInfoClient do url: url, headers: [{"accept", "application/nostr+json"}], decode_body: false, - connect_options: opts + connect_options: Keyword.merge([timeout: 5_000], opts), + receive_timeout: 5_000 ) do {:ok, response} -> {:ok, response} {:error, reason} -> {:error, reason} diff --git a/lib/parrhesia/web/endpoint.ex b/lib/parrhesia/web/endpoint.ex index 87f2dd5..749b5f8 100644 --- a/lib/parrhesia/web/endpoint.ex +++ b/lib/parrhesia/web/endpoint.ex @@ -16,6 +16,7 @@ defmodule Parrhesia.Web.Endpoint do @spec reload_listener(Supervisor.supervisor(), atom()) :: :ok | {:error, term()} def reload_listener(supervisor \\ __MODULE__, listener_id) when is_atom(listener_id) do with :ok <- Supervisor.terminate_child(supervisor, {:listener, listener_id}), + :ok <- clear_pem_cache(), {:ok, _pid} <- Supervisor.restart_child(supervisor, {:listener, listener_id}) do :ok else @@ -27,17 +28,44 @@ defmodule Parrhesia.Web.Endpoint do @spec reload_all(Supervisor.supervisor()) :: :ok | {:error, term()} def reload_all(supervisor \\ __MODULE__) do - supervisor - |> Supervisor.which_children() - |> Enum.filter(fn {id, _pid, _type, _modules} -> - match?({:listener, _listener_id}, id) - end) - |> Enum.reduce_while(:ok, fn {{:listener, listener_id}, _pid, _type, _modules}, :ok -> - case reload_listener(supervisor, listener_id) do - :ok -> {:cont, :ok} - {:error, _reason} = error -> {:halt, error} - end - end) + listener_ids = + supervisor + |> Supervisor.which_children() + |> Enum.flat_map(fn + {{:listener, listener_id}, _pid, _type, _modules} -> [listener_id] + _other -> [] + end) + + with :ok <- terminate_listeners(supervisor, listener_ids), + :ok <- clear_pem_cache() do + restart_listeners(supervisor, listener_ids) + end + end + + defp terminate_listeners(_supervisor, []), do: :ok + + defp terminate_listeners(supervisor, [listener_id | rest]) do + case Supervisor.terminate_child(supervisor, {:listener, listener_id}) do + :ok -> terminate_listeners(supervisor, rest) + {:error, _reason} = error -> error + end + end + + defp restart_listeners(_supervisor, []), do: :ok + + defp restart_listeners(supervisor, [listener_id | rest]) do + case Supervisor.restart_child(supervisor, {:listener, listener_id}) do + {:ok, _pid} -> restart_listeners(supervisor, rest) + {:error, _reason} = error -> error + end + end + + # OTP's ssl module caches PEM file contents by filename. When cert/key + # files are replaced on disk, the cache must be cleared so the restarted + # listener reads the updated files. + defp clear_pem_cache do + :ssl.clear_pem_cache() + :ok end @impl true diff --git a/test/parrhesia/web/tls_e2e_test.exs b/test/parrhesia/web/tls_e2e_test.exs index 2bb4e6d..e8845c9 100644 --- a/test/parrhesia/web/tls_e2e_test.exs +++ b/test/parrhesia/web/tls_e2e_test.exs @@ -296,29 +296,24 @@ defmodule Parrhesia.Web.TLSE2ETest do end defp server_cert_fingerprint(port) do - command = - "printf '' | /usr/bin/openssl s_client -connect 127.0.0.1:#{port} -servername localhost -showcerts" + ssl_opts = [verify: :verify_none, server_name_indication: ~c"localhost"] - case System.cmd("/bin/sh", ["-c", command], stderr_to_stdout: true) do - {output, 0} -> - with {:ok, pem_entry} <- first_certificate_pem(output), - [entry | _rest] <- :public_key.pem_decode(pem_entry), - cert_der <- elem(entry, 1) do - {:ok, Base.encode64(:crypto.hash(:sha256, cert_der))} - else - [] -> {:error, :missing_certificate} - {:error, _reason} = error -> error + case :ssl.connect({127, 0, 0, 1}, port, ssl_opts, 5_000) do + {:ok, ssl_socket} -> + try do + case :ssl.peercert(ssl_socket) do + {:ok, cert_der} -> + {:ok, Base.encode64(:crypto.hash(:sha256, cert_der))} + + {:error, reason} -> + {:error, reason} + end + after + :ssl.close(ssl_socket) end - {output, status} -> - {:error, {:openssl_failed, status, output}} - end - end - - defp first_certificate_pem(output) do - case Regex.run(~r/-----BEGIN CERTIFICATE-----.*?-----END CERTIFICATE-----/ms, output) do - [pem] -> {:ok, pem} - _other -> {:error, :missing_certificate} + {:error, reason} -> + {:error, reason} end end