From 992ff7f5ef59c851b2b82c040d4fbde7aba66111 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Floure?= Date: Mon, 4 Jul 2022 12:27:44 +0200 Subject: [PATCH] Fix eventual crash on failed DRBD backend, bump development version --- changelog.md | 4 +++ lib/ha_handler/drbd.ex | 29 +++++++++++++++----- lib/ha_handler/drbd/supervisor.ex | 4 ++- lib/ha_handler/drbd/watcher.ex | 44 ++++++++++++++++++++----------- mix.exs | 2 +- 5 files changed, 60 insertions(+), 23 deletions(-) diff --git a/changelog.md b/changelog.md index 296db5f..0f2e00c 100644 --- a/changelog.md +++ b/changelog.md @@ -1,3 +1,7 @@ +# 2022-07-?? - v0.4.2 + +* Fix eventual crash on failed DRBD backend. + # 2022-06-13 - v0.4.1 * Fix crash on failed SSHEx / Postgrex connection failure. diff --git a/lib/ha_handler/drbd.ex b/lib/ha_handler/drbd.ex index c5b1e76..f5e92af 100644 --- a/lib/ha_handler/drbd.ex +++ b/lib/ha_handler/drbd.ex @@ -1,4 +1,6 @@ defmodule HAHandler.DRBD do + require Logger + @supervisor HAHandler.DRBD.Supervisor # There might be >1 resources configured in DRBD! @@ -18,6 +20,15 @@ defmodule HAHandler.DRBD do @id_extraction_regex ~r/\n\s(?\d+)\:\s/ @data_extraction_regex ~r/cs:(?(\w|\/)+)\sro:(?(\w|\/)+)\sds:(?(\w|\/)+)\s/ + # Empty state, when backend is not queryable for some reason. + @empty_state %{ + hostname: "unknown", + version: "", + mode: "", + status: "unknown", + data: "" + } + def get_instances() do watchers = Supervisor.which_children(@supervisor) @@ -32,6 +43,8 @@ defmodule HAHandler.DRBD do end def get_state({hostname, pid}) do + empty_reply = %{@empty_state | hostname: hostname} + case GenServer.call(pid, {:execute, @drbd_proc_cmd}) do {:ok, raw, 0} -> case Regex.named_captures(@block_regex, raw) do @@ -54,21 +67,25 @@ defmodule HAHandler.DRBD do |> Enum.filter(fn r -> r["id"] == @default_resource_id end) |> Enum.at(0) - %{ - hostname: hostname, + processed_reply = %{ version: Map.get(version, "full"), mode: Map.get(default_resource, "ro"), status: Map.get(default_resource, "ds"), data: resources } + Map.merge(empty_reply, processed_reply) end _ -> - {:error, "could not parse /proc/drbd"} + Logger.warning("Failed to query DRBD backend: could not parse /proc/drbd.") + end {:ok, _, posix_err} -> - {:error, posix_err} - {:error, _err} = reply -> - reply + Logger.warning("Failed to query DRBD backend: POSIX #{inspect(posix_err)}.") + empty_reply + + {:error, err} -> + Logger.warning("Failed to query DRBD backend: #{inspect(err)}.") + empty_reply end end end diff --git a/lib/ha_handler/drbd/supervisor.ex b/lib/ha_handler/drbd/supervisor.ex index 289488f..98b9f54 100644 --- a/lib/ha_handler/drbd/supervisor.ex +++ b/lib/ha_handler/drbd/supervisor.ex @@ -17,7 +17,9 @@ defmodule HAHandler.DRBD.Supervisor do } end) - opts = [strategy: :one_for_one] + opts = [ + strategy: :one_for_one + ] Supervisor.init(children, opts) end end diff --git a/lib/ha_handler/drbd/watcher.ex b/lib/ha_handler/drbd/watcher.ex index d6af8cd..76363b4 100644 --- a/lib/ha_handler/drbd/watcher.ex +++ b/lib/ha_handler/drbd/watcher.ex @@ -25,25 +25,32 @@ defmodule HAHandler.DRBD.Watcher do @impl true def init(opts) do - hostname = Keyword.get(opts, :hostname) - password = Keyword.get(opts, :password) + state = %{ + backend: nil, + last_reconnect: nil, + hostname: Keyword.get(opts, :hostname), + password: Keyword.get(opts, :password), + } + # This action will be processed once the GenServer is fully + # started/operational. This process handle connection failures by itself, + # as we don't want to crash loop into supervisor logic (which is only there + # to handle unexpected failure). + send self(), :reconnect + + {:ok, state} + end + + @impl true + def handle_info(:reconnect, state = %{hostname: hostname, password: password}) do case connect(hostname, password) do {:ok, pid} -> - state = %{ - backend: pid, - hostname: hostname, - password: password - } - - {:ok, state} + {:noreply, %{state | backend: pid}} {:error, err} -> + # Nothing to do, as the next request will trigger the reconnect logic + # (see :execute call). - # Wait for 10 seconds so that the supervisor does not loop on dead node - # (and reach the max_restart threshold / stop trying). - Process.sleep(10_000) - - {:error, err} + {:noreply, state} end end @@ -51,9 +58,16 @@ defmodule HAHandler.DRBD.Watcher do def handle_call({:execute, cmd}, _from, %{backend: backend} = state) do case SSHEx.run(backend, cmd) do {:ok, _output, _status} = reply-> + {:reply, reply, state} + {:error, :closed} = reply -> + # Asynchroneously tries to reopen the connection to the backend. + send self(), :reconnect + {:reply, reply, state} {:error, _err} = reply -> - {:error, reply, state} + # Do not take action on unknown error. + {:reply, reply, state} end end + end diff --git a/mix.exs b/mix.exs index 34cc5bf..142d5c9 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule HAHandler.MixProject do def project do [ app: :ha_handler, - version: "0.4.1", + version: "0.4.2", elixir: "~> 1.12", start_permanent: Mix.env() == :prod, deps: deps(),