Fix eventual crash on failed DRBD backend, bump development version

2022-07-04 12:27:44 +02:00 · 2022-07-04 12:27:44 +02:00 · 992ff7f5ef
commit 992ff7f5ef
parent fb3338b4d9
5 changed files with 60 additions and 23 deletions
--- a/changelog.md
+++ b/changelog.md
@ -1,3 +1,7 @@
+# 2022-07-?? - v0.4.2
+
+* Fix eventual crash on failed DRBD backend.
+
 # 2022-06-13 - v0.4.1

 * Fix crash on failed SSHEx / Postgrex connection failure.
--- a/lib/ha_handler/drbd.ex
+++ b/lib/ha_handler/drbd.ex
@ -1,4 +1,6 @@
 defmodule HAHandler.DRBD do
+  require Logger
+
  @supervisor HAHandler.DRBD.Supervisor

  # There might be >1 resources configured in DRBD!
@ -18,6 +20,15 @@ defmodule HAHandler.DRBD do
  @id_extraction_regex ~r/\n\s(?<id>\d+)\:\s/
  @data_extraction_regex ~r/cs:(?<cs>(\w|\/)+)\sro:(?<ro>(\w|\/)+)\sds:(?<ds>(\w|\/)+)\s/

+  # Empty state, when backend is not queryable for some reason.
+  @empty_state %{
+    hostname: "unknown",
+    version: "",
+    mode: "",
+    status: "unknown",
+    data: ""
+  }
+
  def get_instances() do
    watchers = Supervisor.which_children(@supervisor)

@ -32,6 +43,8 @@ defmodule HAHandler.DRBD do
  end

  def get_state({hostname, pid}) do
+    empty_reply = %{@empty_state | hostname: hostname}
+
    case GenServer.call(pid, {:execute, @drbd_proc_cmd}) do
      {:ok, raw, 0} ->
        case Regex.named_captures(@block_regex, raw) do
@ -54,21 +67,25 @@ defmodule HAHandler.DRBD do
                                 |> Enum.filter(fn r -> r["id"] == @default_resource_id end)
                                 |> Enum.at(0)

-              %{
-                hostname: hostname,
+              processed_reply = %{
                version: Map.get(version, "full"),
                mode: Map.get(default_resource, "ro"),
                status: Map.get(default_resource, "ds"),
                data: resources
              }
+              Map.merge(empty_reply, processed_reply)
            end
          _ ->
-            {:error, "could not parse /proc/drbd"}
+            Logger.warning("Failed to query DRBD backend: could not parse /proc/drbd.")
+
        end
      {:ok, _, posix_err} ->
-        {:error, posix_err}
-      {:error, _err} = reply ->
-        reply
+        Logger.warning("Failed to query DRBD backend: POSIX #{inspect(posix_err)}.")
+        empty_reply
+
+      {:error, err} ->
+        Logger.warning("Failed to query DRBD backend: #{inspect(err)}.")
+        empty_reply
    end
  end
 end
--- a/lib/ha_handler/drbd/supervisor.ex
+++ b/lib/ha_handler/drbd/supervisor.ex
@ -17,7 +17,9 @@ defmodule HAHandler.DRBD.Supervisor do
        }
      end)

-    opts = [strategy: :one_for_one]
+    opts = [
+      strategy: :one_for_one
+    ]
    Supervisor.init(children, opts)
  end
 end
--- a/lib/ha_handler/drbd/watcher.ex
+++ b/lib/ha_handler/drbd/watcher.ex
@ -25,25 +25,32 @@ defmodule HAHandler.DRBD.Watcher do

  @impl true
  def init(opts) do
-    hostname = Keyword.get(opts, :hostname)
-    password = Keyword.get(opts, :password)
+    state = %{
+      backend: nil,
+      last_reconnect: nil,
+      hostname: Keyword.get(opts, :hostname),
+      password: Keyword.get(opts, :password),
+    }

+    # This action will be processed once the GenServer is fully
+    # started/operational. This process handle connection failures by itself,
+    # as we don't want to crash loop into supervisor logic (which is only there
+    # to handle unexpected failure).
+    send self(), :reconnect
+
+    {:ok, state}
+  end
+
+  @impl true
+  def handle_info(:reconnect, state = %{hostname: hostname, password: password}) do
    case connect(hostname, password) do
      {:ok, pid} ->
-        state = %{
-          backend: pid,
-          hostname: hostname,
-          password: password
-        }
-
-        {:ok, state}
+        {:noreply, %{state | backend: pid}}
      {:error, err} ->
+        # Nothing to do, as the next request will trigger the reconnect logic
+        # (see :execute call).

-        # Wait for 10 seconds so that the supervisor does not loop on dead node
-        # (and reach the max_restart threshold / stop trying).
-        Process.sleep(10_000)
-
-        {:error, err}
+        {:noreply, state}
    end
  end

@ -51,9 +58,16 @@ defmodule HAHandler.DRBD.Watcher do
  def handle_call({:execute, cmd}, _from, %{backend: backend} = state) do
    case SSHEx.run(backend, cmd) do
      {:ok, _output, _status} = reply->
+        {:reply, reply, state}
+      {:error, :closed} = reply ->
+        # Asynchroneously tries to reopen the connection to the backend.
+        send self(), :reconnect
+
        {:reply, reply, state}
      {:error, _err} = reply ->
-        {:error, reply, state}
+        # Do not take action on unknown error.
+        {:reply, reply, state}
    end
  end
+
 end
--- a/mix.exs
+++ b/mix.exs
@ -4,7 +4,7 @@ defmodule HAHandler.MixProject do
  def project do
    [
      app: :ha_handler,
-      version: "0.4.1",
+      version: "0.4.2",
      elixir: "~> 1.12",
      start_permanent: Mix.env() == :prod,
      deps: deps(),