Fix eventual crash on failed DRBD backend, bump development version
Some checks reported errors
continuous-integration/drone/push Build encountered an error
Some checks reported errors
continuous-integration/drone/push Build encountered an error
This commit is contained in:
parent
fb3338b4d9
commit
992ff7f5ef
5 changed files with 60 additions and 23 deletions
|
@ -1,3 +1,7 @@
|
||||||
|
# 2022-07-?? - v0.4.2
|
||||||
|
|
||||||
|
* Fix eventual crash on failed DRBD backend.
|
||||||
|
|
||||||
# 2022-06-13 - v0.4.1
|
# 2022-06-13 - v0.4.1
|
||||||
|
|
||||||
* Fix crash on failed SSHEx / Postgrex connection failure.
|
* Fix crash on failed SSHEx / Postgrex connection failure.
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
defmodule HAHandler.DRBD do
|
defmodule HAHandler.DRBD do
|
||||||
|
require Logger
|
||||||
|
|
||||||
@supervisor HAHandler.DRBD.Supervisor
|
@supervisor HAHandler.DRBD.Supervisor
|
||||||
|
|
||||||
# There might be >1 resources configured in DRBD!
|
# There might be >1 resources configured in DRBD!
|
||||||
|
@ -18,6 +20,15 @@ defmodule HAHandler.DRBD do
|
||||||
@id_extraction_regex ~r/\n\s(?<id>\d+)\:\s/
|
@id_extraction_regex ~r/\n\s(?<id>\d+)\:\s/
|
||||||
@data_extraction_regex ~r/cs:(?<cs>(\w|\/)+)\sro:(?<ro>(\w|\/)+)\sds:(?<ds>(\w|\/)+)\s/
|
@data_extraction_regex ~r/cs:(?<cs>(\w|\/)+)\sro:(?<ro>(\w|\/)+)\sds:(?<ds>(\w|\/)+)\s/
|
||||||
|
|
||||||
|
# Empty state, when backend is not queryable for some reason.
|
||||||
|
@empty_state %{
|
||||||
|
hostname: "unknown",
|
||||||
|
version: "",
|
||||||
|
mode: "",
|
||||||
|
status: "unknown",
|
||||||
|
data: ""
|
||||||
|
}
|
||||||
|
|
||||||
def get_instances() do
|
def get_instances() do
|
||||||
watchers = Supervisor.which_children(@supervisor)
|
watchers = Supervisor.which_children(@supervisor)
|
||||||
|
|
||||||
|
@ -32,6 +43,8 @@ defmodule HAHandler.DRBD do
|
||||||
end
|
end
|
||||||
|
|
||||||
def get_state({hostname, pid}) do
|
def get_state({hostname, pid}) do
|
||||||
|
empty_reply = %{@empty_state | hostname: hostname}
|
||||||
|
|
||||||
case GenServer.call(pid, {:execute, @drbd_proc_cmd}) do
|
case GenServer.call(pid, {:execute, @drbd_proc_cmd}) do
|
||||||
{:ok, raw, 0} ->
|
{:ok, raw, 0} ->
|
||||||
case Regex.named_captures(@block_regex, raw) do
|
case Regex.named_captures(@block_regex, raw) do
|
||||||
|
@ -54,21 +67,25 @@ defmodule HAHandler.DRBD do
|
||||||
|> Enum.filter(fn r -> r["id"] == @default_resource_id end)
|
|> Enum.filter(fn r -> r["id"] == @default_resource_id end)
|
||||||
|> Enum.at(0)
|
|> Enum.at(0)
|
||||||
|
|
||||||
%{
|
processed_reply = %{
|
||||||
hostname: hostname,
|
|
||||||
version: Map.get(version, "full"),
|
version: Map.get(version, "full"),
|
||||||
mode: Map.get(default_resource, "ro"),
|
mode: Map.get(default_resource, "ro"),
|
||||||
status: Map.get(default_resource, "ds"),
|
status: Map.get(default_resource, "ds"),
|
||||||
data: resources
|
data: resources
|
||||||
}
|
}
|
||||||
|
Map.merge(empty_reply, processed_reply)
|
||||||
end
|
end
|
||||||
_ ->
|
_ ->
|
||||||
{:error, "could not parse /proc/drbd"}
|
Logger.warning("Failed to query DRBD backend: could not parse /proc/drbd.")
|
||||||
|
|
||||||
end
|
end
|
||||||
{:ok, _, posix_err} ->
|
{:ok, _, posix_err} ->
|
||||||
{:error, posix_err}
|
Logger.warning("Failed to query DRBD backend: POSIX #{inspect(posix_err)}.")
|
||||||
{:error, _err} = reply ->
|
empty_reply
|
||||||
reply
|
|
||||||
|
{:error, err} ->
|
||||||
|
Logger.warning("Failed to query DRBD backend: #{inspect(err)}.")
|
||||||
|
empty_reply
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -17,7 +17,9 @@ defmodule HAHandler.DRBD.Supervisor do
|
||||||
}
|
}
|
||||||
end)
|
end)
|
||||||
|
|
||||||
opts = [strategy: :one_for_one]
|
opts = [
|
||||||
|
strategy: :one_for_one
|
||||||
|
]
|
||||||
Supervisor.init(children, opts)
|
Supervisor.init(children, opts)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -25,25 +25,32 @@ defmodule HAHandler.DRBD.Watcher do
|
||||||
|
|
||||||
@impl true
|
@impl true
|
||||||
def init(opts) do
|
def init(opts) do
|
||||||
hostname = Keyword.get(opts, :hostname)
|
|
||||||
password = Keyword.get(opts, :password)
|
|
||||||
|
|
||||||
case connect(hostname, password) do
|
|
||||||
{:ok, pid} ->
|
|
||||||
state = %{
|
state = %{
|
||||||
backend: pid,
|
backend: nil,
|
||||||
hostname: hostname,
|
last_reconnect: nil,
|
||||||
password: password
|
hostname: Keyword.get(opts, :hostname),
|
||||||
|
password: Keyword.get(opts, :password),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# This action will be processed once the GenServer is fully
|
||||||
|
# started/operational. This process handle connection failures by itself,
|
||||||
|
# as we don't want to crash loop into supervisor logic (which is only there
|
||||||
|
# to handle unexpected failure).
|
||||||
|
send self(), :reconnect
|
||||||
|
|
||||||
{:ok, state}
|
{:ok, state}
|
||||||
|
end
|
||||||
|
|
||||||
|
@impl true
|
||||||
|
def handle_info(:reconnect, state = %{hostname: hostname, password: password}) do
|
||||||
|
case connect(hostname, password) do
|
||||||
|
{:ok, pid} ->
|
||||||
|
{:noreply, %{state | backend: pid}}
|
||||||
{:error, err} ->
|
{:error, err} ->
|
||||||
|
# Nothing to do, as the next request will trigger the reconnect logic
|
||||||
|
# (see :execute call).
|
||||||
|
|
||||||
# Wait for 10 seconds so that the supervisor does not loop on dead node
|
{:noreply, state}
|
||||||
# (and reach the max_restart threshold / stop trying).
|
|
||||||
Process.sleep(10_000)
|
|
||||||
|
|
||||||
{:error, err}
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -51,9 +58,16 @@ defmodule HAHandler.DRBD.Watcher do
|
||||||
def handle_call({:execute, cmd}, _from, %{backend: backend} = state) do
|
def handle_call({:execute, cmd}, _from, %{backend: backend} = state) do
|
||||||
case SSHEx.run(backend, cmd) do
|
case SSHEx.run(backend, cmd) do
|
||||||
{:ok, _output, _status} = reply->
|
{:ok, _output, _status} = reply->
|
||||||
|
{:reply, reply, state}
|
||||||
|
{:error, :closed} = reply ->
|
||||||
|
# Asynchroneously tries to reopen the connection to the backend.
|
||||||
|
send self(), :reconnect
|
||||||
|
|
||||||
{:reply, reply, state}
|
{:reply, reply, state}
|
||||||
{:error, _err} = reply ->
|
{:error, _err} = reply ->
|
||||||
{:error, reply, state}
|
# Do not take action on unknown error.
|
||||||
|
{:reply, reply, state}
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
2
mix.exs
2
mix.exs
|
@ -4,7 +4,7 @@ defmodule HAHandler.MixProject do
|
||||||
def project do
|
def project do
|
||||||
[
|
[
|
||||||
app: :ha_handler,
|
app: :ha_handler,
|
||||||
version: "0.4.1",
|
version: "0.4.2",
|
||||||
elixir: "~> 1.12",
|
elixir: "~> 1.12",
|
||||||
start_permanent: Mix.env() == :prod,
|
start_permanent: Mix.env() == :prod,
|
||||||
deps: deps(),
|
deps: deps(),
|
||||||
|
|
Loading…
Reference in a new issue