ha-handler/lib/ha_handler/cluster.ex

109 lines
2.6 KiB
Elixir

defmodule HAHandler.Cluster do
use GenServer
require Logger
# How much do we wait (ms) between each check/decision-making round?
@refresh 30_000
def start_link(opts) do
GenServer.start_link(__MODULE__, opts, name: __MODULE__)
end
@impl true
def init(instances) do
if Node.alive?() do
Logger.info("Distribution/clustering is ENABLED.")
Logger.info("Current handler instance is: #{Node.self()}")
Logger.info("Configured handler instances: #{inspect(instances)}")
:net_kernel.monitor_nodes(true)
send(self(), :sync)
else
Logger.warning("Distribution is DISABLED - skipping clustering logic")
end
{:ok, instances}
end
@impl true
def handle_info(:sync, instances) do
current_network = Node.list() ++ [Node.self()]
for node_name <- instances do
# Nothing to do if the node is already in our network/cluster.
if node_name not in current_network do
case Node.connect(node_name) do
true ->
Logger.info("Connected to handler instance #{node_name}")
_ ->
Logger.warning("Could not connect to handler instance #{node_name}")
end
end
end
Process.send_after(self(), :sync, @refresh)
{:noreply, instances}
end
@impl true
def handle_info({:nodedown, node}, instances) do
Logger.warning("Node #{node} went down.")
{:noreply, instances}
end
@impl true
def handle_info({:nodeup, node}, instances) do
Logger.info("Node #{node} came up.")
send(self(), :sync)
{:noreply, instances}
end
@impl true
def handle_call(:get_details, _from, instances) do
{uptime_ms, _} = :erlang.statistics(:wall_clock)
local_details = %{
node: Node.self(),
otp_app: HAHandler.otp_app,
version: HAHandler.version,
uptime: round(uptime_ms / 1_000 / 60),
env: HAHandler.env
}
{:reply, local_details, instances}
end
@impl true
def handle_call(:get_instances, _from, instances) do
{:reply, instances, instances}
end
def get_instance_details() do
known_instances = [Node.self()] ++ Node.list() ++ GenServer.call(__MODULE__, :get_instances)
known_instances
|> Enum.uniq()
|> Enum.map(fn node ->
try do
# FIXME: remote node coud return garbage/another structure!
GenServer.call({__MODULE__, node}, :get_details)
|> Map.put(:status, :up)
catch
:exit, _err ->
%{
node: node,
otp_app: :unknown,
version: :unknown,
uptime: :unknown,
env: :unknown,
status: :down
}
end
end)
end
end