From be41fd4e634a16c1c47ff44812a8870ae6771d3d Mon Sep 17 00:00:00 2001 From: Walton Hoops Date: Mon, 7 Jul 2025 14:50:27 -0600 Subject: [PATCH] chore: log max run_queue and don't fail health checks based on it --- apps/health/lib/health/checkers/run_queue.ex | 88 +++---------------- .../test/health/checkers/run_queue_test.exs | 50 ++--------- 2 files changed, 19 insertions(+), 119 deletions(-) diff --git a/apps/health/lib/health/checkers/run_queue.ex b/apps/health/lib/health/checkers/run_queue.ex index 3da23ae78..3eb44f6cc 100644 --- a/apps/health/lib/health/checkers/run_queue.ex +++ b/apps/health/lib/health/checkers/run_queue.ex @@ -1,90 +1,24 @@ defmodule Health.Checkers.RunQueue do @moduledoc """ - Health check which makes sure the Erlang [Run - Queue](http://erlang.org/doc/man/erlang.html#statistics-1) is reasonably - low. + Health check for monitoring the Erlang [Run + Queue](http://erlang.org/doc/man/erlang.html#statistics-1). + + This check always returns healthy as we don't want to kill tasks based on the run queue length. + Instead it logs the maximum run queue length across all schedulers for monitoring purposes. """ require Logger def current do - [run_queue: queue_size()] + [run_queue: max_queue_length()] end def healthy? do - h? = queue_size() <= max_run_queue_length() - - _ = log_processes(h?, Logger.level()) - - h? - end - - defp max_run_queue_length, do: 100 - - defp queue_size do - :erlang.statistics(:run_queue) - end - - def log_processes(false, level) when level in [:info, :debug] do - spawn(fn -> - for line <- log_lines() do - _ = Logger.info(line) - end - end) - - :logged - end - - def log_processes(_, _) do - :ignored - end - - def log_lines do - start_time = System.monotonic_time() - - for pid <- Process.list() do - # lt short for log time - "process_info pid=#{inspect(pid)} lt=#{start_time} #{log_info(pid)}" - end - end - - def log_info(pid) do - info = - Process.info( - pid, - ~w(current_function initial_call status message_queue_len priority total_heap_size heap_size stack_size reductions dictionary registered_name memory)a - ) - - log_info_iodata(info) - end - - defp log_info_iodata(info) when is_list(info) do - info = - if initial_call = info[:dictionary][:"$initial_call"] do - Keyword.put(info, :initial_call, initial_call) - else - info - end - - info = Keyword.delete(info, :dictionary) - - for {k, v} <- info do - [Atom.to_string(k), "=", pid_log(v), " "] - end - end - - defp log_info_iodata(nil) do - ["status=dead"] - end - - defp pid_log({m, f, a}) when is_atom(m) and is_atom(f) and a >= 0 do - [?", Atom.to_string(m), ?., Atom.to_string(f), ?/, Integer.to_string(a), ?"] - end - - defp pid_log(atom) when is_atom(atom) do - Atom.to_string(atom) + max_length = max_queue_length() + _ = Logger.info("run_queue_check max_run_queue_length=#{max_length}") + true end - defp pid_log(other) do - inspect(other) + defp max_queue_length do + Enum.max(:erlang.statistics(:run_queue_lengths)) end end diff --git a/apps/health/test/health/checkers/run_queue_test.exs b/apps/health/test/health/checkers/run_queue_test.exs index 7c245692a..a8ba6e346 100644 --- a/apps/health/test/health/checkers/run_queue_test.exs +++ b/apps/health/test/health/checkers/run_queue_test.exs @@ -2,51 +2,17 @@ defmodule Health.Checkers.RunQueueTest do use ExUnit.Case import Health.Checkers.RunQueue - describe "log_processes/2" do - test "logs if we're not healthy and the log level is low enough" do - assert log_processes(false, :info) == :logged - assert log_processes(false, :debug) == :logged - end - - test "does nothing when we're healthy" do - assert log_processes(true, :info) == :ignored - end - - test "does nothing when the log level is high" do - assert log_processes(false, :warning) == :ignored - end - end - - describe "log_lines/0" do - test "one line per alive process" do - lines = log_lines() - assert length(lines) >= length(Process.list()) + describe "healthy?/0" do + test "always returns true" do + assert healthy?() == true end end - describe "log_info/1" do - test "logs information about the process" do - binary = IO.iodata_to_binary(log_info(self())) - assert binary =~ ~s(current_function="Elixir.Process.info/2") - assert binary =~ ~s(initial_call="erlang.apply/2") - assert binary =~ ~s(message_queue_len=0) - assert binary =~ ~s(status=running) - end - - test "overrides initial call if present in process dictionary" do - # GenServers set this - {:ok, pid} = Agent.start_link(fn -> :ok end) - binary = IO.iodata_to_binary(log_info(pid)) - - assert binary =~ - ~s(initial_call="Elixir.Health.Checkers.RunQueueTest.-test log_info/1 overrides initial call if present in process dictionary/1-fun-0-/0") - end - - test "logs a dead process" do - {:ok, pid} = Agent.start_link(fn -> :ok end) - Agent.stop(pid) - binary = IO.iodata_to_binary(log_info(pid)) - assert binary =~ ~s(status=dead) + describe "current/0" do + test "returns the current run queue size" do + [run_queue: size] = current() + assert is_integer(size) + assert size >= 0 end end end