From 53481c99598314093c5f0f957032c36b22eba23c Mon Sep 17 00:00:00 2001 From: Reinaldy Rafli Date: Wed, 29 Apr 2026 09:32:47 +0700 Subject: [PATCH 1/4] fix: add more things to inspect failures --- action.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/action.yaml b/action.yaml index 601aaa13099..3877efc5bdb 100644 --- a/action.yaml +++ b/action.yaml @@ -264,3 +264,9 @@ runs: echo "::group::Inspect failure - docker compose logs" docker compose logs echo "::endgroup::" + echo "::group::Inspect failure - docker stats" + docker stats --no-stream + echo "::endgroup::" + echo "::group::Inspect failure - disk i/o stats" + iostat -x 1 + echo "::endgroup::" From 7abcbcdd049b79286d132962cb320ff5235e3297 Mon Sep 17 00:00:00 2001 From: Reinaldy Rafli Date: Wed, 29 Apr 2026 14:45:28 +0700 Subject: [PATCH 2/4] fix(ci): declare timeout minutes --- .github/workflows/test.yml | 2 ++ action.yaml | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e24bdca5251..4c6e02d4dc2 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -26,6 +26,7 @@ jobs: unit-test: if: github.repository_owner == 'getsentry' runs-on: ${{ matrix.os }} + timeout-minutes: 30 strategy: matrix: os: [ubuntu-24.04, ubuntu-24.04-arm] @@ -40,6 +41,7 @@ jobs: integration-test: if: github.repository_owner == 'getsentry' runs-on: ${{ matrix.os }} + timeout-minutes: 30 strategy: fail-fast: false matrix: diff --git a/action.yaml b/action.yaml index 3877efc5bdb..5846f1f4f28 100644 --- a/action.yaml +++ b/action.yaml @@ -267,6 +267,3 @@ runs: echo "::group::Inspect failure - docker stats" docker stats --no-stream echo "::endgroup::" - echo "::group::Inspect failure - disk i/o stats" - iostat -x 1 - echo "::endgroup::" From 3cf271a560474c3f5a09d81326489e09ef5d0dcd Mon Sep 17 00:00:00 2001 From: Reinaldy Rafli Date: Tue, 5 May 2026 11:09:43 +0700 Subject: [PATCH 3/4] feat: longer kafka socket timeout duration --- sentry/sentry.conf.example.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sentry/sentry.conf.example.py b/sentry/sentry.conf.example.py index 3bdc3e1a19d..4af5e13744b 100644 --- a/sentry/sentry.conf.example.py +++ b/sentry/sentry.conf.example.py @@ -221,7 +221,12 @@ def get_internal_network(): DEFAULT_KAFKA_OPTIONS = { "bootstrap.servers": "kafka:9092", "message.max.bytes": 50000000, - "socket.timeout.ms": 1000, + "socket.timeout.ms": 10000, + "request.timeout.ms": 30000, + "retries": 5, + "retry.backoff.ms": 1000, + "reconnect.backoff.ms": 1000, + "reconnect.backoff.max.ms": 10000, } SENTRY_EVENTSTREAM = "sentry.eventstream.kafka.KafkaEventStream" From ebf12465ecd909a7be1c865ec826cdd0c1d5db76 Mon Sep 17 00:00:00 2001 From: Reinaldy Rafli Date: Tue, 5 May 2026 21:27:08 +0700 Subject: [PATCH 4/4] feat: tune kafka to be more lenient --- docker-compose.yml | 4 ++-- sentry/sentry.conf.example.py | 17 +++++++++++------ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index b5d768f8ca0..d67949aa444 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -196,8 +196,8 @@ services: KAFKA_TOOLS_LOG4J_LOGLEVEL: "WARN" ulimits: nofile: - soft: 4096 - hard: 4096 + soft: 100000 + hard: 100000 volumes: - "sentry-kafka:/var/lib/kafka/data" - "sentry-kafka-log:/var/lib/kafka/log" diff --git a/sentry/sentry.conf.example.py b/sentry/sentry.conf.example.py index 4af5e13744b..b901986ef9a 100644 --- a/sentry/sentry.conf.example.py +++ b/sentry/sentry.conf.example.py @@ -221,12 +221,17 @@ def get_internal_network(): DEFAULT_KAFKA_OPTIONS = { "bootstrap.servers": "kafka:9092", "message.max.bytes": 50000000, - "socket.timeout.ms": 10000, - "request.timeout.ms": 30000, - "retries": 5, - "retry.backoff.ms": 1000, - "reconnect.backoff.ms": 1000, - "reconnect.backoff.max.ms": 10000, + "socket.timeout.ms": 10000, # Timeout for individual socket operations (send/recv) + "request.timeout.ms": 30000, # Max time to wait for a broker response before failing + "retries": 5, # Number of retries for transient/retriable request failures + "retry.backoff.ms": 1000, # Wait time between retry attempts + "reconnect.backoff.ms": 1000, # Initial wait before reconnecting after a lost connection + "reconnect.backoff.max.ms": 10000, # Upper bound for exponential backoff on reconnect attempts + # Session & heartbeat — must satisfy: + # heartbeat.interval.ms < session.timeout.ms < max.poll.interval.ms + "session.timeout.ms": 60000, # Grace period before broker evicts an unresponsive consumer (default: 45s) + "heartbeat.interval.ms": 20000, # How often the consumer sends a heartbeat — must be 1/3 of session.timeout.ms + "max.poll.interval.ms": 600000, # Max allowed time between poll() calls before the consumer is considered dead } SENTRY_EVENTSTREAM = "sentry.eventstream.kafka.KafkaEventStream"