-
Notifications
You must be signed in to change notification settings - Fork 17.2k
feat: add scheduler observability metrics #68068
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
92507c2
01c7992
71c00e1
18648f9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1223,12 +1223,16 @@ def _is_tracing_enabled(): | |
| return conf.getboolean("traces", "otel_on") | ||
|
|
||
| def _process_executor_events(self, executor: BaseExecutor, session: Session) -> int: | ||
| return SchedulerJobRunner.process_executor_events( | ||
| executor=executor, | ||
| job_id=self.job.id, | ||
| scheduler_dag_bag=self.scheduler_dag_bag, | ||
| session=session, | ||
| ) | ||
| try: | ||
| return SchedulerJobRunner.process_executor_events( | ||
| executor=executor, | ||
| job_id=self.job.id, | ||
| scheduler_dag_bag=self.scheduler_dag_bag, | ||
| session=session, | ||
| ) | ||
| except Exception as exc: | ||
| stats.incr("scheduler.executor_events.failed", tags={"reason": type(exc).__name__}) | ||
| raise | ||
|
|
||
| @classmethod | ||
| def process_executor_events( | ||
|
|
@@ -1266,6 +1270,7 @@ def process_executor_events( | |
| """ | ||
| ti_primary_key_to_try_number_map: dict[tuple[str, str, str, int], int] = {} | ||
| event_buffer = executor.get_event_buffer() | ||
| num_events = len(event_buffer) | ||
| tis_with_right_state: list[TaskInstanceKey] = [] | ||
| callback_keys_with_events: list[CallbackKey] = [] | ||
|
|
||
|
|
@@ -1327,11 +1332,15 @@ def process_executor_events( | |
|
|
||
| # Return if no finished tasks | ||
| if not tis_with_right_state: | ||
| stats.gauge("scheduler.executor_events.batch_size", num_events) | ||
| stats.incr("scheduler.executor_events.processed", num_events) | ||
| return len(event_buffer) | ||
|
|
||
| # Check state of finished tasks | ||
| filter_for_tis = TI.filter_for_tis(tis_with_right_state) | ||
| if filter_for_tis is None: | ||
| stats.gauge("scheduler.executor_events.batch_size", num_events) | ||
| stats.incr("scheduler.executor_events.processed", num_events) | ||
| return len(event_buffer) | ||
| asset_loader, alias_loader = _eager_load_dag_run_for_validation() | ||
| query = ( | ||
|
|
@@ -1548,6 +1557,8 @@ def process_executor_events( | |
| # Update task state - emails are handled by DAG processor now | ||
| ti.handle_failure(error=msg, session=session) | ||
|
|
||
| stats.gauge("scheduler.executor_events.batch_size", num_events) | ||
| stats.incr("scheduler.executor_events.processed", num_events) | ||
| return len(event_buffer) | ||
|
|
||
| def _execute(self) -> int | None: | ||
|
|
@@ -1583,7 +1594,8 @@ def _execute(self) -> int | None: | |
|
|
||
| if settings.Session is not None: | ||
| settings.Session.remove() | ||
| except Exception: | ||
| except Exception as exc: | ||
| stats.incr("scheduler.loop_exceptions", tags={"exception_class": type(exc).__name__}) | ||
| self.log.exception("Exception when executing SchedulerJob._run_scheduler_loop") | ||
| raise | ||
| finally: | ||
|
|
@@ -3221,8 +3233,12 @@ def adopt_or_reset_orphaned_tasks(self, *, session: Session = NEW_SESSION) -> in | |
|
|
||
| stats.incr("scheduler.orphaned_tasks.cleared", len(to_reset)) | ||
| stats.incr("scheduler.orphaned_tasks.adopted", len(tis_to_adopt_or_reset) - len(to_reset)) | ||
|
|
||
| if to_reset: | ||
| stats.incr( | ||
| "scheduler.zombies.detected", | ||
| len(to_reset), | ||
| tags={"reason": "adopt_failure"}, | ||
| ) | ||
|
Comment on lines
+3237
to
+3241
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe there is a distinction between a zombie task (a task that remains stuck in a running state even though its associated job is inactive) and an orphaned task (a task that has lost its executor). This metric appears to track orphaned tasks that the executor failed to adopt for any reason. Since this overlaps with |
||
| task_instance_str = "\n\t".join(reset_tis_message) | ||
|
Comment on lines
+3236
to
3242
Comment on lines
+3236
to
3242
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these can be merged in a single block
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done in 79f3197 |
||
| self.log.info( | ||
| "Reset the following %s orphaned TaskInstances:\n\t%s", | ||
|
|
@@ -3369,6 +3385,11 @@ def _find_and_purge_task_instances_without_heartbeats(self) -> None: | |
| if task_instances_without_heartbeats := self._find_task_instances_without_heartbeats( | ||
| session=session | ||
| ): | ||
| stats.incr( | ||
| "scheduler.zombies.detected", | ||
| len(task_instances_without_heartbeats), | ||
| tags={"reason": "heartbeat_timeout"}, | ||
| ) | ||
| self._purge_task_instances_without_heartbeats( | ||
| task_instances_without_heartbeats, session=session | ||
| ) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why do you use two different tags for the same data between
scheduler.executor_events.failedandscheduler.loop_exceptions?