Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/impulse/docs/config/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ Maps the silver-layer input tables.
| `channels_uri` | `str` | Yes | Full Unity Catalog path. Time-series sample data. |
| `container_tags_table` | `str` | No | Full Unity Catalog path. Container EAV tags. |
| `channel_tags_table` | `str` | No | Full Unity Catalog path. Channel EAV tags. |
| `channel_mapping_table` | `str` | No | Full Unity Catalog path. Logical-to-physical channel alias table. Required when using `QueryBuilder.channel_with_alias()` (currently supported by `KeyValueStoreSolver`). |
| `channel_mapping_table` | `str` | No | Full Unity Catalog path. Logical-to-physical channel alias table. Required when using `QueryBuilder.channel_with_alias()` (currently supported by `KeyValueStoreSolver`). In reporting mode the resolved alias-to-physical-channel mapping is materialized to the gold-layer [`channel_mapping_resolution_dimension`](../data_model/gold_layer_event_normalized.md#dimension-tables). |
| `unit_conversion_table` | `str` | No | Full Unity Catalog path. Per-unit-family conversion factors. When configured together with a `channel_mapping_table` whose rows carry `source_unit` / `target_unit` columns, aliased selectors auto-convert values from source to target unit during `solve()` (currently supported by `KeyValueStoreSolver`). |

Tag tables are required for solvers that consume tag-based filters
Expand Down
22 changes: 22 additions & 0 deletions docs/impulse/docs/data_model/gold_layer_event_normalized.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,18 @@ histogram2d_fact {
timestamp _created_at
}

channel_mapping_resolution_dimension {
long container_id FK
long channel_id
string channel_name
string data_key
string channel_alias
string priority "nullable"
string source_unit "optional"
string target_unit "optional"
timestamp _created_at
}

histogram_fact }o--|| event_dimension: event_id
histogram2d_fact }o--|| event_dimension: event_id
stats_aggregator_fact }o--|| event_instance_fact: event_instance_id
Expand All @@ -155,6 +167,8 @@ measurement_dimension ||--o{ histogram_fact : container_id
measurement_dimension ||--o{ histogram2d_fact : container_id
measurement_dimension ||--o{ stats_aggregator_fact : container_id

measurement_dimension ||--o{ channel_mapping_resolution_dimension : container_id

event_instance_fact }o--|| event_dimension: event_id
```

Expand Down Expand Up @@ -186,3 +200,11 @@ guaranteed.
| `{prefix}_stats_aggregator_dimension` | `visual_id`, `report_id` | Statistics metadata (signals, aggregation labels). |
| `{prefix}_event_dimension` | `event_id`, `report_id` | Event definitions (name, expression, required channels). |
| `{prefix}_measurement_dimension` | `container_id` | Container metadata. Always carries `container_id`, `config_hash`, `_created_at`; additional columns are populated from [`config.measurement_dimensions`](../config/configuration.md#measurement_dimensions-optional). |
| `{prefix}_channel_mapping_resolution_dimension` | `container_id`, `channel_id`, `channel_alias` | Resolves each channel alias to its physical channel per container (physical join keys, alias `priority`). Written only when the report uses aliased selectors. The `source_unit` / `target_unit` columns are present only when a [`config.unit_conversion_table`](../config/configuration.md) is configured. |

The `channel_mapping_resolution_dimension` table lets BI consumers join a fact
back to the physical channel that an alias resolved to: join on
`(container_id, channel_id, channel_alias)`. The join-key, `channel_alias`,
`priority`, and `source_unit` / `target_unit` column names follow the
[`channel_mapping` solver config](../config/configuration.md) — see the column
reference there for how each maps to the alias and metrics tables.
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import abc
import operator
import zlib
from collections.abc import Callable
from collections.abc import Callable, Iterable
from typing import TYPE_CHECKING, Any

import pyspark.sql.types as T
Expand Down Expand Up @@ -197,6 +197,48 @@ def get_selectors(self) -> list["TimeSeriesSelector"]:
"""
pass

@staticmethod
def collect_selectors(
expressions: Iterable[Any],
uses_alias: bool | None = None,
) -> list["TimeSeriesSelector"]:
"""Collect deduplicated leaf selectors from a list of expressions.

Iterates each item, skips anything that isn't a
:class:`TimeSeriesExpression`, walks ``get_selectors()``, applies
an optional ``uses_alias`` filter, and deduplicates by
``selector_id`` preserving discovery order.

Parameters
----------
expressions : Iterable[Any]
Items to walk; non-``TimeSeriesExpression`` entries are
silently skipped (e.g. the ``selections`` list on a
``QueryBuilder`` may carry other selector kinds).
uses_alias : bool or None, optional
When ``True``, keep only alias selectors; when ``False``,
keep only direct selectors; when ``None`` (default), keep
all.

Returns
-------
list of TimeSeriesSelector
Deduplicated selectors in discovery order.
"""
selectors: list["TimeSeriesSelector"] = []
seen_ids: set = set()
for expression in expressions:
if not isinstance(expression, TimeSeriesExpression):
continue
for selector in expression.get_selectors():
if uses_alias is not None and selector.uses_alias != uses_alias:
continue
if selector.selector_id in seen_ids:
continue
seen_ids.add(selector.selector_id)
selectors.append(selector)
return selectors

@abc.abstractmethod
def __str__(self) -> str:
"""
Expand Down
36 changes: 6 additions & 30 deletions src/impulse_query_engine/analyze/query/query_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,34 +174,6 @@ def select(self, *args) -> Self:
self.selections = list(args)
return self

def _collect_time_series_selectors(self, uses_alias=None) -> list[TimeSeriesSelector]:
"""Collect deduplicated leaf selectors from this query's selections.

Parameters
----------
uses_alias : bool or None, optional
When ``True``, keep only alias selectors; when ``False``, keep
only direct selectors; when ``None`` (default), keep all.

Returns
-------
list of TimeSeriesSelector
Deduplicated selectors in discovery order.
"""
selectors = []
seen_selector_ids = set()
for expression in self.selections:
if not isinstance(expression, TimeSeriesExpression):
continue
for selector in expression.get_selectors():
if uses_alias is not None and selector.uses_alias != uses_alias:
continue
if selector.selector_id in seen_selector_ids:
continue
seen_selector_ids.add(selector.selector_id)
selectors.append(selector)
return selectors

def _determine_result_objects_dtypes(self, default_dtype: T = T.DoubleType()):
"""
Determine result objects and their data types for the selections.
Expand Down Expand Up @@ -261,8 +233,12 @@ def solve(
) = self._determine_result_objects_dtypes()

# extract selectors upfront
direct_selectors = self._collect_time_series_selectors(uses_alias=False)
aliased_selectors = self._collect_time_series_selectors(uses_alias=True)
direct_selectors = TimeSeriesExpression.collect_selectors(
self.selections, uses_alias=False
)
aliased_selectors = TimeSeriesExpression.collect_selectors(
self.selections, uses_alias=True
)

# create Query
tags_df = solver.filter_container_tags(spark, self)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -389,10 +389,15 @@ def filter_aliased_channel_metrics(
Returns
-------
pyspark.sql.DataFrame
DataFrame with ``(container_id, channel_id, selector_ids)``
where ``selector_ids`` is an array column. When unit conversion
is active (see above), also carries ``source_unit`` and
``target_unit`` columns.
DataFrame with
``(container_id, channel_id, <metrics-side join keys>,
channel_alias, alias_priority, selector_ids)`` where
``selector_ids`` is an array column. The metrics-side join key
columns come from ``effective_alias_join_keys`` (default:
``channel_name``, ``data_key``) and are deduplicated in case the
same physical column appears on both sides of a join-key tuple.
When unit conversion is active (see above), also carries
``source_unit`` and ``target_unit`` columns.
"""
container_id_col = self.config.container_id_col
channel_id_col = self.config.channel_id_col
Expand Down Expand Up @@ -488,7 +493,15 @@ def filter_aliased_channel_metrics(
resolved = resolved.withColumn(
"selector_ids", F.array(self._build_selector_id_expr(selectors))
)
out_cols = [container_id_col, channel_id_col, "selector_ids"]
join_key_metrics_cols = list(dict.fromkeys(metrics_col for _, metrics_col in join_keys))
out_cols = [
container_id_col,
channel_id_col,
*join_key_metrics_cols,
channel_alias_col,
alias_priority_col,
"selector_ids",
]
if has_unit_cols:
out_cols.extend([source_unit_col, target_unit_col])
return resolved.select(*out_cols)
Expand Down Expand Up @@ -543,8 +556,22 @@ def resolve_channel_selections(
and target_unit_col in aliased_channel_metrics_df.columns
)

# ``filter_aliased_channel_metrics`` emits extra columns
# (metrics-side join keys, channel_alias, alias_priority) for the
# channel mapping resolution dimension; the solve pipeline only
# consumes (container_id, channel_id, selector_ids[, source_unit,
# target_unit]) and unionByName requires matching schemas.
aliased_solve_cols = [
self.config.container_id_col,
self.config.channel_id_col,
"selector_ids",
]
if has_unit_cols:
aliased_solve_cols.extend([source_unit_col, target_unit_col])
aliased_for_union = aliased_channel_metrics_df.select(*aliased_solve_cols)

merged = channel_metrics_df.unionByName(
aliased_channel_metrics_df, allowMissingColumns=has_unit_cols
aliased_for_union, allowMissingColumns=has_unit_cols
)

agg_exprs = [F.flatten(F.collect_list("selector_ids")).alias("selector_ids")]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -238,8 +238,12 @@ def filter_aliased_channel_metrics(
Returns
-------
pyspark.sql.DataFrame
DataFrame with ``(container_id, channel_id, selector_ids)``
where ``selector_ids`` is an array column.
DataFrame with
``(container_id, channel_id, <metrics-side join keys>,
channel_alias, alias_priority, selector_ids)`` where
``selector_ids`` is an array column. Implementations that
support unit conversion additionally include ``source_unit``
and ``target_unit`` columns.
"""
raise NotImplementedError(
f"{self.__class__.__name__} does not support aliased channel resolution"
Expand Down
54 changes: 53 additions & 1 deletion src/impulse_reporting/core/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,10 @@
from impulse_reporting.incremental.definition_hash_comparator import (
DefinitionHashComparator,
)
from impulse_reporting.meta.container_dimensions import ContainerDimension
from impulse_reporting.meta.container_dimensions import (
ChannelMappingResolutionDimension,
ContainerDimension,
)
from impulse_reporting.persist.report_storage import (
ReportEntityTransformer,
Sink,
Expand Down Expand Up @@ -96,6 +99,7 @@ def __init__(
self.aggregation_dfs = {}
self.aggregation_metadata_dfs = {}
self.container_dimension_df = None
self.channel_mapping_resolution_dimension_df = None
self._is_incremental = None

if config:
Expand Down Expand Up @@ -591,6 +595,12 @@ def _persist_full(self):
uri = writer.get_output_uri()
writer.write(self.container_dimension_df, uri=uri)

# persist channel mapping resolution dimension
if self.channel_mapping_resolution_dimension_df is not None:
writer = storage_factory.create_channel_mapping_resolution_dimension_writer()
uri = writer.get_output_uri()
writer.write(self.channel_mapping_resolution_dimension_df, uri=uri)

@telemetry_logger("report", "determine_report")
def _persist_incremental(
self,
Expand Down Expand Up @@ -738,6 +748,25 @@ def _persist_incremental(
df_enriched = self.container_dimension_df.transform(transformer.add_meta_information)
self.sink.upsert(df_enriched, uri, ["container_id"])

# Persist channel mapping resolution dimension
# (upsert by container_id, channel_id, channel_alias)
if self.channel_mapping_resolution_dimension_df is not None:
writer = storage_factory.create_channel_mapping_resolution_dimension_writer()
uri = writer.get_output_uri()
df_enriched = self.channel_mapping_resolution_dimension_df.transform(
transformer.add_meta_information
)
solver_cfg = self.solver.config
self.sink.upsert(
df_enriched,
uri,
[
solver_cfg.container_id_col,
solver_cfg.channel_id_col,
solver_cfg.channel_alias_col,
],
)

def _transform_for_persistence(
self,
df: DataFrame,
Expand Down Expand Up @@ -1001,6 +1030,29 @@ def determine_report(self, is_incremental: bool = None):
pre_filtered_containers_df=pre_filtered_containers_df,
)

# Determine channel mapping resolution dimension.
# Mirror the fact split: aliases from changed definitions resolve
# over all containers, aliases only in unchanged definitions stay
# scoped to the incrementally-detected containers.
changed_aliased_selectors = TimeSeriesExpression.collect_selectors(
all_changed_expressions,
uses_alias=True,
)
unchanged_aliased_selectors = TimeSeriesExpression.collect_selectors(
all_unchanged_expressions,
uses_alias=True,
)
self.channel_mapping_resolution_dimension_df = (
ChannelMappingResolutionDimension.get_dimension_for_scopes(
spark=self.spark,
query=self.query,
solver=self.solver,
changed_aliased_selectors=changed_aliased_selectors,
unchanged_aliased_selectors=unchanged_aliased_selectors,
pre_filtered_containers_df=pre_filtered_containers_df,
)
)

def _resolve_is_incremental(self, is_incremental: bool = None) -> bool:
"""
Resolve the processing mode considering signature, config, and gold layer.
Expand Down
Loading
Loading