From 8db392e3849feb38fa1cfe41a51c6c0e5840ed5e Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 11 Feb 2026 18:47:54 +0000 Subject: [PATCH 1/2] refactor: fix pull_up_select disorder the columns of readtable nodes --- bigframes/core/rewrite/select_pullup.py | 9 ++++----- .../snapshots/test_binary_compiler/test_corr/out.sql | 4 ++-- .../snapshots/test_binary_compiler/test_cov/out.sql | 4 ++-- .../snapshots/test_nullary_compiler/test_size/out.sql | 6 +++--- .../snapshots/test_unary_compiler/test_mean/out.sql | 2 +- .../snapshots/test_unary_compiler/test_std/out.sql | 2 +- .../test_compile_explode_dataframe/out.sql | 2 +- .../test_compile_explode_series/out.sql | 4 ++-- 8 files changed, 16 insertions(+), 17 deletions(-) diff --git a/bigframes/core/rewrite/select_pullup.py b/bigframes/core/rewrite/select_pullup.py index 415182f8840..a15aba7663f 100644 --- a/bigframes/core/rewrite/select_pullup.py +++ b/bigframes/core/rewrite/select_pullup.py @@ -54,13 +54,12 @@ def pull_up_source_ids(node: nodes.ReadTableNode) -> nodes.BigFrameNode: if all(id.sql == source_id for id, source_id in node.scan_list.items): return node else: - source_ids = sorted( - set(scan_item.source_id for scan_item in node.scan_list.items) - ) new_scan_list = nodes.ScanList.from_items( [ - nodes.ScanItem(identifiers.ColumnId(source_id), source_id) - for source_id in source_ids + nodes.ScanItem( + identifiers.ColumnId(scan_item.source_id), scan_item.source_id + ) + for scan_item in node.scan_list.items ] ) new_source = dataclasses.replace(node, scan_list=new_scan_list) diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_binary_compiler/test_corr/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_binary_compiler/test_corr/out.sql index 5c838f48827..08272882e6b 100644 --- a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_binary_compiler/test_corr/out.sql +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_binary_compiler/test_corr/out.sql @@ -1,7 +1,7 @@ WITH `bfcte_0` AS ( SELECT - `float64_col`, - `int64_col` + `int64_col`, + `float64_col` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` ), `bfcte_1` AS ( SELECT diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_binary_compiler/test_cov/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_binary_compiler/test_cov/out.sql index eda082250a6..7f4463e3b8e 100644 --- a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_binary_compiler/test_cov/out.sql +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_binary_compiler/test_cov/out.sql @@ -1,7 +1,7 @@ WITH `bfcte_0` AS ( SELECT - `float64_col`, - `int64_col` + `int64_col`, + `float64_col` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` ), `bfcte_1` AS ( SELECT diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_nullary_compiler/test_size/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_nullary_compiler/test_size/out.sql index ed8e0c7619d..d5f599b5da7 100644 --- a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_nullary_compiler/test_size/out.sql +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_nullary_compiler/test_size/out.sql @@ -4,17 +4,17 @@ WITH `bfcte_0` AS ( `bytes_col`, `date_col`, `datetime_col`, - `duration_col`, - `float64_col`, `geography_col`, `int64_col`, `int64_too`, `numeric_col`, + `float64_col`, `rowindex`, `rowindex_2`, `string_col`, `time_col`, - `timestamp_col` + `timestamp_col`, + `duration_col` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` ), `bfcte_1` AS ( SELECT diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_mean/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_mean/out.sql index 2f9d540776f..74319b646f2 100644 --- a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_mean/out.sql +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_mean/out.sql @@ -1,8 +1,8 @@ WITH `bfcte_0` AS ( SELECT `bool_col`, - `duration_col`, `int64_col`, + `duration_col`, `int64_col` AS `bfcol_6`, `bool_col` AS `bfcol_7`, `duration_col` AS `bfcol_8` diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_std/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_std/out.sql index bc744258913..c57abdba4b5 100644 --- a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_std/out.sql +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_std/out.sql @@ -1,8 +1,8 @@ WITH `bfcte_0` AS ( SELECT `bool_col`, - `duration_col`, `int64_col`, + `duration_col`, `int64_col` AS `bfcol_6`, `bool_col` AS `bfcol_7`, `duration_col` AS `bfcol_8` diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_dataframe/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_dataframe/out.sql index 5d9019439f2..4f05929e0c7 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_dataframe/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_dataframe/out.sql @@ -1,7 +1,7 @@ WITH `bfcte_0` AS ( SELECT - `int_list_col`, `rowindex`, + `int_list_col`, `string_list_col` FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` ), `bfcte_1` AS ( diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_series/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_series/out.sql index 8ba4559da83..d5b42741d31 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_series/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_series/out.sql @@ -1,7 +1,7 @@ WITH `bfcte_0` AS ( SELECT - `int_list_col`, - `rowindex` + `rowindex`, + `int_list_col` FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` ), `bfcte_1` AS ( SELECT From a62598e1673f1197ec4726ef7218b58294b965d4 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 11 Feb 2026 19:42:58 +0000 Subject: [PATCH 2/2] refactor: enable SELECT * optimizations in sqlglot compiler --- bigframes/core/compile/sqlglot/compiler.py | 16 +++++++----- bigframes/core/compile/sqlglot/sqlglot_ir.py | 25 +++++++++++-------- bigframes/core/sql_nodes.py | 4 +++ .../test_nullary_compiler/test_size/out.sql | 16 +----------- .../out.sql | 4 +-- .../out.sql | 3 +-- .../out.sql | 16 +----------- 7 files changed, 32 insertions(+), 52 deletions(-) diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index 786c5a1ed1f..d74c1b38696 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -153,13 +153,17 @@ def compile_sql_select(node: sql_nodes.SqlSelectNode, child: ir.SQLGlotIR): for ordering in node.sorting ) - projected_cols: tuple[tuple[str, sge.Expression], ...] = tuple( - ( - cdef.id.sql, - expression_compiler.expression_compiler.compile_expression(cdef.expression), + projected_cols: tuple[tuple[str, sge.Expression], ...] = tuple() + if not node.is_star_selection: + projected_cols = tuple( + ( + cdef.id.sql, + expression_compiler.expression_compiler.compile_expression( + cdef.expression + ), + ) + for cdef in node.selections ) - for cdef in node.selections - ) sge_predicates = tuple( expression_compiler.expression_compiler.compile_expression(expression) diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index d0bd32697c4..efe5e09aff2 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -150,7 +150,7 @@ def from_table( if sql_predicate: select_expr = sge.Select().select(sge.Star()).from_(table_expr) select_expr = select_expr.where( - sg.parse_one(sql_predicate, dialect="bigquery"), append=False + sg.parse_one(sql_predicate, dialect=cls.dialect), append=False ) return cls(expr=select_expr, uid_gen=uid_gen) @@ -172,16 +172,19 @@ def select( if len(sorting) > 0: new_expr = new_expr.order_by(*sorting) - to_select = [ - sge.Alias( - this=expr, - alias=sge.to_identifier(id, quoted=self.quoted), - ) - if expr.alias_or_name != id - else expr - for id, expr in selections - ] - new_expr = new_expr.select(*to_select, append=False) + if len(selections) > 0: + to_select = [ + sge.Alias( + this=expr, + alias=sge.to_identifier(id, quoted=self.quoted), + ) + if expr.alias_or_name != id + else expr + for id, expr in selections + ] + new_expr = new_expr.select(*to_select, append=False) + else: + new_expr = new_expr.select(sge.Star(), append=False) if len(predicates) > 0: condition = _and(predicates) diff --git a/bigframes/core/sql_nodes.py b/bigframes/core/sql_nodes.py index a1624a10217..5d921de7aeb 100644 --- a/bigframes/core/sql_nodes.py +++ b/bigframes/core/sql_nodes.py @@ -142,6 +142,10 @@ def consumed_ids(self): def _node_expressions(self): raise NotImplementedError() + @property + def is_star_selection(self) -> bool: + return tuple(self.ids) == tuple(self.child.ids) + @functools.cache def get_id_mapping(self) -> dict[identifiers.ColumnId, ex.Expression]: return {cdef.id: cdef.expression for cdef in self.selections} diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_nullary_compiler/test_size/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_nullary_compiler/test_size/out.sql index d5f599b5da7..7a4393f8133 100644 --- a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_nullary_compiler/test_size/out.sql +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_nullary_compiler/test_size/out.sql @@ -1,20 +1,6 @@ WITH `bfcte_0` AS ( SELECT - `bool_col`, - `bytes_col`, - `date_col`, - `datetime_col`, - `geography_col`, - `int64_col`, - `int64_too`, - `numeric_col`, - `float64_col`, - `rowindex`, - `rowindex_2`, - `string_col`, - `time_col`, - `timestamp_col`, - `duration_col` + * FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` ), `bfcte_1` AS ( SELECT diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_columns_filters/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_columns_filters/out.sql index c9a42b73f1a..2dae14b556e 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_columns_filters/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_columns_filters/out.sql @@ -6,7 +6,5 @@ WITH `bfcte_0` AS ( `rowindex` > 0 AND `string_col` IN ('Hello, World!') ) SELECT - `rowindex`, - `int64_col`, - `string_col` + * FROM `bfcte_0` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_json_types/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_json_types/out.sql index f65f3a10f0f..77a17ec893d 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_json_types/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_json_types/out.sql @@ -1,4 +1,3 @@ SELECT - `rowindex`, - `json_col` + * FROM `bigframes-dev`.`sqlglot_test`.`json_types` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_system_time/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_system_time/out.sql index d188899e7c2..b579e3a6fed 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_system_time/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readtable/test_compile_readtable_w_system_time/out.sql @@ -1,17 +1,3 @@ SELECT - `bool_col`, - `bytes_col`, - `date_col`, - `datetime_col`, - `geography_col`, - `int64_col`, - `int64_too`, - `numeric_col`, - `float64_col`, - `rowindex`, - `rowindex_2`, - `string_col`, - `time_col`, - `timestamp_col`, - `duration_col` + * FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` FOR SYSTEM_TIME AS OF '2025-11-09T03:04:05.678901+00:00' \ No newline at end of file