From 8b02e90dc983a0a70f69568e3adf7f5b88904726 Mon Sep 17 00:00:00 2001 From: Qi Zhu Date: Wed, 3 Jun 2026 23:04:14 +0800 Subject: [PATCH 1/2] perf(logical-plan): box CreateExternalTable / CreateFunction in DdlStatement These two variants dominate the LogicalPlan enum size: CreateExternalTable is 312 bytes and CreateFunction is 288 bytes, while every non-DDL variant is at most 176 bytes (Join). The DdlStatement variant therefore forces LogicalPlan to 320 bytes, even though no SELECT-path workload ever materializes a CreateExternalTable / CreateFunction. Boxing both shrinks LogicalPlan from 320 bytes to 176 bytes (-45%) by making Join the new size-determining variant (the enum discriminant fits inside Join's alignment padding). Every mem::take / mem::swap / Arc store on the planning hot path now moves a smaller payload. Local sql_planner --quick: optimizer_tpch_all: 8.61ms -> 8.18ms (-5.0%) optimizer_tpcds_all: 168.0ms -> 163.5ms (-2.7%) Profiling shows ~13% of sql_planner CPU in libsystem_platform.dylib (memcpy / memmove) -- this change directly reduces that pool. A test_size_of_logical_plan unit test pins the new size and asserts DdlStatement stays smaller than Join, so future variant growth that would re-balloon the enum trips the test rather than silently regressing the planning hot path. 711 optimizer + 218 expr + 86 sql + 7 proto unit tests pass. SLT create_function / create_external_table / ddl pass. clippy clean. --- datafusion/core/src/execution/context/mod.rs | 2 +- datafusion/expr/src/logical_plan/ddl.rs | 28 +++++------ datafusion/expr/src/logical_plan/plan.rs | 39 +++++++++++++++ datafusion/ffi/src/table_provider_factory.rs | 4 +- datafusion/proto/src/logical_plan/mod.rs | 51 +++++++++++--------- datafusion/sql/src/statement.rs | 30 ++++++------ 6 files changed, 99 insertions(+), 55 deletions(-) diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index b2ad5c7d7ada0..189206a711c5d 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -714,7 +714,7 @@ impl SessionContext { Box::pin(self.drop_schema(cmd)).await } DdlStatement::CreateFunction(cmd) => { - Box::pin(self.create_function(cmd)).await + Box::pin(self.create_function(*cmd)).await } DdlStatement::DropFunction(cmd) => { Box::pin(self.drop_function(cmd)).await diff --git a/datafusion/expr/src/logical_plan/ddl.rs b/datafusion/expr/src/logical_plan/ddl.rs index 5779fb0c4ea5b..1990a31edb95f 100644 --- a/datafusion/expr/src/logical_plan/ddl.rs +++ b/datafusion/expr/src/logical_plan/ddl.rs @@ -38,8 +38,10 @@ use sqlparser::ast::Ident; /// Various types of DDL (CREATE / DROP) catalog manipulation #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] pub enum DdlStatement { - /// Creates an external table. - CreateExternalTable(CreateExternalTable), + /// Creates an external table. Boxed to keep `LogicalPlan` enum size down + /// — `CreateExternalTable` is ~312 bytes, dwarfing every other variant + /// in the plan tree and forcing the whole enum to that width. + CreateExternalTable(Box), /// Creates an in memory table. CreateMemoryTable(CreateMemoryTable), /// Creates a new view. @@ -56,8 +58,9 @@ pub enum DdlStatement { DropView(DropView), /// Drops a catalog schema DropCatalogSchema(DropCatalogSchema), - /// Create function statement - CreateFunction(CreateFunction), + /// Create function statement. Boxed for the same reason as + /// [`Self::CreateExternalTable`] (~288 bytes). + CreateFunction(Box), /// Drop function statement DropFunction(DropFunction), } @@ -66,9 +69,7 @@ impl DdlStatement { /// Get a reference to the logical plan's schema pub fn schema(&self) -> &DFSchemaRef { match self { - DdlStatement::CreateExternalTable(CreateExternalTable { schema, .. }) => { - schema - } + DdlStatement::CreateExternalTable(ce) => &ce.schema, DdlStatement::CreateMemoryTable(CreateMemoryTable { input, .. }) | DdlStatement::CreateView(CreateView { input, .. }) => input.schema(), DdlStatement::CreateCatalogSchema(CreateCatalogSchema { schema, .. }) => { @@ -79,7 +80,7 @@ impl DdlStatement { DdlStatement::DropTable(DropTable { schema, .. }) => schema, DdlStatement::DropView(DropView { schema, .. }) => schema, DdlStatement::DropCatalogSchema(DropCatalogSchema { schema, .. }) => schema, - DdlStatement::CreateFunction(CreateFunction { schema, .. }) => schema, + DdlStatement::CreateFunction(cf) => &cf.schema, DdlStatement::DropFunction(DropFunction { schema, .. }) => schema, } } @@ -131,11 +132,9 @@ impl DdlStatement { impl Display for Wrapper<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self.0 { - DdlStatement::CreateExternalTable(CreateExternalTable { - name, - constraints, - .. - }) => { + DdlStatement::CreateExternalTable(ce) => { + let name = &ce.name; + let constraints = &ce.constraints; if constraints.is_empty() { write!(f, "CreateExternalTable: {name:?}") } else { @@ -191,7 +190,8 @@ impl DdlStatement { "DropCatalogSchema: {name:?} if not exist:={if_exists} cascade:={cascade}" ) } - DdlStatement::CreateFunction(CreateFunction { name, .. }) => { + DdlStatement::CreateFunction(cf) => { + let name = &cf.name; write!(f, "CreateFunction: name {name:?}") } DdlStatement::DropFunction(DropFunction { name, .. }) => { diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 1bfecd06c2228..b8843953865d2 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -4740,6 +4740,45 @@ mod tests { use insta::{assert_debug_snapshot, assert_snapshot}; use std::hash::DefaultHasher; + /// `LogicalPlan` is moved/swapped on every step of the planning hot path + /// (every `mem::take` in an in-place rewriter, every `Arc` + /// write, every owned `map_*` traversal). Its size is set by the largest + /// variant, so an oversized variant balloons cost for every other variant. + /// + /// Today the size-setter should be `Join` (~176 bytes); `DdlStatement` is + /// boxed precisely so it does not dominate. If you grow a variant, please + /// box the new large fields rather than letting this number creep up — + /// see the analogous `test_size_of_expr` in `expr.rs`. + #[test] + fn test_size_of_logical_plan() { + // `LogicalPlan` enum on aarch64 / x86_64. Today this matches + // `Join`'s 176 bytes (the enum discriminant fits in `Join`'s + // alignment padding); if `Join` grows or another variant overtakes + // it, this number will move with the new size-setter. + assert_eq!(size_of::(), 176); + // `DdlStatement` is `Ddl(DdlStatement)`'s payload; keep it below the + // `Join` ceiling so it never re-becomes the size-setter. + assert!( + size_of::() < size_of::(), + "DdlStatement ({} bytes) should stay smaller than Join ({} bytes); \ + box the new large variant rather than letting it dominate `LogicalPlan`.", + size_of::(), + size_of::(), + ); + // Sanity check the two boxed variants stay boxed (so the payload + // sits on the heap, not in the enum). + assert_eq!( + size_of::>(), + 8, + "CreateExternalTable should be Box'd inside DdlStatement" + ); + assert_eq!( + size_of::>(), + 8, + "CreateFunction should be Box'd inside DdlStatement" + ); + } + fn employee_schema() -> Schema { Schema::new(vec![ Field::new("id", DataType::Int32, false), diff --git a/datafusion/ffi/src/table_provider_factory.rs b/datafusion/ffi/src/table_provider_factory.rs index 3ce8841614bc0..466b56806d879 100644 --- a/datafusion/ffi/src/table_provider_factory.rs +++ b/datafusion/ffi/src/table_provider_factory.rs @@ -152,7 +152,7 @@ impl FFI_TableProviderFactory { let plan = LogicalPlanNode::decode(cmd_serialized.as_ref()) .map_err(|e| DataFusionError::Internal(format!("{e:?}")))?; match plan.try_into_logical_plan(&task_ctx, logical_codec.as_ref())? { - LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) => Ok(cmd), + LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) => Ok(*cmd), _ => Err(DataFusionError::Internal( "Invalid logical plan in FFI_TableProviderFactory.".to_owned(), )), @@ -272,7 +272,7 @@ impl ForeignTableProviderFactory { let logical_codec: Arc = (&self.0.logical_codec).into(); - let plan = LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)); + let plan = LogicalPlan::Ddl(DdlStatement::CreateExternalTable(Box::new(cmd))); let plan: LogicalPlanNode = AsLogicalPlan::try_from_logical_plan(&plan, logical_codec.as_ref())?; diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs index 49593a6c6a56a..b691441e95a97 100644 --- a/datafusion/proto/src/logical_plan/mod.rs +++ b/datafusion/proto/src/logical_plan/mod.rs @@ -790,26 +790,30 @@ impl AsLogicalPlan for LogicalPlanNode { } Ok(LogicalPlan::Ddl(DdlStatement::CreateExternalTable( - CreateExternalTable::builder( - from_table_reference( - create_extern_table.name.as_ref(), - "CreateExternalTable", - )?, - create_extern_table.location.clone(), - create_extern_table.file_type.clone(), - pb_schema.try_into()?, - ) - .with_partition_cols(create_extern_table.table_partition_cols.clone()) - .with_order_exprs(order_exprs) - .with_if_not_exists(create_extern_table.if_not_exists) - .with_or_replace(create_extern_table.or_replace) - .with_temporary(create_extern_table.temporary) - .with_definition(definition) - .with_unbounded(create_extern_table.unbounded) - .with_options(create_extern_table.options.clone()) - .with_constraints(constraints.into()) - .with_column_defaults(column_defaults) - .build(), + Box::new( + CreateExternalTable::builder( + from_table_reference( + create_extern_table.name.as_ref(), + "CreateExternalTable", + )?, + create_extern_table.location.clone(), + create_extern_table.file_type.clone(), + pb_schema.try_into()?, + ) + .with_partition_cols( + create_extern_table.table_partition_cols.clone(), + ) + .with_order_exprs(order_exprs) + .with_if_not_exists(create_extern_table.if_not_exists) + .with_or_replace(create_extern_table.or_replace) + .with_temporary(create_extern_table.temporary) + .with_definition(definition) + .with_unbounded(create_extern_table.unbounded) + .with_options(create_extern_table.options.clone()) + .with_constraints(constraints.into()) + .with_column_defaults(column_defaults) + .build(), + ), ))) } LogicalPlanType::CreateView(create_view) => { @@ -1774,8 +1778,8 @@ impl AsLogicalPlan for LogicalPlanNode { }, )), }), - LogicalPlan::Ddl(DdlStatement::CreateExternalTable( - CreateExternalTable { + LogicalPlan::Ddl(DdlStatement::CreateExternalTable(ce)) => { + let CreateExternalTable { name, location, file_type, @@ -1790,8 +1794,7 @@ impl AsLogicalPlan for LogicalPlanNode { constraints, column_defaults, temporary, - }, - )) => { + } = ce.as_ref(); let mut converted_order_exprs: Vec = vec![]; for order in order_exprs { let temp = SortExprNodeCollection { diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 7da1c061cd4c3..401313f9d396c 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -1468,7 +1468,7 @@ impl SqlToRel<'_, S> { function_body, }; - let statement = DdlStatement::CreateFunction(CreateFunction { + let statement = DdlStatement::CreateFunction(Box::new(CreateFunction { or_replace, temporary, name, @@ -1476,7 +1476,7 @@ impl SqlToRel<'_, S> { args, params, schema: DFSchemaRef::new(DFSchema::empty()), - }); + })); Ok(LogicalPlan::Ddl(statement)) } @@ -1855,18 +1855,20 @@ impl SqlToRel<'_, S> { let constraints = self.new_constraint_from_table_constraints(&all_constraints, &df_schema)?; Ok(LogicalPlan::Ddl(DdlStatement::CreateExternalTable( - PlanCreateExternalTable::builder(name, location, file_type, df_schema) - .with_partition_cols(table_partition_cols) - .with_if_not_exists(if_not_exists) - .with_or_replace(or_replace) - .with_temporary(temporary) - .with_definition(definition) - .with_order_exprs(ordered_exprs) - .with_unbounded(unbounded) - .with_options(options_map) - .with_constraints(constraints) - .with_column_defaults(column_defaults) - .build(), + Box::new( + PlanCreateExternalTable::builder(name, location, file_type, df_schema) + .with_partition_cols(table_partition_cols) + .with_if_not_exists(if_not_exists) + .with_or_replace(or_replace) + .with_temporary(temporary) + .with_definition(definition) + .with_order_exprs(ordered_exprs) + .with_unbounded(unbounded) + .with_options(options_map) + .with_constraints(constraints) + .with_column_defaults(column_defaults) + .build(), + ), ))) } From e23fcab461eb583d02be4ccffe97af36bf6a1af9 Mon Sep 17 00:00:00 2001 From: Qi Zhu Date: Fri, 5 Jun 2026 14:10:11 +0800 Subject: [PATCH 2/2] docs: add 55.0.0 upgrade guide note for boxed DdlStatement variants --- .../library-user-guide/upgrading/55.0.0.md | 106 ++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/docs/source/library-user-guide/upgrading/55.0.0.md b/docs/source/library-user-guide/upgrading/55.0.0.md index d3988c33a41b2..ad50a37cb93f3 100644 --- a/docs/source/library-user-guide/upgrading/55.0.0.md +++ b/docs/source/library-user-guide/upgrading/55.0.0.md @@ -97,6 +97,112 @@ as a supertrait: + pub trait QueryPlanner: Any + Debug ``` +### `DdlStatement::CreateExternalTable` and `CreateFunction` are now boxed + +The two largest variants of `datafusion_expr::DdlStatement` are now +`Box`ed: + +```rust,ignore +// Before +pub enum DdlStatement { + CreateExternalTable(CreateExternalTable), + // ... + CreateFunction(CreateFunction), + // ... +} + +// After +pub enum DdlStatement { + CreateExternalTable(Box), + // ... + CreateFunction(Box), + // ... +} +``` + +`CreateExternalTable` is 312 bytes and `CreateFunction` is 288 bytes, so +without boxing they forced the entire `LogicalPlan` enum to 320 bytes +even on SELECT-only query paths that never instantiate them. Boxing +shrinks `LogicalPlan` from 320 → 176 bytes (−45%), making every +`mem::take` / `mem::swap` / `Arc` store on the planning +hot path move a smaller payload. + +**Who is affected:** + +- Users who construct `DdlStatement::CreateExternalTable(...)` or + `DdlStatement::CreateFunction(...)` from an owned struct. +- Users who pattern-match these variants and destructure the inner + struct in the same pattern (e.g. + `DdlStatement::CreateExternalTable(CreateExternalTable { name, .. })`). +- Code that consumes the inner struct out of these variants (e.g. to + pass `CreateExternalTable` by value to another function). + +**Migration guide:** + +When constructing the variants, wrap the inner struct in `Box::new`: + +```rust,ignore +// Before +let stmt = DdlStatement::CreateFunction(CreateFunction { name, args, .. }); + +// After +let stmt = DdlStatement::CreateFunction(Box::new(CreateFunction { + name, + args, + .. +})); +``` + +When pattern-matching, bind the boxed value and either access fields +through it (Rust auto-derefs the `Box`) or destructure via `.as_ref()`: + +```rust,ignore +// Before +match ddl { + DdlStatement::CreateExternalTable(CreateExternalTable { + name, location, .. + }) => { /* use name, location */ } +} + +// After — access fields through the box +match ddl { + DdlStatement::CreateExternalTable(ce) => { + let name = &ce.name; + let location = &ce.location; + /* ... */ + } +} + +// After — destructure the dereferenced struct +match ddl { + DdlStatement::CreateExternalTable(ce) => { + let CreateExternalTable { name, location, .. } = ce.as_ref(); + /* ... */ + } +} +``` + +When you need an owned `CreateExternalTable` / `CreateFunction` out of +the variant, dereference the box with `*`: + +```rust,ignore +// Before +match plan { + LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) => Ok(cmd), + _ => { /* ... */ } +} + +// After +match plan { + LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) => Ok(*cmd), + _ => { /* ... */ } +} +``` + +See [PR #22733](https://github.com/apache/datafusion/pull/22733) for +details, including the per-variant size breakdown and benchmark +results. + ### Spark map functions now reject duplicate keys by default The Spark-compatibility map-construction functions (`map_from_arrays`,