MariaDB · spetrunia · Jun 25, 2026 · mariadb-OlegSmirnov · Jun 27, 2026 · DaveGosselin-MariaDB
diff --git a/mysql-test/main/opt_hints_split_materialized.result b/mysql-test/main/opt_hints_split_materialized.result
@@ -779,3 +779,59 @@ drop table one_k, t1000;
 #
 # End 12.1 tests
 #
+#
+# MDEV-39005: Assertion failure on hint-forced Split-Materialized plan
+#
+create table t1 (
+groups_20 int not null,
+groups_20_2 int not null,
+b int,
+primary key (groups_20, groups_20_2)
+) engine=innodb;
+insert into t1 select 0, seq, seq from seq_1_to_10;
+create table t2 (a int, b int, index(a));
+insert into t2 select seq, seq from seq_0_to_10;
+analyze table t1, t2;
+Table	Op	Msg_type	Msg_text
+test.t1	analyze	status	Engine-independent statistics collected
+test.t1	analyze	status	OK
+test.t2	analyze	status	Engine-independent statistics collected
+test.t2	analyze	status	Table is already up to date
+# Query plan without the hint:
+analyze
+select a, sum(b)
+from
+(
+select groups_20 from t1
+group by groups_20
+having count(*) != 1
+) dt
+join
+t2 on a = groups_20
+group by a;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	PRIMARY	t2	ALL	a	NULL	NULL	NULL	11	11.00	100.00	100.00	Using where; Using temporary; Using filesort
+1	PRIMARY	<derived2>	ref	key0	key0	4	test.t2.a	1	0.09	100.00	100.00	
+2	DERIVED	t1	index	PRIMARY	PRIMARY	8	NULL	10	10.00	100.00	100.00	
+# Query plan with the hint.  Note that in this case, the
+# hint doesn't force the split because of the clamping
+# behavior introduced in this patch.
+analyze
+select /*+ split_materialized(dt) */ a, sum(b)
+from
+(
+select groups_20 from t1
+group by groups_20
+having count(*) != 1
+) dt
+join
+t2 on a = groups_20
+group by a;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	PRIMARY	<derived2>	ALL	NULL	NULL	NULL	NULL	2	1.00	100.00	100.00	Using temporary; Using filesort
+1	PRIMARY	t2	ref	a	a	5	dt.groups_20	1	1.00	100.00	100.00	
+2	DERIVED	t1	index	PRIMARY	PRIMARY	8	NULL	10	10.00	100.00	100.00	
+drop table t1, t2;
+#
+# End 12.3 tests
+#
diff --git a/mysql-test/main/opt_hints_split_materialized.test b/mysql-test/main/opt_hints_split_materialized.test
@@ -523,3 +523,54 @@ drop table one_k, t1000;
 --echo #
 --echo # End 12.1 tests
 --echo #
+
+--echo #
+--echo # MDEV-39005: Assertion failure on hint-forced Split-Materialized plan
+--echo #
+create table t1 (
+  groups_20 int not null,
+  groups_20_2 int not null,
+  b int,
+  primary key (groups_20, groups_20_2)
+) engine=innodb;
+insert into t1 select 0, seq, seq from seq_1_to_10;
+
+create table t2 (a int, b int, index(a));
+insert into t2 select seq, seq from seq_0_to_10;
+
+analyze table t1, t2;
+
+--echo # Query plan without the hint:
+analyze
+select a, sum(b)
+from
+  (
+    select groups_20 from t1
+    group by groups_20
+    having count(*) != 1
+  ) dt
+  join
+  t2 on a = groups_20
+group by a;
+
+
+--echo # Query plan with the hint.  Note that in this case, the
+--echo # hint doesn't force the split because of the clamping
+--echo # behavior introduced in this patch.
+analyze
+select /*+ split_materialized(dt) */ a, sum(b)
+from
+  (
+    select groups_20 from t1
+    group by groups_20
+    having count(*) != 1
+  ) dt
+  join
+  t2 on a = groups_20
+group by a;
+
+drop table t1, t2;
+
+--echo #
+--echo # End 12.3 tests
+--echo #
diff --git a/sql/opt_split.cc b/sql/opt_split.cc
@@ -290,7 +290,9 @@ class SplM_opt_info : public Sql_alloc
   double unsplit_cost;
   /* Split operation cost (result form spl_postjoin_oper_cost()) */
   double unsplit_oper_cost;
-  /* Cardinality of T when nothing is pushed */
+  /*
+    Cardinality of T when nothing is pushed, BEFORE the GROUP BY operation is done
+  */
   double unsplit_card;
   /* True when SPLIT_MATERIALIZED hint present and forces this split. */
   bool hint_forced_split{false};
@@ -847,13 +849,12 @@ void JOIN::add_keyuses_for_splitting()
   bzero((char*) &keyuse_ext_end, sizeof(keyuse_ext_end));
   if (ext_keyuses_for_splitting->push(keyuse_ext_end))
     goto err;
-  // psergey-todo: trace anything here?
   /*
-    Use the number of rows that was computed by
-    TABLE_LIST::fetch_number_of_rows():
+    This is the number of rows before the GROUP BY operation.
+    (Split Materialized is allowed only when derived table has a single
+    SELECT, so we know that *this is the only SELECT inside the derived table)
   */
-  spl_opt_info->unsplit_card=
-    rows2double(select_lex->master_unit()->derived->table->stat_records());
+  spl_opt_info->unsplit_card= join_record_count;
 
   rec_len= table->s->rec_buff_length;
 
@@ -1195,6 +1196,7 @@ SplM_plan_info * JOIN_TAB::choose_best_splitting(uint idx,
       double split_card= spl_opt_info->unsplit_card * spl_plan->split_sel;
       double oper_cost= (split_card *
                          spl_postjoin_oper_cost(thd, split_card, rec_len));
+      // TODO: why do we just take the costs from the last table?
       spl_plan->cost= (join->best_positions[join->table_count-1].read_time +
                        oper_cost);
 
@@ -1248,7 +1250,41 @@ SplM_plan_info * JOIN_TAB::choose_best_splitting(uint idx,
       the plan without splitting
     */
     startup_cost= refills * spl_plan->cost;
-    records= (ha_rows) (spl_opt_info->unsplit_card * spl_plan->split_sel);
+
+    /*
+      How many records will be in the split-materialized table?
+      The number doesn't matter a lot, as typically:
+       - The derived table is accessed through an index generated by
+         derived_with_keys optimization.
+       - When all GROUP BY columns are bound, we will always get just one
+         record (this is inferred by infer_derived_key_statistics()).
+
+      However when we have index lookup on a subset of GROUP BY columns
+
+         t1
+         JOIN (SELECT a,b FROM t2,t3 WHERE ... GROUP BY a,b) dt
+         ON dt.a=t1.a
+
+      then it does matter, as best_access_path() will use the code path for
+      unavailable index statistics, and that one takes into account how many
+      rows are in the table we're making the lookup in.
+      Alas, it is not that easy to estimate how many different values of dt.b
+      we will get for some fixed value of dt.a.
+
+      We have these estimates available:
+      * table->used_stat_records has the number of rows in derived table after
+        the GROUP BY operation
+      * Child join's join->join_record_count is the number of records in the
+        derived table for a fixed value of t1.a.
+
+      So, we take the minimum of these two and clip it up to avoid the multiply-by
+      zero problem.
+    */
+    records= (ha_rows)MY_MIN(join->join_record_count,
+                             table->used_stat_records);
+    if (records < 1)
+      records= 1;
+
     if (unlikely(thd->trace_started()) && ! already_printed)
     {
       Json_writer_object trace(thd, "split_materialized");
@@ -1262,7 +1298,8 @@ SplM_plan_info * JOIN_TAB::choose_best_splitting(uint idx,
   {
     /* Restore original values */
     startup_cost=      spl_opt_info->unsplit_cost;
-    records= (ha_rows) spl_opt_info->unsplit_card;
+    /* Number of records in the derived table *after* GROUP BY was applied */
+    records= table->used_stat_records;
     spl_plan= 0;
   }