diff --git a/mysql-test/main/opt_hints_split_materialized.result b/mysql-test/main/opt_hints_split_materialized.result index b1e656b3e606d..78f744ce6a3a9 100644 --- a/mysql-test/main/opt_hints_split_materialized.result +++ b/mysql-test/main/opt_hints_split_materialized.result @@ -779,3 +779,59 @@ drop table one_k, t1000; # # End 12.1 tests # +# +# MDEV-39005: Assertion failure on hint-forced Split-Materialized plan +# +create table t1 ( +groups_20 int not null, +groups_20_2 int not null, +b int, +primary key (groups_20, groups_20_2) +) engine=innodb; +insert into t1 select 0, seq, seq from seq_1_to_10; +create table t2 (a int, b int, index(a)); +insert into t2 select seq, seq from seq_0_to_10; +analyze table t1, t2; +Table Op Msg_type Msg_text +test.t1 analyze status Engine-independent statistics collected +test.t1 analyze status OK +test.t2 analyze status Engine-independent statistics collected +test.t2 analyze status Table is already up to date +# Query plan without the hint: +analyze +select a, sum(b) +from +( +select groups_20 from t1 +group by groups_20 +having count(*) != 1 +) dt +join +t2 on a = groups_20 +group by a; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 PRIMARY t2 ALL a NULL NULL NULL 11 11.00 100.00 100.00 Using where; Using temporary; Using filesort +1 PRIMARY ref key0 key0 4 test.t2.a 1 0.09 100.00 100.00 +2 DERIVED t1 index PRIMARY PRIMARY 8 NULL 10 10.00 100.00 100.00 +# Query plan with the hint. Note that in this case, the +# hint doesn't force the split because of the clamping +# behavior introduced in this patch. +analyze +select /*+ split_materialized(dt) */ a, sum(b) +from +( +select groups_20 from t1 +group by groups_20 +having count(*) != 1 +) dt +join +t2 on a = groups_20 +group by a; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 PRIMARY ALL NULL NULL NULL NULL 2 1.00 100.00 100.00 Using temporary; Using filesort +1 PRIMARY t2 ref a a 5 dt.groups_20 1 1.00 100.00 100.00 +2 DERIVED t1 index PRIMARY PRIMARY 8 NULL 10 10.00 100.00 100.00 +drop table t1, t2; +# +# End 12.3 tests +# diff --git a/mysql-test/main/opt_hints_split_materialized.test b/mysql-test/main/opt_hints_split_materialized.test index 2b5899968e787..9cff055b9382a 100644 --- a/mysql-test/main/opt_hints_split_materialized.test +++ b/mysql-test/main/opt_hints_split_materialized.test @@ -523,3 +523,54 @@ drop table one_k, t1000; --echo # --echo # End 12.1 tests --echo # + +--echo # +--echo # MDEV-39005: Assertion failure on hint-forced Split-Materialized plan +--echo # +create table t1 ( + groups_20 int not null, + groups_20_2 int not null, + b int, + primary key (groups_20, groups_20_2) +) engine=innodb; +insert into t1 select 0, seq, seq from seq_1_to_10; + +create table t2 (a int, b int, index(a)); +insert into t2 select seq, seq from seq_0_to_10; + +analyze table t1, t2; + +--echo # Query plan without the hint: +analyze +select a, sum(b) +from + ( + select groups_20 from t1 + group by groups_20 + having count(*) != 1 + ) dt + join + t2 on a = groups_20 +group by a; + + +--echo # Query plan with the hint. Note that in this case, the +--echo # hint doesn't force the split because of the clamping +--echo # behavior introduced in this patch. +analyze +select /*+ split_materialized(dt) */ a, sum(b) +from + ( + select groups_20 from t1 + group by groups_20 + having count(*) != 1 + ) dt + join + t2 on a = groups_20 +group by a; + +drop table t1, t2; + +--echo # +--echo # End 12.3 tests +--echo # diff --git a/sql/opt_split.cc b/sql/opt_split.cc index e63f9dc5b2d37..bdad85f41a1ae 100644 --- a/sql/opt_split.cc +++ b/sql/opt_split.cc @@ -290,7 +290,9 @@ class SplM_opt_info : public Sql_alloc double unsplit_cost; /* Split operation cost (result form spl_postjoin_oper_cost()) */ double unsplit_oper_cost; - /* Cardinality of T when nothing is pushed */ + /* + Cardinality of T when nothing is pushed, BEFORE the GROUP BY operation is done + */ double unsplit_card; /* True when SPLIT_MATERIALIZED hint present and forces this split. */ bool hint_forced_split{false}; @@ -847,13 +849,12 @@ void JOIN::add_keyuses_for_splitting() bzero((char*) &keyuse_ext_end, sizeof(keyuse_ext_end)); if (ext_keyuses_for_splitting->push(keyuse_ext_end)) goto err; - // psergey-todo: trace anything here? /* - Use the number of rows that was computed by - TABLE_LIST::fetch_number_of_rows(): + This is the number of rows before the GROUP BY operation. + (Split Materialized is allowed only when derived table has a single + SELECT, so we know that *this is the only SELECT inside the derived table) */ - spl_opt_info->unsplit_card= - rows2double(select_lex->master_unit()->derived->table->stat_records()); + spl_opt_info->unsplit_card= join_record_count; rec_len= table->s->rec_buff_length; @@ -1195,6 +1196,7 @@ SplM_plan_info * JOIN_TAB::choose_best_splitting(uint idx, double split_card= spl_opt_info->unsplit_card * spl_plan->split_sel; double oper_cost= (split_card * spl_postjoin_oper_cost(thd, split_card, rec_len)); + // TODO: why do we just take the costs from the last table? spl_plan->cost= (join->best_positions[join->table_count-1].read_time + oper_cost); @@ -1248,7 +1250,41 @@ SplM_plan_info * JOIN_TAB::choose_best_splitting(uint idx, the plan without splitting */ startup_cost= refills * spl_plan->cost; - records= (ha_rows) (spl_opt_info->unsplit_card * spl_plan->split_sel); + + /* + How many records will be in the split-materialized table? + The number doesn't matter a lot, as typically: + - The derived table is accessed through an index generated by + derived_with_keys optimization. + - When all GROUP BY columns are bound, we will always get just one + record (this is inferred by infer_derived_key_statistics()). + + However when we have index lookup on a subset of GROUP BY columns + + t1 + JOIN (SELECT a,b FROM t2,t3 WHERE ... GROUP BY a,b) dt + ON dt.a=t1.a + + then it does matter, as best_access_path() will use the code path for + unavailable index statistics, and that one takes into account how many + rows are in the table we're making the lookup in. + Alas, it is not that easy to estimate how many different values of dt.b + we will get for some fixed value of dt.a. + + We have these estimates available: + * table->used_stat_records has the number of rows in derived table after + the GROUP BY operation + * Child join's join->join_record_count is the number of records in the + derived table for a fixed value of t1.a. + + So, we take the minimum of these two and clip it up to avoid the multiply-by + zero problem. + */ + records= (ha_rows)MY_MIN(join->join_record_count, + table->used_stat_records); + if (records < 1) + records= 1; + if (unlikely(thd->trace_started()) && ! already_printed) { Json_writer_object trace(thd, "split_materialized"); @@ -1262,7 +1298,8 @@ SplM_plan_info * JOIN_TAB::choose_best_splitting(uint idx, { /* Restore original values */ startup_cost= spl_opt_info->unsplit_cost; - records= (ha_rows) spl_opt_info->unsplit_card; + /* Number of records in the derived table *after* GROUP BY was applied */ + records= table->used_stat_records; spl_plan= 0; }