Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions mysql-test/main/opt_hints_split_materialized.result
Original file line number Diff line number Diff line change
Expand Up @@ -779,3 +779,59 @@ drop table one_k, t1000;
#
# End 12.1 tests
#
#
# MDEV-39005: Assertion failure on hint-forced Split-Materialized plan
#
create table t1 (
groups_20 int not null,
groups_20_2 int not null,
b int,
primary key (groups_20, groups_20_2)
) engine=innodb;
insert into t1 select 0, seq, seq from seq_1_to_10;
create table t2 (a int, b int, index(a));
insert into t2 select seq, seq from seq_0_to_10;
analyze table t1, t2;
Table Op Msg_type Msg_text
test.t1 analyze status Engine-independent statistics collected
test.t1 analyze status OK
test.t2 analyze status Engine-independent statistics collected
test.t2 analyze status Table is already up to date
# Query plan without the hint:
analyze
select a, sum(b)
from
(
select groups_20 from t1
group by groups_20
having count(*) != 1
) dt
join
t2 on a = groups_20
group by a;
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 PRIMARY t2 ALL a NULL NULL NULL 11 11.00 100.00 100.00 Using where; Using temporary; Using filesort
1 PRIMARY <derived2> ref key0 key0 4 test.t2.a 1 0.09 100.00 100.00
2 DERIVED t1 index PRIMARY PRIMARY 8 NULL 10 10.00 100.00 100.00
# Query plan with the hint. Note that in this case, the
# hint doesn't force the split because of the clamping
# behavior introduced in this patch.
analyze
select /*+ split_materialized(dt) */ a, sum(b)
from
(
select groups_20 from t1
group by groups_20
having count(*) != 1
) dt
join
t2 on a = groups_20
group by a;
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 PRIMARY <derived2> ALL NULL NULL NULL NULL 2 1.00 100.00 100.00 Using temporary; Using filesort
1 PRIMARY t2 ref a a 5 dt.groups_20 1 1.00 100.00 100.00
2 DERIVED t1 index PRIMARY PRIMARY 8 NULL 10 10.00 100.00 100.00
drop table t1, t2;
#
# End 12.3 tests
#
51 changes: 51 additions & 0 deletions mysql-test/main/opt_hints_split_materialized.test
Original file line number Diff line number Diff line change
Expand Up @@ -523,3 +523,54 @@ drop table one_k, t1000;
--echo #
--echo # End 12.1 tests
--echo #

--echo #
--echo # MDEV-39005: Assertion failure on hint-forced Split-Materialized plan
--echo #
create table t1 (
groups_20 int not null,
groups_20_2 int not null,
b int,
primary key (groups_20, groups_20_2)
) engine=innodb;

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can reproduce the bug on MyISAM, do we really need InnoDB here?

insert into t1 select 0, seq, seq from seq_1_to_10;

create table t2 (a int, b int, index(a));
insert into t2 select seq, seq from seq_0_to_10;

analyze table t1, t2;

--echo # Query plan without the hint:
analyze
select a, sum(b)
from
(
select groups_20 from t1
group by groups_20
having count(*) != 1
) dt
join
t2 on a = groups_20
group by a;


--echo # Query plan with the hint. Note that in this case, the
--echo # hint doesn't force the split because of the clamping

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment is not quite right anymore, the clamping behavior in my patch had an effect only when the hint was provided. There's a different kind of clamping in this new variant and it is hint-agnostic.

--echo # behavior introduced in this patch.
analyze
select /*+ split_materialized(dt) */ a, sum(b)
from
(
select groups_20 from t1
group by groups_20
having count(*) != 1
) dt
join
t2 on a = groups_20
group by a;

drop table t1, t2;

--echo #
--echo # End 12.3 tests
--echo #
53 changes: 45 additions & 8 deletions sql/opt_split.cc
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,9 @@ class SplM_opt_info : public Sql_alloc
double unsplit_cost;
/* Split operation cost (result form spl_postjoin_oper_cost()) */
double unsplit_oper_cost;
/* Cardinality of T when nothing is pushed */
/*
Cardinality of T when nothing is pushed, BEFORE the GROUP BY operation is done
*/
double unsplit_card;
/* True when SPLIT_MATERIALIZED hint present and forces this split. */
bool hint_forced_split{false};
Expand Down Expand Up @@ -847,13 +849,12 @@ void JOIN::add_keyuses_for_splitting()
bzero((char*) &keyuse_ext_end, sizeof(keyuse_ext_end));
if (ext_keyuses_for_splitting->push(keyuse_ext_end))
goto err;
// psergey-todo: trace anything here?
/*
Use the number of rows that was computed by
TABLE_LIST::fetch_number_of_rows():
This is the number of rows before the GROUP BY operation.
(Split Materialized is allowed only when derived table has a single
SELECT, so we know that *this is the only SELECT inside the derived table)
*/
spl_opt_info->unsplit_card=
rows2double(select_lex->master_unit()->derived->table->stat_records());
spl_opt_info->unsplit_card= join_record_count;

rec_len= table->s->rec_buff_length;

Expand Down Expand Up @@ -1195,6 +1196,7 @@ SplM_plan_info * JOIN_TAB::choose_best_splitting(uint idx,
double split_card= spl_opt_info->unsplit_card * spl_plan->split_sel;
double oper_cost= (split_card *
spl_postjoin_oper_cost(thd, split_card, rec_len));
// TODO: why do we just take the costs from the last table?
spl_plan->cost= (join->best_positions[join->table_count-1].read_time +
oper_cost);

Expand Down Expand Up @@ -1248,7 +1250,41 @@ SplM_plan_info * JOIN_TAB::choose_best_splitting(uint idx,
the plan without splitting
*/
startup_cost= refills * spl_plan->cost;
records= (ha_rows) (spl_opt_info->unsplit_card * spl_plan->split_sel);

/*
How many records will be in the split-materialized table?
The number doesn't matter a lot, as typically:
- The derived table is accessed through an index generated by
derived_with_keys optimization.
- When all GROUP BY columns are bound, we will always get just one
record (this is inferred by infer_derived_key_statistics()).

However when we have index lookup on a subset of GROUP BY columns

t1
JOIN (SELECT a,b FROM t2,t3 WHERE ... GROUP BY a,b) dt
ON dt.a=t1.a

then it does matter, as best_access_path() will use the code path for
unavailable index statistics, and that one takes into account how many
rows are in the table we're making the lookup in.
Alas, it is not that easy to estimate how many different values of dt.b
we will get for some fixed value of dt.a.

We have these estimates available:
* table->used_stat_records has the number of rows in derived table after
the GROUP BY operation
* Child join's join->join_record_count is the number of records in the
derived table for a fixed value of t1.a.

So, we take the minimum of these two and clip it up to avoid the multiply-by
zero problem.
*/
records= (ha_rows)MY_MIN(join->join_record_count,
table->used_stat_records);
Comment on lines +1283 to +1284

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The comment above states that we want to use the child join's join_record_count (the number of records in the derived table before the GROUP BY operation). However, in choose_best_splitting, join refers to the parent join object, meaning join->join_record_count evaluates to the parent join's record count instead of the child join's.

To correctly access the child join's record count, we should use spl_opt_info->unsplit_card, which was populated with the child join's join_record_count in add_keyuses_for_splitting().

    records= (ha_rows)MY_MIN(spl_opt_info->unsplit_card,
                             table->used_stat_records);

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/gemini, look at the first lines in

{
  SplM_opt_info *spl_opt_info= table->spl_opt_info;
  DBUG_ASSERT(spl_opt_info != NULL);
  JOIN *join= spl_opt_info->join;

The join comes from spl_opt_info->join.
This one is set in bool JOIN::check_for_splittable_materialized() :

  spl_opt_info->join= this;

so it refers to the child join, right?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't answer if there is a comma after "/gemini", it does for messages starting with "/gemini " (space instead of comma)

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/gemini look at the first lines in

{
SplM_opt_info *spl_opt_info= table->spl_opt_info;
DBUG_ASSERT(spl_opt_info != NULL);
JOIN *join= spl_opt_info->join;
The join comes from spl_opt_info->join.
This one is set in bool JOIN::check_for_splittable_materialized() :

spl_opt_info->join= this;
so it refers to the child join, right?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are absolutely right. I apologize for the confusion in my previous comment. Since spl_opt_info->join is explicitly set to this (the child join) in JOIN::check_for_splittable_materialized(), join->join_record_count does indeed refer to the child join's record count. Thank you for the clarification.

if (records < 1)
records= 1;

if (unlikely(thd->trace_started()) && ! already_printed)
{
Json_writer_object trace(thd, "split_materialized");
Expand All @@ -1262,7 +1298,8 @@ SplM_plan_info * JOIN_TAB::choose_best_splitting(uint idx,
{
/* Restore original values */
startup_cost= spl_opt_info->unsplit_cost;
records= (ha_rows) spl_opt_info->unsplit_card;
/* Number of records in the derived table *after* GROUP BY was applied */
records= table->used_stat_records;
spl_plan= 0;
}

Expand Down