diff --git a/src/support/delta_debugging.h b/src/support/delta_debugging.h new file mode 100644 index 00000000000..9b45bd6e1ca --- /dev/null +++ b/src/support/delta_debugging.h @@ -0,0 +1,121 @@ +/* + * Copyright 2026 WebAssembly Community Group participants + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef wasm_support_delta_debugging_h +#define wasm_support_delta_debugging_h + +#include +#include +#include + +namespace wasm { + +// Use the delta debugging algorithm (Zeller 1999, +// https://dl.acm.org/doi/10.1109/32.988498) to find the minimal set of +// items necessary to preserve some property. Returns that minimal set of +// items, preserving their input order. `tryPartition` should have this +// signature: +// +// bool tryPartition(size_t partitionIndex, +// size_t numPartitions, +// const std::vector& partition) +// +// It should return true iff the property is preserved while keeping only +// `partition` items. +template +std::vector deltaDebugging(std::vector items, F&& tryPartition) { + if (items.empty()) { + return items; + } + // First try removing everything. + if (tryPartition(0, 1, {})) { + return {}; + } + size_t numPartitions = 2; + while (numPartitions <= items.size()) { + // Partition the items. + std::vector> partitions; + size_t size = items.size(); + size_t basePartitionSize = size / numPartitions; + size_t rem = size % numPartitions; + size_t idx = 0; + for (size_t i = 0; i < numPartitions; ++i) { + size_t partitionSize = basePartitionSize + (i < rem ? 1 : 0); + if (partitionSize > 0) { + std::vector partition; + partition.reserve(partitionSize); + for (size_t j = 0; j < partitionSize; ++j) { + partition.push_back(items[idx++]); + } + partitions.emplace_back(std::move(partition)); + } + } + assert(numPartitions == partitions.size()); + + bool reduced = false; + + // Try keeping only one partition. Try each partition in turn. + for (size_t i = 0; i < numPartitions; ++i) { + if (tryPartition(i, numPartitions, partitions[i])) { + items = std::move(partitions[i]); + numPartitions = 2; + reduced = true; + break; + } + } + if (reduced) { + continue; + } + + // Otherwise, try keeping the complement of a partition. Do not do this with + // only two partitions because that would be no different from what we + // already tried. + if (numPartitions > 2) { + for (size_t i = 0; i < numPartitions; ++i) { + std::vector complement; + complement.reserve(items.size() - partitions[i].size()); + for (size_t j = 0; j < numPartitions; ++j) { + if (j != i) { + complement.insert( + complement.end(), partitions[j].begin(), partitions[j].end()); + } + } + if (tryPartition(i, numPartitions, complement)) { + items = std::move(complement); + numPartitions = std::max(numPartitions - 1, size_t(2)); + reduced = true; + break; + } + } + if (reduced) { + continue; + } + } + + if (numPartitions == items.size()) { + // Cannot further refine the partitions. We're done. + break; + } + + // Otherwise, make the partitions finer grained. + numPartitions = std::min(items.size(), 2 * numPartitions); + } + return items; +} + +} // namespace wasm + +#endif // wasm_support_delta_debugging_h diff --git a/src/tools/wasm-reduce/wasm-reduce.cpp b/src/tools/wasm-reduce/wasm-reduce.cpp index e002a499545..6722cb45a03 100644 --- a/src/tools/wasm-reduce/wasm-reduce.cpp +++ b/src/tools/wasm-reduce/wasm-reduce.cpp @@ -29,12 +29,12 @@ #include "ir/branch-utils.h" #include "ir/iteration.h" -#include "ir/literal-utils.h" #include "ir/properties.h" #include "ir/utils.h" #include "pass.h" #include "support/colors.h" #include "support/command-line.h" +#include "support/delta_debugging.h" #include "support/file.h" #include "support/hash.h" #include "support/path.h" @@ -894,8 +894,105 @@ struct Reducer } } - // Reduces entire functions at a time. Returns whether we did a significant - // amount of reduction that justifies doing even more. + bool isEmptyBody(Expression* body) { + if (body->is() || body->is()) { + return true; + } + if (auto* block = body->dynCast()) { + return block->list.empty(); + } + return false; + } + + void reduceFunctionBodies() { + std::cerr << "| try to remove function bodies\n"; + // Use function indices to speed up finding the complement of the kept + // partition. + std::vector nontrivialFuncIndices; + nontrivialFuncIndices.reserve(module->functions.size()); + for (Index i = 0; i < module->functions.size(); ++i) { + auto& func = module->functions[i]; + // Skip functions that already have trivial bodies. + if (func->imported() || isEmptyBody(func->body)) { + continue; + } + nontrivialFuncIndices.push_back(i); + } + // TODO: Use something other than an exception to implement early return. + struct EarlyReturn {}; + try { + deltaDebugging( + nontrivialFuncIndices, + [&](Index partitionIndex, + Index numPartitions, + const std::vector& partition) { + // Stop early if the partition size is less than the square root of + // the remaining set. We don't want to waste time on very fine-grained + // partitions when we could switch to another reduction strategy + // instead. + if (size_t sqrtRemaining = std::sqrt(nontrivialFuncIndices.size()); + partition.size() > 0 && partition.size() < sqrtRemaining) { + throw EarlyReturn{}; + } + + std::cerr << "| try partition " << partitionIndex + 1 << " / " + << numPartitions << " (size " << partition.size() << ")\n"; + Index removedSize = nontrivialFuncIndices.size() - partition.size(); + std::vector oldBodies(removedSize); + + // We first need to remove each non-kept function body, and later we + // might need to restore the same function bodies. Abstract the logic + // for iterating over these function bodies. `f` takes a Function* and + // Expression*& for the stashed body. + auto forEachRemovedFuncBody = [&](auto f) { + Index bodyIndex = 0; + Index nontrivialIndex = 0; + Index partitionIndex = 0; + while (nontrivialIndex < nontrivialFuncIndices.size()) { + if (partitionIndex < partition.size() && + nontrivialFuncIndices[nontrivialIndex] == + partition[partitionIndex]) { + // Kept, skip it. + nontrivialIndex++; + partitionIndex++; + } else { + // Removed, process it + Index funcIndex = nontrivialFuncIndices[nontrivialIndex++]; + f(module->functions[funcIndex].get(), oldBodies[bodyIndex++]); + } + } + assert(bodyIndex == removedSize); + assert(partitionIndex == partition.size()); + }; + + // Stash the bodies. + forEachRemovedFuncBody([&](Function* func, Expression*& oldBody) { + oldBody = func->body; + Builder builder(*module); + if (func->getResults() == Type::none) { + func->body = builder.makeNop(); + } else { + func->body = builder.makeUnreachable(); + } + }); + + if (!writeAndTestReduction()) { + // Failure. Restore the bodies. + forEachRemovedFuncBody([](Function* func, Expression*& oldBody) { + func->body = oldBody; + }); + return false; + } + + // Success! + noteReduction(removedSize); + nontrivialFuncIndices = partition; + return true; + }); + } catch (EarlyReturn) { + } + } + bool reduceFunctions() { // try to remove functions std::vector functionNames; @@ -936,11 +1033,9 @@ struct Reducer } std::cerr << "| trying at i=" << i << " of size " << names.size() << "\n"; - // Try to remove functions and/or empty them. Note that - // tryToRemoveFunctions() will reload the module if it fails, which means - // function names may change - for that reason, run it second. - justReduced = tryToEmptyFunctions(names) || tryToRemoveFunctions(names); - if (justReduced) { + // Note that tryToRemoveFunctions() will reload the module if it fails, + // which means function names may change. + if (tryToRemoveFunctions(names)) { noteReduction(names.size()); // Subtract 1 since the loop increments us anyhow by one: we want to // skip over the skipped functions, and not any more. @@ -967,8 +1062,11 @@ struct Reducer assert(curr == module.get()); curr = nullptr; + reduceFunctionBodies(); + // Reduction of entire functions at a time is very effective, and we do it // with exponential growth and backoff, so keep doing it while it works. + // TODO: Figure out how to use delta debugging for this as well. while (reduceFunctions()) { } @@ -1047,41 +1145,6 @@ struct Reducer } } - // Try to empty out the bodies of some functions. - bool tryToEmptyFunctions(std::vector names) { - std::vector oldBodies; - size_t actuallyEmptied = 0; - for (auto name : names) { - auto* func = module->getFunction(name); - auto* oldBody = func->body; - oldBodies.push_back(oldBody); - // Nothing to do for imported functions (body is nullptr) or for bodies - // that have already been as reduced as we can make them. - if (func->imported() || oldBody->is() || - oldBody->is()) { - continue; - } - actuallyEmptied++; - bool useUnreachable = func->getResults() != Type::none; - if (useUnreachable) { - func->body = builder->makeUnreachable(); - } else { - func->body = builder->makeNop(); - } - } - if (actuallyEmptied > 0 && writeAndTestReduction()) { - std::cerr << "| emptied " << actuallyEmptied << " / " - << names.size() << " functions\n"; - return true; - } else { - // Restore the bodies. - for (size_t i = 0; i < names.size(); i++) { - module->getFunction(names[i])->body = oldBodies[i]; - } - return false; - } - } - // Try to actually remove functions. If they are somehow referred to, we will // get a validation error and undo it. bool tryToRemoveFunctions(std::vector names) { @@ -1504,10 +1567,20 @@ More documentation can be found at bool stopping = false; + bool first = true; while (1) { Reducer reducer( command, test, working, binary, deNan, verbose, debugInfo, options); + // For extremely large modules with slow reproduction commands, reducing + // function bodies first can be more effective than running passes. TODO: + // clean this up and reconsider the order of reducers. + if (first) { + reducer.loadWorking(); + reducer.reduceFunctionBodies(); + first = false; + } + // run binaryen optimization passes to reduce. passes are fast to run // and can often reduce large amounts of code efficiently, as opposed // to detructive reduction (i.e., that doesn't preserve correctness as diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt index 41d16f28e92..3a42fb1de74 100644 --- a/test/gtest/CMakeLists.txt +++ b/test/gtest/CMakeLists.txt @@ -10,6 +10,7 @@ set(unittest_SOURCES cast-check.cpp cfg.cpp dataflow.cpp + delta_debugging.cpp dfa_minimization.cpp disjoint_sets.cpp leaves.cpp diff --git a/test/gtest/delta_debugging.cpp b/test/gtest/delta_debugging.cpp new file mode 100644 index 00000000000..7e4c8ad4db5 --- /dev/null +++ b/test/gtest/delta_debugging.cpp @@ -0,0 +1,97 @@ +#include "support/delta_debugging.h" +#include "gtest/gtest.h" +#include +#include +#include + +using namespace wasm; + +TEST(DeltaDebuggingTest, EmptyInput) { + std::vector items; + auto result = deltaDebugging( + items, [](size_t, size_t, const std::vector&) { return false; }); + EXPECT_TRUE(result.empty()); +} + +TEST(DeltaDebuggingTest, SingleItem) { + std::vector items = {0, 1, 2, 3, 4, 5, 6, 7}; + auto result = deltaDebugging( + items, [](size_t, size_t, const std::vector& partition) { + return std::find(partition.begin(), partition.end(), 3) != + partition.end(); + }); + std::vector expected = {3}; + EXPECT_EQ(result, expected); +} + +TEST(DeltaDebuggingTest, MultipleItemsAdjacent) { + std::vector items = {0, 1, 2, 3, 4, 5, 6, 7}; + auto result = deltaDebugging( + items, [](size_t, size_t, const std::vector& partition) { + bool has2 = + std::find(partition.begin(), partition.end(), 2) != partition.end(); + bool has3 = + std::find(partition.begin(), partition.end(), 3) != partition.end(); + return has2 && has3; + }); + std::vector expected = {2, 3}; + EXPECT_EQ(result, expected); +} + +TEST(DeltaDebuggingTest, MultipleItemsNonAdjacent) { + std::vector items = {0, 1, 2, 3, 4, 5, 6, 7}; + auto result = deltaDebugging( + items, [](size_t, size_t, const std::vector& partition) { + bool has2 = + std::find(partition.begin(), partition.end(), 2) != partition.end(); + bool has5 = + std::find(partition.begin(), partition.end(), 5) != partition.end(); + return has2 && has5; + }); + std::vector expected = {2, 5}; + EXPECT_EQ(result, expected); +} + +TEST(DeltaDebuggingTest, OrderMaintained) { + std::vector items = {3, 1, 4, 2}; + auto result = deltaDebugging( + items, [](size_t, size_t, const std::vector& partition) { + bool has3 = + std::find(partition.begin(), partition.end(), 3) != partition.end(); + bool has2 = + std::find(partition.begin(), partition.end(), 2) != partition.end(); + return has3 && has2; + }); + std::vector expected = {3, 2}; + EXPECT_EQ(result, expected); +} + +TEST(DeltaDebuggingTest, DifferentTypes) { + std::vector items = {"apple", "banana", "cherry", "date"}; + auto result = deltaDebugging( + items, [](size_t, size_t, const std::vector& partition) { + bool hasBanana = + std::find(partition.begin(), partition.end(), "banana") != + partition.end(); + bool hasDate = std::find(partition.begin(), partition.end(), "date") != + partition.end(); + return hasBanana && hasDate; + }); + std::vector expected = {"banana", "date"}; + EXPECT_EQ(result, expected); +} + +TEST(DeltaDebuggingTest, UnconditionallyTrue) { + std::vector items = {0, 1, 2, 3}; + auto result = deltaDebugging( + items, [](size_t, size_t, const std::vector&) { return true; }); + EXPECT_TRUE(result.empty()); +} + +TEST(DeltaDebuggingTest, UnconditionallyFalse) { + std::vector items = {0, 1, 2, 3}; + auto result = deltaDebugging( + items, [](size_t, size_t, const std::vector&) { return false; }); + std::vector expected = {0, 1, 2, 3}; + EXPECT_EQ(result, expected); +}