From 3b1e2dc0db88a975888e244b1c5f44753b5c0b1a Mon Sep 17 00:00:00 2001 From: Jacky Li <86073892+JPL11@users.noreply.github.com> Date: Thu, 11 Jun 2026 16:43:37 -0700 Subject: [PATCH 1/3] MemoryPacking: optimize trampled data instead of giving up When active segments overlap, a later segment overwrites ("tramples") the data of an earlier one. Since active segments are applied in order during instantiation, before any code can run, only the final contents of memory are observable, so we can zero out all trampled bytes and let the normal optimization of zeros remove them. We only do this when the memory is defined in the module itself: with an imported memory, an out-of-bounds segment traps mid-instantiation and the partially-applied state remains visible in the importing module, so there we keep the existing behavior of not optimizing. Fixes #3244 --- src/passes/MemoryPacking.cpp | 76 ++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 4 deletions(-) diff --git a/src/passes/MemoryPacking.cpp b/src/passes/MemoryPacking.cpp index b43abac2787..cd7d7af3b2e 100644 --- a/src/passes/MemoryPacking.cpp +++ b/src/passes/MemoryPacking.cpp @@ -105,6 +105,8 @@ struct MemoryPacking : public Pass { void run(Module* module) override; bool canOptimize(std::vector>& memories, std::vector>& dataSegments); + void + zeroOutTrampledData(std::vector>& dataSegments); void optimizeSegmentOps(Module* module); void getSegmentReferrers(Module* module, ReferrersMap& referrers); void dropUnusedSegments(Module* module, @@ -247,7 +249,6 @@ bool MemoryPacking::canOptimize( // All active segments have constant offsets, known at this time, so we may be // able to optimize, but must still check for the trampling problem mentioned // earlier. - // TODO: optimize in the trampling case DisjointSpans space; for (auto& segment : dataSegments) { if (segment->isActive()) { @@ -255,15 +256,82 @@ bool MemoryPacking::canOptimize( Address start = c->value.getUnsigned(); DisjointSpans::Span span{start, start + segment->data.size()}; if (space.addAndCheckOverlap(span)) { - std::cerr << "warning: active memory segments have overlap, which " - << "prevents some optimizations.\n"; - return false; + // Some segments overlap, that is, a later segment tramples the data of + // an earlier one. If the memory is imported then we cannot optimize + // here: if a later segment is out of bounds then instantiation traps + // partway, leaving the data written so far visible in the imported + // memory (which outlives the failed instantiation), so even trampled + // data matters. + if (memory->imported()) { + std::cerr << "warning: active memory segments have overlap, which " + << "prevents some optimizations.\n"; + return false; + } + // The memory is defined in this module, so partially-applied segments + // can never be observed: either instantiation completes and all the + // segments are applied in order, or it traps and the memory is never + // exposed. We can therefore zero out the trampled data, which the + // normal optimization of zeros will then remove. + zeroOutTrampledData(dataSegments); + break; } } } return true; } +void MemoryPacking::zeroOutTrampledData( + std::vector>& dataSegments) { + // Active segments are applied in order at instantiation, before any code can + // run, so when segments overlap only the last write to each byte is ever + // observable. Zero out all bytes that a later segment overwrites. This + // assumes all active segments have constant offsets, which canOptimize + // verifies before calling us. + // + // Iterate in reverse, tracking the disjoint regions of memory covered by the + // segments seen so far as a map from a region's start address to its end. + std::map covered; + for (auto it = dataSegments.rbegin(); it != dataSegments.rend(); ++it) { + auto& segment = *it; + if (!segment->isActive() || segment->data.empty()) { + continue; + } + uint64_t start = segment->offset->cast()->value.getUnsigned(); + uint64_t end = start + segment->data.size(); + // Zero out our bytes that later segments cover. Look for overlapping + // regions starting from the last one beginning at or before us. + auto covering = covered.upper_bound(start); + if (covering != covered.begin()) { + --covering; + } + for (; covering != covered.end() && covering->first < end; ++covering) { + uint64_t overlapStart = std::max(start, covering->first); + uint64_t overlapEnd = std::min(end, covering->second); + if (overlapStart < overlapEnd) { + std::fill(segment->data.begin() + (overlapStart - start), + segment->data.begin() + (overlapEnd - start), + 0); + } + } + // Add our span to the covered regions, merging with any regions it + // touches. + auto next = covered.upper_bound(start); + if (next != covered.begin()) { + auto prev = std::prev(next); + if (prev->second >= start) { + start = prev->first; + end = std::max(end, prev->second); + next = prev; + } + } + while (next != covered.end() && next->first <= end) { + end = std::max(end, next->second); + next = covered.erase(next); + } + covered[start] = end; + } +} + bool MemoryPacking::canSplit(const std::unique_ptr& segment, const Referrers& referrers) { // Don't mess with segments related to llvm coverage tools such as From a90c6fe9aeed549d6185bb4c6cd76243fd0a4956 Mon Sep 17 00:00:00 2001 From: Jacky Li <86073892+JPL11@users.noreply.github.com> Date: Thu, 11 Jun 2026 16:52:25 -0700 Subject: [PATCH 2/3] Add and update tests for trampled data optimization Update the existing trampling tests, which asserted that we give up on any overlap, to the new optimized output, and add coverage for: full trampling by a non-zero byte, partial trampling in the middle of a segment, chained trampling, one segment trampling multiple earlier ones, passive segments being unaffected, and the imported-memory case where we still do not optimize. --- .../passes/memory-packing_all-features.wast | 85 +++++++++++++++++-- .../memory-packing_zero-filled-memory.wast | 15 +++- 2 files changed, 91 insertions(+), 9 deletions(-) diff --git a/test/lit/passes/memory-packing_all-features.wast b/test/lit/passes/memory-packing_all-features.wast index 8d5089e6fbe..8905a65eb39 100644 --- a/test/lit/passes/memory-packing_all-features.wast +++ b/test/lit/passes/memory-packing_all-features.wast @@ -2196,12 +2196,11 @@ (module ;; CHECK: (memory $0 1 1) (memory $0 1 1) + ;; the zero tramples the "x", so the final memory contents are all zeros, and + ;; both segments can be removed entirely (data (i32.const 1024) "x") - (data (i32.const 1024) "\00") ;; this tramples the "x", and so must be kept. + (data (i32.const 1024) "\00") ) -;; CHECK: (data $0 (i32.const 1024) "x") - -;; CHECK: (data $1 (i32.const 1024) "\00") (module ;; CHECK: (memory $0 1 1) (memory $0 1 1) @@ -2219,15 +2218,85 @@ (module ;; CHECK: (memory $0 1 1) (memory $0 1 1) + ;; trampling in one place does not prevent optimizing elsewhere: everything + ;; here is zeros in the final memory contents, and can be removed (data (i32.const 1024) "x") - (data (i32.const 1024) "\00") ;; when we see one bad thing, we give up + (data (i32.const 1024) "\00") (data (i32.const 4096) "\00") ) -;; CHECK: (data $0 (i32.const 1024) "x") +(module + ;; CHECK: (memory $0 1 1) + (memory $0 1 1) + ;; the "y" fully tramples the "x", so only the "y" remains + (data (i32.const 1024) "x") + (data (i32.const 1024) "y") +) +;; CHECK: (data $1 (i32.const 1024) "y") +(module + ;; CHECK: (memory $0 1 1) + (memory $0 1 1) + ;; partial trampling: the "A" overwrites the "y" in the middle of "xyz". the + ;; trampled byte is zeroed out, and as the segments are applied in order, the + ;; final memory contents are "x", "A", "z" + (data (i32.const 1024) "xyz") + (data (i32.const 1025) "A") +) +;; CHECK: (data $0 (i32.const 1024) "x\00z") -;; CHECK: (data $1 (i32.const 1024) "\00") +;; CHECK: (data $1 (i32.const 1025) "A") +(module + ;; CHECK: (memory $0 1 1) + (memory $0 1 1) + ;; chained trampling, where the tramplers are themselves trampled: the final + ;; memory contents are "f", "e", "c" + (data (i32.const 1024) "abc") + (data (i32.const 1024) "de") + (data (i32.const 1024) "f") +) +;; CHECK: (data $0 (i32.const 1026) "c") + +;; CHECK: (data $1 (i32.const 1025) "e") -;; CHECK: (data $2 (i32.const 4096) "\00") +;; CHECK: (data $2 (i32.const 1024) "f") +(module + ;; CHECK: (memory $0 1 1) + (memory $0 1 1) + ;; one segment tramples multiple earlier ones: "WXYZ" covers all of "ab" and + ;; the "c" of "cd", so only "WXYZ" and the "d" remain + (data (i32.const 1024) "ab") + (data (i32.const 1026) "cd") + (data (i32.const 1023) "WXYZ") +) +;; CHECK: (data $1 (i32.const 1027) "d") + +;; CHECK: (data $2 (i32.const 1023) "WXYZ") +(module + ;; CHECK: (type $0 (func)) + + ;; CHECK: (memory $0 1 1) + (memory $0 1 1) + ;; a passive segment is not applied at instantiation, so it neither tramples + ;; nor is trampled: the active segments cancel out as usual, and the passive + ;; segment is untouched + (data (i32.const 1024) "x") + ;; CHECK: (data $passive "ppp") + (data $passive "ppp") + (data (i32.const 1024) "\00") + ;; CHECK: (func $init (type $0) + ;; CHECK-NEXT: (memory.init $passive + ;; CHECK-NEXT: (i32.const 0) + ;; CHECK-NEXT: (i32.const 0) + ;; CHECK-NEXT: (i32.const 3) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + (func $init + (memory.init $passive + (i32.const 0) + (i32.const 0) + (i32.const 3) + ) + ) +) (module ;; CHECK: (import "env" "memoryBase" (global $memoryBase i32)) (import "env" "memoryBase" (global $memoryBase i32)) diff --git a/test/lit/passes/memory-packing_zero-filled-memory.wast b/test/lit/passes/memory-packing_zero-filled-memory.wast index 58ac799f5c5..a1288d5ce68 100644 --- a/test/lit/passes/memory-packing_zero-filled-memory.wast +++ b/test/lit/passes/memory-packing_zero-filled-memory.wast @@ -1,6 +1,6 @@ ;; NOTE: Assertions have been generated by update_lit_checks.py --all-items and should not be edited. -;; RUN: wasm-opt %s --memory-packing -all --zero-filled-memory -S -o - | filecheck %s +;; RUN: foreach %s %t wasm-opt --memory-packing -all --zero-filled-memory -S -o - | filecheck %s (module ;; we can optimize on an imported memory with zeroFilledMemory being set. @@ -11,3 +11,16 @@ (data (i32.const 1023) "\00") ) ;; CHECK: (data $0 (i32.const 1024) "x") +(module + ;; but we cannot optimize trampling on an imported memory: if a later segment + ;; were to trap during instantiation, the data written before it remains + ;; visible in the imported memory, so even the trampled "x" must be kept + ;; CHECK: (import "env" "memory" (memory $0 1 1)) + (import "env" "memory" (memory $0 1 1)) + + (data (i32.const 1024) "x") + (data (i32.const 1024) "\00") +) +;; CHECK: (data $0 (i32.const 1024) "x") + +;; CHECK: (data $1 (i32.const 1024) "\00") From 2738c3896317a1c421e1a9624101900863fc4cd9 Mon Sep 17 00:00:00 2001 From: Jacky Li <86073892+JPL11@users.noreply.github.com> Date: Thu, 11 Jun 2026 20:20:05 -0700 Subject: [PATCH 3/3] Address review comments - Add a TODO about optimizing trampled segments on imported memories when the relevant segments can be proven in-bounds. - Add blank lines between modules in the memory-packing lit tests. - Add a test that depends on merging the covered regions: without merging, looking up the region covering a segment could find a small later segment and miss a larger overlapping one. --- src/passes/MemoryPacking.cpp | 5 ++++ .../passes/memory-packing_all-features.wast | 26 +++++++++++++++++++ .../memory-packing_zero-filled-memory.wast | 1 + 3 files changed, 32 insertions(+) diff --git a/src/passes/MemoryPacking.cpp b/src/passes/MemoryPacking.cpp index cd7d7af3b2e..c1c0297f6d0 100644 --- a/src/passes/MemoryPacking.cpp +++ b/src/passes/MemoryPacking.cpp @@ -262,6 +262,11 @@ bool MemoryPacking::canOptimize( // partway, leaving the data written so far visible in the imported // memory (which outlives the failed instantiation), so even trampled // data matters. + // TODO: We could optimize anyway if we can check that all the segments + // after the trampled segment, up to and including the trampling + // segment, will be in-bounds for the imported memory, as then no + // trap can occur between the trampled write and the trampling + // one. if (memory->imported()) { std::cerr << "warning: active memory segments have overlap, which " << "prevents some optimizations.\n"; diff --git a/test/lit/passes/memory-packing_all-features.wast b/test/lit/passes/memory-packing_all-features.wast index 8905a65eb39..82e4009ffd9 100644 --- a/test/lit/passes/memory-packing_all-features.wast +++ b/test/lit/passes/memory-packing_all-features.wast @@ -2193,6 +2193,7 @@ (data.drop 0) ) ) + (module ;; CHECK: (memory $0 1 1) (memory $0 1 1) @@ -2201,12 +2202,14 @@ (data (i32.const 1024) "x") (data (i32.const 1024) "\00") ) + (module ;; CHECK: (memory $0 1 1) (memory $0 1 1) (data (i32.const 1024) "x") (data (i32.const 1025) "\00") ) + ;; CHECK: (data $0 (i32.const 1024) "x") (module ;; CHECK: (memory $0 1 1) @@ -2214,6 +2217,7 @@ (data (i32.const 1024) "x") (data (i32.const 1023) "\00") ) + ;; CHECK: (data $0 (i32.const 1024) "x") (module ;; CHECK: (memory $0 1 1) @@ -2224,6 +2228,7 @@ (data (i32.const 1024) "\00") (data (i32.const 4096) "\00") ) + (module ;; CHECK: (memory $0 1 1) (memory $0 1 1) @@ -2231,6 +2236,7 @@ (data (i32.const 1024) "x") (data (i32.const 1024) "y") ) + ;; CHECK: (data $1 (i32.const 1024) "y") (module ;; CHECK: (memory $0 1 1) @@ -2241,6 +2247,7 @@ (data (i32.const 1024) "xyz") (data (i32.const 1025) "A") ) + ;; CHECK: (data $0 (i32.const 1024) "x\00z") ;; CHECK: (data $1 (i32.const 1025) "A") @@ -2253,6 +2260,7 @@ (data (i32.const 1024) "de") (data (i32.const 1024) "f") ) + ;; CHECK: (data $0 (i32.const 1026) "c") ;; CHECK: (data $1 (i32.const 1025) "e") @@ -2267,9 +2275,26 @@ (data (i32.const 1026) "cd") (data (i32.const 1023) "WXYZ") ) + ;; CHECK: (data $1 (i32.const 1027) "d") ;; CHECK: (data $2 (i32.const 1023) "WXYZ") +(module + ;; CHECK: (memory $0 1 1) + (memory $0 1 1) + ;; the regions covered by later segments must be merged as they accumulate: + ;; walking the segments in reverse we see "fghij" [1024, 1029), then "B" + ;; [1025, 1026), then "abcde" [1027, 1032). if the region for "B" were not + ;; merged into the one for "fghij", then looking up the region covering + ;; "abcde" would find "B" and miss that "fghij" tramples the "ab" + (data (i32.const 1027) "abcde") + (data (i32.const 1025) "B") + (data (i32.const 1024) "fghij") +) + +;; CHECK: (data $0 (i32.const 1029) "cde") + +;; CHECK: (data $2 (i32.const 1024) "fghij") (module ;; CHECK: (type $0 (func)) @@ -2297,6 +2322,7 @@ ) ) ) + (module ;; CHECK: (import "env" "memoryBase" (global $memoryBase i32)) (import "env" "memoryBase" (global $memoryBase i32)) diff --git a/test/lit/passes/memory-packing_zero-filled-memory.wast b/test/lit/passes/memory-packing_zero-filled-memory.wast index a1288d5ce68..c4b9e48f602 100644 --- a/test/lit/passes/memory-packing_zero-filled-memory.wast +++ b/test/lit/passes/memory-packing_zero-filled-memory.wast @@ -10,6 +10,7 @@ (data (i32.const 1024) "x") (data (i32.const 1023) "\00") ) + ;; CHECK: (data $0 (i32.const 1024) "x") (module ;; but we cannot optimize trampling on an imported memory: if a later segment