diff --git a/src/passes/MemoryPacking.cpp b/src/passes/MemoryPacking.cpp index b43abac2787..c1c0297f6d0 100644 --- a/src/passes/MemoryPacking.cpp +++ b/src/passes/MemoryPacking.cpp @@ -105,6 +105,8 @@ struct MemoryPacking : public Pass { void run(Module* module) override; bool canOptimize(std::vector>& memories, std::vector>& dataSegments); + void + zeroOutTrampledData(std::vector>& dataSegments); void optimizeSegmentOps(Module* module); void getSegmentReferrers(Module* module, ReferrersMap& referrers); void dropUnusedSegments(Module* module, @@ -247,7 +249,6 @@ bool MemoryPacking::canOptimize( // All active segments have constant offsets, known at this time, so we may be // able to optimize, but must still check for the trampling problem mentioned // earlier. - // TODO: optimize in the trampling case DisjointSpans space; for (auto& segment : dataSegments) { if (segment->isActive()) { @@ -255,15 +256,87 @@ bool MemoryPacking::canOptimize( Address start = c->value.getUnsigned(); DisjointSpans::Span span{start, start + segment->data.size()}; if (space.addAndCheckOverlap(span)) { - std::cerr << "warning: active memory segments have overlap, which " - << "prevents some optimizations.\n"; - return false; + // Some segments overlap, that is, a later segment tramples the data of + // an earlier one. If the memory is imported then we cannot optimize + // here: if a later segment is out of bounds then instantiation traps + // partway, leaving the data written so far visible in the imported + // memory (which outlives the failed instantiation), so even trampled + // data matters. + // TODO: We could optimize anyway if we can check that all the segments + // after the trampled segment, up to and including the trampling + // segment, will be in-bounds for the imported memory, as then no + // trap can occur between the trampled write and the trampling + // one. + if (memory->imported()) { + std::cerr << "warning: active memory segments have overlap, which " + << "prevents some optimizations.\n"; + return false; + } + // The memory is defined in this module, so partially-applied segments + // can never be observed: either instantiation completes and all the + // segments are applied in order, or it traps and the memory is never + // exposed. We can therefore zero out the trampled data, which the + // normal optimization of zeros will then remove. + zeroOutTrampledData(dataSegments); + break; } } } return true; } +void MemoryPacking::zeroOutTrampledData( + std::vector>& dataSegments) { + // Active segments are applied in order at instantiation, before any code can + // run, so when segments overlap only the last write to each byte is ever + // observable. Zero out all bytes that a later segment overwrites. This + // assumes all active segments have constant offsets, which canOptimize + // verifies before calling us. + // + // Iterate in reverse, tracking the disjoint regions of memory covered by the + // segments seen so far as a map from a region's start address to its end. + std::map covered; + for (auto it = dataSegments.rbegin(); it != dataSegments.rend(); ++it) { + auto& segment = *it; + if (!segment->isActive() || segment->data.empty()) { + continue; + } + uint64_t start = segment->offset->cast()->value.getUnsigned(); + uint64_t end = start + segment->data.size(); + // Zero out our bytes that later segments cover. Look for overlapping + // regions starting from the last one beginning at or before us. + auto covering = covered.upper_bound(start); + if (covering != covered.begin()) { + --covering; + } + for (; covering != covered.end() && covering->first < end; ++covering) { + uint64_t overlapStart = std::max(start, covering->first); + uint64_t overlapEnd = std::min(end, covering->second); + if (overlapStart < overlapEnd) { + std::fill(segment->data.begin() + (overlapStart - start), + segment->data.begin() + (overlapEnd - start), + 0); + } + } + // Add our span to the covered regions, merging with any regions it + // touches. + auto next = covered.upper_bound(start); + if (next != covered.begin()) { + auto prev = std::prev(next); + if (prev->second >= start) { + start = prev->first; + end = std::max(end, prev->second); + next = prev; + } + } + while (next != covered.end() && next->first <= end) { + end = std::max(end, next->second); + next = covered.erase(next); + } + covered[start] = end; + } +} + bool MemoryPacking::canSplit(const std::unique_ptr& segment, const Referrers& referrers) { // Don't mess with segments related to llvm coverage tools such as diff --git a/test/lit/passes/memory-packing_all-features.wast b/test/lit/passes/memory-packing_all-features.wast index 8d5089e6fbe..82e4009ffd9 100644 --- a/test/lit/passes/memory-packing_all-features.wast +++ b/test/lit/passes/memory-packing_all-features.wast @@ -2193,21 +2193,23 @@ (data.drop 0) ) ) + (module ;; CHECK: (memory $0 1 1) (memory $0 1 1) + ;; the zero tramples the "x", so the final memory contents are all zeros, and + ;; both segments can be removed entirely (data (i32.const 1024) "x") - (data (i32.const 1024) "\00") ;; this tramples the "x", and so must be kept. + (data (i32.const 1024) "\00") ) -;; CHECK: (data $0 (i32.const 1024) "x") -;; CHECK: (data $1 (i32.const 1024) "\00") (module ;; CHECK: (memory $0 1 1) (memory $0 1 1) (data (i32.const 1024) "x") (data (i32.const 1025) "\00") ) + ;; CHECK: (data $0 (i32.const 1024) "x") (module ;; CHECK: (memory $0 1 1) @@ -2215,19 +2217,112 @@ (data (i32.const 1024) "x") (data (i32.const 1023) "\00") ) + ;; CHECK: (data $0 (i32.const 1024) "x") (module ;; CHECK: (memory $0 1 1) (memory $0 1 1) + ;; trampling in one place does not prevent optimizing elsewhere: everything + ;; here is zeros in the final memory contents, and can be removed (data (i32.const 1024) "x") - (data (i32.const 1024) "\00") ;; when we see one bad thing, we give up + (data (i32.const 1024) "\00") (data (i32.const 4096) "\00") ) -;; CHECK: (data $0 (i32.const 1024) "x") -;; CHECK: (data $1 (i32.const 1024) "\00") +(module + ;; CHECK: (memory $0 1 1) + (memory $0 1 1) + ;; the "y" fully tramples the "x", so only the "y" remains + (data (i32.const 1024) "x") + (data (i32.const 1024) "y") +) + +;; CHECK: (data $1 (i32.const 1024) "y") +(module + ;; CHECK: (memory $0 1 1) + (memory $0 1 1) + ;; partial trampling: the "A" overwrites the "y" in the middle of "xyz". the + ;; trampled byte is zeroed out, and as the segments are applied in order, the + ;; final memory contents are "x", "A", "z" + (data (i32.const 1024) "xyz") + (data (i32.const 1025) "A") +) + +;; CHECK: (data $0 (i32.const 1024) "x\00z") + +;; CHECK: (data $1 (i32.const 1025) "A") +(module + ;; CHECK: (memory $0 1 1) + (memory $0 1 1) + ;; chained trampling, where the tramplers are themselves trampled: the final + ;; memory contents are "f", "e", "c" + (data (i32.const 1024) "abc") + (data (i32.const 1024) "de") + (data (i32.const 1024) "f") +) + +;; CHECK: (data $0 (i32.const 1026) "c") + +;; CHECK: (data $1 (i32.const 1025) "e") + +;; CHECK: (data $2 (i32.const 1024) "f") +(module + ;; CHECK: (memory $0 1 1) + (memory $0 1 1) + ;; one segment tramples multiple earlier ones: "WXYZ" covers all of "ab" and + ;; the "c" of "cd", so only "WXYZ" and the "d" remain + (data (i32.const 1024) "ab") + (data (i32.const 1026) "cd") + (data (i32.const 1023) "WXYZ") +) + +;; CHECK: (data $1 (i32.const 1027) "d") + +;; CHECK: (data $2 (i32.const 1023) "WXYZ") +(module + ;; CHECK: (memory $0 1 1) + (memory $0 1 1) + ;; the regions covered by later segments must be merged as they accumulate: + ;; walking the segments in reverse we see "fghij" [1024, 1029), then "B" + ;; [1025, 1026), then "abcde" [1027, 1032). if the region for "B" were not + ;; merged into the one for "fghij", then looking up the region covering + ;; "abcde" would find "B" and miss that "fghij" tramples the "ab" + (data (i32.const 1027) "abcde") + (data (i32.const 1025) "B") + (data (i32.const 1024) "fghij") +) + +;; CHECK: (data $0 (i32.const 1029) "cde") + +;; CHECK: (data $2 (i32.const 1024) "fghij") +(module + ;; CHECK: (type $0 (func)) + + ;; CHECK: (memory $0 1 1) + (memory $0 1 1) + ;; a passive segment is not applied at instantiation, so it neither tramples + ;; nor is trampled: the active segments cancel out as usual, and the passive + ;; segment is untouched + (data (i32.const 1024) "x") + ;; CHECK: (data $passive "ppp") + (data $passive "ppp") + (data (i32.const 1024) "\00") + ;; CHECK: (func $init (type $0) + ;; CHECK-NEXT: (memory.init $passive + ;; CHECK-NEXT: (i32.const 0) + ;; CHECK-NEXT: (i32.const 0) + ;; CHECK-NEXT: (i32.const 3) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + (func $init + (memory.init $passive + (i32.const 0) + (i32.const 0) + (i32.const 3) + ) + ) +) -;; CHECK: (data $2 (i32.const 4096) "\00") (module ;; CHECK: (import "env" "memoryBase" (global $memoryBase i32)) (import "env" "memoryBase" (global $memoryBase i32)) diff --git a/test/lit/passes/memory-packing_zero-filled-memory.wast b/test/lit/passes/memory-packing_zero-filled-memory.wast index 58ac799f5c5..c4b9e48f602 100644 --- a/test/lit/passes/memory-packing_zero-filled-memory.wast +++ b/test/lit/passes/memory-packing_zero-filled-memory.wast @@ -1,6 +1,6 @@ ;; NOTE: Assertions have been generated by update_lit_checks.py --all-items and should not be edited. -;; RUN: wasm-opt %s --memory-packing -all --zero-filled-memory -S -o - | filecheck %s +;; RUN: foreach %s %t wasm-opt --memory-packing -all --zero-filled-memory -S -o - | filecheck %s (module ;; we can optimize on an imported memory with zeroFilledMemory being set. @@ -10,4 +10,18 @@ (data (i32.const 1024) "x") (data (i32.const 1023) "\00") ) + ;; CHECK: (data $0 (i32.const 1024) "x") +(module + ;; but we cannot optimize trampling on an imported memory: if a later segment + ;; were to trap during instantiation, the data written before it remains + ;; visible in the imported memory, so even the trampled "x" must be kept + ;; CHECK: (import "env" "memory" (memory $0 1 1)) + (import "env" "memory" (memory $0 1 1)) + + (data (i32.const 1024) "x") + (data (i32.const 1024) "\00") +) +;; CHECK: (data $0 (i32.const 1024) "x") + +;; CHECK: (data $1 (i32.const 1024) "\00")