From e12e9f24bb6a39bfb0757f8cbfde42416e38dba5 Mon Sep 17 00:00:00 2001
From: Jie Yao <jyao3@ebaychina.com>
Date: Mon, 8 Jun 2026 15:11:06 +0800
Subject: [PATCH 1/2] SDSTOR-21939: index/wb_cache: Fix recovery corruption
 when a node is created and freed in the same crashed CP

During index CP recovery, if a btree node was both created (e.g. via a
split) and freed (e.g. via a subsequent merge) within the same CP that
crashed before any data or bitmap was flushed, the recovery path would
incorrectly call free_blk on the node's blkid. If the repair phase had
already reused that blkid for a new live interior node, the free_blk
would silently corrupt it.

The first recovery pass processes all journal-recorded nodes and decides
which blkids to release via deleted_bufs. For freed nodes the code first
checked was_node_committed, then inside the "not committed" branch checked
m_created_cp_id:

    if (buf->m_node_freed) {
        if (was_node_committed(buf)) {
            deleted_bufs.push_back(buf);        // correct for prev-CP nodes
        } else {
            if (buf->m_created_cp_id != icp_ctx->id()) {
                m_vdev->commit_blk(buf->m_blkid);  // correct
            } else {
                deleted_bufs.push_back(buf);        // BUG
            }
            prune_from_up_buffer(buf);
        }
    }

A node that was created in the crashed CP (m_created_cp_id == icp_ctx->id())
and later freed in the same CP (m_node_freed == true) always landed in
deleted_bufs, triggering free_blk during recovery cleanup.

The same bug also existed in the was_node_committed == true branch: a
same-CP created+freed node whose parent had already been flushed to disk
would also be added to deleted_bufs via line 624.

A node created within the crashed CP only had its blkid reserved via
reserve_on_disk; the blkid was never committed to the allocator bitmap.
After restart the allocator reloads the bitmap from the last fully
committed CP and already treats that blkid as free. During the repair
phase (repair_links -> alloc_interior_node), the allocator may reassign
that exact blkid to a new live interior node. Recovery then calls
free_blk(blkid) from deleted_bufs, releasing a node that is still in
active use. Any subsequent allocation of that blkid overwrites the
repair node's data, silently corrupting the index.

    Committed CP:  Parent P(100) -> Leaf L(200) with keys [10,20,30,40]

    Crashed CP (journal flushed, no node data or bitmap flush):
      Step 1: insert 25 -> L splits, new node N(300) created
              N: m_created_cp_id=current_cp, blkid 300 reserve_on_disk only
      Step 2: delete 25,30,40 -> N empties, merge frees N
              N: m_node_freed=true
      Crash

    Recovery:
      - N: m_node_freed=true AND m_created_cp_id==icp_ctx->id()
      - Old code puts N in deleted_bufs
      - repair_links calls alloc_interior_node() -> allocator picks blkid 300
        (free in the committed bitmap)
      - Cleanup: free_blk(300) from deleted_bufs -> live repair node released
      - Next allocation at blkid 300 overwrites repair node -> index corrupted

Check m_created_cp_id == icp_ctx->id() before was_node_committed so that
same-CP created+freed nodes are handled as a distinct case and always
bypass deleted_bufs:

    if (buf->m_node_freed) {
        if (buf->m_created_cp_id == icp_ctx->id()) {
            // blkid never reached the bitmap; repair may already hold it
            if (was_node_committed(buf))
                pending_bufs.push_back(buf->m_up_buffer);
            else
                prune_from_up_buffer(buf);
            continue;                   // never touches deleted_bufs
        }
        if (was_node_committed(buf)) {
            // prev-CP node flushed to disk: schedule blkid release
            deleted_bufs.push_back(buf);
            pending_bufs.push_back(buf->m_up_buffer);
        } else {
            // prev-CP node freed but bitmap not flushed: recommit blkid
            m_vdev->commit_blk(buf->m_blkid);
            prune_from_up_buffer(buf);
        }
    }

For same-CP nodes whose parent was already flushed (was_node_committed
true), the parent still needs repair to remove the stale child link, so
its up_buffer is pushed to pending_bufs. When the parent was not flushed,
the in-memory child link is simply pruned. In both cases the blkid is
never freed.

Also extracted the repeated three-step prune pattern (remove_down_buffer
+ prune_up_buffers + clear m_up_buffer) into a prune_from_up_buffer
lambda to eliminate duplication in the same block.
---
 conanfile.py                            |   2 +-
 src/lib/index/wb_cache.cpp              |  79 ++---
 src/tests/test_index_crash_recovery.cpp | 393 ++++++++++++++++++++++++
 3 files changed, 438 insertions(+), 36 deletions(-)

diff --git a/conanfile.py b/conanfile.py
index 87a52abba..4ec340dfb 100644
--- a/conanfile.py
+++ b/conanfile.py
@@ -9,7 +9,7 @@
 
 class HomestoreConan(ConanFile):
     name = "homestore"
-    version = "7.5.10"
+    version = "7.5.11"
 
     homepage = "https://github.com/eBay/Homestore"
     description = "HomeStore Storage Engine"
diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp
index 439e454d1..08cab1a8b 100644
--- a/src/lib/index/wb_cache.cpp
+++ b/src/lib/index/wb_cache.cpp
@@ -432,11 +432,13 @@ void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const&
     down_buf->m_up_buffer = real_up_buf;
 
     if (down_buf->state() == index_buf_state_t::CLEAN) {
-        //In root collapse case, down_buf is written before up_buf, but in root split case, down_buf is written after up_buf.
+        // In root collapse case, down_buf is written before up_buf, but in root split case, down_buf is written after
+        // up_buf.
         LOGDEBUGMOD(wbcache,
-                   "CLEAN_BUF_DEBUG: Adding CLEAN down_buf {} to up_buf {}, up_buf wait_count before={}, might be dirtied later",
-                   down_buf->blkid().to_integer(), real_up_buf->blkid().to_integer(),
-                   real_up_buf->m_wait_for_down_buffers.get());
+                    "CLEAN_BUF_DEBUG: Adding CLEAN down_buf {} to up_buf {}, up_buf wait_count before={}, might be "
+                    "dirtied later",
+                    down_buf->blkid().to_integer(), real_up_buf->blkid().to_integer(),
+                    real_up_buf->m_wait_for_down_buffers.get());
     }
 
     real_up_buf->add_down_buffer(down_buf);
@@ -609,48 +611,55 @@ void IndexWBCache::recover(sisl::byte_view sb) {
 
     std::vector< IndexBufferPtr > pruned_bufs_to_repair;
     std::set< IndexBufferPtr > bufs_to_skip_sanity_check;
+
+    auto prune_from_up_buffer = [&](IndexBufferPtr const& buf) {
+        if (!buf->m_up_buffer) { return; }
+        LOGTRACEMOD(wbcache, "remove_down_buffer {} from up buffer {}", buf->to_string(),
+                    buf->m_up_buffer->to_string());
+        buf->m_up_buffer->remove_down_buffer(buf);
+        prune_up_buffers(buf, pruned_bufs_to_repair);
+        buf->m_up_buffer = nullptr;
+    };
+
     LOGTRACEMOD(wbcache, "\n\n\nRecovery processing begins\n\n\n");
     for (auto const& [_, buf] : bufs) {
         load_buf(buf);
 
         if (buf->m_node_freed) {
             LOGTRACEMOD(wbcache, "recovering free buf {}", buf->to_string());
+            if (buf->m_created_cp_id == icp_ctx->id()) {
+                // Created and freed within the same crashed CP: the blkid was only reserve_on_disk'd, never
+                // flushed to the allocator bitmap. After restart the allocator already treats it as free,
+                // so recovery repair may reuse it. Never add to deleted_bufs — free_blk would corrupt a
+                // live repair node that reused the same blkid.
+                LOGINFO("Ignoring same-CP created+freed buf {} for allocator free", buf->to_string());
+                if (was_node_committed(buf)) {
+                    pending_bufs.push_back(buf->m_up_buffer);
+                } else {
+                    prune_from_up_buffer(buf);
+                }
+                continue;
+            }
             if (was_node_committed(buf)) {
-                // Mark this buffer as deleted, so that we can avoid using it anymore when repairing its parent's link
+                // Node data is on disk from a previous CP; mark it deleted so repair skips its stale
+                // parent link, schedule the blkid for release, and queue the parent for repair.
                 r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted = true;
-                write_buf(nullptr, buf, icp_ctx); // no need to write it here !!
+                write_buf(nullptr, buf, icp_ctx);
                 deleted_bufs.push_back(buf);
                 pending_bufs.push_back(buf->m_up_buffer);
                 LOGINFOMOD(wbcache, "Freeing deleted buf {} and adding up buffer to pending {}", buf->to_string(),
                            buf->m_up_buffer->to_string());
             } else {
-                // (Up) buffer is not committed, node need to be kept and (potentially) repaired later
-                if (buf->m_created_cp_id != icp_ctx->id()) {
-                    LOGTRACEMOD(wbcache,
-                                "NOT FREE committing buffer {} node deleted is false reason: node commited?= {} "
-                                "up committed? {}",
-                                buf->to_string(), was_node_committed(buf), was_node_committed(buf->m_up_buffer));
-                    buf->m_node_freed = false;
-                    r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted = false;
-                    m_vdev->commit_blk(buf->m_blkid);
-                    // it can happen when children moved to one of right parent sibling and then the previous node is
-                    // deleted but not commited during crash (upbuffer is not committed). but its children already
-                    // committed. and freed (or changed)
-                    if (buf->m_node_level) { potential_parent_recovered_bufs.insert(buf); }
-                } else {
-                    LOGINFO("deleting and creating new buf {}", buf->to_string());
-                    deleted_bufs.push_back(buf);
-                }
-                //  1- upbuffer was dirtied by the same cp, so it is not commited, so we don't need to repair it.
-                //  remove it from down_waiting list (probably recursively going up) 2- upbuffer was created and
-                //  freed at the same cp, so it is not commited, so we don't need to repair it.
-                if (buf->m_up_buffer) {
-                    LOGTRACEMOD(wbcache, "remove_down_buffer {} from up buffer {}", buf->to_string(),
-                                buf->m_up_buffer->to_string());
-                    buf->m_up_buffer->remove_down_buffer(buf);
-                    prune_up_buffers(buf, pruned_bufs_to_repair);
-                    buf->m_up_buffer = nullptr;
-                }
+                // Freed from a previous CP but the free was not committed (crash between journal write
+                // and bitmap flush). Recommit the blkid to keep the bitmap consistent, then prune the
+                // stale child link from the uncommitted parent.
+                LOGTRACEMOD(wbcache, "Recommitting freed-but-uncommitted buf {} (up_committed={})", buf->to_string(),
+                            was_node_committed(buf->m_up_buffer));
+                buf->m_node_freed = false;
+                r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted = false;
+                m_vdev->commit_blk(buf->m_blkid);
+                if (buf->m_node_level) { potential_parent_recovered_bufs.insert(buf); }
+                prune_from_up_buffer(buf);
             }
         } else if (buf->m_created_cp_id == icp_ctx->id()) {
             LOGTRACEMOD(wbcache, "recovering new buf {}", buf->to_string());
@@ -828,8 +837,8 @@ bool IndexWBCache::was_node_committed(IndexBufferPtr const& buf) {
 //////////////////// CP Related API section /////////////////////////////////
 folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) {
     LOGINFOMOD(wbcache, "Starting Index CP Flush with cp {}, dirty_buf_count={}, nodes_added={}, nodes_removed={}",
-           cp_ctx->id(), cp_ctx->m_dirty_buf_count.get(), cp_ctx->m_num_nodes_added.load(),
-           cp_ctx->m_num_nodes_removed.load());
+               cp_ctx->id(), cp_ctx->m_dirty_buf_count.get(), cp_ctx->m_num_nodes_added.load(),
+               cp_ctx->m_num_nodes_removed.load());
     LOGTRACEMOD(wbcache, "Index CP Flush with cp {}, \ndag={}", cp_ctx->id(), cp_ctx->to_string_with_dags());
     // #ifdef _PRERELEASE
     //     static int id = 0;
diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp
index 0d90cfa9d..2ac8e0322 100644
--- a/src/tests/test_index_crash_recovery.cpp
+++ b/src/tests/test_index_crash_recovery.cpp
@@ -16,9 +16,13 @@
 #include <gtest/gtest.h>
 #include <boost/uuid/random_generator.hpp>
 
+#include <set>
+#include <vector>
 #include <sisl/utility/enum.hpp>
 #include "common/homestore_config.hpp"
 #include "common/resource_mgr.hpp"
+#include "device/chunk.h"
+#include "device/virtual_dev.hpp"
 #include "test_common/homestore_test_common.hpp"
 #include "test_common/range_scheduler.hpp"
 #include "btree_helpers/btree_test_helper.hpp"
@@ -1416,6 +1420,395 @@ TEST_F(IndexCrashTestTwoTables, MultiTableMetaBufOrdinalCollisionOnRecovery) {
     ASSERT_EQ(hs()->index_service().num_tables(), 2) << "Both tables should be recovered";
 }
 
+// Regression reproducer for the recovery ordering bug where a node that is
+// created and freed in the crashed CP is put into deleted_bufs even though its
+// blkid was never persisted in the allocator bitmap.
+//
+// This test does not add or depend on any new test flip. It uses the existing
+// crash_flush_on_split_at_parent flip only to stop the CP after the index journal
+// is persisted. The first current-CP split is immediately merged away so that
+// its newly allocated blkid is also freed in the same CP; because that allocation
+// is the first one after the clean baseline CP, the blkid is small and appears
+// near the head of fixed_blk_allocator::m_free_blk_q after restart. The later
+// split storm creates enough child links for recovery repair to allocate parent
+// repair nodes normally; when repair reuses that small created+freed blkid,
+// deleted_bufs later frees the live repair node. Subsequent normal inserts then
+// either crash in IndexWBCache::alloc_buf() at the duplicate cache insert assert,
+// or corrupt a shared-blkid btree node and crash during validation/use.
+struct IndexCrashCreatedFreedReuseTest : public test_common::HSTestHelper,
+                                         BtreeTestHelper< FixedLenBtree >,
+                                         public ::testing::Test {
+    using T = FixedLenBtree;
+    using K = T::KeyType;
+    using V = T::ValueType;
+
+    struct InspectableIndexTable : public T::BtreeType {
+        InspectableIndexTable(uuid_t uuid, uuid_t parent_uuid, uint32_t user_sb_size, BtreeConfig const& cfg,
+                              bool* flush_recovery_free_list, std::set< bnodeid_t >* recovery_freed_node_ids) :
+                T::BtreeType{uuid, parent_uuid, user_sb_size, cfg},
+                m_flush_recovery_free_list{flush_recovery_free_list},
+                m_recovery_freed_node_ids{recovery_freed_node_ids} {}
+
+        InspectableIndexTable(superblk< index_table_sb >&& sb, BtreeConfig const& cfg, bool* flush_recovery_free_list,
+                              std::set< bnodeid_t >* recovery_freed_node_ids) :
+                T::BtreeType{std::move(sb), cfg},
+                m_flush_recovery_free_list{flush_recovery_free_list},
+                m_recovery_freed_node_ids{recovery_freed_node_ids} {}
+
+        void recovery_completed() override {
+            T::BtreeType::recovery_completed();
+            if (m_flush_recovery_free_list && *m_flush_recovery_free_list) {
+                auto cpg = hs()->cp_mgr().cp_guard();
+                auto* cp_ctx = s_cast< VDevCPContext* >(cpg.context(cp_consumer_t::INDEX_SVC));
+                if (m_recovery_freed_node_ids) {
+                    cp_ctx->m_free_blkid_list.foreach_entry(
+                        [this](BlkId bid) { m_recovery_freed_node_ids->insert(bid.to_integer()); });
+                }
+                auto const root_blk = BlkId{this->root_node_id()};
+                auto* chunk = hs()->device_mgr()->get_chunk_mutable(root_blk.chunk_num());
+                RELEASE_ASSERT(chunk != nullptr, "Index chunk not found for root blkid {}", root_blk.to_string());
+                auto* vdev = hs()->device_mgr()->get_vdev_mutable(chunk->vdev_id());
+                RELEASE_ASSERT(vdev != nullptr, "Index vdev not found for root blkid {}", root_blk.to_string());
+                vdev->cp_flush(cp_ctx);
+                *m_flush_recovery_free_list = false;
+            }
+        }
+
+        std::set< bnodeid_t > collect_node_ids() const {
+            std::set< bnodeid_t > ids;
+            collect_node_ids_recurse(this->root_node_id(), ids);
+            return ids;
+        }
+
+    private:
+        void collect_node_ids_recurse(bnodeid_t node_id, std::set< bnodeid_t >& ids) const {
+            if ((node_id == empty_bnodeid) || ids.contains(node_id)) { return; }
+
+            BtreeNodePtr node;
+            if ((this->read_node_impl(node_id, node) != btree_status_t::success) || node->is_node_deleted()) { return; }
+            ids.insert(node_id);
+
+            if (node->is_leaf()) { return; }
+
+            for (uint32_t i = 0; i < node->total_entries(); ++i) {
+                BtreeLinkInfo child_info;
+                node->get_nth_value(i, &child_info, false /* copy */);
+                collect_node_ids_recurse(child_info.bnode_id(), ids);
+            }
+
+            if (node->has_valid_edge()) { collect_node_ids_recurse(node->get_edge_value().bnode_id(), ids); }
+        }
+
+        bool* m_flush_recovery_free_list{nullptr};
+        std::set< bnodeid_t >* m_recovery_freed_node_ids{nullptr};
+    };
+
+    using BtType = InspectableIndexTable;
+
+    class TestIndexServiceCallbacks : public IndexServiceCallbacks {
+    public:
+        TestIndexServiceCallbacks(IndexCrashCreatedFreedReuseTest* test) : m_test{test} {}
+
+        std::shared_ptr< IndexTableBase > on_index_table_found(superblk< index_table_sb >&& sb) override {
+            LOGINFO("Index table recovered, root bnode_id {} uuid {} ordinal {} version {}",
+                    static_cast< uint64_t >(sb->root_node), boost::uuids::to_string(sb->uuid), sb->ordinal,
+                    sb->root_link_version);
+            m_test->init_cfg();
+            m_test->m_bt = std::make_shared< BtType >(std::move(sb), m_test->m_cfg, &m_test->m_flush_recovery_free_list,
+                                                      &m_test->m_recovery_freed_node_ids);
+            return m_test->m_bt;
+        }
+
+    private:
+        IndexCrashCreatedFreedReuseTest* m_test;
+    };
+
+    IndexCrashCreatedFreedReuseTest() : testing::Test() { this->m_is_multi_threaded = false; }
+
+    bool m_flush_recovery_free_list{false};
+    std::set< bnodeid_t > m_recovery_freed_node_ids;
+
+    void init_cfg() {
+        this->m_cfg = BtreeConfig(hs()->index_service().node_size());
+        this->m_cfg.m_leaf_node_type = T::leaf_node_type;
+        this->m_cfg.m_int_node_type = T::interior_node_type;
+        this->m_cfg.m_max_keys_in_node = 20;
+        this->m_cfg.m_min_keys_in_node = 6;
+        this->m_cfg.m_max_merge_level = 1;
+    }
+
+    void SetUp() override {
+        HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) {
+            s.generic.cache_max_throttle_cnt = 10000;
+            s.generic.cp_timer_us = 0x8000000000000000;
+            s.resource_limits.dirty_buf_percent = 100;
+            HS_SETTINGS_FACTORY().save();
+        });
+
+        this->start_homestore(
+            "test_index_crash_recovery",
+            {{HS_SERVICE::META, {.size_pct = 10.0}},
+             {HS_SERVICE::INDEX, {.size_pct = 10.0, .index_svc_cbs = new TestIndexServiceCallbacks(this)}}},
+            nullptr, {}, true /* init_device */);
+
+        BtreeTestHelper< FixedLenBtree >::SetUp();
+        init_cfg();
+
+        auto uuid = boost::uuids::random_generator()();
+        auto parent_uuid = boost::uuids::random_generator()();
+        this->m_bt = std::make_shared< BtType >(uuid, parent_uuid, 0, this->m_cfg, &m_flush_recovery_free_list,
+                                                &m_recovery_freed_node_ids);
+        hs()->index_service().add_index_table(this->m_bt);
+    }
+
+    void restart_homestore(uint32_t shutdown_delay_sec = 3) override {
+        this->params(HS_SERVICE::INDEX).index_svc_cbs = new TestIndexServiceCallbacks(this);
+        test_common::HSTestHelper::restart_homestore(shutdown_delay_sec);
+    }
+
+    void TearDown() override {
+        BtreeTestHelper< FixedLenBtree >::TearDown();
+        this->shutdown_homestore(false);
+    }
+
+    void insert_key(uint32_t key_num) {
+        K key{key_num};
+        V value{V::generate_rand()};
+        auto req = BtreeSinglePutRequest{&key, &value, btree_put_type::INSERT};
+        req.enable_route_tracing();
+        auto const ret = this->m_bt->put(req);
+        ASSERT_EQ(ret, btree_status_t::success) << "insert key=" << key_num << " failed with " << enum_name(ret);
+    }
+
+    void update_key(uint32_t key_num) {
+        K key{key_num};
+        V value{V::generate_rand()};
+        auto req = BtreeSinglePutRequest{&key, &value, btree_put_type::UPDATE};
+        req.enable_route_tracing();
+        auto const ret = this->m_bt->put(req);
+        ASSERT_EQ(ret, btree_status_t::success) << "update key=" << key_num << " failed with " << enum_name(ret);
+    }
+
+    bool remove_key(uint32_t key_num) {
+        auto existing_v = std::make_unique< V >();
+        K key{key_num};
+        auto req = BtreeSingleRemoveRequest{&key, existing_v.get()};
+        req.enable_route_tracing();
+        auto const ret = this->m_bt->remove(req);
+        if ((ret != btree_status_t::success) && (ret != btree_status_t::not_found)) {
+            ADD_FAILURE() << "remove key=" << key_num << " failed with " << enum_name(ret);
+        }
+        return ret == btree_status_t::success;
+    }
+
+    void insert_sparse_multiples(uint32_t begin, uint32_t end, uint32_t step) {
+        for (auto k = begin; k <= end; k += step) {
+            insert_key(k);
+        }
+    }
+
+    void insert_range_skip_multiples(uint32_t begin, uint32_t end, uint32_t step) {
+        for (auto k = begin; k < end; ++k) {
+            if ((k % step) == 0) { continue; }
+            insert_key(k);
+        }
+    }
+
+    void remove_range_skip_multiples(uint32_t begin, uint32_t end, uint32_t step) {
+        uint32_t removed{0};
+        for (auto k = begin; k < end; ++k) {
+            if ((k % step) == 0) { continue; }
+            if (remove_key(k)) { ++removed; }
+        }
+        ASSERT_GT(removed, 0u) << "expected at least one key to be removed in [" << begin << ", " << end << ")";
+    }
+
+    void verify_existing_range(uint32_t begin, uint32_t end) {
+        for (auto k = begin; k < end; ++k) {
+            auto out_v = std::make_unique< V >();
+            K key{k};
+            auto req = BtreeSingleGetRequest{&key, out_v.get()};
+            req.enable_route_tracing();
+            auto const ret = this->m_bt->get(req);
+            ASSERT_EQ(ret, btree_status_t::success) << "get key=" << k << " failed with " << enum_name(ret);
+        }
+    }
+
+    std::shared_ptr< BtType > inspectable_bt() const { return std::static_pointer_cast< BtType >(this->m_bt); }
+
+    bnodeid_t first_added_node_id(std::set< bnodeid_t > const& before, std::set< bnodeid_t > const& after) const {
+        for (auto const id : after) {
+            if (!before.contains(id)) { return id; }
+        }
+        return empty_bnodeid;
+    }
+
+    std::vector< bnodeid_t > added_then_freed_node_ids(std::set< bnodeid_t > const& before,
+                                                       std::set< bnodeid_t > const& after_add,
+                                                       std::set< bnodeid_t > const& after_free) const {
+        std::vector< bnodeid_t > ids;
+        for (auto const id : after_add) {
+            if (!before.contains(id) && !after_free.contains(id)) { ids.push_back(id); }
+        }
+        return ids;
+    }
+
+    VirtualDev* index_vdev() const {
+        auto const root_blk = BlkId{this->m_bt->root_node_id()};
+        auto* chunk = hs()->device_mgr()->get_chunk_mutable(root_blk.chunk_num());
+        RELEASE_ASSERT(chunk != nullptr, "Index chunk not found for root blkid {}", root_blk.to_string());
+        auto* vdev = hs()->device_mgr()->get_vdev_mutable(chunk->vdev_id());
+        RELEASE_ASSERT(vdev != nullptr, "Index vdev not found for root blkid {}", root_blk.to_string());
+        return vdev;
+    }
+
+    void commit_index_free_queue_until(uint64_t remaining_blks) {
+        auto* vdev = index_vdev();
+        blk_alloc_hints hints;
+        hints.application_hint = this->m_bt->ordinal();
+
+        uint64_t consumed{0};
+        while (vdev->available_blks() > remaining_blks) {
+            BlkId blkid;
+            auto status = vdev->alloc_contiguous_blks(1, hints, blkid);
+            ASSERT_EQ(status, BlkAllocStatus::SUCCESS)
+                << "failed to consume index free queue after " << consumed << " allocations";
+            status = vdev->commit_blk(blkid);
+            ASSERT_EQ(status, BlkAllocStatus::SUCCESS) << "failed to commit consumed blk " << blkid.to_string();
+            ++consumed;
+        }
+        LOGINFO("Persistently consumed {} index free blks; remaining free blks={}", consumed, vdev->available_blks());
+    }
+
+    bool prepare_target_as_next_btree_alloc(bnodeid_t target_node_id) {
+        auto* vdev = index_vdev();
+        auto const target_blkid = BlkId{target_node_id};
+
+        blk_alloc_hints hints;
+        hints.application_hint = this->m_bt->ordinal();
+
+        uint64_t consumed{0};
+        while (vdev->available_blks() > 1) {
+            BlkId blkid;
+            auto const status = vdev->alloc_contiguous_blks(1, hints, blkid);
+            if (status != BlkAllocStatus::SUCCESS) {
+                ADD_FAILURE() << "failed to consume index free queue after " << consumed << " allocations";
+                return false;
+            }
+            if (blkid == target_blkid) {
+                vdev->free_blk(blkid, nullptr /* vctx */, true /* free_now */);
+                LOGINFO("Moved target blkid {} to allocator tail after {} consumptions", target_blkid.to_string(),
+                        consumed);
+                continue;
+            }
+            ++consumed;
+        }
+
+        BlkId last_blkid;
+        auto const status = vdev->alloc_contiguous_blks(1, hints, last_blkid);
+        if (status != BlkAllocStatus::SUCCESS) {
+            ADD_FAILURE() << "failed to allocate final free blk after " << consumed << " consumptions";
+            return false;
+        }
+        if (last_blkid != target_blkid) {
+            LOGINFO("Final free blk {} did not match target blkid {}", last_blkid.to_string(),
+                    target_blkid.to_string());
+            vdev->free_blk(last_blkid, nullptr /* vctx */, true /* free_now */);
+            return false;
+        }
+
+        vdev->free_blk(last_blkid, nullptr /* vctx */, true /* free_now */);
+        LOGINFO("Prepared target blkid {} as next btree allocation candidate", target_blkid.to_string());
+        return true;
+    }
+};
+
+TEST_F(IndexCrashCreatedFreedReuseTest, CreatedAndFreedBlkReusedByRecoveryRepair) {
+    constexpr uint32_t sparse_step = 100;
+    constexpr uint32_t preload_last_key = 2000;
+    constexpr uint64_t crash_cp_free_queue_blks = 512;
+
+    LOGINFO("Step 1: preload sparse keys [0, {}] step {} and flush the baseline CP", preload_last_key, sparse_step);
+    insert_sparse_multiples(0, preload_last_key, sparse_step);
+    test_common::HSTestHelper::trigger_cp(true);
+
+    LOGINFO("Step 1b: persistently consume index free queue so crash-CP created nodes are near recovery queue head");
+
+    // simulate a case that a lot of blk is consumed by other btree node allocation. for example, we have 2 btrees. one
+    // is used for reproduce this issue(the current one) , the other is used for consuming blks(receives lots of put
+    // request and lead to a lot of  blk allocation). commit_index_free_queue_until is used to consume free blks , just
+    // like what the second btree does.
+    commit_index_free_queue_until(crash_cp_free_queue_blks);
+    update_key(0);
+    test_common::HSTestHelper::trigger_cp(true);
+
+    LOGINFO("Step 2: set existing crash flip to crash after journal persistence but before parent flush");
+    this->set_basic_flip("crash_flush_on_split_at_parent");
+
+    LOGINFO("Step 3: observe the first current-CP split node, then make that exact blkid created+freed");
+    auto const before_first_split = inspectable_bt()->collect_node_ids();
+    insert_range_skip_multiples(1, 160, sparse_step);
+    auto const after_first_split = inspectable_bt()->collect_node_ids();
+    ASSERT_NE(first_added_node_id(before_first_split, after_first_split), empty_bnodeid)
+        << "expected first current-CP split to add a node";
+
+    remove_range_skip_multiples(1, 160, sparse_step);
+    auto const after_target_merge = inspectable_bt()->collect_node_ids();
+    auto const created_freed_node_ids =
+        added_then_freed_node_ids(before_first_split, after_first_split, after_target_merge);
+    ASSERT_FALSE(created_freed_node_ids.empty()) << "expected a current-CP split node to be freed in the same CP";
+    LOGINFO("Found {} created+freed node candidates in the crash CP", created_freed_node_ids.size());
+
+    LOGINFO("Step 4: create many more split records in the same unflushed CP to force recovery repair allocations");
+    insert_range_skip_multiples(1, preload_last_key, sparse_step);
+
+    LOGINFO("Step 5: crash and recover through the normal recovery/repair path");
+    m_flush_recovery_free_list = true;
+    m_recovery_freed_node_ids.clear();
+    test_common::HSTestHelper::trigger_cp(false);
+    this->wait_for_crash_recovery(true);
+
+    bnodeid_t target_node_id{empty_bnodeid};
+    for (auto const id : created_freed_node_ids) {
+        if (m_recovery_freed_node_ids.contains(id)) {
+            target_node_id = id;
+            break;
+        }
+    }
+
+    LOGINFO("Step 6: check whether recovery incorrectly freed the live repair blk");
+    if (target_node_id != empty_bnodeid) {
+        auto const recovered_nodes = inspectable_bt()->collect_node_ids();
+        target_node_id = empty_bnodeid;
+        for (auto const id : created_freed_node_ids) {
+            if (m_recovery_freed_node_ids.contains(id) && recovered_nodes.contains(id)) {
+                target_node_id = id;
+                break;
+            }
+        }
+        ASSERT_NE(target_node_id, empty_bnodeid)
+            << "recovery freed same-CP created+freed candidates but none was reused as a live repair node";
+        ASSERT_TRUE(prepare_target_as_next_btree_alloc(target_node_id))
+            << "target blk was not available as the next btree allocation";
+
+        LOGINFO("Buggy recovery detected: live repair node {} was freed; following inserts should crash at "
+                "duplicate/shared blkid",
+                target_node_id);
+        for (auto k = 100000u; k < 100300u; ++k) {
+            insert_key(k);
+        }
+    }
+
+    LOGINFO("Step 7: fixed recovery detected; no same-CP created+freed blkid was freed, verify writes and reads");
+    for (auto k = 100000u; k < 100300u; ++k) {
+        insert_key(k);
+    }
+
+    for (auto k = 0u; k <= preload_last_key; k += sparse_step) {
+        verify_existing_range(k, k + 1);
+    }
+}
+
 #endif
 
 int main(int argc, char* argv[]) {

From 27aed5f47ba5ff4636309fdced2fd1aa67823f97 Mon Sep 17 00:00:00 2001
From: Jie Yao <jyao3@ebaychina.com>
Date: Thu, 11 Jun 2026 10:47:40 +0800
Subject: [PATCH 2/2] update

---
 src/tests/test_index_crash_recovery.cpp | 87 ++++---------------------
 1 file changed, 13 insertions(+), 74 deletions(-)

diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp
index 2ac8e0322..e8572ea34 100644
--- a/src/tests/test_index_crash_recovery.cpp
+++ b/src/tests/test_index_crash_recovery.cpp
@@ -1653,17 +1653,13 @@ struct IndexCrashCreatedFreedReuseTest : public test_common::HSTestHelper,
         return ids;
     }
 
-    VirtualDev* index_vdev() const {
+    void commit_index_free_queue_until(uint64_t remaining_blks) {
         auto const root_blk = BlkId{this->m_bt->root_node_id()};
         auto* chunk = hs()->device_mgr()->get_chunk_mutable(root_blk.chunk_num());
         RELEASE_ASSERT(chunk != nullptr, "Index chunk not found for root blkid {}", root_blk.to_string());
         auto* vdev = hs()->device_mgr()->get_vdev_mutable(chunk->vdev_id());
         RELEASE_ASSERT(vdev != nullptr, "Index vdev not found for root blkid {}", root_blk.to_string());
-        return vdev;
-    }
 
-    void commit_index_free_queue_until(uint64_t remaining_blks) {
-        auto* vdev = index_vdev();
         blk_alloc_hints hints;
         hints.application_hint = this->m_bt->ordinal();
 
@@ -1679,48 +1675,6 @@ struct IndexCrashCreatedFreedReuseTest : public test_common::HSTestHelper,
         }
         LOGINFO("Persistently consumed {} index free blks; remaining free blks={}", consumed, vdev->available_blks());
     }
-
-    bool prepare_target_as_next_btree_alloc(bnodeid_t target_node_id) {
-        auto* vdev = index_vdev();
-        auto const target_blkid = BlkId{target_node_id};
-
-        blk_alloc_hints hints;
-        hints.application_hint = this->m_bt->ordinal();
-
-        uint64_t consumed{0};
-        while (vdev->available_blks() > 1) {
-            BlkId blkid;
-            auto const status = vdev->alloc_contiguous_blks(1, hints, blkid);
-            if (status != BlkAllocStatus::SUCCESS) {
-                ADD_FAILURE() << "failed to consume index free queue after " << consumed << " allocations";
-                return false;
-            }
-            if (blkid == target_blkid) {
-                vdev->free_blk(blkid, nullptr /* vctx */, true /* free_now */);
-                LOGINFO("Moved target blkid {} to allocator tail after {} consumptions", target_blkid.to_string(),
-                        consumed);
-                continue;
-            }
-            ++consumed;
-        }
-
-        BlkId last_blkid;
-        auto const status = vdev->alloc_contiguous_blks(1, hints, last_blkid);
-        if (status != BlkAllocStatus::SUCCESS) {
-            ADD_FAILURE() << "failed to allocate final free blk after " << consumed << " consumptions";
-            return false;
-        }
-        if (last_blkid != target_blkid) {
-            LOGINFO("Final free blk {} did not match target blkid {}", last_blkid.to_string(),
-                    target_blkid.to_string());
-            vdev->free_blk(last_blkid, nullptr /* vctx */, true /* free_now */);
-            return false;
-        }
-
-        vdev->free_blk(last_blkid, nullptr /* vctx */, true /* free_now */);
-        LOGINFO("Prepared target blkid {} as next btree allocation candidate", target_blkid.to_string());
-        return true;
-    }
 };
 
 TEST_F(IndexCrashCreatedFreedReuseTest, CreatedAndFreedBlkReusedByRecoveryRepair) {
@@ -1768,35 +1722,20 @@ TEST_F(IndexCrashCreatedFreedReuseTest, CreatedAndFreedBlkReusedByRecoveryRepair
     test_common::HSTestHelper::trigger_cp(false);
     this->wait_for_crash_recovery(true);
 
-    bnodeid_t target_node_id{empty_bnodeid};
-    for (auto const id : created_freed_node_ids) {
-        if (m_recovery_freed_node_ids.contains(id)) {
-            target_node_id = id;
-            break;
-        }
+    LOGINFO("Step 6: do sanity check");
+    // 1 all the recovered nodes should not exist in m_recovery_freed_node_ids either ( should not be freed during
+    // recovery)
+    auto const recovered_nodes = inspectable_bt()->collect_node_ids();
+    for (const auto id : m_recovery_freed_node_ids) {
+        ASSERT_FALSE(recovered_nodes.contains(id))
+            << "created_freed_node " << id << " was freed and also recovered as a live node";
     }
 
-    LOGINFO("Step 6: check whether recovery incorrectly freed the live repair blk");
-    if (target_node_id != empty_bnodeid) {
-        auto const recovered_nodes = inspectable_bt()->collect_node_ids();
-        target_node_id = empty_bnodeid;
-        for (auto const id : created_freed_node_ids) {
-            if (m_recovery_freed_node_ids.contains(id) && recovered_nodes.contains(id)) {
-                target_node_id = id;
-                break;
-            }
-        }
-        ASSERT_NE(target_node_id, empty_bnodeid)
-            << "recovery freed same-CP created+freed candidates but none was reused as a live repair node";
-        ASSERT_TRUE(prepare_target_as_next_btree_alloc(target_node_id))
-            << "target blk was not available as the next btree allocation";
-
-        LOGINFO("Buggy recovery detected: live repair node {} was freed; following inserts should crash at "
-                "duplicate/shared blkid",
-                target_node_id);
-        for (auto k = 100000u; k < 100300u; ++k) {
-            insert_key(k);
-        }
+    // 2 all the created+freed blkids should not be freed during recovery ( should not appear in
+    // m_recovery_freed_node_ids)
+    for (auto const id : created_freed_node_ids) {
+        ASSERT_FALSE(m_recovery_freed_node_ids.contains(id))
+            << "created_freed_node " << id << " was freed during recovery";
     }
 
     LOGINFO("Step 7: fixed recovery detected; no same-CP created+freed blkid was freed, verify writes and reads");