diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index 7a50b2a87a5305..1ebf64d99c7c6f 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -116,8 +116,13 @@ Status Segment::_open(io::FileSystemSPtr fs, const std::string& path, uint32_t s segment->_fs = fs; segment->_file_reader = std::move(file_reader); st = segment->_open(stats); - } else if (st.is() && - reader_options.cache_type == io::FileCachePolicy::FILE_BLOCK_CACHE) { + } + + // Three-tier retry for CORRUPTION errors when file cache is enabled. + // This handles CORRUPTION from both open_file() and _parse_footer() (via _open()). + if (st.is() && + reader_options.cache_type == io::FileCachePolicy::FILE_BLOCK_CACHE) { + // Tier 1: Clear file cache and retry with cache support (re-downloads from remote). LOG(WARNING) << "bad segment file may be read from file cache, try to read remote source " "file directly, file path: " << path << " cache_key: " << file_cache_key_str(path); @@ -133,6 +138,7 @@ Status Segment::_open(io::FileSystemSPtr fs, const std::string& path, uint32_t s } TEST_INJECTION_POINT_CALLBACK("Segment::open:corruption1", &st); if (st.is()) { // corrupt again + // Tier 2: Bypass cache entirely and read directly from remote storage. LOG(WARNING) << "failed to try to read remote source file again with cache support," << " try to read from remote directly, " << " file path: " << path << " cache_key: " << file_cache_key_str(path); @@ -146,6 +152,7 @@ Status Segment::_open(io::FileSystemSPtr fs, const std::string& path, uint32_t s segment->_file_reader = std::move(file_reader); st = segment->_open(stats); if (!st.ok()) { + // Tier 3: Remote source itself is corrupt. LOG(WARNING) << "failed to try to read remote source file directly," << " file path: " << path << " cache_key: " << file_cache_key_str(path); diff --git a/be/test/olap/rowset/segment_v2/segment_corruption_test.cpp b/be/test/olap/rowset/segment_v2/segment_corruption_test.cpp index c133691233231f..4fbe7d69a2b30f 100644 --- a/be/test/olap/rowset/segment_v2/segment_corruption_test.cpp +++ b/be/test/olap/rowset/segment_v2/segment_corruption_test.cpp @@ -336,6 +336,77 @@ TEST_F(SegmentCorruptionTest, TestFsSetInCorruptionRetryPath) { // If we reach here, _fs was correctly set } +// Test that CORRUPTION from _parse_footer() (after successful open_file()) triggers the +// three-tier retry logic. Before the fix, the retry was in an else-if branch that was +// only reachable when open_file() itself returned CORRUPTION, not when _parse_footer() +// did. This test verifies the fix by: +// 1. Creating a valid segment +// 2. Corrupting the footer magic number on the first parse attempt via sync point +// 3. Verifying the segment opens successfully after cache-bypass retry +TEST_F(SegmentCorruptionTest, TestFooterCorruptionTriggersRetry) { + auto schema = create_schema_with_inverted_index(); + RowsetId rowset_id; + rowset_id.init(3); + + auto path = create_segment_with_inverted_index(schema, 0, rowset_id); + auto fs = io::global_local_filesystem(); + + // Use sync point to corrupt the magic number on the first _parse_footer() call. + // This simulates reading corrupt data from file cache while the remote file is fine. + auto* sp = SyncPoint::get_instance(); + sp->enable_processing(); + + int parse_footer_count = 0; + SyncPoint::CallbackGuard guard; + sp->set_call_back( + "Segment::parse_footer:magic_number_corruption", + [&parse_footer_count](auto&& args) { + // Corrupt magic number only on the first attempt to simulate cache corruption. + // Subsequent retries (which bypass or clear cache) will read correct data. + if (parse_footer_count == 0) { + auto* buf = try_any_cast(args[0]); + // Corrupt the magic number (last 4 bytes of the 12-byte trailer) + buf[8] = 0xFF; + buf[9] = 0xFF; + buf[10] = 0xFF; + buf[11] = 0xFF; + parse_footer_count++; + } + }, + &guard); + + std::shared_ptr segment; + // Use FILE_BLOCK_CACHE to enable the corruption retry path + io::FileReaderOptions reader_options; + reader_options.cache_type = io::FileCachePolicy::FILE_BLOCK_CACHE; + + auto st = Segment::open(fs, path, /*tablet_id=*/100, /*segment_id=*/0, rowset_id, schema, + reader_options, &segment); + + sp->disable_processing(); + + // Verify that the magic number was corrupted on the first attempt + ASSERT_EQ(parse_footer_count, 1) << "Footer corruption should have been injected once"; + + // The segment should open successfully after retry (tier 1: clear cache + retry) + ASSERT_TRUE(st.ok()) << st.to_string(); + ASSERT_NE(segment, nullptr); + + // Verify that _fs was correctly set through the retry path + OlapReaderStatistics stats; + StorageReadOptions read_options; + read_options.stats = &stats; + + auto indexes = schema->inverted_indexs(schema->column(1)); + ASSERT_FALSE(indexes.empty()); + const TabletIndex* idx_meta = indexes[0]; + std::unique_ptr iter; + st = segment->new_index_iterator(schema->column(1), idx_meta, read_options, &iter); + st = segment->_index_file_reader->init(config::inverted_index_read_buffer_size, + &read_options.io_ctx); + ASSERT_TRUE(st.ok()) << st.to_string(); +} + // Test normal segment open path TEST_F(SegmentCorruptionTest, TestFsSetInNormalPath) { auto schema = create_schema_with_inverted_index();