From 2b11a1272a305029c353e3e484404fc659d78d2e Mon Sep 17 00:00:00 2001 From: danielob Date: Wed, 19 Oct 2022 21:42:45 +0200 Subject: [PATCH 1/4] rolling latency added --- bdm/allocator/BackgroundMover-inl.h | 113 + bdm/allocator/BackgroundMover.h | 103 + bdm/allocator/BackgroundMoverStrategy.h | 34 + bdm/allocator/CCacheAllocator.cpp | 121 + bdm/allocator/CCacheAllocator.h | 121 + bdm/allocator/CCacheManager.cpp | 90 + bdm/allocator/CCacheManager.h | 82 + bdm/allocator/CMakeLists.txt | 141 + bdm/allocator/Cache.cpp | 70 + bdm/allocator/Cache.h | 309 + bdm/allocator/CacheAllocator-inl.h | 4116 ++ bdm/allocator/CacheAllocator.cpp | 26 + bdm/allocator/CacheAllocator.h | 2499 + bdm/allocator/CacheAllocatorConfig.h | 1332 + bdm/allocator/CacheChainedItemIterator.h | 120 + bdm/allocator/CacheDetails.cpp | 29 + bdm/allocator/CacheDetails.h | 42 + bdm/allocator/CacheItem-inl.h | 512 + bdm/allocator/CacheItem.h | 573 + bdm/allocator/CacheStats.cpp | 340 + bdm/allocator/CacheStats.h | 734 + bdm/allocator/CacheStatsInternal.h | 264 + bdm/allocator/CacheTraits.h | 59 + bdm/allocator/CacheVersion.h | 59 + bdm/allocator/ChainedAllocs.h | 108 + bdm/allocator/ChainedHashTable-inl.h | 624 + bdm/allocator/ChainedHashTable.h | 698 + bdm/allocator/ContainerTypes.cpp | 32 + bdm/allocator/FreeMemStrategy.cpp | 70 + bdm/allocator/FreeMemStrategy.h | 62 + bdm/allocator/FreeThresholdStrategy.cpp | 126 + bdm/allocator/FreeThresholdStrategy.h | 55 + bdm/allocator/Handle.h | 644 + bdm/allocator/HitsPerSlabStrategy.cpp | 207 + bdm/allocator/HitsPerSlabStrategy.h | 120 + bdm/allocator/ICompactCache.h | 52 + bdm/allocator/KAllocation.h | 156 + bdm/allocator/LruTailAgeStrategy.cpp | 200 + bdm/allocator/LruTailAgeStrategy.h | 134 + bdm/allocator/MM2Q-inl.h | 477 + bdm/allocator/MM2Q.h | 637 + bdm/allocator/MMLru-inl.h | 385 + bdm/allocator/MMLru.h | 470 + bdm/allocator/MMTinyLFU-inl.h | 376 + bdm/allocator/MMTinyLFU.h | 641 + .../MarginalHitsOptimizeStrategy.cpp | 153 + bdm/allocator/MarginalHitsOptimizeStrategy.h | 123 + bdm/allocator/MarginalHitsState-inl.h | 70 + bdm/allocator/MarginalHitsState.h | 81 + bdm/allocator/MarginalHitsStrategy.cpp | 109 + bdm/allocator/MarginalHitsStrategy.h | 96 + bdm/allocator/MemoryMonitor.cpp | 322 + bdm/allocator/MemoryMonitor.h | 334 + bdm/allocator/MemoryTierCacheConfig.h | 101 + bdm/allocator/NvmAdmissionPolicy.h | 217 + bdm/allocator/NvmCacheState.cpp | 193 + bdm/allocator/NvmCacheState.h | 112 + bdm/allocator/PoolOptimizeStrategy.cpp | 26 + bdm/allocator/PoolOptimizeStrategy.h | 90 + bdm/allocator/PoolOptimizer.cpp | 92 + bdm/allocator/PoolOptimizer.h | 85 + bdm/allocator/PoolRebalancer.cpp | 129 + bdm/allocator/PoolRebalancer.h | 94 + bdm/allocator/PoolResizeStrategy.h | 73 + bdm/allocator/PoolResizer.cpp | 100 + bdm/allocator/PoolResizer.h | 84 + bdm/allocator/PromotionStrategy.h | 88 + bdm/allocator/RandomStrategy.h | 70 + bdm/allocator/ReadOnlySharedCacheView.h | 92 + bdm/allocator/Reaper-inl.h | 143 + bdm/allocator/Reaper.h | 109 + bdm/allocator/RebalanceInfo.h | 185 + bdm/allocator/RebalanceStrategy.cpp | 288 + bdm/allocator/RebalanceStrategy.h | 187 + bdm/allocator/Refcount.h | 404 + bdm/allocator/SlabReleaseStats.cpp | 72 + bdm/allocator/SlabReleaseStats.h | 83 + bdm/allocator/TempShmMapping.cpp | 92 + bdm/allocator/TempShmMapping.h | 67 + bdm/allocator/TlsActiveItemRing.h | 113 + bdm/allocator/TypedHandle.h | 152 + bdm/allocator/Util.h | 200 + bdm/allocator/datastruct/DList-inl.h | 214 + bdm/allocator/datastruct/DList.h | 268 + bdm/allocator/datastruct/MultiDList-inl.h | 153 + bdm/allocator/datastruct/MultiDList.h | 210 + bdm/allocator/datastruct/SList-inl.h | 154 + bdm/allocator/datastruct/SList.h | 248 + .../datastruct/serialize/objects.thrift | 40 + bdm/allocator/datastruct/tests/DListTest.cpp | 156 + .../datastruct/tests/MultiDListTest.cpp | 104 + bdm/allocator/datastruct/tests/SListTest.cpp | 508 + .../datastruct/tests/test_objects.thrift | 25 + bdm/allocator/memory/AllocationClass.cpp | 747 + bdm/allocator/memory/AllocationClass.h | 511 + bdm/allocator/memory/CompressedPtr.h | 227 + bdm/allocator/memory/MemoryAllocator.cpp | 267 + bdm/allocator/memory/MemoryAllocator.h | 684 + bdm/allocator/memory/MemoryAllocatorStats.h | 113 + bdm/allocator/memory/MemoryPool.cpp | 525 + bdm/allocator/memory/MemoryPool.h | 418 + bdm/allocator/memory/MemoryPoolManager.cpp | 328 + bdm/allocator/memory/MemoryPoolManager.h | 236 + bdm/allocator/memory/Slab.cpp | 25 + bdm/allocator/memory/Slab.h | 312 + bdm/allocator/memory/SlabAllocator.cpp | 602 + bdm/allocator/memory/SlabAllocator.h | 457 + bdm/allocator/memory/serialize/objects.thrift | 74 + .../memory/tests/AllocationClassTest.cpp | 1367 + .../memory/tests/MemoryAllocatorTest.cpp | 818 + .../memory/tests/MemoryPoolManagerTest.cpp | 349 + bdm/allocator/memory/tests/MemoryPoolTest.cpp | 946 + .../memory/tests/SlabAllocatorTest.cpp | 782 + bdm/allocator/memory/tests/TestBase.cpp | 156 + bdm/allocator/memory/tests/TestBase.h | 131 + .../nvmcache/BlockCacheReinsertionPolicy.h | 41 + bdm/allocator/nvmcache/CacheApiWrapper.h | 133 + bdm/allocator/nvmcache/InFlightPuts.h | 169 + bdm/allocator/nvmcache/NavyConfig.cpp | 226 + bdm/allocator/nvmcache/NavyConfig.h | 573 + bdm/allocator/nvmcache/NavySetup.cpp | 292 + bdm/allocator/nvmcache/NavySetup.h | 39 + bdm/allocator/nvmcache/NvmCache-inl.h | 851 + bdm/allocator/nvmcache/NvmCache.h | 456 + bdm/allocator/nvmcache/NvmItem.cpp | 125 + bdm/allocator/nvmcache/NvmItem.h | 174 + bdm/allocator/nvmcache/ReqContexts.h | 143 + bdm/allocator/nvmcache/TombStones.h | 128 + bdm/allocator/nvmcache/WaitContext.h | 40 + .../nvmcache/tests/InFlightPutsTest.cpp | 159 + .../nvmcache/tests/NavyConfigTest.cpp | 376 + .../nvmcache/tests/NavySetupTest.cpp | 113 + .../nvmcache/tests/NvmCacheTests.cpp | 2525 + bdm/allocator/nvmcache/tests/NvmItemTests.cpp | 131 + bdm/allocator/nvmcache/tests/NvmTestBase.cpp | 154 + bdm/allocator/nvmcache/tests/NvmTestBase.h | 184 + .../nvmcache/tests/TombStoneTests.cpp | 125 + bdm/allocator/serialize/objects.thrift | 158 + bdm/allocator/tests/AccessTypeTest.h | 590 + bdm/allocator/tests/AllocatorHitStatsTest.h | 434 + .../tests/AllocatorHitStatsTypeTest.cpp | 35 + .../tests/AllocatorMemoryTiersTest.cpp | 35 + .../tests/AllocatorMemoryTiersTest.h | 187 + bdm/allocator/tests/AllocatorResizeTest.h | 1172 + .../tests/AllocatorResizeTypeTest.cpp | 58 + bdm/allocator/tests/AllocatorTestUtils.h | 47 + bdm/allocator/tests/AllocatorTypeTest.cpp | 535 + .../tests/AllocatorTypeTestDeathStyle.cpp | 37 + bdm/allocator/tests/BaseAllocatorTest.h | 6310 ++ .../tests/BaseAllocatorTestDeathStyle.h | 83 + bdm/allocator/tests/Cache.h | 64 + .../tests/CacheAllocatorConfigTest.cpp | 72 + .../tests/CacheAllocatorTestWrapper.h | 64 + bdm/allocator/tests/CacheBaseTest.cpp | 117 + bdm/allocator/tests/ChainedHashTest.cpp | 269 + bdm/allocator/tests/ItemHandleTest.cpp | 348 + bdm/allocator/tests/ItemTest.cpp | 206 + bdm/allocator/tests/MM2QTest.cpp | 731 + bdm/allocator/tests/MMLruTest.cpp | 545 + bdm/allocator/tests/MMTinyLFUTest.cpp | 433 + bdm/allocator/tests/MMTypeTest.h | 349 + bdm/allocator/tests/MarginalHitsStateTest.cpp | 130 + bdm/allocator/tests/MemoryMonitorTest.cpp | 94 + bdm/allocator/tests/MemoryTiersTest.cpp | 285 + bdm/allocator/tests/MultiAllocatorTest.cpp | 38 + bdm/allocator/tests/MultiAllocatorTest.h | 69 + .../tests/NvmAdmissionPolicyTest.cpp | 214 + bdm/allocator/tests/NvmCacheStateTest.cpp | 334 + bdm/allocator/tests/NvmTestUtils.h | 45 + .../tests/PoolOptimizeStrategyTest.cpp | 207 + bdm/allocator/tests/RebalanceStrategyTest.cpp | 623 + bdm/allocator/tests/RefCountTest.cpp | 177 + .../tests/SimplePoolOptimizationTest.cpp | 34 + .../tests/SimplePoolOptimizationTest.h | 178 + bdm/allocator/tests/SimpleRebalancingTest.cpp | 34 + bdm/allocator/tests/SimpleRebalancingTest.h | 175 + bdm/allocator/tests/TestBase-inl.h | 336 + bdm/allocator/tests/TestBase.h | 132 + bdm/benchmarks/BenchmarkUtils.h | 77 + .../BinarySearchVsHashTableBench.cpp | 249 + bdm/benchmarks/BucketMutexBench.cpp | 352 + bdm/benchmarks/BytesEqualBenchmark.cpp | 205 + bdm/benchmarks/CMakeLists.txt | 61 + .../CacheAllocatorOpsMicroBench.cpp | 564 + bdm/benchmarks/CachelibMapOperationBench.cpp | 213 + bdm/benchmarks/CachelibMapWorkloadBench.cpp | 227 + .../CachelibRangeMapWorkloadBench.cpp | 236 + bdm/benchmarks/CachelibTickerClockBench.cpp | 148 + bdm/benchmarks/CompactCacheBench.cpp | 147 + bdm/benchmarks/DataTypeBench.thrift | 27 + bdm/benchmarks/EventTrackerPerf.cpp | 139 + bdm/benchmarks/HashMapBenchmark.cpp | 219 + bdm/benchmarks/ItemsReaperBench.cpp | 158 + bdm/benchmarks/MMTypeAccessBench.cpp | 115 + bdm/benchmarks/MMTypeBench.cpp | 283 + bdm/benchmarks/MMTypeBench.h | 261 + bdm/benchmarks/MutexBench.cpp | 461 + bdm/benchmarks/PtrCompressionBench.cpp | 157 + bdm/benchmarks/SListBench.cpp | 212 + bdm/benchmarks/SmallOperationMicroBench.cpp | 287 + .../SpeedUpExistenceCheckBenchmark.cpp | 596 + .../StrictAliasingSafeReadBench.cpp | 109 + bdm/benchmarks/ThreadLocalBench.cpp | 125 + bdm/benchmarks/tl-bench/main.cpp | 174 + bdm/cachebench/CMakeLists.txt | 94 + bdm/cachebench/cache/Cache-inl.h | 849 + bdm/cachebench/cache/Cache.cpp | 34 + bdm/cachebench/cache/Cache.h | 488 + bdm/cachebench/cache/CacheStats.h | 572 + bdm/cachebench/cache/CacheValue.h | 91 + bdm/cachebench/cache/ItemRecords.h | 192 + bdm/cachebench/cache/TimeStampTicker.cpp | 121 + bdm/cachebench/cache/TimeStampTicker.h | 112 + .../cache/tests/TimeStampTickerTest.cpp | 56 + bdm/cachebench/consistency/LogEventStream.cpp | 70 + bdm/cachebench/consistency/LogEventStream.h | 56 + bdm/cachebench/consistency/RingBuffer.h | 100 + bdm/cachebench/consistency/ShortThreadId.cpp | 51 + bdm/cachebench/consistency/ShortThreadId.h | 43 + bdm/cachebench/consistency/ValueHistory.cpp | 312 + bdm/cachebench/consistency/ValueHistory.h | 201 + bdm/cachebench/consistency/ValueTracker.cpp | 99 + bdm/cachebench/consistency/ValueTracker.h | 96 + .../consistency/tests/RingBufferTest.cpp | 80 + .../consistency/tests/ShortThreadIdTest.cpp | 59 + .../consistency/tests/ValueHistoryTest.cpp | 262 + .../consistency/tests/ValueTrackerTest.cpp | 41 + bdm/cachebench/main.cpp | 159 + bdm/cachebench/runner/AsyncCacheStressor.h | 648 + bdm/cachebench/runner/CacheStressor.h | 556 + bdm/cachebench/runner/FastShutdown.cpp | 144 + bdm/cachebench/runner/FastShutdown.h | 74 + bdm/cachebench/runner/IntegrationStressor.cpp | 372 + bdm/cachebench/runner/IntegrationStressor.h | 237 + bdm/cachebench/runner/ProgressTracker.cpp | 81 + bdm/cachebench/runner/ProgressTracker.h | 47 + bdm/cachebench/runner/Runner.cpp | 84 + bdm/cachebench/runner/Runner.h | 66 + bdm/cachebench/runner/Stressor.cpp | 201 + bdm/cachebench/runner/Stressor.h | 116 + .../chained-items-stress-moving.json | 38 + .../chained-items-stress-no-moving.json | 38 + .../navy-with-deletes-inmem-buf.json | 43 + ...vy-with-deletes-stack-alloc-inmem-buf.json | 43 + ...ith-deletes-with-truncated-alloc-size.json | 42 + .../consistency/navy-with-deletes.json | 42 + .../navy-with-encryption-inmem-buf.json | 46 + .../consistency/navy-with-encryption.json | 45 + .../navy-with-reinsertion-inmem-buf.json | 32 + .../consistency/navy-with-reinsertion.json | 31 + .../test_configs/consistency/navy.json | 42 + .../consistency/ram-with-deletes.json | 42 + .../test_configs/consistency/ram.json | 42 + .../feature_stress/dram_cache.json | 26 + .../feature_stress/dynamic_refresh_time.json | 27 + .../feature_stress/free_list.json | 28 + .../feature_stress/item_destructor/dram.json | 35 + .../feature_stress/item_destructor/navy.json | 36 + .../item_destructor/navy_bighash.json | 36 + .../feature_stress/navy/bc_fifo.json | 31 + .../test_configs/feature_stress/navy/bh.json | 39 + .../feature_stress/navy/default.json | 44 + .../slab_release/chained_items.json | 36 + .../slab_release/chained_items_moving.json | 37 + .../feature_stress/slab_release/default.json | 28 + .../feature_stress/slab_release/moving.json | 28 + .../test_configs/hit_ratio/cdn/config.json | 31 + .../test_configs/hit_ratio/cdn/pop.json | 813 + .../test_configs/hit_ratio/cdn/sizes.json | 53634 ++++++++++++++++ .../graph_cache_follower_assocs/config.json | 31 + .../graph_cache_follower_assocs/pop.json | 1527 + .../graph_cache_follower_assocs/sizes.json | 4560 ++ .../graph_cache_follower_fbobj/config.json | 31 + .../graph_cache_follower_fbobj/pop.json | 1625 + .../graph_cache_follower_fbobj/sizes.json | 5022 ++ .../graph_cache_leader_assocs/config.json | 31 + .../graph_cache_leader_assocs/pop.json | 1133 + .../graph_cache_leader_assocs/sizes.json | 5918 ++ ...g-4GB-DRAM-4GB-PMEM-default_new_test4.json | 51 + .../config-4GB-DRAM-4GB-PMEM.json | 42 + .../config-8GB-DRAM.json | 33 + .../config-8GB-PMEM.json | 39 + .../graph_cache_leader_fbobj/config.json | 31 + .../graph_cache_leader_fbobj/pop.json | 599 + .../graph_cache_leader_fbobj/sizes.json | 5354 ++ .../hit_ratio/kvcache_reg/config.json | 30 + .../hit_ratio/kvcache_reg/pop.json | 1617 + .../hit_ratio/kvcache_reg/sizes.json | 15046 +++++ .../integration_tests/cachelib_map.json | 16 + .../integration_tests/cachelib_map_asan.json | 16 + .../integration_tests/cachelib_range_map.json | 16 + .../cachelib_range_map_asan.json | 16 + .../integration_tests/fast_shutdown.json | 17 + .../integration_tests/high_refcount.json | 16 + bdm/cachebench/test_configs/simple_test.json | 29 + .../simple_test_with_persistence.json | 35 + .../test_configs/simple_tiers_test.json | 36 + .../ssd_perf/flat_kvcache_reg/config.json | 54 + .../ssd_perf/flat_kvcache_reg/pop.json | 1617 + .../ssd_perf/flat_kvcache_reg/sizes.json | 15046 +++++ .../graph_cache_leader/assoc_pop.json | 1133 + .../graph_cache_leader/assoc_sizes.json | 5918 ++ .../ssd_perf/graph_cache_leader/config.json | 77 + .../graph_cache_leader/fbobj_pop.json | 599 + .../graph_cache_leader/fbobj_sizes.json | 5354 ++ .../ssd_perf/kvcache_l2_reg/config.json | 50 + .../ssd_perf/kvcache_l2_reg/pop.json | 1419 + .../ssd_perf/kvcache_l2_reg/sizes.json | 28804 +++++++++ .../ssd_perf/kvcache_l2_wc/config.json | 54 + .../ssd_perf/kvcache_l2_wc/pop.json | 1491 + .../ssd_perf/kvcache_l2_wc/sizes.json | 31778 +++++++++ ...ybrid_cache_get_throughput_all_misses.json | 40 + ...block_cache_get_throughput_all_misses.json | 40 + .../ram_cache_get_throughput_all_misses.json | 32 + .../navy_small_size_class.json | 42 + .../navy_small_stacked_alloc.json | 41 + ...small_stacked_alloc_and_in_mem_buffer.json | 41 + ...ked_alloc_and_in_mem_buffer_throttled.json | 46 + ..._value_get_throughput_high_contention.json | 31 + ...t_throughput_high_contention_try_lock.json | 30 + ...me_value_get_throughput_test_20_pools.json | 31 + ...value_get_throughput_test_single_pool.json | 27 + .../simple_lone_get_throughput.json | 27 + .../simple_navy_delete_throughput.json | 27 + .../throughput/simple_online_throughput.json | 27 + .../throughput/simple_set_throughput.json | 26 + ...ple_set_throughput_with_chained_items.json | 32 + .../throughput/simple_trace_throughput.json | 16 + .../throughput/small_cache_chained_items.json | 30 + .../uniform_values_get_throughput_test.json | 31 + ...alues_get_throughput_test_single_pool.json | 26 + bdm/cachebench/util/CacheConfig.cpp | 226 + bdm/cachebench/util/CacheConfig.h | 320 + bdm/cachebench/util/Config.cpp | 234 + bdm/cachebench/util/Config.h | 303 + bdm/cachebench/util/Exceptions.h | 31 + bdm/cachebench/util/JSONConfig.h | 156 + bdm/cachebench/util/NandWrites.cpp | 445 + bdm/cachebench/util/NandWrites.h | 70 + bdm/cachebench/util/Parallel.h | 72 + bdm/cachebench/util/Request.h | 128 + .../util/tests/MemoryTierConfigTest.cpp | 86 + bdm/cachebench/util/tests/NandWritesTest.cpp | 539 + bdm/cachebench/vizualize/extract_latency.sh | 146 + bdm/cachebench/vizualize/gnuplot_latency.plt | 54 + .../workload/AmplifiedReplayGenerator.h | 175 + bdm/cachebench/workload/FastDiscrete.h | 170 + bdm/cachebench/workload/GeneratorBase.h | 87 + bdm/cachebench/workload/OnlineGenerator.cpp | 154 + bdm/cachebench/workload/OnlineGenerator.h | 102 + bdm/cachebench/workload/PieceWiseCache.cpp | 584 + bdm/cachebench/workload/PieceWiseCache.h | 356 + .../workload/PieceWiseReplayGenerator.cpp | 293 + .../workload/PieceWiseReplayGenerator.h | 188 + bdm/cachebench/workload/ReplayGenerator.h | 87 + bdm/cachebench/workload/ReplayGeneratorBase.h | 95 + .../workload/WorkloadDistribution.h | 141 + bdm/cachebench/workload/WorkloadGenerator.cpp | 188 + bdm/cachebench/workload/WorkloadGenerator.h | 75 + .../workload/tests/PieceWiseCacheTest.cpp | 340 + .../workload/tests/WorkloadGeneratorTest.cpp | 104 + 361 files changed, 269492 insertions(+) create mode 100644 bdm/allocator/BackgroundMover-inl.h create mode 100644 bdm/allocator/BackgroundMover.h create mode 100644 bdm/allocator/BackgroundMoverStrategy.h create mode 100644 bdm/allocator/CCacheAllocator.cpp create mode 100644 bdm/allocator/CCacheAllocator.h create mode 100644 bdm/allocator/CCacheManager.cpp create mode 100644 bdm/allocator/CCacheManager.h create mode 100644 bdm/allocator/CMakeLists.txt create mode 100644 bdm/allocator/Cache.cpp create mode 100644 bdm/allocator/Cache.h create mode 100644 bdm/allocator/CacheAllocator-inl.h create mode 100644 bdm/allocator/CacheAllocator.cpp create mode 100644 bdm/allocator/CacheAllocator.h create mode 100644 bdm/allocator/CacheAllocatorConfig.h create mode 100644 bdm/allocator/CacheChainedItemIterator.h create mode 100644 bdm/allocator/CacheDetails.cpp create mode 100644 bdm/allocator/CacheDetails.h create mode 100644 bdm/allocator/CacheItem-inl.h create mode 100644 bdm/allocator/CacheItem.h create mode 100644 bdm/allocator/CacheStats.cpp create mode 100644 bdm/allocator/CacheStats.h create mode 100644 bdm/allocator/CacheStatsInternal.h create mode 100644 bdm/allocator/CacheTraits.h create mode 100644 bdm/allocator/CacheVersion.h create mode 100644 bdm/allocator/ChainedAllocs.h create mode 100644 bdm/allocator/ChainedHashTable-inl.h create mode 100644 bdm/allocator/ChainedHashTable.h create mode 100644 bdm/allocator/ContainerTypes.cpp create mode 100644 bdm/allocator/FreeMemStrategy.cpp create mode 100644 bdm/allocator/FreeMemStrategy.h create mode 100644 bdm/allocator/FreeThresholdStrategy.cpp create mode 100644 bdm/allocator/FreeThresholdStrategy.h create mode 100644 bdm/allocator/Handle.h create mode 100644 bdm/allocator/HitsPerSlabStrategy.cpp create mode 100644 bdm/allocator/HitsPerSlabStrategy.h create mode 100644 bdm/allocator/ICompactCache.h create mode 100644 bdm/allocator/KAllocation.h create mode 100644 bdm/allocator/LruTailAgeStrategy.cpp create mode 100644 bdm/allocator/LruTailAgeStrategy.h create mode 100644 bdm/allocator/MM2Q-inl.h create mode 100644 bdm/allocator/MM2Q.h create mode 100644 bdm/allocator/MMLru-inl.h create mode 100644 bdm/allocator/MMLru.h create mode 100644 bdm/allocator/MMTinyLFU-inl.h create mode 100644 bdm/allocator/MMTinyLFU.h create mode 100644 bdm/allocator/MarginalHitsOptimizeStrategy.cpp create mode 100644 bdm/allocator/MarginalHitsOptimizeStrategy.h create mode 100644 bdm/allocator/MarginalHitsState-inl.h create mode 100644 bdm/allocator/MarginalHitsState.h create mode 100644 bdm/allocator/MarginalHitsStrategy.cpp create mode 100644 bdm/allocator/MarginalHitsStrategy.h create mode 100644 bdm/allocator/MemoryMonitor.cpp create mode 100644 bdm/allocator/MemoryMonitor.h create mode 100644 bdm/allocator/MemoryTierCacheConfig.h create mode 100644 bdm/allocator/NvmAdmissionPolicy.h create mode 100644 bdm/allocator/NvmCacheState.cpp create mode 100644 bdm/allocator/NvmCacheState.h create mode 100644 bdm/allocator/PoolOptimizeStrategy.cpp create mode 100644 bdm/allocator/PoolOptimizeStrategy.h create mode 100644 bdm/allocator/PoolOptimizer.cpp create mode 100644 bdm/allocator/PoolOptimizer.h create mode 100644 bdm/allocator/PoolRebalancer.cpp create mode 100644 bdm/allocator/PoolRebalancer.h create mode 100644 bdm/allocator/PoolResizeStrategy.h create mode 100644 bdm/allocator/PoolResizer.cpp create mode 100644 bdm/allocator/PoolResizer.h create mode 100644 bdm/allocator/PromotionStrategy.h create mode 100644 bdm/allocator/RandomStrategy.h create mode 100644 bdm/allocator/ReadOnlySharedCacheView.h create mode 100644 bdm/allocator/Reaper-inl.h create mode 100644 bdm/allocator/Reaper.h create mode 100644 bdm/allocator/RebalanceInfo.h create mode 100644 bdm/allocator/RebalanceStrategy.cpp create mode 100644 bdm/allocator/RebalanceStrategy.h create mode 100644 bdm/allocator/Refcount.h create mode 100644 bdm/allocator/SlabReleaseStats.cpp create mode 100644 bdm/allocator/SlabReleaseStats.h create mode 100644 bdm/allocator/TempShmMapping.cpp create mode 100644 bdm/allocator/TempShmMapping.h create mode 100644 bdm/allocator/TlsActiveItemRing.h create mode 100644 bdm/allocator/TypedHandle.h create mode 100644 bdm/allocator/Util.h create mode 100644 bdm/allocator/datastruct/DList-inl.h create mode 100644 bdm/allocator/datastruct/DList.h create mode 100644 bdm/allocator/datastruct/MultiDList-inl.h create mode 100644 bdm/allocator/datastruct/MultiDList.h create mode 100644 bdm/allocator/datastruct/SList-inl.h create mode 100644 bdm/allocator/datastruct/SList.h create mode 100644 bdm/allocator/datastruct/serialize/objects.thrift create mode 100644 bdm/allocator/datastruct/tests/DListTest.cpp create mode 100644 bdm/allocator/datastruct/tests/MultiDListTest.cpp create mode 100644 bdm/allocator/datastruct/tests/SListTest.cpp create mode 100644 bdm/allocator/datastruct/tests/test_objects.thrift create mode 100644 bdm/allocator/memory/AllocationClass.cpp create mode 100644 bdm/allocator/memory/AllocationClass.h create mode 100644 bdm/allocator/memory/CompressedPtr.h create mode 100644 bdm/allocator/memory/MemoryAllocator.cpp create mode 100644 bdm/allocator/memory/MemoryAllocator.h create mode 100644 bdm/allocator/memory/MemoryAllocatorStats.h create mode 100644 bdm/allocator/memory/MemoryPool.cpp create mode 100644 bdm/allocator/memory/MemoryPool.h create mode 100644 bdm/allocator/memory/MemoryPoolManager.cpp create mode 100644 bdm/allocator/memory/MemoryPoolManager.h create mode 100644 bdm/allocator/memory/Slab.cpp create mode 100644 bdm/allocator/memory/Slab.h create mode 100644 bdm/allocator/memory/SlabAllocator.cpp create mode 100644 bdm/allocator/memory/SlabAllocator.h create mode 100644 bdm/allocator/memory/serialize/objects.thrift create mode 100644 bdm/allocator/memory/tests/AllocationClassTest.cpp create mode 100644 bdm/allocator/memory/tests/MemoryAllocatorTest.cpp create mode 100644 bdm/allocator/memory/tests/MemoryPoolManagerTest.cpp create mode 100644 bdm/allocator/memory/tests/MemoryPoolTest.cpp create mode 100644 bdm/allocator/memory/tests/SlabAllocatorTest.cpp create mode 100644 bdm/allocator/memory/tests/TestBase.cpp create mode 100644 bdm/allocator/memory/tests/TestBase.h create mode 100644 bdm/allocator/nvmcache/BlockCacheReinsertionPolicy.h create mode 100644 bdm/allocator/nvmcache/CacheApiWrapper.h create mode 100644 bdm/allocator/nvmcache/InFlightPuts.h create mode 100644 bdm/allocator/nvmcache/NavyConfig.cpp create mode 100644 bdm/allocator/nvmcache/NavyConfig.h create mode 100644 bdm/allocator/nvmcache/NavySetup.cpp create mode 100644 bdm/allocator/nvmcache/NavySetup.h create mode 100644 bdm/allocator/nvmcache/NvmCache-inl.h create mode 100644 bdm/allocator/nvmcache/NvmCache.h create mode 100644 bdm/allocator/nvmcache/NvmItem.cpp create mode 100644 bdm/allocator/nvmcache/NvmItem.h create mode 100644 bdm/allocator/nvmcache/ReqContexts.h create mode 100644 bdm/allocator/nvmcache/TombStones.h create mode 100644 bdm/allocator/nvmcache/WaitContext.h create mode 100644 bdm/allocator/nvmcache/tests/InFlightPutsTest.cpp create mode 100644 bdm/allocator/nvmcache/tests/NavyConfigTest.cpp create mode 100644 bdm/allocator/nvmcache/tests/NavySetupTest.cpp create mode 100644 bdm/allocator/nvmcache/tests/NvmCacheTests.cpp create mode 100644 bdm/allocator/nvmcache/tests/NvmItemTests.cpp create mode 100644 bdm/allocator/nvmcache/tests/NvmTestBase.cpp create mode 100644 bdm/allocator/nvmcache/tests/NvmTestBase.h create mode 100644 bdm/allocator/nvmcache/tests/TombStoneTests.cpp create mode 100644 bdm/allocator/serialize/objects.thrift create mode 100644 bdm/allocator/tests/AccessTypeTest.h create mode 100644 bdm/allocator/tests/AllocatorHitStatsTest.h create mode 100644 bdm/allocator/tests/AllocatorHitStatsTypeTest.cpp create mode 100644 bdm/allocator/tests/AllocatorMemoryTiersTest.cpp create mode 100644 bdm/allocator/tests/AllocatorMemoryTiersTest.h create mode 100644 bdm/allocator/tests/AllocatorResizeTest.h create mode 100644 bdm/allocator/tests/AllocatorResizeTypeTest.cpp create mode 100644 bdm/allocator/tests/AllocatorTestUtils.h create mode 100644 bdm/allocator/tests/AllocatorTypeTest.cpp create mode 100644 bdm/allocator/tests/AllocatorTypeTestDeathStyle.cpp create mode 100644 bdm/allocator/tests/BaseAllocatorTest.h create mode 100644 bdm/allocator/tests/BaseAllocatorTestDeathStyle.h create mode 100644 bdm/allocator/tests/Cache.h create mode 100644 bdm/allocator/tests/CacheAllocatorConfigTest.cpp create mode 100644 bdm/allocator/tests/CacheAllocatorTestWrapper.h create mode 100644 bdm/allocator/tests/CacheBaseTest.cpp create mode 100644 bdm/allocator/tests/ChainedHashTest.cpp create mode 100644 bdm/allocator/tests/ItemHandleTest.cpp create mode 100644 bdm/allocator/tests/ItemTest.cpp create mode 100644 bdm/allocator/tests/MM2QTest.cpp create mode 100644 bdm/allocator/tests/MMLruTest.cpp create mode 100644 bdm/allocator/tests/MMTinyLFUTest.cpp create mode 100644 bdm/allocator/tests/MMTypeTest.h create mode 100644 bdm/allocator/tests/MarginalHitsStateTest.cpp create mode 100644 bdm/allocator/tests/MemoryMonitorTest.cpp create mode 100644 bdm/allocator/tests/MemoryTiersTest.cpp create mode 100644 bdm/allocator/tests/MultiAllocatorTest.cpp create mode 100644 bdm/allocator/tests/MultiAllocatorTest.h create mode 100644 bdm/allocator/tests/NvmAdmissionPolicyTest.cpp create mode 100644 bdm/allocator/tests/NvmCacheStateTest.cpp create mode 100644 bdm/allocator/tests/NvmTestUtils.h create mode 100644 bdm/allocator/tests/PoolOptimizeStrategyTest.cpp create mode 100644 bdm/allocator/tests/RebalanceStrategyTest.cpp create mode 100644 bdm/allocator/tests/RefCountTest.cpp create mode 100644 bdm/allocator/tests/SimplePoolOptimizationTest.cpp create mode 100644 bdm/allocator/tests/SimplePoolOptimizationTest.h create mode 100644 bdm/allocator/tests/SimpleRebalancingTest.cpp create mode 100644 bdm/allocator/tests/SimpleRebalancingTest.h create mode 100644 bdm/allocator/tests/TestBase-inl.h create mode 100644 bdm/allocator/tests/TestBase.h create mode 100644 bdm/benchmarks/BenchmarkUtils.h create mode 100644 bdm/benchmarks/BinarySearchVsHashTableBench.cpp create mode 100644 bdm/benchmarks/BucketMutexBench.cpp create mode 100644 bdm/benchmarks/BytesEqualBenchmark.cpp create mode 100644 bdm/benchmarks/CMakeLists.txt create mode 100644 bdm/benchmarks/CacheAllocatorOpsMicroBench.cpp create mode 100644 bdm/benchmarks/CachelibMapOperationBench.cpp create mode 100644 bdm/benchmarks/CachelibMapWorkloadBench.cpp create mode 100644 bdm/benchmarks/CachelibRangeMapWorkloadBench.cpp create mode 100644 bdm/benchmarks/CachelibTickerClockBench.cpp create mode 100644 bdm/benchmarks/CompactCacheBench.cpp create mode 100644 bdm/benchmarks/DataTypeBench.thrift create mode 100644 bdm/benchmarks/EventTrackerPerf.cpp create mode 100644 bdm/benchmarks/HashMapBenchmark.cpp create mode 100644 bdm/benchmarks/ItemsReaperBench.cpp create mode 100644 bdm/benchmarks/MMTypeAccessBench.cpp create mode 100644 bdm/benchmarks/MMTypeBench.cpp create mode 100644 bdm/benchmarks/MMTypeBench.h create mode 100644 bdm/benchmarks/MutexBench.cpp create mode 100644 bdm/benchmarks/PtrCompressionBench.cpp create mode 100644 bdm/benchmarks/SListBench.cpp create mode 100644 bdm/benchmarks/SmallOperationMicroBench.cpp create mode 100644 bdm/benchmarks/SpeedUpExistenceCheckBenchmark.cpp create mode 100644 bdm/benchmarks/StrictAliasingSafeReadBench.cpp create mode 100644 bdm/benchmarks/ThreadLocalBench.cpp create mode 100644 bdm/benchmarks/tl-bench/main.cpp create mode 100644 bdm/cachebench/CMakeLists.txt create mode 100644 bdm/cachebench/cache/Cache-inl.h create mode 100644 bdm/cachebench/cache/Cache.cpp create mode 100644 bdm/cachebench/cache/Cache.h create mode 100644 bdm/cachebench/cache/CacheStats.h create mode 100644 bdm/cachebench/cache/CacheValue.h create mode 100644 bdm/cachebench/cache/ItemRecords.h create mode 100644 bdm/cachebench/cache/TimeStampTicker.cpp create mode 100644 bdm/cachebench/cache/TimeStampTicker.h create mode 100644 bdm/cachebench/cache/tests/TimeStampTickerTest.cpp create mode 100644 bdm/cachebench/consistency/LogEventStream.cpp create mode 100644 bdm/cachebench/consistency/LogEventStream.h create mode 100644 bdm/cachebench/consistency/RingBuffer.h create mode 100644 bdm/cachebench/consistency/ShortThreadId.cpp create mode 100644 bdm/cachebench/consistency/ShortThreadId.h create mode 100644 bdm/cachebench/consistency/ValueHistory.cpp create mode 100644 bdm/cachebench/consistency/ValueHistory.h create mode 100644 bdm/cachebench/consistency/ValueTracker.cpp create mode 100644 bdm/cachebench/consistency/ValueTracker.h create mode 100644 bdm/cachebench/consistency/tests/RingBufferTest.cpp create mode 100644 bdm/cachebench/consistency/tests/ShortThreadIdTest.cpp create mode 100644 bdm/cachebench/consistency/tests/ValueHistoryTest.cpp create mode 100644 bdm/cachebench/consistency/tests/ValueTrackerTest.cpp create mode 100644 bdm/cachebench/main.cpp create mode 100644 bdm/cachebench/runner/AsyncCacheStressor.h create mode 100644 bdm/cachebench/runner/CacheStressor.h create mode 100644 bdm/cachebench/runner/FastShutdown.cpp create mode 100644 bdm/cachebench/runner/FastShutdown.h create mode 100644 bdm/cachebench/runner/IntegrationStressor.cpp create mode 100644 bdm/cachebench/runner/IntegrationStressor.h create mode 100644 bdm/cachebench/runner/ProgressTracker.cpp create mode 100644 bdm/cachebench/runner/ProgressTracker.h create mode 100644 bdm/cachebench/runner/Runner.cpp create mode 100644 bdm/cachebench/runner/Runner.h create mode 100644 bdm/cachebench/runner/Stressor.cpp create mode 100644 bdm/cachebench/runner/Stressor.h create mode 100644 bdm/cachebench/test_configs/consistency/chained-items-stress-moving.json create mode 100644 bdm/cachebench/test_configs/consistency/chained-items-stress-no-moving.json create mode 100644 bdm/cachebench/test_configs/consistency/navy-with-deletes-inmem-buf.json create mode 100644 bdm/cachebench/test_configs/consistency/navy-with-deletes-stack-alloc-inmem-buf.json create mode 100644 bdm/cachebench/test_configs/consistency/navy-with-deletes-with-truncated-alloc-size.json create mode 100644 bdm/cachebench/test_configs/consistency/navy-with-deletes.json create mode 100644 bdm/cachebench/test_configs/consistency/navy-with-encryption-inmem-buf.json create mode 100644 bdm/cachebench/test_configs/consistency/navy-with-encryption.json create mode 100644 bdm/cachebench/test_configs/consistency/navy-with-reinsertion-inmem-buf.json create mode 100644 bdm/cachebench/test_configs/consistency/navy-with-reinsertion.json create mode 100644 bdm/cachebench/test_configs/consistency/navy.json create mode 100644 bdm/cachebench/test_configs/consistency/ram-with-deletes.json create mode 100644 bdm/cachebench/test_configs/consistency/ram.json create mode 100644 bdm/cachebench/test_configs/feature_stress/dram_cache.json create mode 100644 bdm/cachebench/test_configs/feature_stress/dynamic_refresh_time.json create mode 100644 bdm/cachebench/test_configs/feature_stress/free_list.json create mode 100644 bdm/cachebench/test_configs/feature_stress/item_destructor/dram.json create mode 100644 bdm/cachebench/test_configs/feature_stress/item_destructor/navy.json create mode 100644 bdm/cachebench/test_configs/feature_stress/item_destructor/navy_bighash.json create mode 100644 bdm/cachebench/test_configs/feature_stress/navy/bc_fifo.json create mode 100644 bdm/cachebench/test_configs/feature_stress/navy/bh.json create mode 100644 bdm/cachebench/test_configs/feature_stress/navy/default.json create mode 100644 bdm/cachebench/test_configs/feature_stress/slab_release/chained_items.json create mode 100644 bdm/cachebench/test_configs/feature_stress/slab_release/chained_items_moving.json create mode 100644 bdm/cachebench/test_configs/feature_stress/slab_release/default.json create mode 100644 bdm/cachebench/test_configs/feature_stress/slab_release/moving.json create mode 100644 bdm/cachebench/test_configs/hit_ratio/cdn/config.json create mode 100644 bdm/cachebench/test_configs/hit_ratio/cdn/pop.json create mode 100644 bdm/cachebench/test_configs/hit_ratio/cdn/sizes.json create mode 100644 bdm/cachebench/test_configs/hit_ratio/graph_cache_follower_assocs/config.json create mode 100644 bdm/cachebench/test_configs/hit_ratio/graph_cache_follower_assocs/pop.json create mode 100644 bdm/cachebench/test_configs/hit_ratio/graph_cache_follower_assocs/sizes.json create mode 100644 bdm/cachebench/test_configs/hit_ratio/graph_cache_follower_fbobj/config.json create mode 100644 bdm/cachebench/test_configs/hit_ratio/graph_cache_follower_fbobj/pop.json create mode 100644 bdm/cachebench/test_configs/hit_ratio/graph_cache_follower_fbobj/sizes.json create mode 100644 bdm/cachebench/test_configs/hit_ratio/graph_cache_leader_assocs/config.json create mode 100644 bdm/cachebench/test_configs/hit_ratio/graph_cache_leader_assocs/pop.json create mode 100644 bdm/cachebench/test_configs/hit_ratio/graph_cache_leader_assocs/sizes.json create mode 100644 bdm/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-4GB-DRAM-4GB-PMEM-default_new_test4.json create mode 100644 bdm/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-4GB-DRAM-4GB-PMEM.json create mode 100644 bdm/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-DRAM.json create mode 100644 bdm/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-PMEM.json create mode 100644 bdm/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config.json create mode 100644 bdm/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/pop.json create mode 100644 bdm/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/sizes.json create mode 100644 bdm/cachebench/test_configs/hit_ratio/kvcache_reg/config.json create mode 100644 bdm/cachebench/test_configs/hit_ratio/kvcache_reg/pop.json create mode 100644 bdm/cachebench/test_configs/hit_ratio/kvcache_reg/sizes.json create mode 100644 bdm/cachebench/test_configs/integration_tests/cachelib_map.json create mode 100644 bdm/cachebench/test_configs/integration_tests/cachelib_map_asan.json create mode 100644 bdm/cachebench/test_configs/integration_tests/cachelib_range_map.json create mode 100644 bdm/cachebench/test_configs/integration_tests/cachelib_range_map_asan.json create mode 100644 bdm/cachebench/test_configs/integration_tests/fast_shutdown.json create mode 100644 bdm/cachebench/test_configs/integration_tests/high_refcount.json create mode 100644 bdm/cachebench/test_configs/simple_test.json create mode 100644 bdm/cachebench/test_configs/simple_test_with_persistence.json create mode 100644 bdm/cachebench/test_configs/simple_tiers_test.json create mode 100644 bdm/cachebench/test_configs/ssd_perf/flat_kvcache_reg/config.json create mode 100644 bdm/cachebench/test_configs/ssd_perf/flat_kvcache_reg/pop.json create mode 100644 bdm/cachebench/test_configs/ssd_perf/flat_kvcache_reg/sizes.json create mode 100644 bdm/cachebench/test_configs/ssd_perf/graph_cache_leader/assoc_pop.json create mode 100644 bdm/cachebench/test_configs/ssd_perf/graph_cache_leader/assoc_sizes.json create mode 100644 bdm/cachebench/test_configs/ssd_perf/graph_cache_leader/config.json create mode 100644 bdm/cachebench/test_configs/ssd_perf/graph_cache_leader/fbobj_pop.json create mode 100644 bdm/cachebench/test_configs/ssd_perf/graph_cache_leader/fbobj_sizes.json create mode 100644 bdm/cachebench/test_configs/ssd_perf/kvcache_l2_reg/config.json create mode 100644 bdm/cachebench/test_configs/ssd_perf/kvcache_l2_reg/pop.json create mode 100644 bdm/cachebench/test_configs/ssd_perf/kvcache_l2_reg/sizes.json create mode 100644 bdm/cachebench/test_configs/ssd_perf/kvcache_l2_wc/config.json create mode 100644 bdm/cachebench/test_configs/ssd_perf/kvcache_l2_wc/pop.json create mode 100644 bdm/cachebench/test_configs/ssd_perf/kvcache_l2_wc/sizes.json create mode 100644 bdm/cachebench/test_configs/throughput/miss-workload/hybrid_cache_get_throughput_all_misses.json create mode 100644 bdm/cachebench/test_configs/throughput/miss-workload/hybrid_cache_only_block_cache_get_throughput_all_misses.json create mode 100644 bdm/cachebench/test_configs/throughput/miss-workload/ram_cache_get_throughput_all_misses.json create mode 100644 bdm/cachebench/test_configs/throughput/navy_block_cache/navy_small_size_class.json create mode 100644 bdm/cachebench/test_configs/throughput/navy_block_cache/navy_small_stacked_alloc.json create mode 100644 bdm/cachebench/test_configs/throughput/navy_block_cache/navy_small_stacked_alloc_and_in_mem_buffer.json create mode 100644 bdm/cachebench/test_configs/throughput/navy_block_cache/navy_small_stacked_alloc_and_in_mem_buffer_throttled.json create mode 100644 bdm/cachebench/test_configs/throughput/same_value_get_throughput_high_contention.json create mode 100644 bdm/cachebench/test_configs/throughput/same_value_get_throughput_high_contention_try_lock.json create mode 100644 bdm/cachebench/test_configs/throughput/same_value_get_throughput_test_20_pools.json create mode 100644 bdm/cachebench/test_configs/throughput/same_value_get_throughput_test_single_pool.json create mode 100644 bdm/cachebench/test_configs/throughput/simple_lone_get_throughput.json create mode 100644 bdm/cachebench/test_configs/throughput/simple_navy_delete_throughput.json create mode 100644 bdm/cachebench/test_configs/throughput/simple_online_throughput.json create mode 100644 bdm/cachebench/test_configs/throughput/simple_set_throughput.json create mode 100644 bdm/cachebench/test_configs/throughput/simple_set_throughput_with_chained_items.json create mode 100644 bdm/cachebench/test_configs/throughput/simple_trace_throughput.json create mode 100644 bdm/cachebench/test_configs/throughput/small_cache_chained_items.json create mode 100644 bdm/cachebench/test_configs/throughput/uniform_values_get_throughput_test.json create mode 100644 bdm/cachebench/test_configs/throughput/uniform_values_get_throughput_test_single_pool.json create mode 100644 bdm/cachebench/util/CacheConfig.cpp create mode 100644 bdm/cachebench/util/CacheConfig.h create mode 100644 bdm/cachebench/util/Config.cpp create mode 100644 bdm/cachebench/util/Config.h create mode 100644 bdm/cachebench/util/Exceptions.h create mode 100644 bdm/cachebench/util/JSONConfig.h create mode 100644 bdm/cachebench/util/NandWrites.cpp create mode 100644 bdm/cachebench/util/NandWrites.h create mode 100644 bdm/cachebench/util/Parallel.h create mode 100644 bdm/cachebench/util/Request.h create mode 100644 bdm/cachebench/util/tests/MemoryTierConfigTest.cpp create mode 100644 bdm/cachebench/util/tests/NandWritesTest.cpp create mode 100644 bdm/cachebench/vizualize/extract_latency.sh create mode 100644 bdm/cachebench/vizualize/gnuplot_latency.plt create mode 100644 bdm/cachebench/workload/AmplifiedReplayGenerator.h create mode 100644 bdm/cachebench/workload/FastDiscrete.h create mode 100644 bdm/cachebench/workload/GeneratorBase.h create mode 100644 bdm/cachebench/workload/OnlineGenerator.cpp create mode 100644 bdm/cachebench/workload/OnlineGenerator.h create mode 100644 bdm/cachebench/workload/PieceWiseCache.cpp create mode 100644 bdm/cachebench/workload/PieceWiseCache.h create mode 100644 bdm/cachebench/workload/PieceWiseReplayGenerator.cpp create mode 100644 bdm/cachebench/workload/PieceWiseReplayGenerator.h create mode 100644 bdm/cachebench/workload/ReplayGenerator.h create mode 100644 bdm/cachebench/workload/ReplayGeneratorBase.h create mode 100644 bdm/cachebench/workload/WorkloadDistribution.h create mode 100644 bdm/cachebench/workload/WorkloadGenerator.cpp create mode 100644 bdm/cachebench/workload/WorkloadGenerator.h create mode 100644 bdm/cachebench/workload/tests/PieceWiseCacheTest.cpp create mode 100644 bdm/cachebench/workload/tests/WorkloadGeneratorTest.cpp diff --git a/bdm/allocator/BackgroundMover-inl.h b/bdm/allocator/BackgroundMover-inl.h new file mode 100644 index 0000000000..d65e732202 --- /dev/null +++ b/bdm/allocator/BackgroundMover-inl.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) Intel and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace facebook { +namespace cachelib { + +template +BackgroundMover::BackgroundMover( + Cache& cache, + std::shared_ptr strategy, + MoverDir direction) + : cache_(cache), strategy_(strategy), direction_(direction) { + if (direction_ == MoverDir::Evict) { + moverFunc = BackgroundMoverAPIWrapper::traverseAndEvictItems; + + } else if (direction_ == MoverDir::Promote) { + moverFunc = BackgroundMoverAPIWrapper::traverseAndPromoteItems; + } +} + +template +BackgroundMover::~BackgroundMover() { + stop(std::chrono::seconds(0)); +} + +template +void BackgroundMover::work() { + try { + checkAndRun(); + } catch (const std::exception& ex) { + XLOGF(ERR, "BackgroundMover interrupted due to exception: {}", ex.what()); + } +} + +template +void BackgroundMover::setAssignedMemory( + std::vector>&& assignedMemory) { + XLOG(INFO, "Class assigned to background worker:"); + for (auto [tid, pid, cid] : assignedMemory) { + XLOGF(INFO, "Tid: {}, Pid: {}, Cid: {}", tid, pid, cid); + } + + mutex.lock_combine([this, &assignedMemory] { + this->assignedMemory_ = std::move(assignedMemory); + }); +} + +// Look for classes that exceed the target memory capacity +// and return those for eviction +template +void BackgroundMover::checkAndRun() { + auto assignedMemory = mutex.lock_combine([this] { return assignedMemory_; }); + + unsigned int moves = 0; + std::set classes{}; + auto batches = strategy_->calculateBatchSizes(cache_, assignedMemory); + + for (size_t i = 0; i < batches.size(); i++) { + const auto [tid, pid, cid] = assignedMemory[i]; + const auto batch = batches[i]; + + classes.insert(cid); + const auto& mpStats = cache_.getPoolByTid(pid, tid).getStats(); + + if (!batch) { + continue; + } + + // try moving BATCH items from the class in order to reach free target + auto moved = moverFunc(cache_, tid, pid, cid, batch); + moves += moved; + moves_per_class_[tid][pid][cid] += moved; + totalBytesMoved.add(moved * mpStats.acStats.at(cid).allocSize); + } + + numTraversals.inc(); + numMovedItems.add(moves); + totalClasses.add(classes.size()); +} + +template +BackgroundMoverStats BackgroundMover::getStats() const noexcept { + BackgroundMoverStats stats; + stats.numMovedItems = numMovedItems.get(); + stats.runCount = numTraversals.get(); + stats.totalBytesMoved = totalBytesMoved.get(); + stats.totalClasses = totalClasses.get(); + stats.strategyStats = strategy_->getStats(); + + return stats; +} + +template +std::map>> +BackgroundMover::getClassStats() const noexcept { + return moves_per_class_; +} + +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/BackgroundMover.h b/bdm/allocator/BackgroundMover.h new file mode 100644 index 0000000000..5538561e11 --- /dev/null +++ b/bdm/allocator/BackgroundMover.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) Intel and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/BackgroundMoverStrategy.h" +#include "cachelib/allocator/CacheStats.h" +#include "cachelib/common/AtomicCounter.h" +#include "cachelib/common/PeriodicWorker.h" + +namespace facebook { +namespace cachelib { + +// wrapper that exposes the private APIs of CacheType that are specifically +// needed for the cache api +template +struct BackgroundMoverAPIWrapper { + static size_t traverseAndEvictItems(C& cache, + unsigned int tid, + unsigned int pid, + unsigned int cid, + size_t batch) { + return cache.traverseAndEvictItems(tid, pid, cid, batch); + } + + static size_t traverseAndPromoteItems(C& cache, + unsigned int tid, + unsigned int pid, + unsigned int cid, + size_t batch) { + return cache.traverseAndPromoteItems(tid, pid, cid, batch); + } +}; + +enum class MoverDir { Evict = 0, Promote }; + +// Periodic worker that evicts items from tiers in batches +// The primary aim is to reduce insertion times for new items in the +// cache +template +class BackgroundMover : public PeriodicWorker { + public: + using Cache = CacheT; + // @param cache the cache interface + // @param strategy the stragey class that defines how objects are + // moved, + // (promoted vs. evicted and how much) + BackgroundMover(Cache& cache, + std::shared_ptr strategy, + MoverDir direction_); + + ~BackgroundMover() override; + + BackgroundMoverStats getStats() const noexcept; + std::map>> + getClassStats() const noexcept; + + void setAssignedMemory( + std::vector>&& assignedMemory); + + private: + std::map>> + moves_per_class_; + // cache allocator's interface for evicting + using Item = typename Cache::Item; + + Cache& cache_; + std::shared_ptr strategy_; + MoverDir direction_; + + std::function + moverFunc; + + // implements the actual logic of running the background evictor + void work() override final; + void checkAndRun(); + + AtomicCounter numMovedItems{0}; + AtomicCounter numTraversals{0}; + AtomicCounter totalClasses{0}; + AtomicCounter totalBytesMoved{0}; + + std::vector> assignedMemory_; + folly::DistributedMutex mutex; +}; +} // namespace cachelib +} // namespace facebook + +#include "cachelib/allocator/BackgroundMover-inl.h" diff --git a/bdm/allocator/BackgroundMoverStrategy.h b/bdm/allocator/BackgroundMoverStrategy.h new file mode 100644 index 0000000000..1d586277ab --- /dev/null +++ b/bdm/allocator/BackgroundMoverStrategy.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/Cache.h" + +namespace facebook { +namespace cachelib { + +// Base class for background eviction strategy. +class BackgroundMoverStrategy { + public: + virtual std::vector calculateBatchSizes( + const CacheBase& cache, + std::vector> acVec) = 0; + virtual BackgroundStrategyStats getStats() = 0; +}; + +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/CCacheAllocator.cpp b/bdm/allocator/CCacheAllocator.cpp new file mode 100644 index 0000000000..cff4bded4b --- /dev/null +++ b/bdm/allocator/CCacheAllocator.cpp @@ -0,0 +1,121 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cachelib/allocator/CCacheAllocator.h" + +#include + +namespace facebook { +namespace cachelib { + +CCacheAllocator::CCacheAllocator(MemoryAllocator& allocator, PoolId poolId) + : allocator_(allocator), poolId_(poolId), currentChunksIndex_(0) { + XDCHECK_EQ(0u, getNumChunks()); + resize(); +} + +CCacheAllocator::CCacheAllocator(MemoryAllocator& allocator, + PoolId poolId, + const SerializationType& object) + : CCacheAllocatorBase(*object.ccMetadata()), + allocator_(allocator), + poolId_(poolId), + currentChunksIndex_(0) { + auto& currentChunks = chunks_[currentChunksIndex_]; + for (auto chunk : *object.chunks()) { + currentChunks.push_back(allocator_.unCompress(CompressedPtr(chunk))); + } +} + +size_t CCacheAllocator::getConfiguredSize() const { + return allocator_.getPool(poolId_).getPoolSize(); +} + +std::string CCacheAllocator::getName() const { + return allocator_.getPoolName(poolId_); +} + +size_t CCacheAllocator::resize() { + auto chunks = chunks_[currentChunksIndex_]; + + const size_t currChunks = chunks.size(); + const size_t curSize = currChunks * getChunkSize(); + + /* Round size down to nearest even chunk_size multiple. */ + const size_t newSize = getConfiguredSize(); + const size_t numNewChunks = newSize / getChunkSize(); + const size_t newSizeWanted = numNewChunks * getChunkSize(); + + if (numNewChunks < currChunks) { + /* Shrink cache. Simply release the last N chunks. */ + while (numNewChunks < chunks.size()) { + XDCHECK(chunks.back() != nullptr); + release(chunks.back()); + chunks.pop_back(); + } + } else if (numNewChunks > currChunks) { + size_t i; + for (i = currChunks; i < numNewChunks; i++) { + void* chunk = allocate(); + if (chunk == nullptr) { + break; + } + chunks.push_back(chunk); + } + + if (chunks.size() != numNewChunks) { + XLOGF(CRITICAL, + "Unable to fully increase memory size for pool {}. Wanted to " + "allocate {} new chunks. Allocated {} chunks increasing arena size " + "from {} to {} bytes.", + poolId_, numNewChunks - currChunks, i - currChunks, curSize, + newSizeWanted); + } + } + + chunks_[currentChunksIndex_ ^ 1] = chunks; + currentChunksIndex_ ^= 1; + return chunks.size(); +} + +CCacheAllocator::SerializationType CCacheAllocator::saveState() { + CCacheAllocator::SerializationType object; + *object.ccMetadata() = ccType_.saveState(); + + std::lock_guard guard(resizeLock_); + for (auto chunk : getCurrentChunks()) { + object.chunks()->push_back(allocator_.compress(chunk).saveState()); + } + return object; +} + +void* CCacheAllocator::allocate() { + return allocator_.allocateZeroedSlab(poolId_); +} + +void CCacheAllocator::release(void* chunk) { + auto context = allocator_.startSlabRelease( + poolId_, + allocator_.getAllocationClassId(poolId_, + static_cast(getChunkSize())), + Slab::kInvalidClassId, SlabReleaseMode::kResize, chunk); + XDCHECK_EQ(1u, context.getActiveAllocations().size()); + allocator_.free(chunk); + allocator_.completeSlabRelease(context); +} + +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/CCacheAllocator.h b/bdm/allocator/CCacheAllocator.h new file mode 100644 index 0000000000..a8814c8409 --- /dev/null +++ b/bdm/allocator/CCacheAllocator.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/memory/MemoryAllocator.h" +#include "cachelib/compact_cache/allocators/CCacheAllocatorBase.h" + +// This is the allocator for compact cache provided by cache allocator for one +// pool +// It helps attached compact cache allocate/release memory from the +// specified memory pool in memory allocator +namespace facebook { +namespace cachelib { + +class CCacheAllocator : public CCacheAllocatorBase { + public: + using SerializationType = serialization::CompactCacheAllocatorObject; + + // initialize the compact cache allocator from scratch or from a previous + // state. + CCacheAllocator(MemoryAllocator& allocator, PoolId poolId); + CCacheAllocator(MemoryAllocator& allocator, + PoolId poolId, + const SerializationType& object); + + CCacheAllocator(const CCacheAllocator&) = delete; + CCacheAllocator& operator=(const CCacheAllocator&) = delete; + + // Expand or shrink the number of chunks to getConfiguredSize()/getChunkSize() + // + // Here we assume that compact cache implementation is doing the right thing: + // For expandsion, ccache will rehash after calling this resize() + // For shrink, ccache will rehash before calling this resize() + // + // @return the number of chunks we have resized to + size_t resize() override; + + // Return name assocaited with this allocator. This is the name associated + // with the pool when user has created it via CacheAllocator::addPool() + std::string getName() const; + + // This allocator uses CacheAllocator underneath and each chunk is an + // entire cachelib slab. + size_t getChunkSize() const { return Slab::kSize; } + + // return the configured size of the allocator + size_t getConfiguredSize() const override; + + // return the pointer to the chunk with index _chunkNum_ + void* getChunk(size_t chunkNum) { + if (chunkNum >= getNumChunks()) { + throw std::invalid_argument(folly::sformat( + "Trying to get chunk #{} but only {} chunks are available", chunkNum, + getNumChunks())); + } + return getCurrentChunks()[chunkNum]; + } + + // return number of chunks + size_t getNumChunks() const noexcept { return getCurrentChunks().size(); } + + bool overSized() const { + return getNumChunks() * getChunkSize() > getConfiguredSize(); + } + + // return the pool id for the compact cache + PoolId getPoolId() const { return poolId_; } + + // store allocate state into an object for warm roll + SerializationType saveState(); + + private: + // return the current chunks vector + const std::vector& getCurrentChunks() const { + return chunks_[currentChunksIndex_]; + } + + // allocate a chunk of memory with size Slab::kSize + // + // @ return the pointer to the new memory + // nullptr if we run out of memory from MemoryAllocator + // + // @throw std::logic_error if compact cache is not enabled + // @throw std::invalid_argument if the poolId_ is invalid + void* allocate(); + + // release the passed in chunk back in memory allocator + // + // @throw std::invalid_argument if the chunk or poolId_ is invalid OR + // if the chunk does not belong to any active + // allocation handed out by this allocator. + void release(void* chunk); + + // MemoryAllocator reference and corresponding memory pool id + MemoryAllocator& allocator_; + const PoolId poolId_; + + // two chunks vectors, one is in use while the other is spare + // according to currentChunksIndex_ + std::vector chunks_[2]; + + // index of the currently used chunks vector + std::atomic currentChunksIndex_{0}; +}; + +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/CCacheManager.cpp b/bdm/allocator/CCacheManager.cpp new file mode 100644 index 0000000000..6750139ff0 --- /dev/null +++ b/bdm/allocator/CCacheManager.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cachelib/allocator/CCacheManager.h" + +namespace facebook { +namespace cachelib { + +CCacheManager::CCacheManager(const SerializationType& object, + MemoryAllocator& memoryAllocator) + : memoryAllocator_(memoryAllocator) { + std::lock_guard guard(lock_); + + for (const auto& allocator : *object.allocators()) { + auto id = memoryAllocator_.getPoolId(allocator.first); + allocators_.emplace( + std::piecewise_construct, + std::forward_as_tuple(allocator.first), + std::forward_as_tuple(memoryAllocator_, id, allocator.second)); + } +} + +CCacheAllocator& CCacheManager::addAllocator(const std::string& name, + PoolId poolId) { + std::lock_guard guard(lock_); + auto result = + allocators_.emplace(std::piecewise_construct, + std::forward_as_tuple(name), + std::forward_as_tuple(memoryAllocator_, poolId)); + if (!result.second) { + throw std::invalid_argument( + folly::sformat("Duplicate allocator named {}", name)); + } + return result.first->second; +} + +CCacheAllocator& CCacheManager::getAllocator(const std::string& name) { + std::lock_guard guard(lock_); + return allocators_.at(name); +} + +void CCacheManager::resizeAll() { + // put all allocator pointers into a vector Before starting resizing because + // resizing compact cache involves rehashing which can take a long time, and + // we don't want to hold the lock for the entire duration + std::vector allAllocators; + { + std::lock_guard guard(lock_); + for (auto& allocator : allocators_) { + allAllocators.push_back(&allocator.second); + } + } + + for (auto& allocator : allAllocators) { + // shrink oversized compact caches first + if (allocator->overSized()) { + allocator->resizeCompactCache(); + } + } + for (auto& allocator : allAllocators) { + // resize all compact cache + allocator->resizeCompactCache(); + } +} + +CCacheManager::SerializationType CCacheManager::saveState() { + std::lock_guard guard(lock_); + + SerializationType object; + for (auto& allocator : allocators_) { + object.allocators()->emplace(allocator.first, allocator.second.saveState()); + } + return object; +} + +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/CCacheManager.h b/bdm/allocator/CCacheManager.h new file mode 100644 index 0000000000..372754331d --- /dev/null +++ b/bdm/allocator/CCacheManager.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/CCacheAllocator.h" +#include "cachelib/allocator/memory/serialize/gen-cpp2/objects_types.h" + +namespace facebook { +namespace cachelib { + +// manager for compact cache +// It manages the multiple pools of compact caches (CCacheAllocator) and +// interacts with cache allocator. +class CCacheManager { + public: + using SerializationType = serialization::CompactCacheAllocatorManagerObject; + /** + * Restores the state of all compact cache allocators using information + * in an object. + * + * @param object state object to be restored + * @param allocator the memory allocator to restore pools from + * @return true on success, false on failure. On failure errno is set to + * EINVAL if some allocator can not be found + * + * @note This function does not clean up on errors. If an error is returned + * the cache is likely left in a partially initialized state. All errors + * are assumed to be fatal. + */ + CCacheManager(const SerializationType& object, + MemoryAllocator& memoryAllocator); + + explicit CCacheManager(MemoryAllocator& memoryAllocator) + : memoryAllocator_(memoryAllocator) {} + + // Add a new allocator with given name and poolId + CCacheAllocator& addAllocator(const std::string& name, PoolId poolId); + + // Get the allocator with given name + CCacheAllocator& getAllocator(const std::string& name); + + /** + * Resize all compact caches attached to the allocators + */ + void resizeAll(); + + /** + * Save the state of all compact cache allocators in an object + * + * @return object that contains the state + */ + SerializationType saveState(); + + private: + std::mutex lock_; + MemoryAllocator& memoryAllocator_; + + // Mapping from pool names to allocators + // Compact cache will have direct reference of the allocator, which is fine + // according to http://en.cppreference.com/w/cpp/container/unordered_map + // "References and pointers to either key or data stored in the container are + // only invalidated by erasing that element, even when the corresponding + // iterator is invalidated." + std::unordered_map allocators_; +}; + +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/CMakeLists.txt b/bdm/allocator/CMakeLists.txt new file mode 100644 index 0000000000..87643ff006 --- /dev/null +++ b/bdm/allocator/CMakeLists.txt @@ -0,0 +1,141 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_thrift_file(SERIALIZE serialize/objects.thrift frozen2) + +add_thrift_file(DATASTRUCT_SERIALIZE + datastruct/serialize/objects.thrift frozen2) + +add_thrift_file(DATASTRUCT_TESTS + datastruct/tests/test_objects.thrift frozen2) + +add_thrift_file(MEMORY_SERIALIZE + memory/serialize/objects.thrift frozen2) + +add_library (cachelib_allocator + ${SERIALIZE_THRIFT_FILES} + ${DATASTRUCT_SERIALIZE_THRIFT_FILES} + ${MEMORY_SERIALIZE_THRIFT_FILES} + CacheAllocator.cpp + Cache.cpp + CacheDetails.cpp + CacheStats.cpp + CCacheAllocator.cpp + CCacheManager.cpp + ContainerTypes.cpp + FreeMemStrategy.cpp + FreeThresholdStrategy.cpp + HitsPerSlabStrategy.cpp + LruTailAgeStrategy.cpp + MarginalHitsOptimizeStrategy.cpp + MarginalHitsStrategy.cpp + memory/AllocationClass.cpp + memory/MemoryAllocator.cpp + memory/MemoryPool.cpp + memory/MemoryPoolManager.cpp + MemoryMonitor.cpp + memory/SlabAllocator.cpp + memory/Slab.cpp + nvmcache/NvmItem.cpp + nvmcache/NavyConfig.cpp + nvmcache/NavySetup.cpp + NvmCacheState.cpp + PoolOptimizer.cpp + PoolOptimizeStrategy.cpp + PoolRebalancer.cpp + PoolResizer.cpp + RebalanceStrategy.cpp + SlabReleaseStats.cpp + TempShmMapping.cpp +) +add_dependencies(cachelib_allocator thrift_generated_files) +target_link_libraries(cachelib_allocator PUBLIC + cachelib_navy + cachelib_common + cachelib_shm + ) + +if ((CMAKE_SYSTEM_NAME STREQUAL Linux) AND + (CMAKE_SYSTEM_PROCESSOR STREQUAL x86_64)) +else() + target_compile_definitions(cachelib_allocator PRIVATE SKIP_SIZE_VERIFY) +endif() + + +install(TARGETS cachelib_allocator + EXPORT cachelib-exports + DESTINATION ${LIB_INSTALL_DIR} ) + +if (BUILD_TESTS) + add_library (allocator_test_support OBJECT + ${DATASTRUCT_TESTS_THRIFT_FILES} + ./nvmcache/tests/NvmTestBase.cpp + ./memory/tests/TestBase.cpp + ) + add_dependencies(allocator_test_support thrift_generated_files) + target_link_libraries (allocator_test_support PUBLIC + cachelib_allocator + common_test_utils + glog::glog + gflags + GTest::gtest + GTest::gtest_main + GTest::gmock + ) + + + function (ADD_TEST SOURCE_FILE) + generic_add_test("allocator-test" "${SOURCE_FILE}" + allocator_test_support "${ARGN}") + endfunction() + + + add_test (tests/CacheBaseTest.cpp) + add_test (tests/ItemHandleTest.cpp) + add_test (tests/ItemTest.cpp) + add_test (tests/MarginalHitsStateTest.cpp) + add_test (tests/MM2QTest.cpp) + add_test (tests/MMLruTest.cpp) + add_test (tests/MMTinyLFUTest.cpp) + add_test (tests/NvmCacheStateTest.cpp) + add_test (tests/RefCountTest.cpp) + add_test (tests/SimplePoolOptimizationTest.cpp) + add_test (tests/SimpleRebalancingTest.cpp) + add_test (tests/PoolOptimizeStrategyTest.cpp) + add_test (tests/RebalanceStrategyTest.cpp) + add_test (tests/AllocatorTypeTest.cpp) + add_test (tests/ChainedHashTest.cpp) + add_test (tests/AllocatorResizeTypeTest.cpp) + add_test (tests/AllocatorHitStatsTypeTest.cpp) + add_test (tests/AllocatorMemoryTiersTest.cpp) + add_test (tests/MemoryTiersTest.cpp) + add_test (tests/MultiAllocatorTest.cpp) + add_test (tests/NvmAdmissionPolicyTest.cpp) + add_test (tests/CacheAllocatorConfigTest.cpp) + add_test (nvmcache/tests/NvmItemTests.cpp) + add_test (nvmcache/tests/InFlightPutsTest.cpp) + add_test (nvmcache/tests/TombStoneTests.cpp) + add_test (nvmcache/tests/NavySetupTest.cpp) + add_test (nvmcache/tests/NvmCacheTests.cpp) + add_test (nvmcache/tests/NavyConfigTest.cpp) + add_test (memory/tests/AllocationClassTest.cpp ) + add_test (memory/tests/MemoryAllocatorTest.cpp ) + add_test (memory/tests/MemoryPoolManagerTest.cpp ) + add_test (memory/tests/MemoryPoolTest.cpp ) + add_test (memory/tests/SlabAllocatorTest.cpp ) + add_test (datastruct/tests/DListTest.cpp ) + add_test (datastruct/tests/MultiDListTest.cpp ) + add_test (datastruct/tests/SListTest.cpp ) + +endif() diff --git a/bdm/allocator/Cache.cpp b/bdm/allocator/Cache.cpp new file mode 100644 index 0000000000..0e812fb10e --- /dev/null +++ b/bdm/allocator/Cache.cpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cachelib/allocator/Cache.h" + +#include + +#include "cachelib/allocator/RebalanceStrategy.h" + +namespace facebook { +namespace cachelib { + +void CacheBase::setRebalanceStrategy( + PoolId pid, std::shared_ptr strategy) { + std::unique_lock l(lock_); + poolRebalanceStrategies_[pid] = std::move(strategy); +} + +std::shared_ptr CacheBase::getRebalanceStrategy( + PoolId pid) const { + std::unique_lock l(lock_); + auto it = poolRebalanceStrategies_.find(pid); + if (it != poolRebalanceStrategies_.end() && it->second) { + return it->second; + } + return nullptr; +} + +void CacheBase::setResizeStrategy(PoolId pid, + std::shared_ptr strategy) { + std::unique_lock l(lock_); + poolResizeStrategies_[pid] = std::move(strategy); +} + +std::shared_ptr CacheBase::getResizeStrategy( + PoolId pid) const { + std::unique_lock l(lock_); + auto it = poolResizeStrategies_.find(pid); + if (it != poolResizeStrategies_.end() && it->second) { + return it->second; + } + return nullptr; +} + +void CacheBase::setPoolOptimizeStrategy( + std::shared_ptr strategy) { + std::unique_lock l(lock_); + poolOptimizeStrategy_ = std::move(strategy); +} + +std::shared_ptr CacheBase::getPoolOptimizeStrategy() + const { + std::unique_lock l(lock_); + return poolOptimizeStrategy_; +} +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/Cache.h b/bdm/allocator/Cache.h new file mode 100644 index 0000000000..589614ee3b --- /dev/null +++ b/bdm/allocator/Cache.h @@ -0,0 +1,309 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include + +#include "cachelib/allocator/CacheDetails.h" +#include "cachelib/allocator/CacheStats.h" +#include "cachelib/allocator/ICompactCache.h" +#include "cachelib/allocator/memory/MemoryAllocator.h" +#include "cachelib/common/Hash.h" + +namespace facebook { +namespace cachelib { + +class PoolResizer; +class PoolRebalancer; +class PoolOptimizer; +class MemoryMonitor; + +// Forward declaration. +class RebalanceStrategy; +class PoolOptimizeStrategy; +class MarginalHitsOptimizeStrategy; +class AllocatorConfigExporter; + +namespace tests { +struct SimplePoolOptimizeStrategy; +} + +// The mode in which the cache was accessed. This can be used by containers +// to differentiate between the access modes and do appropriate action. +enum class AccessMode { kRead, kWrite }; + +// used by RemoveCB, indicating if the removal from the MMContainer was an +// eviction or not. +enum class RemoveContext { kEviction, kNormal }; +// used by ItemDestructor, indicating how the item is destructed +enum class DestructorContext { + // item was in dram and evicted from dram. it could have + // been present in nvm as well. + kEvictedFromRAM, + + // item was only in nvm and evicted from nvm + kEvictedFromNVM, + + // item was present in dram and removed by user calling + // remove()/insertOrReplace, or removed due to expired. + // it could have been present in nvm as well. + kRemovedFromRAM, + + // item was present only in nvm and removed by user calling + // remove()/insertOrReplace. + kRemovedFromNVM +}; + +// A base class of cache exposing members and status agnostic of template type. +class CacheBase { + public: + CacheBase() = default; + virtual ~CacheBase() = default; + + // Movable but not copyable + CacheBase(const CacheBase&) = delete; + CacheBase& operator=(const CacheBase&) = delete; + CacheBase(CacheBase&&) = default; + CacheBase& operator=(CacheBase&&) = default; + + // TODO: come up with some reasonable number + static constexpr unsigned kMaxTiers = 2; + + // Get a string referring to the cache name for this cache + virtual const std::string getCacheName() const = 0; + + // Returns true for ObjectCacheBase, false for CacheAllocator. + virtual bool isObjectCache() const = 0; + + // Get the reference to a memory pool, for stats purposes + // + // @param poolId The pool id to query + virtual const MemoryPool& getPool(PoolId poolId) const = 0; + + // Get the reference to a memory pool using a tier id, for stats purposes + // + // @param poolId The pool id to query + // @param tierId The tier of the pool id + virtual const MemoryPool& getPoolByTid(PoolId poolId, TierId tid) const = 0; + + // Get Pool specific stats (regular pools). This includes stats from the + // Memory Pool and also the cache. + // + // @param poolId the pool id + virtual PoolStats getPoolStats(PoolId poolId) const = 0; + + virtual AllocationClassBaseStat getAllocationClassStats( + TierId, PoolId pid, ClassId cid) const = 0; + + // @param poolId the pool id + virtual AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId poolId) const = 0; + + // This can be expensive so it is not part of PoolStats + // + // @param pid pool id + // @param slabProjectionLength number of slabs worth of items to visit to + // estimate the projected age. If 0, returns + // tail age for projection age. + // + // @return PoolEvictionAgeStats see CacheStats.h + virtual PoolEvictionAgeStats getPoolEvictionAgeStats( + PoolId pid, unsigned int slabProjectionLength) const = 0; + + // @return a map of stat value> representation for all the nvm + // cache stats. This is useful for our monitoring to directly upload them. + virtual std::unordered_map getNvmCacheStatsMap() + const = 0; + + // @return a map of stat value> representation for all the event + // tracker stats. If no event tracker exists, this will be empty + virtual std::unordered_map getEventTrackerStatsMap() + const = 0; + + // @return the Cache metadata + virtual CacheMetadata getCacheMetadata() const noexcept = 0; + + // @return cache's memory usage stats + virtual CacheMemoryStats getCacheMemoryStats() const = 0; + + // @return the overall cache stats + virtual GlobalCacheStats getGlobalCacheStats() const = 0; + + // @return the slab release stats. + virtual SlabReleaseStats getSlabReleaseStats() const = 0; + + // Set rebalancing strategy + // + // @param pid Pool id of the pool to set this strategy on. + // @param strategy The strategy + void setRebalanceStrategy(PoolId pid, + std::shared_ptr strategy); + + // @param pid The pool id. + // + // @return The rebalancing strategy of the specifid pool. + std::shared_ptr getRebalanceStrategy(PoolId pid) const; + + // Set resizing strategy + // + // @param pid Pool id of the pool to set this strategy on. + // @param strategy The strategy + void setResizeStrategy(PoolId pid, + std::shared_ptr strategy); + + // @param pid The pool id. + // + // @return The resizing strategy of the specifid pool. + std::shared_ptr getResizeStrategy(PoolId pid) const; + + // Set optimizing strategy + // + // @param pid Pool id of the pool to set this strategy on. + // @param strategy The strategy + void setPoolOptimizeStrategy(std::shared_ptr strategy); + + // @param pid The pool id. + // + // @return The optimizing strategy of the specifid pool. + std::shared_ptr getPoolOptimizeStrategy() const; + + // return the list of currently active pools that are oversized + virtual std::set getRegularPoolIdsForResize() const = 0; + + // return a list of all valid pool ids. + virtual std::set getPoolIds() const = 0; + + // returns a list of pools excluding compact cache pools + virtual std::set getRegularPoolIds() const = 0; + + // returns a list of pools created for ccache. + virtual std::set getCCachePoolIds() const = 0; + + // return whether a pool participates in auto-resizing + virtual bool autoResizeEnabledForPool(PoolId) const = 0; + + // return the virtual interface of an attached compact cache for a particular + // pool id + virtual const ICompactCache& getCompactCache(PoolId pid) const = 0; + + // return object cache stats + virtual void getObjectCacheCounters( + std::function) const {} + + protected: + // move bytes from one pool to another. The source pool should be at least + // _bytes_ in size. + // + // + // @param src the pool to be sized down and giving the memory. + // @param dest the pool receiving the memory. + // @param bytes the number of bytes to move from src to dest. + // @param true if the resize succeeded. false if src does does not have + // correct size to do the transfer. + // @throw std::invalid_argument if src or dest is invalid pool + virtual bool resizePools(PoolId /* src */, + PoolId /* dest */, + size_t /* bytes */) = 0; + + // resize all of the attached and disabled compact caches + virtual void resizeCompactCaches() = 0; + + // serialize cache allocator config for exporting to Scuba + virtual std::map serializeConfigParams() const = 0; + + // Releases a slab from a pool into its corresponding memory pool + // or back to the slab allocator, depending on SlabReleaseMode. + // SlabReleaseMode::kRebalance -> back to the pool + // SlabReleaseMode::kResize -> back to the slab allocator + // + // @param pid Pool that will make make one slab available + // @param cid Class that will release a slab back to its pool + // or slab allocator + // @param mode the mode for slab release (rebalance/resize) + // @param hint hint referring to the slab. this can be an allocation + // that the user knows to exist in the slab. If this is + // nullptr, a random slab is selected from the pool and + // allocation class. + // + // @throw std::invalid_argument if the hint is invalid or if the pid or cid + // is invalid. + virtual void releaseSlab(PoolId pid, + ClassId cid, + SlabReleaseMode mode, + const void* hint = nullptr) = 0; + + // update the number of slabs to be advised + virtual void updateNumSlabsToAdvise(int32_t numSlabsToAdvise) = 0; + + // calculate the number of slabs to be advised/reclaimed in each pool + virtual PoolAdviseReclaimData calcNumSlabsToAdviseReclaim() = 0; + + // Releasing a slab from this allocation class id and pool id. The release + // could be for a pool resizing or allocation class rebalancing. + // + // All active allocations will be evicted in the process. + // + // This function will be blocked until a slab is freed + // + // @param pid Pool that will make one slab available + // @param victim Class that will release a slab back to its pool + // or slab allocator or a receiver if defined + // @param receiver Class that will receive + // @param mode the mode for slab release (rebalance/resize) + // the user knows to exist in the slab. If this is nullptr, + // a random slab is selected from the pool and allocation + // class. + // @param hint hint referring to the slab. this can be an allocation + // that the user knows to exist in the slab. If this is + // nullptr, a random slab is selected from the pool and + // allocation class. + // + // @throw std::invalid_argument if the hint is invalid or if the pid or cid + // is invalid. + virtual void releaseSlab(PoolId pid, + ClassId victim, + ClassId receiver, + SlabReleaseMode mode, + const void* hint = nullptr) = 0; + + // Reclaim slabs from the slab allocator that were advised away using + // releaseSlab in SlabReleaseMode::kAdvise mode. + // + // @param pid The pool for which to recliam slabs + // @param numSlabs The number of slabs to reclaim for the pool + // @return The number of slabs that were actually reclaimed (<= numSlabs) + virtual unsigned int reclaimSlabs(PoolId id, size_t numSlabs) = 0; + + // Protect 'poolRebalanceStragtegies_' and `poolResizeStrategies_` + // and `poolOptimizeStrategy_` + mutable std::mutex lock_; + std::unordered_map> + poolRebalanceStrategies_; + std::unordered_map> + poolResizeStrategies_; + std::shared_ptr poolOptimizeStrategy_; + + friend PoolResizer; + friend PoolRebalancer; + friend PoolOptimizer; + friend MemoryMonitor; + friend AllocatorConfigExporter; +}; +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/CacheAllocator-inl.h b/bdm/allocator/CacheAllocator-inl.h new file mode 100644 index 0000000000..52f8a9412d --- /dev/null +++ b/bdm/allocator/CacheAllocator-inl.h @@ -0,0 +1,4116 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing fmm permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace facebook { +namespace cachelib { + +template +CacheAllocator::CacheAllocator(Config config) + : CacheAllocator(InitMemType::kNone, config) { + // TODO(MEMORY_TIER) + if (getNumTiers() > 1 || std::holds_alternative( + memoryTierConfigs[0].getShmTypeOpts())) { + throw std::runtime_error( + "Using custom memory tier or using more than one tier is only " + "supported for Shared Memory."); + } + initCommon(false); +} + +template +CacheAllocator::CacheAllocator(SharedMemNewT, Config config) + : CacheAllocator(InitMemType::kMemNew, config) { + initCommon(false); + shmManager_->removeShm(detail::kShmInfoName, + PosixSysVSegmentOpts(config_.isUsingPosixShm())); +} + +template +CacheAllocator::CacheAllocator(SharedMemAttachT, Config config) + : CacheAllocator(InitMemType::kMemAttach, config) { + /* TODO - per tier? */ + for (auto pid : *metadata_.compactCachePools()) { + isCompactCachePool_[pid] = true; + } + + initCommon(true); + + // We will create a new info shm segment on shutDown(). If we don't remove + // this info shm segment here and the new info shm segment's size is larger + // than this one, creating new one will fail. + shmManager_->removeShm(detail::kShmInfoName, + PosixSysVSegmentOpts(config_.isUsingPosixShm())); +} + +template +CacheAllocator::CacheAllocator( + typename CacheAllocator::InitMemType type, Config config) + : isOnShm_{type != InitMemType::kNone ? true + : config.memMonitoringEnabled()}, + config_(config.validate()), + memoryTierConfigs(config.getMemoryTierConfigs()), + tempShm_(type == InitMemType::kNone && isOnShm_ + ? std::make_unique(config_.getCacheSize()) + : nullptr), + shmManager_(type != InitMemType::kNone + ? std::make_unique(config_.cacheDir, + config_.isUsingPosixShm()) + : nullptr), + deserializer_(type == InitMemType::kMemAttach ? createDeserializer() + : nullptr), + metadata_{type == InitMemType::kMemAttach + ? deserializeCacheAllocatorMetadata(*deserializer_) + : serialization::CacheAllocatorMetadata{}}, + allocator_(initAllocator(type)), + compactCacheManager_(type != InitMemType::kMemAttach + ? std::make_unique(*allocator_[0] /* TODO: per tier */) + : restoreCCacheManager(0/* TODO: per tier */)), + compressor_(createPtrCompressor()), + mmContainers_(type == InitMemType::kMemAttach + ? deserializeMMContainers(*deserializer_, compressor_) + : MMContainers{getNumTiers()}), + accessContainer_(initAccessContainer( + type, detail::kShmHashTableName, config.accessConfig, config_.isUsingPosixShm())), + chainedItemAccessContainer_( + initAccessContainer(type, + detail::kShmChainedItemHashTableName, + config.chainedItemAccessConfig, + config_.isUsingPosixShm())), + chainedItemLocks_(config_.chainedItemsLockPower, + std::make_shared()), + movesMap_(kShards), + moveLock_(kShards), + cacheCreationTime_{ + type != InitMemType::kMemAttach + ? util::getCurrentTimeSec() + : static_cast(*metadata_.cacheCreationTime())}, + cacheInstanceCreationTime_{type != InitMemType::kMemAttach + ? cacheCreationTime_ + : util::getCurrentTimeSec()}, + // Pass in cacheInstnaceCreationTime_ as the current time to keep + // nvmCacheState's current time in sync + nvmCacheState_{cacheInstanceCreationTime_, config_.cacheDir, + config_.isNvmCacheEncryptionEnabled(), + config_.isNvmCacheTruncateAllocSizeEnabled()} {} + +template +CacheAllocator::~CacheAllocator() { + XLOG(DBG, "destructing CacheAllocator"); + // Stop all workers. In case user didn't call shutDown, we want to + // terminate all background workers and nvmCache before member variables + // go out of scope. + stopWorkers(); + nvmCache_.reset(); +} + +template +ShmSegmentOpts CacheAllocator::createShmCacheOpts(TierId tid) { + ShmSegmentOpts opts; + opts.alignment = sizeof(Slab); + opts.typeOpts = memoryTierConfigs[tid].getShmTypeOpts(); + opts.memBindNumaNodes = memoryTierConfigs[tid].getMemBind(); + if (auto *v = std::get_if(&opts.typeOpts)) { + v->usePosix = config_.usePosixShm; + } + + return opts; +} + +template +size_t CacheAllocator::memoryTierSize(TierId tid) const +{ + auto partitions = std::accumulate(memoryTierConfigs.begin(), memoryTierConfigs.end(), 0UL, + [](const size_t i, const MemoryTierCacheConfig& config){ + return i + config.getRatio(); + }); + + return memoryTierConfigs[tid].calculateTierSize(config_.getCacheSize(), partitions); +} + +template +std::vector> +CacheAllocator::createPrivateAllocator() { + std::vector> allocators; + + if (isOnShm_) + allocators.emplace_back(std::make_unique( + getAllocatorConfig(config_), + tempShm_->getAddr(), + config_.getCacheSize())); + else + allocators.emplace_back(std::make_unique( + getAllocatorConfig(config_), + config_.getCacheSize())); + + return allocators; +} + +template +std::unique_ptr +CacheAllocator::createNewMemoryAllocator(TierId tid) { + size_t tierSize = memoryTierSize(tid); + return std::make_unique( + getAllocatorConfig(config_), + shmManager_ + ->createShm(detail::kShmCacheName + std::to_string(tid), + tierSize, config_.slabMemoryBaseAddr, + createShmCacheOpts(tid)) + .addr, + tierSize); +} + +template +std::unique_ptr +CacheAllocator::restoreMemoryAllocator(TierId tid) { + return std::make_unique( + deserializer_->deserialize(), + shmManager_ + ->attachShm(detail::kShmCacheName + std::to_string(tid), + config_.slabMemoryBaseAddr, createShmCacheOpts(tid)).addr, + memoryTierSize(tid), + config_.disableFullCoredump); +} + +template +std::vector> +CacheAllocator::createAllocators() { + std::vector> allocators; + for (int tid = 0; tid < getNumTiers(); tid++) { + allocators.emplace_back(createNewMemoryAllocator(tid)); + } + return allocators; +} + +template +std::vector> +CacheAllocator::restoreAllocators() { + std::vector> allocators; + for (int tid = 0; tid < getNumTiers(); tid++) { + allocators.emplace_back(restoreMemoryAllocator(tid)); + } + return allocators; +} + +template +std::unique_ptr +CacheAllocator::restoreCCacheManager(TierId tid) { + return std::make_unique( + deserializer_->deserialize(), + *allocator_[tid]); +} + +template +void CacheAllocator::initCommon(bool dramCacheAttached) { + if (config_.nvmConfig.has_value()) { + if (config_.nvmCacheAP) { + nvmAdmissionPolicy_ = config_.nvmCacheAP; + } else if (config_.rejectFirstAPNumEntries) { + nvmAdmissionPolicy_ = std::make_shared>( + config_.rejectFirstAPNumEntries, config_.rejectFirstAPNumSplits, + config_.rejectFirstSuffixIgnoreLength, + config_.rejectFirstUseDramHitSignal); + } + if (config_.nvmAdmissionMinTTL > 0) { + if (!nvmAdmissionPolicy_) { + nvmAdmissionPolicy_ = std::make_shared>(); + } + nvmAdmissionPolicy_->initMinTTL(config_.nvmAdmissionMinTTL); + } + } + initStats(); + initNvmCache(dramCacheAttached); + + if (!config_.delayCacheWorkersStart) { + initWorkers(); + } +} + +template +void CacheAllocator::initNvmCache(bool dramCacheAttached) { + if (!config_.nvmConfig.has_value()) { + return; + } + + // for some usecases that create pools, restoring nvmcache when dram cache + // is not persisted is not supported. + const bool shouldDrop = config_.dropNvmCacheOnShmNew && !dramCacheAttached; + + // if we are dealing with persistency, cache directory should be enabled + const bool truncate = config_.cacheDir.empty() || + nvmCacheState_.shouldStartFresh() || shouldDrop; + if (truncate) { + nvmCacheState_.markTruncated(); + } + + nvmCache_ = std::make_unique(*this, *config_.nvmConfig, truncate, + config_.itemDestructor); + if (!config_.cacheDir.empty()) { + nvmCacheState_.clearPrevState(); + } +} + +template +void CacheAllocator::initWorkers() { + if (config_.poolResizingEnabled() && !poolResizer_) { + startNewPoolResizer(config_.poolResizeInterval, + config_.poolResizeSlabsPerIter, + config_.poolResizeStrategy); + } + + if (config_.poolRebalancingEnabled() && !poolRebalancer_) { + startNewPoolRebalancer(config_.poolRebalanceInterval, + config_.defaultPoolRebalanceStrategy, + config_.poolRebalancerFreeAllocThreshold); + } + + if (config_.memMonitoringEnabled() && !memMonitor_) { + if (!isOnShm_) { + throw std::invalid_argument( + "Memory monitoring is not supported for cache on heap. It is " + "supported " + "for cache on a shared memory segment only."); + } + startNewMemMonitor(config_.memMonitorInterval, + config_.memMonitorConfig, + config_.poolAdviseStrategy); + } + + if (config_.itemsReaperEnabled() && !reaper_) { + startNewReaper(config_.reaperInterval, config_.reaperConfig); + } + + if (config_.poolOptimizerEnabled() && !poolOptimizer_) { + startNewPoolOptimizer(config_.regularPoolOptimizeInterval, + config_.compactCacheOptimizeInterval, + config_.poolOptimizeStrategy, + config_.ccacheOptimizeStepSizePercent); + } + + if (config_.backgroundEvictorEnabled()) { + startNewBackgroundEvictor(config_.backgroundEvictorInterval, + config_.backgroundEvictorStrategy, + config_.backgroundEvictorThreads); + } + + if (config_.backgroundPromoterEnabled()) { + startNewBackgroundPromoter(config_.backgroundPromoterInterval, + config_.backgroundPromoterStrategy, + config_.backgroundPromoterThreads); + } +} + +template +std::vector> +CacheAllocator::initAllocator( + InitMemType type) { + if (type == InitMemType::kNone) { + return createPrivateAllocator(); + } else if (type == InitMemType::kMemNew) { + return createAllocators(); + } else if (type == InitMemType::kMemAttach) { + return restoreAllocators(); + } + + // Invalid type + throw std::runtime_error(folly::sformat( + "Cannot initialize memory allocator, unknown InitMemType: {}.", + static_cast(type))); +} + +template +std::unique_ptr::AccessContainer> +CacheAllocator::initAccessContainer(InitMemType type, + const std::string name, + AccessConfig config, + bool usePosixShm) { + if (type == InitMemType::kNone) { + return std::make_unique( + config, compressor_, + [this](Item* it) -> WriteHandle { return acquire(it); }); + } else if (type == InitMemType::kMemNew) { + return std::make_unique( + config, + shmManager_ + ->createShm( + name, + AccessContainer::getRequiredSize(config.getNumBuckets()), + nullptr, + ShmSegmentOpts(config.getPageSize(), false, usePosixShm)) + .addr, + compressor_, + [this](Item* it) -> WriteHandle { return acquire(it); }); + } else if (type == InitMemType::kMemAttach) { + return std::make_unique( + deserializer_->deserialize(), + config, + shmManager_->attachShm(name, nullptr, + ShmSegmentOpts(config.getPageSize(), false, usePosixShm)), + compressor_, + [this](Item* it) -> WriteHandle { return acquire(it); }); + } + + // Invalid type + throw std::runtime_error(folly::sformat( + "Cannot initialize access container, unknown InitMemType: {}.", + static_cast(type))); +} + +template +std::unique_ptr CacheAllocator::createDeserializer() { + auto infoAddr = shmManager_->attachShm(detail::kShmInfoName, nullptr, + ShmSegmentOpts(PageSizeT::NORMAL, false, config_.isUsingPosixShm())); + return std::make_unique( + reinterpret_cast(infoAddr.addr), + reinterpret_cast(infoAddr.addr) + infoAddr.size); +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::allocate(PoolId poolId, + typename Item::Key key, + uint32_t size, + uint32_t ttlSecs, + uint32_t creationTime) { + if (creationTime == 0) { + creationTime = util::getCurrentTimeSec(); + } + return allocateInternal(poolId, key, size, creationTime, + ttlSecs == 0 ? 0 : creationTime + ttlSecs); +} + +template +bool CacheAllocator::shouldWakeupBgEvictor(TierId tid, PoolId pid, ClassId cid) { + // TODO: should we also work on lower tiers? should we have separate set of params? + if (tid == 1) return false; + return getAllocationClassStats(tid, pid, cid).approxFreePercent <= config_.lowEvictionAcWatermark; +} + +template +size_t CacheAllocator::backgroundWorkerId(TierId tid, PoolId pid, ClassId cid, size_t numWorkers) { + XDCHECK(numWorkers); + + // TODO: came up with some better sharding (use some hashing) + return (tid + pid + cid) % numWorkers; +} + + +template +typename CacheAllocator::WriteHandle +CacheAllocator::allocateInternalTier(TierId tid, + PoolId pid, + typename Item::Key key, + uint32_t size, + uint32_t creationTime, + uint32_t expiryTime, + bool fromBgThread) { + util::LatencyTracker tracker{stats().allocateLatency_}; + + SCOPE_FAIL { stats_.invalidAllocs.inc(); }; + + // number of bytes required for this item + const auto requiredSize = Item::getRequiredSize(key, size); + + // the allocation class in our memory allocator. + const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize); + util::RollingLatencyTracker rollTracker{(*stats_.classAllocLatency)[tid][pid][cid]}; + + // TODO: per-tier + (*stats_.allocAttempts)[pid][cid].inc(); + + void* memory = allocator_[tid]->allocate(pid, requiredSize); + + if (backgroundEvictor_.size() && !fromBgThread && (memory == nullptr || shouldWakeupBgEvictor(tid, pid, cid))) { + backgroundEvictor_[backgroundWorkerId(tid, pid, cid, backgroundEvictor_.size())]->wakeUp(); + } + // TODO: Today isEvictionDisabled means do not evict from memory (DRAM). + // Should we support eviction between memory tiers (e.g. from DRAM to PMEM)? + if (memory == nullptr && !config_.isEvictionDisabled()) { + memory = findEviction(tid, pid, cid); + } + + WriteHandle handle; + if (memory != nullptr) { + // At this point, we have a valid memory allocation that is ready for use. + // Ensure that when we abort from here under any circumstances, we free up + // the memory. Item's handle could throw because the key size was invalid + // for example. + SCOPE_FAIL { + // free back the memory to the allocator since we failed. + allocator_[tid]->free(memory); + }; + + handle = acquire(new (memory) Item(key, size, creationTime, expiryTime)); + if (handle) { + handle.markNascent(); + (*stats_.fragmentationSize)[pid][cid].add( + util::getFragmentation(*this, *handle)); + } + + } else { // failed to allocate memory. + (*stats_.allocFailures)[pid][cid].inc(); // TODO: per-tier + // wake up rebalancer + if (poolRebalancer_) { + poolRebalancer_->wakeUp(); + } + } + + if (auto eventTracker = getEventTracker()) { + const auto result = + handle ? AllocatorApiResult::ALLOCATED : AllocatorApiResult::FAILED; + eventTracker->record(AllocatorApiEvent::ALLOCATE, key, result, size, + expiryTime ? expiryTime - creationTime : 0); + } + + return handle; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::allocateInternal(PoolId pid, + typename Item::Key key, + uint32_t size, + uint32_t creationTime, + uint32_t expiryTime, + bool fromBgThread) { + auto tid = 0; /* TODO: consult admission policy */ + for(TierId tid = 0; tid < getNumTiers(); ++tid) { + auto handle = allocateInternalTier(tid, pid, key, size, creationTime, expiryTime, fromBgThread); + if (handle) return handle; + } + return {}; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::allocateChainedItem(const ReadHandle& parent, + uint32_t size) { + if (!parent) { + throw std::invalid_argument( + "Cannot call allocate chained item with a empty parent handle!"); + } + + auto it = allocateChainedItemInternal(parent, size); + if (auto eventTracker = getEventTracker()) { + const auto result = + it ? AllocatorApiResult::ALLOCATED : AllocatorApiResult::FAILED; + eventTracker->record(AllocatorApiEvent::ALLOCATE_CHAINED, parent->getKey(), + result, size, parent->getConfiguredTTL().count()); + } + return it; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::allocateChainedItemInternal( + const ReadHandle& parent, uint32_t size) { + util::LatencyTracker tracker{stats().allocateLatency_}; + + SCOPE_FAIL { stats_.invalidAllocs.inc(); }; + + // number of bytes required for this item + const auto requiredSize = ChainedItem::getRequiredSize(size); + + // TODO: is this correct? + auto tid = getTierId(*parent); + + const auto pid = allocator_[tid]->getAllocInfo(parent->getMemory()).poolId; + const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize); + + util::RollingLatencyTracker rollTracker{(*stats_.classAllocLatency)[tid][pid][cid]}; + + // TODO: per-tier? Right now stats_ are not used in any public periodic + // worker + (*stats_.allocAttempts)[pid][cid].inc(); + + void* memory = allocator_[tid]->allocate(pid, requiredSize); + if (memory == nullptr) { + memory = findEviction(tid, pid, cid); + } + if (memory == nullptr) { + (*stats_.allocFailures)[pid][cid].inc(); + return WriteHandle{}; + } + + SCOPE_FAIL { allocator_[tid]->free(memory); }; + + auto child = acquire( + new (memory) ChainedItem(compressor_.compress(parent.getInternal()), size, + util::getCurrentTimeSec())); + + if (child) { + child.markNascent(); + (*stats_.fragmentationSize)[pid][cid].add( + util::getFragmentation(*this, *child)); + } + + return child; +} + +template +void CacheAllocator::addChainedItem(WriteHandle& parent, + WriteHandle child) { + if (!parent || !child || !child->isChainedItem()) { + throw std::invalid_argument( + folly::sformat("Invalid parent or child. parent: {}, child: {}", + parent ? parent->toString() : "nullptr", + child ? child->toString() : "nullptr")); + } + + auto l = chainedItemLocks_.lockExclusive(parent->getKey()); + + // Insert into secondary lookup table for chained allocation + auto oldHead = chainedItemAccessContainer_->insertOrReplace(*child); + if (oldHead) { + child->asChainedItem().appendChain(oldHead->asChainedItem(), compressor_); + } + + // Count an item that just became a new parent + if (!parent->hasChainedItem()) { + stats_.numChainedParentItems.inc(); + } + // Parent needs to be marked before inserting child into MM container + // so the parent-child relationship is established before an eviction + // can be triggered from the child + parent->markHasChainedItem(); + // Count a new child + stats_.numChainedChildItems.inc(); + + insertInMMContainer(*child); + + // Increment refcount since this chained item is now owned by the parent + // Parent will decrement the refcount upon release. Since this is an + // internal refcount, we dont include it in active handle tracking. + child->incRef(); + XDCHECK_EQ(2u, child->getRefCount()); + + invalidateNvm(*parent); + if (auto eventTracker = getEventTracker()) { + eventTracker->record(AllocatorApiEvent::ADD_CHAINED, parent->getKey(), + AllocatorApiResult::INSERTED, child->getSize(), + child->getConfiguredTTL().count()); + } +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::popChainedItem(WriteHandle& parent) { + if (!parent || !parent->hasChainedItem()) { + throw std::invalid_argument(folly::sformat( + "Invalid parent {}", parent ? parent->toString() : nullptr)); + } + + WriteHandle head; + { // scope of chained item lock. + auto l = chainedItemLocks_.lockExclusive(parent->getKey()); + + head = findChainedItem(*parent); + if (head->asChainedItem().getNext(compressor_) != nullptr) { + chainedItemAccessContainer_->insertOrReplace( + *head->asChainedItem().getNext(compressor_)); + } else { + chainedItemAccessContainer_->remove(*head); + parent->unmarkHasChainedItem(); + stats_.numChainedParentItems.dec(); + } + head->asChainedItem().setNext(nullptr, compressor_); + + invalidateNvm(*parent); + } + const auto res = removeFromMMContainer(*head); + XDCHECK(res == true); + + // decrement the refcount to indicate this item is unlinked from its parent + head->decRef(); + stats_.numChainedChildItems.dec(); + + if (auto eventTracker = getEventTracker()) { + eventTracker->record(AllocatorApiEvent::POP_CHAINED, parent->getKey(), + AllocatorApiResult::REMOVED, head->getSize(), + head->getConfiguredTTL().count()); + } + + return head; +} + +template +typename CacheAllocator::Key +CacheAllocator::getParentKey(const Item& chainedItem) { + XDCHECK(chainedItem.isChainedItem()); + if (!chainedItem.isChainedItem()) { + throw std::invalid_argument(folly::sformat( + "Item must be chained item! Item: {}", chainedItem.toString())); + } + return reinterpret_cast(chainedItem) + .getParentItem(compressor_) + .getKey(); +} + +template +void CacheAllocator::transferChainLocked(WriteHandle& parent, + WriteHandle& newParent) { + // parent must be in a state to not have concurrent readers. Eviction code + // paths rely on holding the last item handle. Since we hold on to an item + // handle here, the chain will not be touched by any eviction code path. + XDCHECK(parent); + XDCHECK(newParent); + XDCHECK_EQ(parent->getKey(), newParent->getKey()); + XDCHECK(parent->hasChainedItem()); + + if (newParent->hasChainedItem()) { + throw std::invalid_argument(folly::sformat( + "New Parent {} has invalid state", newParent->toString())); + } + + auto headHandle = findChainedItem(*parent); + XDCHECK(headHandle); + + // remove from the access container since we are changing the key + chainedItemAccessContainer_->remove(*headHandle); + + // change the key of the chain to have them belong to the new parent. + ChainedItem* curr = &headHandle->asChainedItem(); + const auto newParentPtr = compressor_.compress(newParent.get()); + while (curr) { + XDCHECK_EQ(curr == headHandle.get() ? 2u : 1u, curr->getRefCount()); + XDCHECK(curr->isInMMContainer()); + curr->changeKey(newParentPtr); + curr = curr->getNext(compressor_); + } + + newParent->markHasChainedItem(); + auto oldHead = chainedItemAccessContainer_->insertOrReplace(*headHandle); + if (oldHead) { + throw std::logic_error( + folly::sformat("Did not expect to find an existing chain for {}", + newParent->toString(), oldHead->toString())); + } + parent->unmarkHasChainedItem(); +} + +template +void CacheAllocator::transferChainAndReplace( + WriteHandle& parent, WriteHandle& newParent) { + if (!parent || !newParent) { + throw std::invalid_argument("invalid parent or new parent"); + } + { // scope for chained item lock + auto l = chainedItemLocks_.lockExclusive(parent->getKey()); + transferChainLocked(parent, newParent); + } + + if (replaceIfAccessible(*parent, *newParent)) { + newParent.unmarkNascent(); + } + invalidateNvm(*parent); +} + +template +bool CacheAllocator::replaceIfAccessible(Item& oldItem, + Item& newItem) { + XDCHECK(!newItem.isAccessible()); + + // Inside the access container's lock, this checks if the old item is + // accessible, and only in that case replaces it. If the old item is not + // accessible anymore, it may have been replaced or removed earlier and there + // is no point in proceeding with a move. + if (!accessContainer_->replaceIfAccessible(oldItem, newItem)) { + return false; + } + + // Inside the MM container's lock, this checks if the old item exists to + // make sure that no other thread removed it, and only then replaces it. + if (!replaceInMMContainer(oldItem, newItem)) { + accessContainer_->remove(newItem); + return false; + } + + // Replacing into the MM container was successful, but someone could have + // called insertOrReplace() or remove() before or after the + // replaceInMMContainer() operation, which would invalidate newItem. + if (!newItem.isAccessible()) { + removeFromMMContainer(newItem); + return false; + } + return true; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::replaceChainedItem(Item& oldItem, + WriteHandle newItemHandle, + Item& parent) { + if (!newItemHandle) { + throw std::invalid_argument("Empty handle for newItem"); + } + auto l = chainedItemLocks_.lockExclusive(parent.getKey()); + + if (!oldItem.isChainedItem() || !newItemHandle->isChainedItem() || + &oldItem.asChainedItem().getParentItem(compressor_) != + &newItemHandle->asChainedItem().getParentItem(compressor_) || + &oldItem.asChainedItem().getParentItem(compressor_) != &parent || + newItemHandle->isInMMContainer() || !oldItem.isInMMContainer()) { + throw std::invalid_argument(folly::sformat( + "Invalid args for replaceChainedItem. oldItem={}, newItem={}, " + "parent={}", + oldItem.toString(), newItemHandle->toString(), parent.toString())); + } + + auto oldItemHdl = + replaceChainedItemLocked(oldItem, std::move(newItemHandle), parent); + invalidateNvm(parent); + return oldItemHdl; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::replaceChainedItemLocked(Item& oldItem, + WriteHandle newItemHdl, + const Item& parent) { + XDCHECK(newItemHdl != nullptr); + XDCHECK_GE(1u, oldItem.getRefCount()); + + // grab the handle to the old item so that we can return this. Also, we need + // to drop the refcount the parent holds on oldItem by manually calling + // decRef. To do that safely we need to have a proper outstanding handle. + auto oldItemHdl = acquire(&oldItem); + + // Replace the old chained item with new item in the MMContainer before we + // actually replace the old item in the chain + + if (!replaceChainedItemInMMContainer(oldItem, *newItemHdl)) { + // This should never happen since we currently hold an valid + // parent handle. None of its chained items can be removed + throw std::runtime_error(folly::sformat( + "chained item cannot be replaced in MM container, oldItem={}, " + "newItem={}, parent={}", + oldItem.toString(), newItemHdl->toString(), parent.toString())); + } + + XDCHECK(!oldItem.isInMMContainer()); + XDCHECK(newItemHdl->isInMMContainer()); + + auto head = findChainedItem(parent); + XDCHECK(head != nullptr); + XDCHECK_EQ(reinterpret_cast( + &head->asChainedItem().getParentItem(compressor_)), + reinterpret_cast(&parent)); + + // if old item is the head, replace the head in the chain and insert into + // the access container and append its chain. + if (head.get() == &oldItem) { + chainedItemAccessContainer_->insertOrReplace(*newItemHdl); + } else { + // oldItem is in the middle of the chain, find its previous and fix the + // links + auto* prev = &head->asChainedItem(); + auto* curr = prev->getNext(compressor_); + while (curr != nullptr && curr != &oldItem) { + prev = curr; + curr = curr->getNext(compressor_); + } + + XDCHECK(curr != nullptr); + prev->setNext(&newItemHdl->asChainedItem(), compressor_); + } + + newItemHdl->asChainedItem().setNext( + oldItem.asChainedItem().getNext(compressor_), compressor_); + oldItem.asChainedItem().setNext(nullptr, compressor_); + + // this should not result in 0 refcount. We are bumping down the internal + // refcount. If it did, we would leak an item. + oldItem.decRef(); + XDCHECK_LT(0u, oldItem.getRefCount()) << oldItem.toString(); + + // increment refcount to indicate parent owns this similar to addChainedItem + // Since this is an internal refcount, we dont include it in active handle + // tracking. + + newItemHdl->incRef(); + return oldItemHdl; +} + +template +typename CacheAllocator::ReleaseRes +CacheAllocator::releaseBackToAllocator(Item& it, + RemoveContext ctx, + bool nascent, + const Item* toRecycle) { + if (!it.isDrained()) { + throw std::runtime_error( + folly::sformat("cannot release this item: {}", it.toString())); + } + const auto tid = getTierId(it); + const auto allocInfo = allocator_[tid]->getAllocInfo(it.getMemory()); + + if (ctx == RemoveContext::kEviction) { + const auto timeNow = util::getCurrentTimeSec(); + const auto refreshTime = timeNow - it.getLastAccessTime(); + const auto lifeTime = timeNow - it.getCreationTime(); + stats_.ramEvictionAgeSecs_.trackValue(refreshTime); + stats_.ramItemLifeTimeSecs_.trackValue(lifeTime); + stats_.perPoolEvictionAgeSecs_[allocInfo.poolId].trackValue(refreshTime); + } + + (*stats_.fragmentationSize)[allocInfo.poolId][allocInfo.classId].sub( + util::getFragmentation(*this, it)); + + // Chained items can only end up in this place if the user has allocated + // memory for a chained item but has decided not to insert the chained item + // to a parent item and instead drop the chained item handle. In this case, + // we free the chained item directly without calling remove callback. + if (it.isChainedItem()) { + if (toRecycle) { + throw std::runtime_error( + folly::sformat("Can not recycle a chained item {}, toRecyle", + it.toString(), toRecycle->toString())); + } + allocator_[tid]->free(&it); + return ReleaseRes::kReleased; + } + + // nascent items represent items that were allocated but never inserted into + // the cache. We should not be executing removeCB for them since they were + // not initialized from the user perspective and never part of the cache. + if (!nascent && config_.removeCb) { + config_.removeCb(RemoveCbData{ctx, it, viewAsChainedAllocsRange(it)}); + } + + // only skip destructor for evicted items that are either in the queue to put + // into nvm or already in nvm + if (!nascent && config_.itemDestructor && + (ctx != RemoveContext::kEviction || !it.isNvmClean() || + it.isNvmEvicted())) { + try { + config_.itemDestructor(DestructorData{ + ctx, it, viewAsChainedAllocsRange(it), allocInfo.poolId}); + stats().numRamDestructorCalls.inc(); + } catch (const std::exception& e) { + stats().numDestructorExceptions.inc(); + XLOG_EVERY_N(INFO, 100) + << "Catch exception from user's item destructor: " << e.what(); + } + } + + // If no `toRecycle` is set, then the result is kReleased + // Because this function cannot fail to release "it" + ReleaseRes res = + toRecycle == nullptr ? ReleaseRes::kReleased : ReleaseRes::kNotRecycled; + + // Free chained allocs if there are any + if (it.hasChainedItem()) { + // At this point, the parent is only accessible within this thread + // and thus no one else can add or remove any chained items associated + // with this parent. So we're free to go through the list and free + // chained items one by one. + auto headHandle = findChainedItem(it); + ChainedItem* head = &headHandle.get()->asChainedItem(); + headHandle.reset(); + + if (head == nullptr || &head->getParentItem(compressor_) != &it) { + throw std::runtime_error(folly::sformat( + "Mismatch parent pointer. This should not happen. Key: {}", + it.getKey())); + } + + if (!chainedItemAccessContainer_->remove(*head)) { + throw std::runtime_error(folly::sformat( + "Chained item associated with {} cannot be removed from hash table " + "This should not happen here.", + it.getKey())); + } + + while (head) { + auto next = head->getNext(compressor_); + + const auto childInfo = + allocator_[tid]->getAllocInfo(static_cast(head)); + (*stats_.fragmentationSize)[childInfo.poolId][childInfo.classId].sub( + util::getFragmentation(*this, *head)); + + removeFromMMContainer(*head); + + // If this chained item is marked as moving, we will not free it. + // We must capture the moving state before we do the decRef when + // we know the item must still be valid + const bool wasMoving = head->isMoving(); + + // Decref and check if we were the last reference. Now if the item + // was marked moving, after decRef, it will be free to be released + // by slab release thread + const auto childRef = head->decRef(); + + // If the item is already moving and we already decremented the + // refcount, we don't need to free this item. We'll let the slab + // release thread take care of that + if (!wasMoving) { + if (childRef != 0) { + throw std::runtime_error(folly::sformat( + "chained item refcount is not zero. We cannot proceed! " + "Ref: {}, Chained Item: {}", + childRef, head->toString())); + } + + // Item is not moving and refcount is 0, we can proceed to + // free it or recylce the memory + if (head == toRecycle) { + XDCHECK(ReleaseRes::kReleased != res); + res = ReleaseRes::kRecycled; + } else { + allocator_[tid]->free(head); + } + } + + stats_.numChainedChildItems.dec(); + head = next; + } + stats_.numChainedParentItems.dec(); + } + + if (&it == toRecycle) { + XDCHECK(ReleaseRes::kReleased != res); + res = ReleaseRes::kRecycled; + } else { + XDCHECK(it.isDrained()); + allocator_[tid]->free(&it); + } + + return res; +} + +template +void CacheAllocator::incRef(Item& it) { + it.incRef(); + ++handleCount_.tlStats(); +} + +template +RefcountWithFlags::Value CacheAllocator::decRef(Item& it) { + const auto ret = it.decRef(); + // do this after we ensured that we incremented a reference. + --handleCount_.tlStats(); + return ret; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::acquire(Item* it) { + if (UNLIKELY(!it)) { + return WriteHandle{}; + } + + SCOPE_FAIL { stats_.numRefcountOverflow.inc(); }; + + incRef(*it); + return WriteHandle{it, *this}; +} + +template +void CacheAllocator::release(Item* it, bool isNascent) { + // decrement the reference and if it drops to 0, release it back to the + // memory allocator, and invoke the removal callback if there is one. + if (UNLIKELY(!it)) { + return; + } + + const auto ref = decRef(*it); + + if (UNLIKELY(ref == 0)) { + const auto res = + releaseBackToAllocator(*it, RemoveContext::kNormal, isNascent); + XDCHECK(res == ReleaseRes::kReleased); + } +} + +template +bool CacheAllocator::removeFromMMContainer(Item& item) { + // remove it from the mm container. + if (item.isInMMContainer()) { + auto& mmContainer = getMMContainer(item); + return mmContainer.remove(item); + } + return false; +} + +template +bool CacheAllocator::replaceInMMContainer(Item& oldItem, + Item& newItem) { + auto& oldContainer = getMMContainer(oldItem); + auto& newContainer = getMMContainer(newItem); + if (&oldContainer == &newContainer) { + return oldContainer.replace(oldItem, newItem); + } else { + return oldContainer.remove(oldItem) && newContainer.add(newItem); + } +} + +template +bool CacheAllocator::replaceInMMContainer(Item* oldItem, + Item& newItem) { + return replaceInMMContainer(*oldItem, newItem); +} + +template +bool CacheAllocator::replaceInMMContainer(EvictionIterator& oldItemIt, + Item& newItem) { + auto& oldContainer = getMMContainer(*oldItemIt); + auto& newContainer = getMMContainer(newItem); + + // This function is used for eviction across tiers + XDCHECK(&oldContainer != &newContainer); + oldContainer.remove(oldItemIt); + + return newContainer.add(newItem); +} + +template +bool CacheAllocator::replaceChainedItemInMMContainer( + Item& oldItem, Item& newItem) { + auto& oldMMContainer = getMMContainer(oldItem); + auto& newMMContainer = getMMContainer(newItem); + if (&oldMMContainer == &newMMContainer) { + return oldMMContainer.replace(oldItem, newItem); + } else { + if (!oldMMContainer.remove(oldItem)) { + return false; + } + + // This cannot fail because a new item should not have been inserted + const auto newRes = newMMContainer.add(newItem); + XDCHECK(newRes); + return true; + } +} + +template +void CacheAllocator::insertInMMContainer(Item& item) { + XDCHECK(!item.isInMMContainer()); + auto& mmContainer = getMMContainer(item); + if (!mmContainer.add(item)) { + throw std::runtime_error(folly::sformat( + "Invalid state. Node {} was already in the container.", &item)); + } +} + +/** + * There is a potential race with inserts and removes that. While T1 inserts + * the key, there is T2 that removes the key. There can be an interleaving of + * inserts and removes into the MM and Access Conatainers.It does not matter + * what the outcome of this race is (ie key can be present or not present). + * However, if the key is not accessible, it should also not be in + * MMContainer. To ensure that, we always add to MMContainer on inserts before + * adding to the AccessContainer. Both the code paths on success/failure, + * preserve the appropriate state in the MMContainer. Note that this insert + * will also race with the removes we do in SlabRebalancing code paths. + */ + +template +bool CacheAllocator::insert(const WriteHandle& handle) { + return insertImpl(handle, AllocatorApiEvent::INSERT); +} + +template +bool CacheAllocator::insertImpl(const WriteHandle& handle, + AllocatorApiEvent event) { + XDCHECK(handle); + XDCHECK(event == AllocatorApiEvent::INSERT || + event == AllocatorApiEvent::INSERT_FROM_NVM); + if (handle->isAccessible()) { + throw std::invalid_argument("Handle is already accessible"); + } + + if (nvmCache_ != nullptr && !handle->isNvmClean()) { + throw std::invalid_argument("Can't use insert API with nvmCache enabled"); + } + + // insert into the MM container before we make it accessible. Find will + // return this item as soon as it is accessible. + insertInMMContainer(*(handle.getInternal())); + + AllocatorApiResult result; + if (!accessContainer_->insert(*(handle.getInternal()))) { + // this should destroy the handle and release it back to the allocator. + removeFromMMContainer(*(handle.getInternal())); + result = AllocatorApiResult::FAILED; + } else { + handle.unmarkNascent(); + result = AllocatorApiResult::INSERTED; + } + + if (auto eventTracker = getEventTracker()) { + eventTracker->record(event, handle->getKey(), result, handle->getSize(), + handle->getConfiguredTTL().count()); + } + + return result == AllocatorApiResult::INSERTED; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::insertOrReplace(const WriteHandle& handle) { + XDCHECK(handle); + if (handle->isAccessible()) { + throw std::invalid_argument("Handle is already accessible"); + } + + HashedKey hk{handle->getKey()}; + + insertInMMContainer(*(handle.getInternal())); + WriteHandle replaced; + try { + auto lock = nvmCache_ ? nvmCache_->getItemDestructorLock(hk) + : std::unique_lock(); + + replaced = accessContainer_->insertOrReplace(*(handle.getInternal())); + + if (replaced && replaced->isNvmClean() && !replaced->isNvmEvicted()) { + // item is to be replaced and the destructor will be executed + // upon memory released, mark it in nvm to avoid destructor + // executed from nvm + nvmCache_->markNvmItemRemovedLocked(hk); + } + } catch (const std::exception&) { + removeFromMMContainer(*(handle.getInternal())); + if (auto eventTracker = getEventTracker()) { + eventTracker->record(AllocatorApiEvent::INSERT_OR_REPLACE, + handle->getKey(), + AllocatorApiResult::FAILED, + handle->getSize(), + handle->getConfiguredTTL().count()); + } + throw; + } + + // Remove from LRU as well if we do have a handle of old item + if (replaced) { + removeFromMMContainer(*replaced); + } + + if (UNLIKELY(nvmCache_ != nullptr)) { + // We can avoid nvm delete only if we have non nvm clean item in cache. + // In all other cases we must enqueue delete. + if (!replaced || replaced->isNvmClean()) { + nvmCache_->remove(hk, nvmCache_->createDeleteTombStone(hk)); + } + } + + handle.unmarkNascent(); + + if (auto eventTracker = getEventTracker()) { + XDCHECK(handle); + const auto result = + replaced ? AllocatorApiResult::REPLACED : AllocatorApiResult::INSERTED; + eventTracker->record(AllocatorApiEvent::INSERT_OR_REPLACE, handle->getKey(), + result, handle->getSize(), + handle->getConfiguredTTL().count()); + } + + return replaced; +} + +/* Next two methods are used to asynchronously move Item between memory tiers. + * + * The thread, which moves Item, allocates new Item in the tier we are moving to + * and calls moveRegularItemWithSync() method. This method does the following: + * 1. Create MoveCtx and put it to the movesMap. + * 2. Update the access container with the new item from the tier we are + * moving to. This Item has kIncomplete flag set. + * 3. Copy data from the old Item to the new one. + * 4. Unset the kIncomplete flag and Notify MoveCtx + * + * Concurrent threads which are getting handle to the same key: + * 1. When a handle is created it checks if the kIncomplete flag is set + * 2. If so, Handle implementation creates waitContext and adds it to the + * MoveCtx by calling addWaitContextForMovingItem() method. + * 3. Wait until the moving thread will complete its job. + */ +template +bool CacheAllocator::addWaitContextForMovingItem( + folly::StringPiece key, std::shared_ptr> waiter) { + auto shard = getShardForKey(key); + auto& movesMap = getMoveMapForShard(shard); + auto lock = getMoveLockForShard(shard); + auto it = movesMap.find(key); + if (it == movesMap.end()) { + return false; + } + auto ctx = it->second.get(); + ctx->addWaiter(std::move(waiter)); + return true; +} + +template +template +typename CacheAllocator::WriteHandle +CacheAllocator::moveRegularItemWithSync( + Item& oldItem, WriteHandle& newItemHdl, P&& predicate) { + XDCHECK(oldItem.isMoving()); + // TODO: should we introduce new latency tracker. E.g. evictRegularLatency_ + // ??? util::LatencyTracker tracker{stats_.evictRegularLatency_}; + + if (!oldItem.isAccessible() || oldItem.isExpired()) { + return {}; + } + + XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize()); + XDCHECK_NE(getTierId(oldItem), getTierId(*newItemHdl)); + + // take care of the flags before we expose the item to be accessed. this + // will ensure that when another thread removes the item from RAM, we issue + // a delete accordingly. See D7859775 for an example + if (oldItem.isNvmClean()) { + newItemHdl->markNvmClean(); + } + + folly::StringPiece key(oldItem.getKey()); + auto shard = getShardForKey(key); + auto& movesMap = getMoveMapForShard(shard); + MoveCtx* ctx(nullptr); + { + auto lock = getMoveLockForShard(shard); + auto res = movesMap.try_emplace(key, std::make_unique()); + if (!res.second) { + return {}; + } + ctx = res.first->second.get(); + } + + auto resHdl = WriteHandle{}; + auto guard = folly::makeGuard([key, this, ctx, shard, &resHdl]() { + auto& movesMap = getMoveMapForShard(shard); + if (resHdl) + resHdl->unmarkIncomplete(); + auto lock = getMoveLockForShard(shard); + ctx->setItemHandle(std::move(resHdl)); + movesMap.erase(key); + }); + + // TODO: Possibly we can use markMoving() instead. But today + // moveOnSlabRelease logic assume that we mark as moving old Item + // and than do copy and replace old Item with the new one in access + // container. Furthermore, Item can be marked as Moving only + // if it is linked to MM container. In our case we mark the new Item + // and update access container before the new Item is ready (content is + // copied). + newItemHdl->markIncomplete(); + + // Inside the access container's lock, this checks if the old item is + // accessible and its refcount is zero. If the item is not accessible, + // there is no point to replace it since it had already been removed + // or in the process of being removed. If the item is in cache but the + // refcount is non-zero, it means user could be attempting to remove + // this item through an API such as remove(ItemHandle). In this case, + // it is unsafe to replace the old item with a new one, so we should + // also abort. + if (!accessContainer_->replaceIf(oldItem, *newItemHdl, + predicate)) { + return {}; + } + + if (config_.moveCb) { + // Execute the move callback. We cannot make any guarantees about the + // consistency of the old item beyond this point, because the callback can + // do more than a simple memcpy() e.g. update external references. If there + // are any remaining handles to the old item, it is the caller's + // responsibility to invalidate them. The move can only fail after this + // statement if the old item has been removed or replaced, in which case it + // should be fine for it to be left in an inconsistent state. + config_.moveCb(oldItem, *newItemHdl, nullptr); + } else { + std::memcpy(newItemHdl->getMemory(), oldItem.getMemory(), + oldItem.getSize()); + } + + // Inside the MM container's lock, this checks if the old item exists to + // make sure that no other thread removed it, and only then replaces it. + if (!replaceInMMContainer(oldItem, *newItemHdl)) { + accessContainer_->remove(*newItemHdl); + return {}; + } + + // Replacing into the MM container was successful, but someone could have + // called insertOrReplace() or remove() before or after the + // replaceInMMContainer() operation, which would invalidate newItemHdl. + if (!newItemHdl->isAccessible()) { + removeFromMMContainer(*newItemHdl); + return {}; + } + + // no one can add or remove chained items at this point + if (oldItem.hasChainedItem()) { + // safe to acquire handle for a moving Item + auto oldHandle = acquire(&oldItem); + XDCHECK_EQ(1u, oldHandle->getRefCount()) << oldHandle->toString(); + XDCHECK(!newItemHdl->hasChainedItem()) << newItemHdl->toString(); + try { + auto l = chainedItemLocks_.lockExclusive(oldItem.getKey()); + transferChainLocked(oldHandle, newItemHdl); + } catch (const std::exception& e) { + // this should never happen because we drained all the handles. + XLOGF(DFATAL, "{}", e.what()); + throw; + } + + XDCHECK(!oldItem.hasChainedItem()); + XDCHECK(newItemHdl->hasChainedItem()); + } + newItemHdl.unmarkNascent(); + resHdl = std::move(newItemHdl); // guard will assign it to ctx under lock + return acquire(&oldItem); +} + +template +bool CacheAllocator::moveRegularItem(Item& oldItem, + WriteHandle& newItemHdl) { + XDCHECK(config_.moveCb); + util::LatencyTracker tracker{stats_.moveRegularLatency_}; + + if (!oldItem.isAccessible() || oldItem.isExpired()) { + return false; + } + + XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize()); + XDCHECK_EQ(reinterpret_cast(&getMMContainer(oldItem)), + reinterpret_cast(&getMMContainer(*newItemHdl))); + + // take care of the flags before we expose the item to be accessed. this + // will ensure that when another thread removes the item from RAM, we issue + // a delete accordingly. See D7859775 for an example + if (oldItem.isNvmClean()) { + newItemHdl->markNvmClean(); + } + + // Execute the move callback. We cannot make any guarantees about the + // consistency of the old item beyond this point, because the callback can + // do more than a simple memcpy() e.g. update external references. If there + // are any remaining handles to the old item, it is the caller's + // responsibility to invalidate them. The move can only fail after this + // statement if the old item has been removed or replaced, in which case it + // should be fine for it to be left in an inconsistent state. + config_.moveCb(oldItem, *newItemHdl, nullptr); + + // Inside the access container's lock, this checks if the old item is + // accessible and its refcount is zero. If the item is not accessible, + // there is no point to replace it since it had already been removed + // or in the process of being removed. If the item is in cache but the + // refcount is non-zero, it means user could be attempting to remove + // this item through an API such as remove(itemHandle). In this case, + // it is unsafe to replace the old item with a new one, so we should + // also abort. + if (!accessContainer_->replaceIf(oldItem, *newItemHdl, itemMovingPredicate)) { + return false; + } + + // Inside the MM container's lock, this checks if the old item exists to + // make sure that no other thread removed it, and only then replaces it. + if (!replaceInMMContainer(oldItem, *newItemHdl)) { + accessContainer_->remove(*newItemHdl); + return false; + } + + // Replacing into the MM container was successful, but someone could have + // called insertOrReplace() or remove() before or after the + // replaceInMMContainer() operation, which would invalidate newItemHdl. + if (!newItemHdl->isAccessible()) { + removeFromMMContainer(*newItemHdl); + return false; + } + + // no one can add or remove chained items at this point + if (oldItem.hasChainedItem()) { + // safe to acquire handle for a moving Item + auto oldHandle = acquire(&oldItem); + XDCHECK_EQ(1u, oldHandle->getRefCount()) << oldHandle->toString(); + XDCHECK(!newItemHdl->hasChainedItem()) << newItemHdl->toString(); + try { + auto l = chainedItemLocks_.lockExclusive(oldItem.getKey()); + transferChainLocked(oldHandle, newItemHdl); + } catch (const std::exception& e) { + // this should never happen because we drained all the handles. + XLOGF(DFATAL, "{}", e.what()); + throw; + } + + XDCHECK(!oldItem.hasChainedItem()); + XDCHECK(newItemHdl->hasChainedItem()); + } + newItemHdl.unmarkNascent(); + return true; +} + +template +bool CacheAllocator::moveChainedItem(ChainedItem& oldItem, + WriteHandle& newItemHdl) { + XDCHECK(config_.moveCb); + util::LatencyTracker tracker{stats_.moveChainedLatency_}; + + // This item has been unlinked from its parent and we're the only + // owner of it, so we're done here + if (!oldItem.isInMMContainer() || oldItem.isOnlyMoving()) { + return false; + } + + const auto parentKey = oldItem.getParentItem(compressor_).getKey(); + + // Grab lock to prevent anyone else from modifying the chain + auto l = chainedItemLocks_.lockExclusive(parentKey); + + auto parentHandle = + validateAndGetParentHandleForChainedMoveLocked(oldItem, parentKey); + if (!parentHandle) { + return false; + } + + // once we have the moving sync and valid parent for the old item, check if + // the original allocation was made correctly. If not, we destroy the + // allocation to indicate a retry to moving logic above. + if (reinterpret_cast( + &newItemHdl->asChainedItem().getParentItem(compressor_)) != + reinterpret_cast(&parentHandle->asChainedItem())) { + newItemHdl.reset(); + return false; + } + + XDCHECK_EQ(reinterpret_cast( + &newItemHdl->asChainedItem().getParentItem(compressor_)), + reinterpret_cast(&parentHandle->asChainedItem())); + + // In case someone else had removed this chained item from its parent by now + // So we check again to see if the it has been unlinked from its parent + if (!oldItem.isInMMContainer() || oldItem.isOnlyMoving()) { + return false; + } + + auto parentPtr = parentHandle.getInternal(); + + XDCHECK_EQ(reinterpret_cast(parentPtr), + reinterpret_cast(&oldItem.getParentItem(compressor_))); + + // Invoke the move callback to fix up any user data related to the chain + config_.moveCb(oldItem, *newItemHdl, parentPtr); + + // Replace the new item in the position of the old one before both in the + // parent's chain and the MMContainer. + auto oldItemHandle = + replaceChainedItemLocked(oldItem, std::move(newItemHdl), *parentHandle); + XDCHECK(oldItemHandle->isMoving()); + XDCHECK(!oldItemHandle->isInMMContainer()); + + return true; +} + +template +typename CacheAllocator::Item* +CacheAllocator::findEviction(TierId tid, PoolId pid, ClassId cid) { + auto& mmContainer = getMMContainer(tid, pid, cid); + + // Keep searching for a candidate until we were able to evict it + // or until the search limit has been exhausted + unsigned int searchTries = 0; + while ((config_.evictionSearchTries == 0 || + config_.evictionSearchTries > searchTries)) { + ++searchTries; + (*stats_.evictionAttempts)[pid][cid].inc(); + + Item* toRecycle = nullptr; + Item* candidate = nullptr; + + mmContainer.withEvictionIterator([this, &candidate, &toRecycle, &searchTries](auto &&itr){ + while ((config_.evictionSearchTries == 0 || + config_.evictionSearchTries > searchTries) && itr) { + ++searchTries; + + auto *toRecycle_ = itr.get(); + auto *candidate_ = toRecycle_->isChainedItem() + ? &toRecycle_->asChainedItem().getParentItem(compressor_) + : toRecycle_; + + // make sure no other thead is evicting the item + if (candidate_->getRefCount() == 0 && candidate_->markMoving()) { + toRecycle = toRecycle_; + candidate = candidate_; + return; + } + + ++itr; + } + }); + + if (!toRecycle) + continue; + + XDCHECK(toRecycle); + XDCHECK(candidate); + + // for chained items, the ownership of the parent can change. We try to + // evict what we think as parent and see if the eviction of parent + // recycles the child we intend to. + auto toReleaseHandle = + evictNormalItem(*candidate, true /* skipIfTokenInvalid */); + auto ref = candidate->unmarkMoving(); + + if (toReleaseHandle || ref == 0u) { + if (candidate->hasChainedItem()) { + (*stats_.chainedItemEvictions)[pid][cid].inc(); + } else { + (*stats_.regularItemEvictions)[pid][cid].inc(); + } + } else { + if (candidate->hasChainedItem()) { + stats_.evictFailParentAC.inc(); + } else { + stats_.evictFailAC.inc(); + } + } + + if (toReleaseHandle) { + if (auto eventTracker = getEventTracker()) { + eventTracker->record( + AllocatorApiEvent::DRAM_EVICT, toReleaseHandle->getKey(), + AllocatorApiResult::EVICTED, toReleaseHandle->getSize(), + toReleaseHandle->getConfiguredTTL().count()); + } + + XDCHECK(toReleaseHandle.get() == candidate); + XDCHECK(toRecycle == candidate || toRecycle->isChainedItem()); + XDCHECK_EQ(1u, toReleaseHandle->getRefCount()); + + // We manually release the item here because we don't want to + // invoke the Item Handle's destructor which will be decrementing + // an already zero refcount, which will throw exception + auto& itemToRelease = *toReleaseHandle.release(); + + // Decrementing the refcount because we want to recycle the item + const auto ref = decRef(itemToRelease); + XDCHECK_EQ(0u, ref); + + // check if by releasing the item we intend to, we actually + // recycle the candidate. + if (ReleaseRes::kRecycled == + releaseBackToAllocator(itemToRelease, RemoveContext::kEviction, + /* isNascent */ false, toRecycle)) { + return toRecycle; + } + } else if (ref == 0u) { + // it's safe to recycle the item here as there are no more + // references and the item could not been marked as moving + // by other thread since it's detached from MMContainer. + if (ReleaseRes::kRecycled == + releaseBackToAllocator(*candidate, RemoveContext::kEviction, + /* isNascent */ false, toRecycle)) { + return toRecycle; + } + } + } + return nullptr; +} + +template +folly::Range::ChainedItemIter> +CacheAllocator::viewAsChainedAllocsRange(const Item& parent) const { + return parent.hasChainedItem() + ? folly::Range{ChainedItemIter{ + findChainedItem(parent).get(), + compressor_}, + ChainedItemIter{}} + : folly::Range{}; +} + +template +bool CacheAllocator::shouldWriteToNvmCache(const Item& item) { + // write to nvmcache when it is enabled and the item says that it is not + // nvmclean or evicted by nvm while present in DRAM. + bool doWrite = nvmCache_ && nvmCache_->isEnabled(); + if (!doWrite) { + return false; + } + + doWrite = !item.isExpired(); + if (!doWrite) { + stats_.numNvmRejectsByExpiry.inc(); + return false; + } + + doWrite = (!item.isNvmClean() || item.isNvmEvicted()); + if (!doWrite) { + stats_.numNvmRejectsByClean.inc(); + return false; + } + return true; +} + +template +bool CacheAllocator::shouldWriteToNvmCacheExclusive( + const Item& item) { + auto chainedItemRange = viewAsChainedAllocsRange(item); + + if (nvmAdmissionPolicy_ && + !nvmAdmissionPolicy_->accept(item, chainedItemRange)) { + stats_.numNvmRejectsByAP.inc(); + return false; + } + + return true; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::tryEvictToNextMemoryTier( + TierId tid, PoolId pid, Item& item, bool fromBgThread) { + if(item.isChainedItem()) return {}; // TODO: We do not support ChainedItem yet + if(item.isExpired()) return acquire(&item); + + TierId nextTier = tid; // TODO - calculate this based on some admission policy + while (++nextTier < getNumTiers()) { // try to evict down to the next memory tiers + // allocateInternal might trigger another eviction + auto newItemHdl = allocateInternalTier(nextTier, pid, + item.getKey(), + item.getSize(), + item.getCreationTime(), + item.getExpiryTime(), + fromBgThread); + + if (newItemHdl) { + XDCHECK_EQ(newItemHdl->getSize(), item.getSize()); + return moveRegularItemWithSync(item, newItemHdl, itemMovingPredicate); + } + } + + return {}; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::tryEvictToNextMemoryTier(Item& item, bool fromBgThread) { + auto tid = getTierId(item); + auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId; + return tryEvictToNextMemoryTier(tid, pid, item, fromBgThread); +} + +template +bool +CacheAllocator::tryPromoteToNextMemoryTier( + TierId tid, PoolId pid, Item& item, bool fromBgThread) { + TierId nextTier = tid; + while (nextTier > 0) { // try to evict down to the next memory tiers + auto toPromoteTier = nextTier - 1; + --nextTier; + + // allocateInternal might trigger another eviction + auto newItemHdl = allocateInternalTier(toPromoteTier, pid, + item.getKey(), + item.getSize(), + item.getCreationTime(), + item.getExpiryTime(), + fromBgThread); + + if (newItemHdl) { + XDCHECK_EQ(newItemHdl->getSize(), item.getSize()); + auto predicate = [&](const Item& item){ + return item.getRefCount() == 0 || config_.numDuplicateElements > 0; + }; + if (moveRegularItemWithSync(item, newItemHdl, predicate)) { + return true; + } + } + } + + return false; +} + +template +bool +CacheAllocator::tryPromoteToNextMemoryTier(Item& item, bool fromBgThread) { + auto tid = getTierId(item); + auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId; + return tryPromoteToNextMemoryTier(tid, pid, item, fromBgThread); +} + + +template +typename CacheAllocator::RemoveRes +CacheAllocator::remove(typename Item::Key key) { + // While we issue this delete, there can be potential races that change the + // state of the cache between ram and nvm. If we find the item in RAM and + // obtain a handle, the situation is simpler. The complicated ones are the + // following scenarios where when the delete checks RAM, we don't find + // anything in RAM. The end scenario is that in the absence of any + // concurrent inserts, after delete, there should be nothing in nvm and ram. + // + // == Racing async fill from nvm with delete == + // 1. T1 finds nothing in ram and issues a nvmcache look that is async. We + // enqueue the get holding the fill lock and drop it. + // 2. T2 finds nothing in ram, enqueues delete to nvmcache. + // 3. T1's async fetch finishes and fills the item in cache, but right + // before the delete is enqueued above + // + // To deal with this race, we first enqueue the nvmcache delete tombstone + // and when we finish the async fetch, we check if a tombstone was enqueued + // meanwhile and cancel the fill. + // + // == Racing async fill from nvm with delete == + // there is a key in nvmcache and nothing in RAM. + // 1. T1 issues delete while nothing is in RAM and enqueues nvm cache + // remove + // 2. before the nvmcache remove gets enqueued, T2 does a find() that + // fetches from nvm. + // 3. T2 inserts in cache from nvmcache and T1 observes that item and tries + // to remove it only from RAM. + // + // to fix this, we do the nvmcache remove always the last thing and enqueue + // a tombstone to avoid concurrent fills while we are in the process of + // doing the nvmcache remove. + // + // == Racing eviction with delete == + // 1. T1 is evicting an item, trying to remove from the hashtable and is in + // the process of enqueing the put to nvmcache. + // 2. T2 is removing and finds nothing in ram, enqueue the nvmcache delete. + // The delete to nvmcache gets enqueued after T1 fills in ram. + // + // If T2 finds the item in ram, eviction can not proceed and the race does + // not exist. If T2 does not find anything in RAM, it is likely that T1 is + // in the process of issuing an nvmcache put. In this case, T1's nvmcache + // put will check if there was a delete enqueued while the eviction was in + // flight after removing from the hashtable. + // + stats_.numCacheRemoves.inc(); + HashedKey hk{key}; + + using Guard = typename NvmCacheT::DeleteTombStoneGuard; + auto tombStone = nvmCache_ ? nvmCache_->createDeleteTombStone(hk) : Guard{}; + + auto handle = findInternal(key); + if (!handle) { + if (nvmCache_) { + nvmCache_->remove(hk, std::move(tombStone)); + } + if (auto eventTracker = getEventTracker()) { + eventTracker->record(AllocatorApiEvent::REMOVE, key, + AllocatorApiResult::NOT_FOUND); + } + return RemoveRes::kNotFoundInRam; + } + + return removeImpl(hk, *handle, std::move(tombStone)); +} + +template +bool CacheAllocator::removeFromRamForTesting( + typename Item::Key key) { + return removeImpl(HashedKey{key}, *findInternal(key), DeleteTombStoneGuard{}, + false /* removeFromNvm */) == RemoveRes::kSuccess; +} + +template +void CacheAllocator::removeFromNvmForTesting( + typename Item::Key key) { + if (nvmCache_) { + HashedKey hk{key}; + nvmCache_->remove(hk, nvmCache_->createDeleteTombStone(hk)); + } +} + +template +bool CacheAllocator::pushToNvmCacheFromRamForTesting( + typename Item::Key key) { + auto handle = findInternal(key); + + if (handle && nvmCache_ && shouldWriteToNvmCache(*handle) && + shouldWriteToNvmCacheExclusive(*handle)) { + nvmCache_->put(handle, nvmCache_->createPutToken(handle->getKey())); + return true; + } + return false; +} + +template +void CacheAllocator::flushNvmCache() { + if (nvmCache_) { + nvmCache_->flushPendingOps(); + } +} + +template +typename CacheAllocator::RemoveRes +CacheAllocator::remove(AccessIterator& it) { + stats_.numCacheRemoves.inc(); + if (auto eventTracker = getEventTracker()) { + eventTracker->record(AllocatorApiEvent::REMOVE, it->getKey(), + AllocatorApiResult::REMOVED, it->getSize(), + it->getConfiguredTTL().count()); + } + HashedKey hk{it->getKey()}; + auto tombstone = + nvmCache_ ? nvmCache_->createDeleteTombStone(hk) : DeleteTombStoneGuard{}; + return removeImpl(hk, *it, std::move(tombstone)); +} + +template +typename CacheAllocator::RemoveRes +CacheAllocator::remove(const ReadHandle& it) { + stats_.numCacheRemoves.inc(); + if (!it) { + throw std::invalid_argument("Trying to remove a null item handle"); + } + HashedKey hk{it->getKey()}; + auto tombstone = + nvmCache_ ? nvmCache_->createDeleteTombStone(hk) : DeleteTombStoneGuard{}; + return removeImpl(hk, *(it.getInternal()), std::move(tombstone)); +} + +template +typename CacheAllocator::RemoveRes +CacheAllocator::removeImpl(HashedKey hk, + Item& item, + DeleteTombStoneGuard tombstone, + bool removeFromNvm, + bool recordApiEvent) { + bool success = false; + { + auto lock = nvmCache_ ? nvmCache_->getItemDestructorLock(hk) + : std::unique_lock(); + + success = accessContainer_->remove(item); + + if (removeFromNvm && success && item.isNvmClean() && !item.isNvmEvicted()) { + // item is to be removed and the destructor will be executed + // upon memory released, mark it in nvm to avoid destructor + // executed from nvm + nvmCache_->markNvmItemRemovedLocked(hk); + } + } + XDCHECK(!item.isAccessible()); + + // remove it from the mm container. this will be no-op if it is already + // removed. + removeFromMMContainer(item); + + // Enqueue delete to nvmCache if we know from the item that it was pulled in + // from NVM. If the item was not pulled in from NVM, it is not possible to + // have it be written to NVM. + if (removeFromNvm && item.isNvmClean()) { + XDCHECK(tombstone); + nvmCache_->remove(hk, std::move(tombstone)); + } + + auto eventTracker = getEventTracker(); + if (recordApiEvent && eventTracker) { + const auto result = + success ? AllocatorApiResult::REMOVED : AllocatorApiResult::NOT_FOUND; + eventTracker->record(AllocatorApiEvent::REMOVE, item.getKey(), result, + item.getSize(), item.getConfiguredTTL().count()); + } + + // the last guy with reference to the item will release it back to the + // allocator. + if (success) { + stats_.numCacheRemoveRamHits.inc(); + return RemoveRes::kSuccess; + } + return RemoveRes::kNotFoundInRam; +} + +template +void CacheAllocator::invalidateNvm(Item& item) { + if (nvmCache_ != nullptr && item.isAccessible() && item.isNvmClean()) { + HashedKey hk{item.getKey()}; + { + auto lock = nvmCache_->getItemDestructorLock(hk); + if (!item.isNvmEvicted() && item.isNvmClean() && item.isAccessible()) { + // item is being updated and invalidated in nvm. Mark the item to avoid + // destructor to be executed from nvm + nvmCache_->markNvmItemRemovedLocked(hk); + } + item.unmarkNvmClean(); + } + nvmCache_->remove(hk, nvmCache_->createDeleteTombStone(hk)); + } +} + +template +TierId +CacheAllocator::getTierId(const Item& item) const { + return getTierId(item.getMemory()); +} + +template +TierId +CacheAllocator::getTierId(const void* ptr) const { + for (TierId tid = 0; tid < getNumTiers(); tid++) { + if (allocator_[tid]->isMemoryInAllocator(ptr)) + return tid; + } + + throw std::invalid_argument("Item does not belong to any tier!"); +} + +template +typename CacheAllocator::MMContainer& +CacheAllocator::getMMContainer(const Item& item) const noexcept { + const auto tid = getTierId(item); + const auto allocInfo = + allocator_[tid]->getAllocInfo(static_cast(&item)); + return getMMContainer(tid, allocInfo.poolId, allocInfo.classId); +} + +template +typename CacheAllocator::MMContainer& +CacheAllocator::getMMContainer(TierId tid, + PoolId pid, + ClassId cid) const noexcept { + XDCHECK_LT(static_cast(tid), mmContainers_.size()); + XDCHECK_LT(static_cast(pid), mmContainers_[tid].size()); + XDCHECK_LT(static_cast(cid), mmContainers_[tid][pid].size()); + return *mmContainers_[tid][pid][cid]; +} + +template +MMContainerStat CacheAllocator::getMMContainerStat( + TierId tid, PoolId pid, ClassId cid) const noexcept { + if(static_cast(tid) >= mmContainers_.size()) { + return MMContainerStat{}; + } + if (static_cast(pid) >= mmContainers_[tid].size()) { + return MMContainerStat{}; + } + if (static_cast(cid) >= mmContainers_[tid][pid].size()) { + return MMContainerStat{}; + } + return mmContainers_[tid][pid][cid] ? mmContainers_[tid][pid][cid]->getStats() + : MMContainerStat{}; +} + +template +typename CacheAllocator::ReadHandle +CacheAllocator::peek(typename Item::Key key) { + return findInternal(key); +} + +template +std::pair::ReadHandle, + typename CacheAllocator::ReadHandle> +CacheAllocator::inspectCache(typename Item::Key key) { + std::pair res; + res.first = findInternal(key); + res.second = nvmCache_ ? nvmCache_->peek(key) : nullptr; + return res; +} + +// findFast and find() are the most performance critical parts of +// CacheAllocator. Hence the sprinkling of UNLIKELY/LIKELY to tell the +// compiler which executions we don't want to optimize on. +template +typename CacheAllocator::WriteHandle +CacheAllocator::findFastInternal(typename Item::Key key, + AccessMode mode) { + auto handle = findInternal(key); + + stats_.numCacheGets.inc(); + if (UNLIKELY(!handle)) { + stats_.numCacheGetMiss.inc(); + return handle; + } + + markUseful(handle, mode); + return handle; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::findFastImpl(typename Item::Key key, + AccessMode mode) { + auto handle = findFastInternal(key, mode); + auto eventTracker = getEventTracker(); + if (UNLIKELY(eventTracker != nullptr)) { + if (handle) { + eventTracker->record(AllocatorApiEvent::FIND_FAST, key, + AllocatorApiResult::FOUND, + folly::Optional(handle->getSize()), + handle->getConfiguredTTL().count()); + } else { + eventTracker->record(AllocatorApiEvent::FIND_FAST, key, + AllocatorApiResult::NOT_FOUND); + } + } + return handle; +} + +template +typename CacheAllocator::ReadHandle +CacheAllocator::findFast(typename Item::Key key) { + return findFastImpl(key, AccessMode::kRead); +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::findFastToWrite(typename Item::Key key, + bool doNvmInvalidation) { + auto handle = findFastImpl(key, AccessMode::kWrite); + if (handle == nullptr) { + return nullptr; + } + if (doNvmInvalidation) { + invalidateNvm(*handle); + } + return handle; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::findImpl(typename Item::Key key, AccessMode mode) { + auto handle = findFastInternal(key, mode); + + if (handle) { + if (UNLIKELY(handle->isExpired())) { + // update cache miss stats if the item has already been expired. + stats_.numCacheGetMiss.inc(); + stats_.numCacheGetExpiries.inc(); + auto eventTracker = getEventTracker(); + if (UNLIKELY(eventTracker != nullptr)) { + eventTracker->record(AllocatorApiEvent::FIND, key, + AllocatorApiResult::NOT_FOUND); + } + WriteHandle ret; + ret.markExpired(); + return ret; + } + + auto eventTracker = getEventTracker(); + if (UNLIKELY(eventTracker != nullptr)) { + eventTracker->record(AllocatorApiEvent::FIND, key, + AllocatorApiResult::FOUND, handle->getSize(), + handle->getConfiguredTTL().count()); + } + return handle; + } + + auto eventResult = AllocatorApiResult::NOT_FOUND; + + if (nvmCache_) { + handle = nvmCache_->find(HashedKey{key}); + eventResult = AllocatorApiResult::NOT_FOUND_IN_MEMORY; + } + + auto eventTracker = getEventTracker(); + if (UNLIKELY(eventTracker != nullptr)) { + eventTracker->record(AllocatorApiEvent::FIND, key, eventResult); + } + + return handle; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::findToWrite(typename Item::Key key, + bool doNvmInvalidation) { + auto handle = findImpl(key, AccessMode::kWrite); + if (handle == nullptr) { + return nullptr; + } + if (doNvmInvalidation) { + invalidateNvm(*handle); + } + return handle; +} + +template +typename CacheAllocator::ReadHandle +CacheAllocator::find(typename Item::Key key) { + return findImpl(key, AccessMode::kRead); +} + +template +void CacheAllocator::markUseful(const ReadHandle& handle, + AccessMode mode) { + if (!handle) { + return; + } + + auto& item = *(handle.getInternal()); + bool recorded = recordAccessInMMContainer(item, mode); + + // if parent is not recorded, skip children as well when the config is set + if (LIKELY(!item.hasChainedItem() || + (!recorded && config_.isSkipPromoteChildrenWhenParentFailed()))) { + return; + } + + forEachChainedItem(item, [this, mode](ChainedItem& chainedItem) { + recordAccessInMMContainer(chainedItem, mode); + }); +} + +template +bool CacheAllocator::recordAccessInMMContainer(Item& item, + AccessMode mode) { + const auto tid = getTierId(item); + const auto allocInfo = + allocator_[tid]->getAllocInfo(static_cast(&item)); + (*stats_.cacheHits)[allocInfo.poolId][allocInfo.classId].inc(); + + // track recently accessed items if needed + if (UNLIKELY(config_.trackRecentItemsForDump)) { + ring_->trackItem(reinterpret_cast(&item), item.getSize()); + } + + auto& mmContainer = getMMContainer(tid, allocInfo.poolId, allocInfo.classId); + return mmContainer.recordAccess(item, mode); +} + +template +uint32_t CacheAllocator::getUsableSize(const Item& item) const { + const auto tid = getTierId(item); + const auto allocSize = + allocator_[tid]->getAllocInfo(static_cast(&item)).allocSize; + return item.isChainedItem() + ? allocSize - ChainedItem::getRequiredSize(0) + : allocSize - Item::getRequiredSize(item.getKey(), 0); +} + +template +typename CacheAllocator::ReadHandle +CacheAllocator::getSampleItem() { + // TODO: is using random tier a good idea? + auto tid = folly::Random::rand32() % getNumTiers(); + + const auto* item = + reinterpret_cast(allocator_[tid]->getRandomAlloc()); + if (!item) { + return ReadHandle{}; + } + + ReadHandle handle = findInternal(item->getKey()); + // Check that item returned is the same that was sampled + if (handle.get() == item) { + return handle; + } + return ReadHandle{}; +} + +template +std::vector CacheAllocator::dumpEvictionIterator( + PoolId pid, ClassId cid, size_t numItems) { + if (numItems == 0) { + return {}; + } + + // Always evict from the lowest layer. + int tid = getNumTiers() - 1; + + if (static_cast(tid) >= mmContainers_.size() || + static_cast(pid) >= mmContainers_[tid].size() || + static_cast(cid) >= mmContainers_[tid][pid].size()) { + throw std::invalid_argument( + folly::sformat("Invalid TierId: {} and PoolId: {} and ClassId: {}.", tid, pid, cid)); + } + + std::vector content; + + size_t i = 0; + while (i < numItems && tid >= 0) { + auto& mm = *mmContainers_[tid][pid][cid]; + auto evictItr = mm.getEvictionIterator(); + while (evictItr && i < numItems) { + content.push_back(evictItr->toString()); + ++evictItr; + ++i; + } + + --tid; + } + + return content; +} + +template +template +folly::IOBuf CacheAllocator::convertToIOBufT(Handle& handle) { + if (!handle) { + throw std::invalid_argument("null item handle for converting to IOBUf"); + } + + Item* item = handle.getInternal(); + const uint32_t dataOffset = item->getOffsetForMemory(); + + using ConvertChainedItem = std::function( + Item * item, ChainedItem & chainedItem)>; + folly::IOBuf iobuf; + ConvertChainedItem converter; + + // based on current refcount and threshold from config + // determine to use a new Item Handle for each chain items + // or use shared Item Handle for all chain items + if (item->getRefCount() > config_.thresholdForConvertingToIOBuf) { + auto sharedHdl = std::make_shared(std::move(handle)); + + iobuf = folly::IOBuf{ + folly::IOBuf::TAKE_OWNERSHIP, item, + + // Since we'll be moving the IOBuf data pointer forward + // by dataOffset, we need to adjust the IOBuf length + // accordingly + dataOffset + item->getSize(), + + [](void*, void* userData) { + auto* hdl = reinterpret_cast*>(userData); + delete hdl; + } /* freeFunc */, + new std::shared_ptr{sharedHdl} /* userData for freeFunc */}; + + if (item->hasChainedItem()) { + converter = [sharedHdl](Item*, ChainedItem& chainedItem) { + const uint32_t chainedItemDataOffset = chainedItem.getOffsetForMemory(); + + return folly::IOBuf::takeOwnership( + &chainedItem, + + // Since we'll be moving the IOBuf data pointer forward by + // dataOffset, + // we need to adjust the IOBuf length accordingly + chainedItemDataOffset + chainedItem.getSize(), + + [](void*, void* userData) { + auto* hdl = reinterpret_cast*>(userData); + delete hdl; + } /* freeFunc */, + new std::shared_ptr{sharedHdl} /* userData for freeFunc */); + }; + } + + } else { + // following IOBuf will take the item's ownership and trigger freeFunc to + // release the reference count. + handle.release(); + iobuf = folly::IOBuf{folly::IOBuf::TAKE_OWNERSHIP, item, + + // Since we'll be moving the IOBuf data pointer forward + // by dataOffset, we need to adjust the IOBuf length + // accordingly + dataOffset + item->getSize(), + + [](void* buf, void* userData) { + Handle{reinterpret_cast(buf), + *reinterpret_cast(userData)} + .reset(); + } /* freeFunc */, + this /* userData for freeFunc */}; + + if (item->hasChainedItem()) { + converter = [this](Item* parentItem, ChainedItem& chainedItem) { + const uint32_t chainedItemDataOffset = chainedItem.getOffsetForMemory(); + + // Each IOBuf converted from a child item will hold one additional + // refcount on the parent item. This ensures that as long as the user + // holds any IOBuf pointing anywhere in the chain, the whole chain + // will not be evicted from cache. + // + // We can safely bump the refcount on the parent here only because + // we already have an item handle on the parent (which has just been + // moved into the IOBuf above). Normally, the only place we can + // bump an item handle safely is through the AccessContainer. + acquire(parentItem).release(); + + return folly::IOBuf::takeOwnership( + &chainedItem, + + // Since we'll be moving the IOBuf data pointer forward by + // dataOffset, + // we need to adjust the IOBuf length accordingly + chainedItemDataOffset + chainedItem.getSize(), + + [](void* buf, void* userData) { + auto* cache = reinterpret_cast(userData); + auto* child = reinterpret_cast(buf); + auto* parent = &child->getParentItem(cache->compressor_); + Handle{parent, *cache}.reset(); + } /* freeFunc */, + this /* userData for freeFunc */); + }; + } + } + + iobuf.trimStart(dataOffset); + iobuf.markExternallySharedOne(); + + if (item->hasChainedItem()) { + auto appendHelper = [&](ChainedItem& chainedItem) { + const uint32_t chainedItemDataOffset = chainedItem.getOffsetForMemory(); + + auto nextChain = converter(item, chainedItem); + + nextChain->trimStart(chainedItemDataOffset); + nextChain->markExternallySharedOne(); + + // Append immediately after the parent, IOBuf will present the data + // in the original insertion order. + // + // i.e. 1. Allocate parent + // 2. add A, add B, add C + // + // In memory: parent -> C -> B -> A + // In IOBuf: parent -> A -> B -> C + iobuf.appendChain(std::move(nextChain)); + }; + + forEachChainedItem(*item, std::move(appendHelper)); + } + + return iobuf; +} + +template +folly::IOBuf CacheAllocator::wrapAsIOBuf(const Item& item) { + folly::IOBuf ioBuf{folly::IOBuf::WRAP_BUFFER, item.getMemory(), + item.getSize()}; + + if (item.hasChainedItem()) { + auto appendHelper = [&](ChainedItem& chainedItem) { + auto nextChain = folly::IOBuf::wrapBuffer(chainedItem.getMemory(), + chainedItem.getSize()); + + // Append immediately after the parent, IOBuf will present the data + // in the original insertion order. + // + // i.e. 1. Allocate parent + // 2. add A, add B, add C + // + // In memory: parent -> C -> B -> A + // In IOBuf: parent -> A -> B -> C + ioBuf.appendChain(std::move(nextChain)); + }; + + forEachChainedItem(item, std::move(appendHelper)); + } + return ioBuf; +} + +template +PoolId CacheAllocator::addPool( + folly::StringPiece name, + size_t size, + const std::set& allocSizes, + MMConfig config, + std::shared_ptr rebalanceStrategy, + std::shared_ptr resizeStrategy, + bool ensureProvisionable) { + folly::SharedMutex::WriteHolder w(poolsResizeAndRebalanceLock_); + + PoolId pid = 0; + size_t totalCacheSize = 0; + + for (TierId tid = 0; tid < getNumTiers(); tid++) { + totalCacheSize += allocator_[tid]->getMemorySize(); + } + + for (TierId tid = 0; tid < getNumTiers(); tid++) { + auto tierSizeRatio = + static_cast(allocator_[tid]->getMemorySize()) / totalCacheSize; + size_t tierPoolSize = static_cast(tierSizeRatio * size); + + // TODO: what if we manage to add pool only in one tier? + // we should probably remove that on failure + auto res = allocator_[tid]->addPool( + name, tierPoolSize, allocSizes, ensureProvisionable); + XDCHECK(tid == 0 || res == pid); + pid = res; + } + + createMMContainers(pid, std::move(config)); + setRebalanceStrategy(pid, std::move(rebalanceStrategy)); + setResizeStrategy(pid, std::move(resizeStrategy)); + + if (backgroundEvictor_.size()) { + for (size_t id = 0; id < backgroundEvictor_.size(); id++) + backgroundEvictor_[id]->setAssignedMemory(getAssignedMemoryToBgWorker(id, backgroundEvictor_.size(), 0)); + } + + if (backgroundPromoter_.size()) { + for (size_t id = 0; id < backgroundPromoter_.size(); id++) + backgroundPromoter_[id]->setAssignedMemory(getAssignedMemoryToBgWorker(id, backgroundPromoter_.size(), 1)); + } + + return pid; +} + +template +void CacheAllocator::overridePoolRebalanceStrategy( + PoolId pid, std::shared_ptr rebalanceStrategy) { + if (static_cast(pid) >= mmContainers_[0].size()) { + throw std::invalid_argument(folly::sformat( + "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[0].size())); + } + setRebalanceStrategy(pid, std::move(rebalanceStrategy)); +} + +template +void CacheAllocator::overridePoolResizeStrategy( + PoolId pid, std::shared_ptr resizeStrategy) { + if (static_cast(pid) >= mmContainers_[0].size()) { + throw std::invalid_argument(folly::sformat( + "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[0].size())); + } + setResizeStrategy(pid, std::move(resizeStrategy)); +} + +template +void CacheAllocator::overridePoolOptimizeStrategy( + std::shared_ptr optimizeStrategy) { + setPoolOptimizeStrategy(std::move(optimizeStrategy)); +} + +template +void CacheAllocator::overridePoolConfig(TierId tid, PoolId pid, + const MMConfig& config) { + // TODO: add generic tier id checking + if (static_cast(pid) >= mmContainers_[tid].size()) { + throw std::invalid_argument(folly::sformat( + "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[tid].size())); + } + auto& pool = allocator_[tid]->getPool(pid); + for (unsigned int cid = 0; cid < pool.getNumClassId(); ++cid) { + MMConfig mmConfig = config; + mmConfig.addExtraConfig( + config_.trackTailHits + ? pool.getAllocationClass(static_cast(cid)) + .getAllocsPerSlab() + : 0); + DCHECK_NOTNULL(mmContainers_[tid][pid][cid].get()); + mmContainers_[tid][pid][cid]->setConfig(mmConfig); + } +} + +template +void CacheAllocator::createMMContainers(const PoolId pid, + MMConfig config) { + // pools on each layer should have the same number of class id, etc. + // TODO: think about deduplication + auto& pool = allocator_[0]->getPool(pid); + + for (unsigned int cid = 0; cid < pool.getNumClassId(); ++cid) { + config.addExtraConfig( + config_.trackTailHits + ? pool.getAllocationClass(static_cast(cid)) + .getAllocsPerSlab() + : 0); + for (TierId tid = 0; tid < getNumTiers(); tid++) { + mmContainers_[tid][pid][cid].reset(new MMContainer(config, compressor_)); + } + } +} + +template +PoolId CacheAllocator::getPoolId( + folly::StringPiece name) const noexcept { + // each tier has the same pools + return allocator_[0]->getPoolId(name.str()); +} + +// The Function returns a consolidated vector of Release Slab +// events from Pool Workers { Pool rebalancer, Pool Resizer and +// Memory Monitor}. +template +AllSlabReleaseEvents CacheAllocator::getAllSlabReleaseEvents( + PoolId poolId) const { + AllSlabReleaseEvents res; + // lock protects against workers being restarted + { + std::lock_guard l(workersMutex_); + if (poolRebalancer_) { + res.rebalancerEvents = poolRebalancer_->getSlabReleaseEvents(poolId); + } + if (poolResizer_) { + res.resizerEvents = poolResizer_->getSlabReleaseEvents(poolId); + } + if (memMonitor_) { + res.monitorEvents = memMonitor_->getSlabReleaseEvents(poolId); + } + } + return res; +} + +template +std::set CacheAllocator::filterCompactCachePools( + const PoolIds& poolIds) const { + PoolIds ret; + folly::SharedMutex::ReadHolder lock(compactCachePoolsLock_); + for (auto poolId : poolIds) { + if (!isCompactCachePool_[poolId]) { + // filter out slab pools backing the compact caches. + ret.insert(poolId); + } + } + return ret; +} + +template +std::set CacheAllocator::getRegularPoolIds() const { + folly::SharedMutex::ReadHolder r(poolsResizeAndRebalanceLock_); + // TODO - get rid of the duplication - right now, each tier + // holds pool objects with mostly the same info + return filterCompactCachePools(allocator_[0]->getPoolIds()); +} + +template +std::set CacheAllocator::getCCachePoolIds() const { + PoolIds ret; + folly::SharedMutex::ReadHolder lock(compactCachePoolsLock_); + for (PoolId id = 0; id < static_cast(MemoryPoolManager::kMaxPools); + id++) { + if (isCompactCachePool_[id]) { + // filter out slab pools backing the compact caches. + ret.insert(id); + } + } + return ret; +} + +template +std::set CacheAllocator::getRegularPoolIdsForResize() + const { + folly::SharedMutex::ReadHolder r(poolsResizeAndRebalanceLock_); + // If Slabs are getting advised away - as indicated by non-zero + // getAdvisedMemorySize - then pools may be overLimit even when + // all slabs are not allocated. Otherwise, pools may be overLimit + // only after all slabs are allocated. + return (allocator_[currentTier()]->allSlabsAllocated()) || + (allocator_[currentTier()]->getAdvisedMemorySize() != 0) + ? filterCompactCachePools(allocator_[currentTier()]->getPoolsOverLimit()) + : std::set{}; +} + +template +const std::string CacheAllocator::getCacheName() const { + return config_.cacheName; +} + +template +size_t CacheAllocator::getPoolSize(PoolId poolId) const { + size_t poolSize = 0; + for (auto& allocator: allocator_) { + const auto& pool = allocator->getPool(poolId); + poolSize += pool.getPoolSize(); + } + return poolSize; +} + +template +PoolStats CacheAllocator::getPoolStats(PoolId poolId) const { + const auto& pool = allocator_[currentTier()]->getPool(poolId); + const auto& allocSizes = pool.getAllocSizes(); + auto mpStats = pool.getStats(); + const auto& classIds = mpStats.classIds; + + // check if this is a compact cache. + bool isCompactCache = false; + { + folly::SharedMutex::ReadHolder lock(compactCachePoolsLock_); + isCompactCache = isCompactCachePool_[poolId]; + } + + std::unordered_map cacheStats; + uint64_t totalHits = 0; + // cacheStats is only menaningful for pools that are not compact caches. + // TODO export evictions, numItems etc from compact cache directly. + if (!isCompactCache) { + for (const ClassId cid : classIds) { + uint64_t classHits = (*stats_.cacheHits)[poolId][cid].get(); + cacheStats.insert( + {cid, + {allocSizes[cid], (*stats_.allocAttempts)[poolId][cid].get(), + (*stats_.evictionAttempts)[poolId][cid].get(), + (*stats_.allocFailures)[poolId][cid].get(), + (*stats_.fragmentationSize)[poolId][cid].get(), classHits, + (*stats_.chainedItemEvictions)[poolId][cid].get(), + (*stats_.regularItemEvictions)[poolId][cid].get(), + getMMContainerStat(currentTier(), poolId, cid)}}); + totalHits += classHits; + } + } + + PoolStats ret; + ret.isCompactCache = isCompactCache; + ret.poolName = allocator_[currentTier()]->getPoolName(poolId); + ret.poolSize = pool.getPoolSize(); + ret.poolUsableSize = pool.getPoolUsableSize(); + ret.poolAdvisedSize = pool.getPoolAdvisedSize(); + ret.cacheStats = std::move(cacheStats); + ret.mpStats = std::move(mpStats); + ret.numPoolGetHits = totalHits; + ret.evictionAgeSecs = stats_.perPoolEvictionAgeSecs_[poolId].estimate(); + + return ret; +} + +template +double CacheAllocator::slabsApproxFreePercentage(TierId tid) const +{ + return allocator_[tid]->approxFreeSlabsPercentage(); +} + +template +AllocationClassBaseStat CacheAllocator::getAllocationClassStats( + TierId tid, PoolId pid, ClassId cid) const { + const auto &ac = allocator_[tid]->getPool(pid).getAllocationClass(cid); + + AllocationClassBaseStat stats{}; + stats.allocSize = ac.getAllocSize(); + stats.memorySize = ac.getNumSlabs() * Slab::kSize; + + if (slabsApproxFreePercentage(tid) > 0.0) { + auto totalMemory = MemoryAllocator::getMemorySize(memoryTierSize(tid)); + auto freeMemory = static_cast(totalMemory) * slabsApproxFreePercentage(tid) / 100.0; + + // amount of free memory which has the same ratio to entire free memory as + // this allocation class memory size has to used memory + auto scaledFreeMemory = static_cast(freeMemory * stats.memorySize / totalMemory); + + auto acAllocatedMemory = (100.0 - ac.approxFreePercentage()) / 100.0 * ac.getNumSlabs() * Slab::kSize; + auto acMaxAvailableMemory = ac.getNumSlabs() * Slab::kSize + scaledFreeMemory; + + if (acMaxAvailableMemory == 0) { + stats.approxFreePercent = 100.0; + } else { + stats.approxFreePercent = 100.0 - 100.0 * acAllocatedMemory / acMaxAvailableMemory; + } + } else { + stats.approxFreePercent = ac.approxFreePercentage(); + } + stats.allocLatencyNs = (*stats_.classAllocLatency)[tid][pid][cid]; + + return stats; +} + +template +PoolEvictionAgeStats CacheAllocator::getPoolEvictionAgeStats( + PoolId pid, unsigned int slabProjectionLength) const { + PoolEvictionAgeStats stats; + const auto& pool = allocator_[currentTier()]->getPool(pid); + const auto& allocSizes = pool.getAllocSizes(); + for (ClassId cid = 0; cid < static_cast(allocSizes.size()); ++cid) { + auto& mmContainer = getMMContainer(currentTier(), pid, cid); + const auto numItemsPerSlab = + allocator_[currentTier()]->getPool(pid).getAllocationClass(cid).getAllocsPerSlab(); + const auto projectionLength = numItemsPerSlab * slabProjectionLength; + stats.classEvictionAgeStats[cid] = + mmContainer.getEvictionAgeStat(projectionLength); + } + return stats; +} + +template +CacheMetadata CacheAllocator::getCacheMetadata() const noexcept { + return CacheMetadata{kCachelibVersion, kCacheRamFormatVersion, + kCacheNvmFormatVersion, config_.getCacheSize()}; +} + +template +void CacheAllocator::releaseSlab(PoolId pid, + ClassId cid, + SlabReleaseMode mode, + const void* hint) { + releaseSlab(pid, cid, Slab::kInvalidClassId, mode, hint); +} + +template +void CacheAllocator::releaseSlab(PoolId pid, + ClassId victim, + ClassId receiver, + SlabReleaseMode mode, + const void* hint) { + stats_.numActiveSlabReleases.inc(); + SCOPE_EXIT { stats_.numActiveSlabReleases.dec(); }; + switch (mode) { + case SlabReleaseMode::kRebalance: + stats_.numReleasedForRebalance.inc(); + break; + case SlabReleaseMode::kResize: + stats_.numReleasedForResize.inc(); + break; + case SlabReleaseMode::kAdvise: + stats_.numReleasedForAdvise.inc(); + break; + } + + try { + auto releaseContext = allocator_[currentTier()]->startSlabRelease( + pid, victim, receiver, mode, hint, + [this]() -> bool { return shutDownInProgress_; }); + + // No work needed if the slab is already released + if (releaseContext.isReleased()) { + return; + } + + releaseSlabImpl(currentTier(), releaseContext); + if (!allocator_[currentTier()]->allAllocsFreed(releaseContext)) { + throw std::runtime_error( + folly::sformat("Was not able to free all allocs. PoolId: {}, AC: {}", + releaseContext.getPoolId(), + releaseContext.getClassId())); + } + + allocator_[currentTier()]->completeSlabRelease(releaseContext); + } catch (const exception::SlabReleaseAborted& e) { + stats_.numAbortedSlabReleases.inc(); + throw exception::SlabReleaseAborted(folly::sformat( + "Slab release aborted while releasing " + "a slab in pool {} victim {} receiver {}. Original ex msg: ", + pid, static_cast(victim), static_cast(receiver), e.what())); + } +} + +template +SlabReleaseStats CacheAllocator::getSlabReleaseStats() const noexcept { + std::lock_guard l(workersMutex_); + return SlabReleaseStats{stats_.numActiveSlabReleases.get(), + stats_.numReleasedForRebalance.get(), + stats_.numReleasedForResize.get(), + stats_.numReleasedForAdvise.get(), + poolRebalancer_ ? poolRebalancer_->getRunCount() + : 0ULL, + poolResizer_ ? poolResizer_->getRunCount() : 0ULL, + memMonitor_ ? memMonitor_->getRunCount() : 0ULL, + stats_.numMoveAttempts.get(), + stats_.numMoveSuccesses.get(), + stats_.numEvictionAttempts.get(), + stats_.numEvictionSuccesses.get(), + stats_.numSlabReleaseStuck.get()}; +} + +template +void CacheAllocator::releaseSlabImpl(TierId tid, + const SlabReleaseContext& releaseContext) { + auto startTime = std::chrono::milliseconds(util::getCurrentTimeMs()); + bool releaseStuck = false; + + SCOPE_EXIT { + if (releaseStuck) { + stats_.numSlabReleaseStuck.dec(); + } + }; + + util::Throttler throttler( + config_.throttleConfig, + [this, &startTime, &releaseStuck](std::chrono::milliseconds curTime) { + if (!releaseStuck && + curTime >= startTime + config_.slabReleaseStuckThreshold) { + stats().numSlabReleaseStuck.inc(); + releaseStuck = true; + } + }); + + // Active allocations need to be freed before we can release this slab + // The idea is: + // 1. Iterate through each active allocation + // 2. Under AC lock, acquire ownership of this active allocation + // 3. If 2 is successful, Move or Evict + // 4. Move on to the next item if current item is freed + for (auto alloc : releaseContext.getActiveAllocations()) { + // Need to mark an item for release before proceeding + // If we can't mark as moving, it means the item is already freed + const bool isAlreadyFreed = + !markMovingForSlabRelease(releaseContext, alloc, throttler); + if (isAlreadyFreed) { + continue; + } + + Item& item = *static_cast(alloc); + + // Try to move this item and make sure we can free the memory + const bool isMoved = moveForSlabRelease(releaseContext, item, throttler); + + // if moving fails, evict it + if (!isMoved) { + evictForSlabRelease(releaseContext, item, throttler); + } + XDCHECK(allocator_[tid]->isAllocFreed(releaseContext, alloc)); + } +} + +template +void CacheAllocator::throttleWith(util::Throttler& t, + std::function fn) { + const unsigned int rateLimit = 1024; + // execute every 1024 times we have actually throttled + if (t.throttle() && (t.numThrottles() % rateLimit) == 0) { + fn(); + } +} + +template +bool CacheAllocator::moveForSlabRelease( + const SlabReleaseContext& ctx, Item& oldItem, util::Throttler& throttler) { + if (!config_.moveCb) { + return false; + } + + bool isMoved = false; + auto startTime = util::getCurrentTimeSec(); + WriteHandle newItemHdl = allocateNewItemForOldItem(oldItem); + + for (unsigned int itemMovingAttempts = 0; + itemMovingAttempts < config_.movingTries; + ++itemMovingAttempts) { + stats_.numMoveAttempts.inc(); + + // Nothing to move and the key is likely also bogus for chained items. + if (oldItem.isOnlyMoving()) { + oldItem.unmarkMoving(); + const auto res = + releaseBackToAllocator(oldItem, RemoveContext::kNormal, false); + XDCHECK(res == ReleaseRes::kReleased); + return true; + } + + if (!newItemHdl) { + // try to allocate again if it previously wasn't successful + newItemHdl = allocateNewItemForOldItem(oldItem); + } + + // if we have a valid handle, try to move, if not, we retry. + if (newItemHdl) { + isMoved = tryMovingForSlabRelease(oldItem, newItemHdl); + if (isMoved) { + break; + } + } + + throttleWith(throttler, [&] { + XLOGF(WARN, + "Spent {} seconds, slab release still trying to move Item: {}. " + "Pool: {}, Class: {}.", + util::getCurrentTimeSec() - startTime, oldItem.toString(), + ctx.getPoolId(), ctx.getClassId()); + }); + } + + // Return false if we've exhausted moving tries. + if (!isMoved) { + return false; + } + + // Since item has been moved, we can directly free it. We don't need to + // worry about any stats related changes, because there is another item + // that's identical to this one to replace it. Here we just need to wait + // until all users have dropped the item handles before we can proceed. + startTime = util::getCurrentTimeSec(); + while (!oldItem.isOnlyMoving()) { + throttleWith(throttler, [&] { + XLOGF(WARN, + "Spent {} seconds, slab release still waiting for refcount to " + "drain Item: {}. Pool: {}, Class: {}.", + util::getCurrentTimeSec() - startTime, oldItem.toString(), + ctx.getPoolId(), ctx.getClassId()); + }); + } + + auto tid = getTierId(oldItem); + + const auto allocInfo = allocator_[tid]->getAllocInfo(oldItem.getMemory()); + allocator_[tid]->free(&oldItem); + + (*stats_.fragmentationSize)[allocInfo.poolId][allocInfo.classId].sub( + util::getFragmentation(*this, oldItem)); + stats_.numMoveSuccesses.inc(); + return true; +} + +template +typename CacheAllocator::ReadHandle +CacheAllocator::validateAndGetParentHandleForChainedMoveLocked( + const ChainedItem& item, const Key& parentKey) { + ReadHandle parentHandle{}; + try { + parentHandle = findInternal(parentKey); + // If the parent is not the same as the parent of the chained item, + // it means someone has replaced our old parent already. So we abort. + if (!parentHandle || + parentHandle.get() != &item.getParentItem(compressor_)) { + return {}; + } + } catch (const exception::RefcountOverflow&) { + return {}; + } + + return parentHandle; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::allocateNewItemForOldItem(const Item& oldItem) { + if (oldItem.isChainedItem()) { + const auto& oldChainedItem = oldItem.asChainedItem(); + const auto parentKey = oldChainedItem.getParentItem(compressor_).getKey(); + + // Grab lock to prevent anyone else from modifying the chain + auto l = chainedItemLocks_.lockExclusive(parentKey); + + auto parentHandle = validateAndGetParentHandleForChainedMoveLocked( + oldChainedItem, parentKey); + if (!parentHandle) { + return {}; + } + + // Set up the destination for the move. Since oldChainedItem would have + // the moving bit set, it won't be picked for eviction. + auto newItemHdl = + allocateChainedItemInternal(parentHandle, oldChainedItem.getSize()); + if (!newItemHdl) { + return {}; + } + + XDCHECK_EQ(newItemHdl->getSize(), oldChainedItem.getSize()); + auto parentPtr = parentHandle.getInternal(); + XDCHECK_EQ(reinterpret_cast(parentPtr), + reinterpret_cast( + &oldChainedItem.getParentItem(compressor_))); + + return newItemHdl; + } + + const auto allocInfo = + allocator_[getTierId(oldItem)]->getAllocInfo(static_cast(&oldItem)); + + // Set up the destination for the move. Since oldItem would have the moving + // bit set, it won't be picked for eviction. + auto newItemHdl = allocateInternalTier(getTierId(oldItem), + allocInfo.poolId, + oldItem.getKey(), + oldItem.getSize(), + oldItem.getCreationTime(), + oldItem.getExpiryTime(), + false); + if (!newItemHdl) { + return {}; + } + + XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize()); + XDCHECK_EQ(reinterpret_cast(&getMMContainer(oldItem)), + reinterpret_cast(&getMMContainer(*newItemHdl))); + + return newItemHdl; +} + +template +bool CacheAllocator::tryMovingForSlabRelease( + Item& oldItem, WriteHandle& newItemHdl) { + // By holding onto a user-level synchronization object, we ensure moving + // a regular item or chained item is synchronized with any potential + // user-side mutation. + std::unique_ptr syncObj; + if (config_.movingSync) { + if (!oldItem.isChainedItem()) { + syncObj = config_.movingSync(oldItem.getKey()); + } else { + // Copy the key so we have a valid key to work with if the chained + // item is still valid. + const std::string parentKey = + oldItem.asChainedItem().getParentItem(compressor_).getKey().str(); + if (oldItem.isOnlyMoving()) { + // If chained item no longer has a refcount, its parent is already + // being released, so we abort this try to moving. + return false; + } + syncObj = config_.movingSync(parentKey); + } + + // We need to differentiate between the following three scenarios: + // 1. nullptr indicates no move sync required for this particular item + // 2. moveSync.isValid() == true meaning we've obtained the sync + // 3. moveSync.isValid() == false meaning we need to abort and retry + if (syncObj && !syncObj->isValid()) { + return false; + } + } + + return oldItem.isChainedItem() + ? moveChainedItem(oldItem.asChainedItem(), newItemHdl) + : moveRegularItem(oldItem, newItemHdl); +} + +template +void CacheAllocator::evictForSlabRelease( + const SlabReleaseContext& ctx, Item& item, util::Throttler& throttler) { + XDCHECK(!config_.isEvictionDisabled()); + + auto startTime = util::getCurrentTimeSec(); + while (true) { + stats_.numEvictionAttempts.inc(); + + // if the item is already in a state where only the moving bit is set, + // nothing needs to be done. We simply need to unmark moving bit and free + // the item. + if (item.isOnlyMoving()) { + item.unmarkMoving(); + const auto res = + releaseBackToAllocator(item, RemoveContext::kNormal, false); + XDCHECK(ReleaseRes::kReleased == res); + return; + } + + // Since we couldn't move, we now evict this item. Owning handle will be + // the item's handle for regular/normal items and will be the parent + // handle for chained items. + auto owningHandle = + item.isChainedItem() + ? evictChainedItemForSlabRelease(item.asChainedItem()) + : evictNormalItem(item); + + // we managed to evict the corresponding owner of the item and have the + // last handle for the owner. + if (owningHandle) { + const auto allocInfo = + allocator_[getTierId(item)]->getAllocInfo(static_cast(&item)); + if (owningHandle->hasChainedItem()) { + (*stats_.chainedItemEvictions)[allocInfo.poolId][allocInfo.classId] + .inc(); + } else { + (*stats_.regularItemEvictions)[allocInfo.poolId][allocInfo.classId] + .inc(); + } + + stats_.numEvictionSuccesses.inc(); + + // we have the last handle. no longer need to hold on to the moving bit + item.unmarkMoving(); + + XDCHECK(owningHandle->isExclusive()); + + // manually decrement the refcount to call releaseBackToAllocator + const auto ref = decRef(*owningHandle); + XDCHECK(ref == 0); + const auto res = releaseBackToAllocator(*owningHandle.release(), + RemoveContext::kEviction, false); + XDCHECK(res == ReleaseRes::kReleased); + return; + } + + if (shutDownInProgress_) { + item.unmarkMoving(); + allocator_[getTierId(item)]->abortSlabRelease(ctx); + throw exception::SlabReleaseAborted( + folly::sformat("Slab Release aborted while trying to evict" + " Item: {} Pool: {}, Class: {}.", + item.toString(), ctx.getPoolId(), ctx.getClassId())); + } + throttleWith(throttler, [&] { + XLOGF(WARN, + "Spent {} seconds, slab release still trying to evict Item: {}. " + "Pool: {}, Class: {}.", + util::getCurrentTimeSec() - startTime, item.toString(), + ctx.getPoolId(), ctx.getClassId()) + << (item.isChainedItem() + ? folly::sformat(" Parent: {}", + item.asChainedItem() + .getParentItem(compressor_) + .toString()) + : ""); + }); + } +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::evictNormalItem(Item& item, + bool skipIfTokenInvalid, + bool fromBgThread) { + XDCHECK(item.isMoving()); + + if (item.isOnlyMoving()) { + return WriteHandle{}; + } + + auto evictHandle = tryEvictToNextMemoryTier(item, fromBgThread); + if(evictHandle) return evictHandle; + + auto predicate = [](const Item& it) { return it.getRefCount() == 0; }; + + const bool evictToNvmCache = shouldWriteToNvmCache(item); + auto token = evictToNvmCache ? nvmCache_->createPutToken(item.getKey()) + : typename NvmCacheT::PutToken{}; + + if (skipIfTokenInvalid && evictToNvmCache && !token.isValid()) { + stats_.evictFailConcurrentFill.inc(); + return WriteHandle{}; + } + + // We remove the item from both access and mm containers. It doesn't matter + // if someone else calls remove on the item at this moment, the item cannot + // be freed as long as we have the moving bit set. + auto handle = accessContainer_->removeIf(item, std::move(predicate)); + + if (!handle) { + return handle; + } + + XDCHECK_EQ(reinterpret_cast(handle.get()), + reinterpret_cast(&item)); + XDCHECK_EQ(1u, handle->getRefCount()); + removeFromMMContainer(item); + + // now that we are the only handle and we actually removed something from + // the RAM cache, we enqueue it to nvmcache. + if (evictToNvmCache && shouldWriteToNvmCacheExclusive(item)) { + nvmCache_->put(handle, std::move(token)); + } + + return handle; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::evictChainedItemForSlabRelease(ChainedItem& child) { + XDCHECK(child.isMoving()); + + // We have the child marked as moving, but dont know anything about the + // state of the parent. Unlike the case of regular eviction where we are + // sure that the child is inside the MMContainer, ensuring its parent is + // valid, we can not make any assumptions here. We try to find the parent + // first through the access container and then verify that the parent's + // chain points to the child before cleaning up the parent. If the parent + // was in the process of being re-allocated or child was being removed + // concurrently, we would synchronize here on one of the checks. + Item& expectedParent = child.getParentItem(compressor_); + + // Grab exclusive lock since we are modifying the chain. at this point, we + // dont know the state of the parent. so we need to do some validity checks + // after we have the chained item lock to ensure that we got the lock off of + // a valid state. + const std::string parentKey = expectedParent.getKey().str(); + auto l = chainedItemLocks_.lockExclusive(parentKey); + + // check if the child is still in mmContainer and the expected parent is + // valid under the chained item lock. + if (expectedParent.getKey() != parentKey || !child.isInMMContainer() || + child.isOnlyMoving() || + &expectedParent != &child.getParentItem(compressor_) || + !expectedParent.isAccessible() || !expectedParent.hasChainedItem()) { + return {}; + } + + // search if the child is present in the chain + auto parentHandle = findInternal(parentKey); + if (!parentHandle || parentHandle != &expectedParent) { + return {}; + } + + ChainedItem* head = nullptr; + { // scope for the handle + auto headHandle = findChainedItem(expectedParent); + head = headHandle ? &headHandle->asChainedItem() : nullptr; + } + + bool found = false; + while (head) { + if (head == &child) { + found = true; + break; + } + head = head->getNext(compressor_); + } + + if (!found) { + return {}; + } + + // if we found the child in the parent's chain, we remove it and ensure that + // the handle we obtained was the last one. Before that, create a put token + // to guard any racing cache find to avoid item re-appearing in NvmCache. + const bool evictToNvmCache = shouldWriteToNvmCache(expectedParent); + + auto token = evictToNvmCache + ? nvmCache_->createPutToken(expectedParent.getKey()) + : typename NvmCacheT::PutToken{}; + + if (!accessContainer_->removeIf(expectedParent, + parentEvictForSlabReleasePredicate)) { + return {}; + } + + // at this point, we should be the last handle owner + XDCHECK_EQ(1u, parentHandle->getRefCount()); + + // We remove the parent from both access and mm containers. It doesn't + // matter if someone else calls remove on the parent at this moment, it + // cannot be freed since we hold an active item handle + removeFromMMContainer(*parentHandle); + + // In case someone else had removed this chained item from its parent by now + // So we check again to see if it has been unlinked from its parent + if (!child.isInMMContainer() || child.isOnlyMoving()) { + return {}; + } + + // check after removing from the MMContainer that the parent is still not + // being marked as moving. If parent is moving, it will release the child + // item and we will wait for that. + if (parentHandle->isMoving()) { + return {}; + } + + // now that we are the only handle and we actually removed something from + // the RAM cache, we enqueue it to nvmcache. + if (evictToNvmCache && shouldWriteToNvmCacheExclusive(*parentHandle)) { + DCHECK(parentHandle->hasChainedItem()); + nvmCache_->put(parentHandle, std::move(token)); + } + + return parentHandle; +} + +template +bool CacheAllocator::removeIfExpired(const ReadHandle& handle) { + if (!handle) { + return false; + } + + // We remove the item from both access and mm containers. + // We want to make sure the caller is the only one holding the handle. + auto removedHandle = + accessContainer_->removeIf(*(handle.getInternal()), itemExpiryPredicate); + if (removedHandle) { + removeFromMMContainer(*(handle.getInternal())); + return true; + } + + return false; +} + +template +bool CacheAllocator::markMovingForSlabRelease( + const SlabReleaseContext& ctx, void* alloc, util::Throttler& throttler) { + // MemoryAllocator::processAllocForRelease will execute the callback + // if the item is not already free. So there are three outcomes here: + // 1. Item not freed yet and marked as moving + // 2. Item not freed yet but could not be marked as moving + // 3. Item freed already + // + // For 1), return true + // For 2), retry + // For 3), return false to abort since no action is required + + // At first, we assume this item was already freed + bool itemFreed = true; + bool markedMoving = false; + TierId tid = getTierId(alloc); + const auto fn = [&markedMoving, &itemFreed](void* memory) { + // Since this callback is executed, the item is not yet freed + itemFreed = false; + Item* item = static_cast(memory); + if (item->markMoving()) { + markedMoving = true; + } + }; + + auto startTime = util::getCurrentTimeSec(); + while (true) { + allocator_[tid]->processAllocForRelease(ctx, alloc, fn); + + // If item is already freed we give up trying to mark the item moving + // and return false, otherwise if marked as moving, we return true. + if (itemFreed) { + return false; + } else if (markedMoving) { + return true; + } + + // Reset this to true, since we always assume an item is freed + // when checking with the AllocationClass + itemFreed = true; + + if (shutDownInProgress_) { + XDCHECK(!static_cast(alloc)->isMoving()); + allocator_[tid]->abortSlabRelease(ctx); + throw exception::SlabReleaseAborted( + folly::sformat("Slab Release aborted while still trying to mark" + " as moving for Item: {}. Pool: {}, Class: {}.", + static_cast(alloc)->toString(), ctx.getPoolId(), + ctx.getClassId())); + } + throttleWith(throttler, [&] { + XLOGF(WARN, + "Spent {} seconds, slab release still trying to mark as moving for " + "Item: {}. Pool: {}, Class: {}.", + util::getCurrentTimeSec() - startTime, + static_cast(alloc)->toString(), ctx.getPoolId(), + ctx.getClassId()); + }); + } +} + +template +template +CCacheT* CacheAllocator::addCompactCache(folly::StringPiece name, + size_t size, + Args&&... args) { + if (getNumTiers() != 1) + throw std::runtime_error("TODO: compact cache for multi-tier Cache not supported."); + + if (!config_.isCompactCacheEnabled()) { + throw std::logic_error("Compact cache is not enabled"); + } + + folly::SharedMutex::WriteHolder lock(compactCachePoolsLock_); + auto poolId = allocator_[0]->addPool(name, size, {Slab::kSize}); + isCompactCachePool_[poolId] = true; + + auto ptr = std::make_unique( + compactCacheManager_->addAllocator(name.str(), poolId), + std::forward(args)...); + auto it = compactCaches_.emplace(poolId, std::move(ptr)); + XDCHECK(it.second); + return static_cast(it.first->second.get()); +} + +template +template +CCacheT* CacheAllocator::attachCompactCache(folly::StringPiece name, + Args&&... args) { + auto& allocator = compactCacheManager_->getAllocator(name.str()); + auto poolId = allocator.getPoolId(); + // if a compact cache with this name already exists, return without creating + // new instance + folly::SharedMutex::WriteHolder lock(compactCachePoolsLock_); + if (compactCaches_.find(poolId) != compactCaches_.end()) { + return static_cast(compactCaches_[poolId].get()); + } + + auto ptr = std::make_unique(allocator, std::forward(args)...); + auto it = compactCaches_.emplace(poolId, std::move(ptr)); + XDCHECK(it.second); + return static_cast(it.first->second.get()); +} + +template +const ICompactCache& CacheAllocator::getCompactCache( + PoolId pid) const { + folly::SharedMutex::ReadHolder lock(compactCachePoolsLock_); + if (!isCompactCachePool_[pid]) { + throw std::invalid_argument( + folly::sformat("PoolId {} is not a compact cache", pid)); + } + + auto it = compactCaches_.find(pid); + if (it == compactCaches_.end()) { + throw std::invalid_argument(folly::sformat( + "PoolId {} belongs to an un-attached compact cache", pid)); + } + return *it->second; +} + +template +void CacheAllocator::setPoolOptimizerFor(PoolId poolId, + bool enableAutoResizing) { + optimizerEnabled_[poolId] = enableAutoResizing; +} + +template +void CacheAllocator::resizeCompactCaches() { + compactCacheManager_->resizeAll(); +} + +template +typename CacheTrait::MMType::LruType CacheAllocator::getItemLruType( + const Item& item) const { + return getMMContainer(item).getLruType(item); +} + +// The order of the serialization is as follows: +// +// This is also the order of deserialization in the constructor, when +// we restore the cache allocator. +// +// --------------------------------- +// | accessContainer_ | +// | mmContainers_ | +// | compactCacheManager_ | +// | allocator_ | +// | metadata_ | +// --------------------------------- +template +folly::IOBufQueue CacheAllocator::saveStateToIOBuf() { + if (stats_.numActiveSlabReleases.get() != 0) { + throw std::logic_error( + "There are still slabs being released at the moment"); + } + + *metadata_.allocatorVersion() = kCachelibVersion; + *metadata_.ramFormatVersion() = kCacheRamFormatVersion; + *metadata_.cacheCreationTime() = static_cast(cacheCreationTime_); + *metadata_.mmType() = MMType::kId; + *metadata_.accessType() = AccessType::kId; + + metadata_.compactCachePools()->clear(); + const auto pools = getPoolIds(); + { + folly::SharedMutex::ReadHolder lock(compactCachePoolsLock_); + for (PoolId pid : pools) { + for (unsigned int cid = 0; cid < (*stats_.fragmentationSize)[pid].size(); + ++cid) { + metadata_.fragmentationSize()[pid][static_cast(cid)] = + (*stats_.fragmentationSize)[pid][cid].get(); + } + if (isCompactCachePool_[pid]) { + metadata_.compactCachePools()->push_back(pid); + } + } + } + + *metadata_.numChainedParentItems() = stats_.numChainedParentItems.get(); + *metadata_.numChainedChildItems() = stats_.numChainedChildItems.get(); + *metadata_.numAbortedSlabReleases() = stats_.numAbortedSlabReleases.get(); + + // TODO: implement serialization for multiple tiers + auto serializeMMContainers = [](MMContainers& mmContainers) { + MMSerializationTypeContainer state; + for (unsigned int i = 0; i < 1 /* TODO: */ ; ++i) { + for (unsigned int j = 0; j < mmContainers[i].size(); ++j) { + for (unsigned int k = 0; k < mmContainers[i][j].size(); ++k) { + if (mmContainers[i][j][k]) { + state.pools_ref()[j][k] = mmContainers[i][j][k]->saveState(); + } + } + } + } + return state; + }; + MMSerializationTypeContainer mmContainersState = + serializeMMContainers(mmContainers_); + + AccessSerializationType accessContainerState = accessContainer_->saveState(); + // TODO: foreach allocator + MemoryAllocator::SerializationType allocatorState = allocator_[0]->saveState(); + CCacheManager::SerializationType ccState = compactCacheManager_->saveState(); + + AccessSerializationType chainedItemAccessContainerState = + chainedItemAccessContainer_->saveState(); + + // serialize to an iobuf queue. The caller can then copy over the serialized + // results into a single buffer. + folly::IOBufQueue queue; + Serializer::serializeToIOBufQueue(queue, metadata_); + Serializer::serializeToIOBufQueue(queue, allocatorState); + Serializer::serializeToIOBufQueue(queue, ccState); + Serializer::serializeToIOBufQueue(queue, mmContainersState); + Serializer::serializeToIOBufQueue(queue, accessContainerState); + Serializer::serializeToIOBufQueue(queue, chainedItemAccessContainerState); + return queue; +} + +template +bool CacheAllocator::stopWorkers(std::chrono::seconds timeout) { + bool success = true; + success &= stopPoolRebalancer(timeout); + success &= stopPoolResizer(timeout); + success &= stopMemMonitor(timeout); + success &= stopReaper(timeout); + success &= stopBackgroundEvictor(timeout); + success &= stopBackgroundPromoter(timeout); + return success; +} + +template +typename CacheAllocator::ShutDownStatus +CacheAllocator::shutDown() { + using ShmShutDownRes = typename ShmManager::ShutDownRes; + XLOG(DBG, "shutting down CacheAllocator"); + if (shmManager_ == nullptr) { + throw std::invalid_argument( + "shutDown can only be called once from a cached manager created on " + "shared memory. You may also be incorrectly constructing your " + "allocator. Are you passing in " + "AllocatorType::SharedMem* ?"); + } + XDCHECK(!config_.cacheDir.empty()); + + if (config_.enableFastShutdown) { + shutDownInProgress_ = true; + } + + stopWorkers(); + + const auto handleCount = getNumActiveHandles(); + if (handleCount != 0) { + XLOGF(ERR, "Found {} active handles while shutting down cache. aborting", + handleCount); + return ShutDownStatus::kFailed; + } + + const auto nvmShutDownStatusOpt = saveNvmCache(); + saveRamCache(); + const auto shmShutDownStatus = shmManager_->shutDown(); + const auto shmShutDownSucceeded = + (shmShutDownStatus == ShmShutDownRes::kSuccess); + shmManager_.reset(); + + // TODO: save per-tier state + + if (shmShutDownSucceeded) { + if (!nvmShutDownStatusOpt || *nvmShutDownStatusOpt) + return ShutDownStatus::kSuccess; + + if (nvmShutDownStatusOpt && !*nvmShutDownStatusOpt) + return ShutDownStatus::kSavedOnlyDRAM; + } + + XLOGF(ERR, "Could not shutdown DRAM cache cleanly. ShutDownRes={}", + (shmShutDownStatus == ShmShutDownRes::kFailedWrite ? "kFailedWrite" + : "kFileDeleted")); + + if (nvmShutDownStatusOpt && *nvmShutDownStatusOpt) { + return ShutDownStatus::kSavedOnlyNvmCache; + } + + return ShutDownStatus::kFailed; +} + +template +std::optional CacheAllocator::saveNvmCache() { + if (!nvmCache_) { + return std::nullopt; + } + + // throw any exceptions from shutting down nvmcache since we dont know the + // state of RAM as well. + if (!nvmCache_->isEnabled()) { + nvmCache_->shutDown(); + return std::nullopt; + } + + if (!nvmCache_->shutDown()) { + XLOG(ERR, "Could not shutdown nvmcache cleanly"); + return false; + } + + nvmCacheState_.markSafeShutDown(); + return true; +} + +template +void CacheAllocator::saveRamCache() { + // serialize the cache state + auto serializedBuf = saveStateToIOBuf(); + std::unique_ptr ioBuf = serializedBuf.move(); + ioBuf->coalesce(); + + ShmSegmentOpts opts; + opts.typeOpts = PosixSysVSegmentOpts(config_.isUsingPosixShm()); + + void* infoAddr = shmManager_->createShm(detail::kShmInfoName, ioBuf->length(), + nullptr, opts).addr; + Serializer serializer(reinterpret_cast(infoAddr), + reinterpret_cast(infoAddr) + ioBuf->length()); + serializer.writeToBuffer(std::move(ioBuf)); +} + +template +typename CacheAllocator::MMContainers +CacheAllocator::deserializeMMContainers( + Deserializer& deserializer, + const typename Item::PtrCompressor& compressor) { + const auto container = + deserializer.deserialize(); + + /* TODO: right now, we create empty containers because deserialization + * only works for a single (topmost) tier. */ + MMContainers mmContainers{getNumTiers()}; + + for (auto& kvPool : *container.pools_ref()) { + auto i = static_cast(kvPool.first); + auto& pool = getPool(i); + for (auto& kv : kvPool.second) { + auto j = static_cast(kv.first); + for (TierId tid = 0; tid < getNumTiers(); tid++) { + MMContainerPtr ptr = + std::make_unique(kv.second, + compressor); + auto config = ptr->getConfig(); + config.addExtraConfig(config_.trackTailHits + ? pool.getAllocationClass(j).getAllocsPerSlab() + : 0); + ptr->setConfig(config); + mmContainers[tid][i][j] = std::move(ptr); + } + } + } + // We need to drop the unevictableMMContainer in the desierializer. + // TODO: remove this at version 17. + if (metadata_.allocatorVersion() <= 15) { + deserializer.deserialize(); + } + return mmContainers; +} + +template +serialization::CacheAllocatorMetadata +CacheAllocator::deserializeCacheAllocatorMetadata( + Deserializer& deserializer) { + auto meta = deserializer.deserialize(); + // TODO: + // Once everyone is on v8 or later, remove the outter if. + if (kCachelibVersion > 8) { + if (*meta.ramFormatVersion() != kCacheRamFormatVersion) { + throw std::runtime_error( + folly::sformat("Expected cache ram format version {}. But found {}.", + kCacheRamFormatVersion, *meta.ramFormatVersion())); + } + } + + if (*meta.accessType() != AccessType::kId) { + throw std::invalid_argument( + folly::sformat("Expected {}, got {} for AccessType", *meta.accessType(), + AccessType::kId)); + } + + if (*meta.mmType() != MMType::kId) { + throw std::invalid_argument(folly::sformat("Expected {}, got {} for MMType", + *meta.mmType(), MMType::kId)); + } + return meta; +} + +template +int64_t CacheAllocator::getNumActiveHandles() const { + return handleCount_.getSnapshot(); +} + +template +int64_t CacheAllocator::getHandleCountForThread() const { + return handleCount_.tlStats(); +} + +template +void CacheAllocator::resetHandleCountForThread_private() { + handleCount_.tlStats() = 0; +} + +template +void CacheAllocator::adjustHandleCountForThread_private( + int64_t delta) { + handleCount_.tlStats() += delta; +} + +template +void CacheAllocator::initStats() { + stats_.init(); + + // deserialize the fragmentation size of each thread. + for (const auto& pid : *metadata_.fragmentationSize()) { + for (const auto& cid : pid.second) { + (*stats_.fragmentationSize)[pid.first][cid.first].set( + static_cast(cid.second)); + } + } + + // deserialize item counter stats + stats_.numChainedParentItems.set(*metadata_.numChainedParentItems()); + stats_.numChainedChildItems.set(*metadata_.numChainedChildItems()); + stats_.numAbortedSlabReleases.set( + static_cast(*metadata_.numAbortedSlabReleases())); +} + +template +void CacheAllocator::forEachChainedItem( + const Item& parent, std::function func) { + auto l = chainedItemLocks_.lockShared(parent.getKey()); + + auto headHandle = findChainedItem(parent); + if (!headHandle) { + return; + } + + ChainedItem* head = &headHandle.get()->asChainedItem(); + while (head) { + func(*head); + head = head->getNext(compressor_); + } +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::findChainedItem(const Item& parent) const { + const auto cPtr = compressor_.compress(&parent); + return chainedItemAccessContainer_->find( + Key{reinterpret_cast(&cPtr), ChainedItem::kKeySize}); +} + +template +template +CacheChainedAllocs, Handle, Iter> +CacheAllocator::viewAsChainedAllocsT(const Handle& parent) { + XDCHECK(parent); + auto handle = parent.clone(); + if (!handle) { + throw std::invalid_argument("Failed to clone item handle"); + } + + if (!handle->hasChainedItem()) { + throw std::invalid_argument( + folly::sformat("Failed to materialize chain. Parent does not have " + "chained items. Parent: {}", + parent->toString())); + } + + auto l = chainedItemLocks_.lockShared(handle->getKey()); + auto head = findChainedItem(*handle); + return CacheChainedAllocs, Handle, Iter>{ + std::move(l), std::move(handle), *head, compressor_}; +} + +template +GlobalCacheStats CacheAllocator::getGlobalCacheStats() const { + GlobalCacheStats ret{}; + stats_.populateGlobalCacheStats(ret); + + ret.numItems = accessContainer_->getStats().numKeys; + + const uint64_t currTime = util::getCurrentTimeSec(); + ret.cacheInstanceUpTime = currTime - cacheInstanceCreationTime_; + ret.ramUpTime = currTime - cacheCreationTime_; + ret.nvmUpTime = currTime - nvmCacheState_.getCreationTime(); + ret.nvmCacheEnabled = nvmCache_ ? nvmCache_->isEnabled() : false; + ret.reaperStats = getReaperStats(); + ret.evictionStats = getBackgroundMoverStats(MoverDir::Evict); + ret.promotionStats = getBackgroundMoverStats(MoverDir::Promote); + ret.numActiveHandles = getNumActiveHandles(); + + ret.isNewRamCache = cacheCreationTime_ == cacheInstanceCreationTime_; + ret.isNewNvmCache = + nvmCacheState_.getCreationTime() == cacheInstanceCreationTime_; + + return ret; +} + +template +CacheMemoryStats CacheAllocator::getCacheMemoryStats() const { + size_t totalCacheSize = 0; + for(auto& allocator: allocator_) { + totalCacheSize += allocator->getMemorySize(); + } + + auto addSize = [this](size_t a, PoolId pid) { + return a + allocator_[currentTier()]->getPool(pid).getPoolSize(); + }; + const auto regularPoolIds = getRegularPoolIds(); + const auto ccCachePoolIds = getCCachePoolIds(); + size_t regularCacheSize = std::accumulate( + regularPoolIds.begin(), regularPoolIds.end(), 0ULL, addSize); + size_t compactCacheSize = std::accumulate( + ccCachePoolIds.begin(), ccCachePoolIds.end(), 0ULL, addSize); + + std::vector slabsApproxFreePercentages; + for (TierId tid = 0; tid < getNumTiers(); ++tid) + slabsApproxFreePercentages.push_back(slabsApproxFreePercentage(tid)); + + return CacheMemoryStats{totalCacheSize, + regularCacheSize, + compactCacheSize, + allocator_[currentTier()]->getAdvisedMemorySize(), + memMonitor_ ? memMonitor_->getMaxAdvisePct() : 0, + allocator_[currentTier()]->getUnreservedMemorySize(), + nvmCache_ ? nvmCache_->getSize() : 0, + util::getMemAvailable(), + util::getRSSBytes(), + slabsApproxFreePercentages}; +} + +template +bool CacheAllocator::autoResizeEnabledForPool(PoolId pid) const { + folly::SharedMutex::ReadHolder lock(compactCachePoolsLock_); + if (isCompactCachePool_[pid]) { + // compact caches need to be registered to enable auto resizing + return optimizerEnabled_[pid]; + } else { + // by default all regular pools participate in auto resizing + return true; + } +} + +template +void CacheAllocator::startCacheWorkers() { + initWorkers(); +} + +template +template +bool CacheAllocator::stopWorker(folly::StringPiece name, + std::unique_ptr& worker, + std::chrono::seconds timeout) { + std::lock_guard l(workersMutex_); + if (!worker) { + return true; + } + + bool ret = worker->stop(timeout); + if (ret) { + XLOGF(DBG1, "Stopped worker '{}'", name); + } else { + XLOGF(ERR, "Couldn't stop worker '{}', timeout: {} seconds", name, + timeout.count()); + } + worker.reset(); + return ret; +} + +template +template +bool CacheAllocator::startNewWorker( + folly::StringPiece name, + std::unique_ptr& worker, + std::chrono::milliseconds interval, + Args&&... args) { + if (!stopWorker(name, worker)) { + return false; + } + + std::lock_guard l(workersMutex_); + worker = std::make_unique(*this, std::forward(args)...); + bool ret = worker->start(interval, name); + if (ret) { + XLOGF(DBG1, "Started worker '{}'", name); + } else { + XLOGF(ERR, "Couldn't start worker '{}', interval: {} milliseconds", name, + interval.count()); + } + return ret; +} + +template +bool CacheAllocator::startNewPoolRebalancer( + std::chrono::milliseconds interval, + std::shared_ptr strategy, + unsigned int freeAllocThreshold) { + return startNewWorker("PoolRebalancer", poolRebalancer_, interval, strategy, + freeAllocThreshold); +} + +template +bool CacheAllocator::startNewPoolResizer( + std::chrono::milliseconds interval, + unsigned int poolResizeSlabsPerIter, + std::shared_ptr strategy) { + return startNewWorker("PoolResizer", poolResizer_, interval, + poolResizeSlabsPerIter, strategy); +} +template +bool CacheAllocator::startNewPoolOptimizer( + std::chrono::seconds regularInterval, + std::chrono::seconds ccacheInterval, + std::shared_ptr strategy, + unsigned int ccacheStepSizePercent) { + // For now we are asking the worker to wake up every second to see whether + // it should do actual size optimization. Probably need to move to using + // the same interval for both, with confirmation of further experiments. + const auto workerInterval = std::chrono::seconds(1); + return startNewWorker("PoolOptimizer", poolOptimizer_, workerInterval, + strategy, regularInterval.count(), + ccacheInterval.count(), ccacheStepSizePercent); +} +template +bool CacheAllocator::startNewMemMonitor( + std::chrono::milliseconds interval, + MemoryMonitor::Config config, + std::shared_ptr strategy) { + return startNewWorker("MemoryMonitor", memMonitor_, interval, + std::move(config), strategy); +} +template +bool CacheAllocator::startNewReaper( + std::chrono::milliseconds interval, + util::Throttler::Config reaperThrottleConfig) { + return startNewWorker("Reaper", reaper_, interval, reaperThrottleConfig); +} + +template +auto CacheAllocator::getAssignedMemoryToBgWorker(size_t evictorId, size_t numWorkers, TierId tid) +{ + std::vector> asssignedMemory; + // TODO: for now, only evict from tier 0 + auto pools = filterCompactCachePools(allocator_[tid]->getPoolIds()); + for (const auto pid : pools) { + const auto& mpStats = getPoolByTid(pid,tid).getStats(); + for (const auto cid : mpStats.classIds) { + if (backgroundWorkerId(tid, pid, cid, numWorkers) == evictorId) { + asssignedMemory.emplace_back(tid, pid, cid); + } + } + } + return asssignedMemory; +} + +template +bool CacheAllocator::startNewBackgroundEvictor( + std::chrono::milliseconds interval, + std::shared_ptr strategy, + size_t threads) { + XDCHECK(threads > 0); + backgroundEvictor_.resize(threads); + bool result = true; + + for (size_t i = 0; i < threads; i++) { + auto ret = startNewWorker("BackgroundEvictor" + std::to_string(i), backgroundEvictor_[i], interval, strategy, MoverDir::Evict); + result = result && ret; + + if (result) { + backgroundEvictor_[i]->setAssignedMemory(getAssignedMemoryToBgWorker(i, backgroundEvictor_.size(), 0)); + } + } + return result; +} + +template +bool CacheAllocator::startNewBackgroundPromoter( + std::chrono::milliseconds interval, + std::shared_ptr strategy, + size_t threads) { + XDCHECK(threads > 0); + XDCHECK(getNumTiers() > 1); + backgroundPromoter_.resize(threads); + bool result = true; + + for (size_t i = 0; i < threads; i++) { + auto ret = startNewWorker("BackgroundPromoter" + std::to_string(i), backgroundPromoter_[i], interval, strategy, MoverDir::Promote); + result = result && ret; + + if (result) { + backgroundPromoter_[i]->setAssignedMemory(getAssignedMemoryToBgWorker(i, backgroundPromoter_.size(), 1)); + } + } + return result; +} + +template +bool CacheAllocator::stopPoolRebalancer( + std::chrono::seconds timeout) { + return stopWorker("PoolRebalancer", poolRebalancer_, timeout); +} + +template +bool CacheAllocator::stopPoolResizer(std::chrono::seconds timeout) { + return stopWorker("PoolResizer", poolResizer_, timeout); +} + +template +bool CacheAllocator::stopPoolOptimizer( + std::chrono::seconds timeout) { + return stopWorker("PoolOptimizer", poolOptimizer_, timeout); +} + +template +bool CacheAllocator::stopMemMonitor(std::chrono::seconds timeout) { + return stopWorker("MemoryMonitor", memMonitor_, timeout); +} + +template +bool CacheAllocator::stopReaper(std::chrono::seconds timeout) { + return stopWorker("Reaper", reaper_, timeout); +} + +template +bool CacheAllocator::stopBackgroundEvictor(std::chrono::seconds timeout) { + bool result = true; + for (size_t i = 0; i < backgroundEvictor_.size(); i++) { + auto ret = stopWorker("BackgroundEvictor", backgroundEvictor_[i], timeout); + result = result && ret; + } + return result; +} + +template +bool CacheAllocator::stopBackgroundPromoter(std::chrono::seconds timeout) { + bool result = true; + for (size_t i = 0; i < backgroundPromoter_.size(); i++) { + auto ret = stopWorker("BackgroundPromoter", backgroundPromoter_[i], timeout); + result = result && ret; + } + return result; +} + +template +bool CacheAllocator::cleanupStrayShmSegments( + const std::string& cacheDir, bool posix /*TODO(SHM_FILE): const std::vector& config */) { + if (util::getStatIfExists(cacheDir, nullptr) && util::isDir(cacheDir)) { + try { + // cache dir exists. clean up only if there are no other processes + // attached. if another process was attached, the following would fail. + ShmManager::cleanup(cacheDir, posix); + + // TODO: cleanup per-tier state + } catch (const std::exception& e) { + XLOGF(ERR, "Error cleaning up {}. Exception: ", cacheDir, e.what()); + return false; + } + } else { + // cache dir did not exist. Try to nuke the segments we know by name. + // Any other concurrent process can not be attached to the segments or + // even if it does, we want to mark it for destruction. + ShmManager::removeByName(cacheDir, detail::kShmInfoName, posix); + ShmManager::removeByName(cacheDir, detail::kShmCacheName + + std::to_string(0 /* TODO: per tier */), posix); + ShmManager::removeByName(cacheDir, detail::kShmHashTableName, posix); + ShmManager::removeByName(cacheDir, detail::kShmChainedItemHashTableName, + posix); + + // TODO(SHM_FILE): try to nuke segments of differente types (which require + // extra info) + // for (auto &tier : config) { + // ShmManager::removeByName(cacheDir, tierShmName, config_.memoryTiers[i].opts); + // } + } + return true; +} + +template +uint64_t CacheAllocator::getItemPtrAsOffset(const void* ptr) { + // Return unt64_t instead of uintptr_t to accommodate platforms where + // the two differ (e.g. Mac OS 12) - causing templating instantiation + // errors downstream. + + auto tid = getTierId(ptr); + + // if this succeeeds, the address is valid within the cache. + allocator_[tid]->getAllocInfo(ptr); + + if (!isOnShm_ || !shmManager_) { + throw std::invalid_argument("Shared memory not used"); + } + + const auto& shm = shmManager_->getShmByName(detail::kShmCacheName + std::to_string(tid)); + + return reinterpret_cast(ptr) - + reinterpret_cast(shm.getCurrentMapping().addr); +} + +template +std::unordered_map +CacheAllocator::getNvmCacheStatsMap() const { + auto ret = nvmCache_ ? nvmCache_->getStatsMap() + : std::unordered_map{}; + if (nvmAdmissionPolicy_) { + auto policyStats = nvmAdmissionPolicy_->getCounters(); + for (const auto& kv : policyStats) { + ret[kv.first] = kv.second; + } + } + return ret; +} + +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/CacheAllocator.cpp b/bdm/allocator/CacheAllocator.cpp new file mode 100644 index 0000000000..0ae8fbed32 --- /dev/null +++ b/bdm/allocator/CacheAllocator.cpp @@ -0,0 +1,26 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cachelib/allocator/CacheAllocator.h" + +namespace facebook { +namespace cachelib { +template class CacheAllocator; +template class CacheAllocator; +template class CacheAllocator; +template class CacheAllocator; +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/CacheAllocator.h b/bdm/allocator/CacheAllocator.h new file mode 100644 index 0000000000..aef7a43c3a --- /dev/null +++ b/bdm/allocator/CacheAllocator.h @@ -0,0 +1,2499 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#include +#include +#pragma GCC diagnostic pop + +#include "cachelib/allocator/BackgroundMover.h" +#include "cachelib/allocator/CCacheManager.h" +#include "cachelib/allocator/Cache.h" +#include "cachelib/allocator/CacheAllocatorConfig.h" +#include "cachelib/allocator/CacheChainedItemIterator.h" +#include "cachelib/allocator/CacheItem.h" +#include "cachelib/allocator/CacheStats.h" +#include "cachelib/allocator/CacheStatsInternal.h" +#include "cachelib/allocator/CacheTraits.h" +#include "cachelib/allocator/CacheVersion.h" +#include "cachelib/allocator/ChainedAllocs.h" +#include "cachelib/allocator/ICompactCache.h" +#include "cachelib/allocator/KAllocation.h" +#include "cachelib/allocator/MemoryMonitor.h" +#include "cachelib/allocator/NvmAdmissionPolicy.h" +#include "cachelib/allocator/NvmCacheState.h" +#include "cachelib/allocator/PoolOptimizeStrategy.h" +#include "cachelib/allocator/PoolOptimizer.h" +#include "cachelib/allocator/PoolRebalancer.h" +#include "cachelib/allocator/PoolResizer.h" +#include "cachelib/allocator/ReadOnlySharedCacheView.h" +#include "cachelib/allocator/Reaper.h" +#include "cachelib/allocator/RebalanceStrategy.h" +#include "cachelib/allocator/Refcount.h" +#include "cachelib/allocator/TempShmMapping.h" +#include "cachelib/allocator/TlsActiveItemRing.h" +#include "cachelib/allocator/TypedHandle.h" +#include "cachelib/allocator/Util.h" +#include "cachelib/allocator/memory/MemoryAllocator.h" +#include "cachelib/allocator/memory/MemoryAllocatorStats.h" +#include "cachelib/allocator/memory/serialize/gen-cpp2/objects_types.h" +#include "cachelib/allocator/nvmcache/NvmCache.h" +#include "cachelib/allocator/serialize/gen-cpp2/objects_types.h" +#include "cachelib/common/Exceptions.h" +#include "cachelib/common/Hash.h" +#include "cachelib/common/Mutex.h" +#include "cachelib/common/PeriodicWorker.h" +#include "cachelib/common/Serialization.h" +#include "cachelib/common/Throttler.h" +#include "cachelib/common/Time.h" +#include "cachelib/common/Utils.h" +#include "cachelib/shm/ShmManager.h" + +namespace facebook { +namespace cachelib { + +template +class FbInternalRuntimeUpdateWrapper; + +template +class ReadOnlyMap; + +namespace objcache2 { +template +class ObjectCache; + +template +class ObjectCacheBase; +} // namespace objcache2 + +namespace cachebench { +template +class Cache; +namespace tests { +class CacheTest; +} +} // namespace cachebench + +namespace tests { +template +class BaseAllocatorTest; + +template +class AllocatorHitStatsTest; + +template +class AllocatorResizeTest; + +template +class FixedSizeArrayTest; + +template +class MapTest; + +class NvmCacheTest; + +template +class PoolOptimizeStrategyTest; + +class NvmAdmissionPolicyTest; + +class CacheAllocatorTestWrapper; +class PersistenceCache; +} // namespace tests + +namespace objcache { +template +class ObjectCache; +namespace test { +#define GET_CLASS_NAME(test_case_name, test_name) \ + test_case_name##_##test_name##_Test + +#define GET_DECORATED_CLASS_NAME(namespace, test_case_name, test_name) \ + namespace ::GET_CLASS_NAME(test_case_name, test_name) + +class GET_CLASS_NAME(ObjectCache, ObjectHandleInvalid); +} // namespace test +} // namespace objcache + +// CacheAllocator can provide an interface to make Keyed Allocations(Item) and +// takes two templated types that control how the allocation is +// maintained(MMType aka MemoryManagementType) and accessed(AccessType). The +// cache allocator internally has an allocator that it interacts with to make +// allocations. All active allocations are put into the AccessContainer and +// the MMContainer for maintenance. When the cache is full, allocations are +// garbage collected by the implementation of MMType. +// +// The MMType is used for keeping track of allocations that are currently +// under the control of cache allocator. The MMType is required to provide a +// data structure MMContainer with a well defined interface. For example, +// check MMLru.h's Container (TODO use boost concepts to enforce the +// interface if possible and have it defined in a .h file). The MMType must +// provide a Hook type that will be used to instrument the resultant Item to +// be compatible for use with the MMContainer similar to a boost intrusive +// member hook. MMType::Hook must be sufficient for MMType::Container to +// operate. The MMContainer is expected to implement interfaces to +// add/remove/evict/recordAccess a T& object into the container. This allows +// us to change/abstract away the memory management implementation of the +// cache from the other parts of the cache. +// +// Similar to the MMType, the AccessType is an intrusive data type that +// provides a container to access the keyed allocations. AccessType must +// provide an AccessType::Hook and AccessType::Container with +// find/insert/remove interface similar to a hash table. +// +template +class CacheAllocator : public CacheBase { + public: + using CacheT = CacheAllocator; + using MMType = typename CacheTrait::MMType; + using AccessType = typename CacheTrait::AccessType; + using Config = CacheAllocatorConfig; + + // configs for the MMtype and AccessType. + using MMConfig = typename MMType::Config; + using AccessConfig = typename AccessType::Config; + + using Item = CacheItem; + using ChainedItem = typename Item::ChainedItem; + + // the holder for the item when we hand it to the caller. This ensures + // that the reference count is maintained when the caller is done with the + // item. The ReadHandle/WriteHandle provides a getMemory() and getKey() + // interface. The caller is free to use the result of these two as long as the + // handle is active/alive. Using the result of the above interfaces after + // destroying the ReadHandle/WriteHandle is UB. The ReadHandle/WriteHandle + // safely wraps a pointer to the "const Item"/"Item". + using ReadHandle = typename Item::ReadHandle; + using WriteHandle = typename Item::WriteHandle; + // Following is deprecated as of allocator version 17 and this line will be + // removed at a future date + // using ItemHandle = WriteHandle; + template > + using TypedHandle = TypedHandleImpl; + + // TODO (sathya) some types take CacheT and some take CacheTrait. need to + // clean this up and come up with a consistent policy that is intuitive. + using ChainedItemIter = CacheChainedItemIterator; + using WritableChainedItemIter = CacheChainedItemIterator; + using ChainedAllocs = CacheChainedAllocs; + using WritableChainedAllocs = + CacheChainedAllocs; + + using Key = typename Item::Key; + using PoolIds = std::set; + + using EventTracker = EventInterface; + + // holds information about removal, used in RemoveCb + struct RemoveCbData { + // remove or eviction + RemoveContext context; + + // item about to be freed back to allocator + Item& item; + + // Iterator range pointing to chained allocs associated with @item + folly::Range chainedAllocs; + }; + struct DestructorData { + DestructorData(DestructorContext ctx, + Item& it, + folly::Range iter, + PoolId id) + : context(ctx), item(it), chainedAllocs(iter), pool(id) {} + + // helps to convert RemoveContext to DestructorContext, + // the context for RemoveCB is re-used to create DestructorData, + // this can be removed if RemoveCB is dropped. + DestructorData(RemoveContext ctx, + Item& it, + folly::Range iter, + PoolId id) + : item(it), chainedAllocs(iter), pool(id) { + if (ctx == RemoveContext::kEviction) { + context = DestructorContext::kEvictedFromRAM; + } else { + context = DestructorContext::kRemovedFromRAM; + } + } + + // remove or eviction + DestructorContext context; + + // item about to be freed back to allocator + // when the item is evicted/removed from NVM, the item is created on the + // heap, functions (e.g. CacheAllocator::getAllocInfo) that assumes item is + // located in cache slab doesn't work in such case. + // chained items must be iterated though @chainedAllocs. + // Other APIs used to access chained items are not compatible and should not + // be used. + Item& item; + + // Iterator range pointing to chained allocs associated with @item + // when chained items are evicted/removed from NVM, items are created on the + // heap, functions (e.g. CacheAllocator::getAllocInfo) that assumes items + // are located in cache slab doesn't work in such case. + folly::Range chainedAllocs; + + // the pool that this item is/was + PoolId pool; + }; + + // call back to execute when moving an item, this could be a simple memcpy + // or something more complex. + // An optional parentItem pointer is provided if the item being moved is a + // chained item. + using MoveCb = + std::function; + + // call back type that is executed when the cache item is removed + // (evicted / freed) from RAM, only items inserted into cache (not nascent) + // successfully are tracked + using RemoveCb = std::function; + + // the destructor being executed when the item is removed from cache (both RAM + // and NVM), only items inserted into cache (not nascent) successfully are + // tracked. + using ItemDestructor = std::function; + + using NvmCacheT = NvmCache; + using NvmCacheConfig = typename NvmCacheT::Config; + using DeleteTombStoneGuard = typename NvmCacheT::DeleteTombStoneGuard; + + // Interface for the sync object provided by the user if movingSync is turned + // on. + // SyncObj is for CacheLib to obtain exclusive access to an item when + // it is moving it during slab release. Once held, the user should guarantee + // the item will not be accessed from another thread. + struct SyncObj { + virtual ~SyncObj() = default; + + // Override this function to indicate success/failure of the sync obj, + // if user-supplied SyncObj can fail. e.g. if a lock can timeout. + virtual bool isValid() const { return true; } + }; + using ChainedItemMovingSync = std::function(Key)>; + + using AccessContainer = typename Item::AccessContainer; + using MMContainer = typename Item::MMContainer; + + // serialization types + using MMSerializationType = typename MMType::SerializationType; + using MMSerializationConfigType = typename MMType::SerializationConfigType; + using MMSerializationTypeContainer = + typename MMType::SerializationTypeContainer; + using AccessSerializationType = typename AccessType::SerializationType; + + using ShmManager = facebook::cachelib::ShmManager; + + // The shared memory segments that can be persisted and re-attached to + enum SharedMemNewT { SharedMemNew }; + // Attach to a persisted shared memory segment + enum SharedMemAttachT { SharedMemAttach }; + + // instantiates a cache allocator on heap memory + // + // @param config the configuration for the whole cache allocator + explicit CacheAllocator(Config config); + + // instantiates a cache allocator on shared memory + // + // @param config the configuration for the whole cache allocator + CacheAllocator(SharedMemNewT, Config config); + + // restore a cache allocator from shared memory + // + // @param config the configuration for the whole cache allocator + // + // @throw std::invalid_argument if cannot restore successful + CacheAllocator(SharedMemAttachT, Config config); + + // Shared segments will be detached upon destruction + ~CacheAllocator() override; + + // create a new cache allocation. The allocation can be initialized + // appropriately and made accessible through insert or insertOrReplace. + // If the handle returned from this api is not passed on to + // insert/insertOrReplace, the allocation gets destroyed when the handle + // goes out of scope. + // + // @param id the pool id for the allocation that was previously + // created through addPool + // @param key the key for the allocation. This will be made a + // part of the Item and be available through getKey(). + // @param size the size of the allocation, exclusive of the key + // size. + // @param ttlSecs Time To Live(second) for the item, + // default with 0 means no expiration time. + // + // @return the handle for the item or an invalid handle(nullptr) if the + // allocation failed. Allocation can fail if we are out of memory + // and can not find an eviction. + // @throw std::invalid_argument if the poolId is invalid or the size + // requested is invalid or if the key is invalid(key.size() == 0 or + // key.size() > 255) + WriteHandle allocate(PoolId id, + Key key, + uint32_t size, + uint32_t ttlSecs = 0, + uint32_t creationTime = 0); + + // Allocate a chained item + // + // The resulting chained item does not have a parent item and + // will be freed once the handle is dropped + // + // The parent handle parameter here is mainly used to find the + // correct pool to allocate memory for this chained item + // + // @param parent handle to the cache item + // @param size the size for the chained allocation + // + // @return handle to the chained allocation + // @throw std::invalid_argument if the size requested is invalid or + // if the item is invalid + WriteHandle allocateChainedItem(const ReadHandle& parent, uint32_t size); + + // Link a chained item to a parent item and mark this parent handle as having + // chained allocations. + // The parent handle is not reset (to become a null handle) so that the caller + // can continue using it as before calling this api. + // + // @param parent handle to the parent item + // @param child chained item that will be linked to the parent + // + // @throw std::invalid_argument if parent is nullptr + void addChainedItem(WriteHandle& parent, WriteHandle child); + + // Pop the first chained item assocaited with this parent and unmark this + // parent handle as having chained allocations. + // The parent handle is not reset (to become a null handle) so that the caller + // can continue using it as before calling this api. + // + // @param parent handle to the parent item + // + // @return ChainedItem head if there exists one + // nullptr otherwise + WriteHandle popChainedItem(WriteHandle& parent); + + // Return the key to the parent item. + // + // This API is racy with transferChainedAndReplace and also with moving during + // a slab release. To use this safely. user needs to synchronize calls to this + // API using their user level lock (in exclusive mode). The same user level + // lock should've been provided via movingSync to CacheLib if moving is + // enabled for slab rebalancing. + // + // @throw std::invalid_argument if chainedItem is not actually a chained item. + Key getParentKey(const Item& chainedItem); + + // replace a chained item in the existing chain. old and new item must be + // chained items that have been allocated with the same parent that is + // passed in. oldItem must be in the chain and newItem must not be. + // + // Upon success a handle to the oldItem is returned for the caller + // + // @param oldItem the item we are replacing in the chain + // @param newItem the item we are replacing it with + // @param parent the parent for the chain + // + // @return handle to the oldItem on return. + // + // @throw std::invalid_argument if any of the pre-conditions fails + WriteHandle replaceChainedItem(Item& oldItem, + WriteHandle newItem, + Item& parent); + + // Transfers the ownership of the chain from the current parent to the new + // parent and inserts the new parent into the cache. Parent will be unmarked + // as having chained allocations and its nvmCache will be invalidated. Parent + // will not be null after calling this API. + // + // Caller must synchronize with any modifications to the parent's chain and + // any calls to find() for the same key to ensure there are no more concurrent + // parent handle while doing this. While calling this method, the cache does + // not guarantee a consistent view for the key and the caller must not rely on + // this. The new parent and old parent must be allocations for the same key. + // New parent must also be an allocation that is not added to the cache. + // + // + // @param parent the current parent of the chain we want to transfer + // @param newParent the new parent for the chain + // + // @throw std::invalid_argument if the parent does not have chained item or + // incorrect state of chained item or if any of the pre-conditions + // are not met + void transferChainAndReplace(WriteHandle& parent, WriteHandle& newParent); + + // Inserts the allocated handle into the AccessContainer, making it + // accessible for everyone. This needs to be the handle that the caller + // allocated through _allocate_. If this call fails, the allocation will be + // freed back when the handle gets out of scope in the caller. + // + // @param handle the handle for the allocation. + // + // @return true if the handle was successfully inserted into the hashtable + // and is now accessible to everyone. False if there was an error. + // + // @throw std::invalid_argument if the handle is already accessible. + bool insert(const WriteHandle& handle); + + // Replaces the allocated handle into the AccessContainer, making it + // accessible for everyone. If an existing handle is already in the + // container, remove that handle. This needs to be the handle that the caller + // allocated through _allocate_. If this call fails, the allocation will be + // freed back when the handle gets out of scope in the caller. + // + // @param handle the handle for the allocation. + // + // @throw std::invalid_argument if the handle is already accessible. + // @throw cachelib::exception::RefcountOverflow if the item we are replacing + // is already out of refcounts. + // @return handle to the old item that had been replaced + WriteHandle insertOrReplace(const WriteHandle& handle); + + // look up an item by its key across the nvm cache as well if enabled. + // + // @param key the key for lookup + // + // @return the read handle for the item or a handle to nullptr if the + // key does not exist. + ReadHandle find(Key key); + + // Warning: this API is synchronous today with HybridCache. This means as + // opposed to find(), we will block on an item being read from + // flash until it is loaded into DRAM-cache. In find(), if an item + // is missing in dram, we will return a "not-ready" handle and + // user can choose to block or convert to folly::SemiFuture and + // process the item only when it becomes ready (loaded into DRAM). + // If blocking behavior is NOT what you want, a workaround is: + // auto readHandle = cache->find("my key"); + // if (!readHandle.isReady()) { + // auto sf = std::move(readHandle) + // .toSemiFuture() + // .defer([] (auto readHandle)) { + // return std::move(readHandle).toWriteHandle(); + // } + // } + // + // look up an item by its key across the nvm cache as well if enabled. Users + // should call this API only when they are going to mutate the item data. + // + // @param key the key for lookup + // @param isNvmInvalidate whether to do nvm invalidation; + // defaults to be true + // + // @return the write handle for the item or a handle to nullptr if the + // key does not exist. + WriteHandle findToWrite(Key key, bool doNvmInvalidation = true); + + // look up an item by its key. This ignores the nvm cache and only does RAM + // lookup. + // + // @param key the key for lookup + // + // @return the read handle for the item or a handle to nullptr if the key + // does not exist. + FOLLY_ALWAYS_INLINE ReadHandle findFast(Key key); + + // look up an item by its key. This ignores the nvm cache and only does RAM + // lookup. Users should call this API only when they are going to mutate the + // item data. + // + // @param key the key for lookup + // @param isNvmInvalidate whether to do nvm invalidation; + // defaults to be true + // + // @return the write handle for the item or a handle to nullptr if the + // key does not exist. + FOLLY_ALWAYS_INLINE WriteHandle + findFastToWrite(Key key, bool doNvmInvalidation = true); + + // look up an item by its key. This ignores the nvm cache and only does RAM + // lookup. This API does not update the stats related to cache gets and misses + // nor mark the item as useful (see markUseful below). + // + // @param key the key for lookup + // @return the handle for the item or a handle to nullptr if the key does + // not exist. + FOLLY_ALWAYS_INLINE ReadHandle peek(Key key); + + // Mark an item that was fetched through peek as useful. This is useful when + // users want to look into the cache and only mark items as useful when they + // inspect the contents of it. + // + // @param handle the item handle + // @param mode the mode of access for the lookup. defaults to + // AccessMode::kRead + void markUseful(const ReadHandle& handle, AccessMode mode); + + using AccessIterator = typename AccessContainer::Iterator; + // Iterator interface for the cache. It guarantees that all keys that were + // present when the iteration started will be accessible unless they are + // removed. Keys that are removed/inserted during the lifetime of an + // iterator are not guaranteed to be either visited or not-visited. + // Adding/Removing from the hash table while the iterator is alive will not + // inivalidate any iterator or the element that the iterator points at + // currently. The iterator internally holds a Handle to the item and hence + // the keys that the iterator holds reference to, will not be evictable + // until the iterator is destroyed. + AccessIterator begin() { return accessContainer_->begin(); } + + // return an iterator with a throttler for throttled iteration + AccessIterator begin(util::Throttler::Config config) { + return accessContainer_->begin(config); + } + + AccessIterator end() { return accessContainer_->end(); } + + enum class RemoveRes : uint8_t { + kSuccess, + kNotFoundInRam, + }; + // removes the allocation corresponding to the key, if present in the hash + // table. The key will not be accessible through find() after this returns + // success. The allocation for the key will be recycled once all active + // Item handles are released. + // + // @param key the key for the allocation. + // @return kSuccess if the key exists and was successfully removed. + // kNotFoundInRam if the key was not present in memory (doesn't + // check nvm) + RemoveRes remove(Key key); + + // remove the key that the iterator is pointing to. The element will + // not be accessible upon success. However, the elemenet will not actually be + // recycled until the iterator destroys the internal handle. + // + // @param it the iterator to the key to be destroyed. + // @return kSuccess if the element was still in the hashtable and it was + // successfully removed. + // kNotFoundInRam if the element the iterator was pointing to was + // deleted already. + RemoveRes remove(AccessIterator& it); + + // removes the allocation corresponding to the handle. The allocation will + // be freed when all the existing handles are released. + // + // @param it item read handle + // + // @return kSuccess if the item exists and was successfully removed. + // kNotFoundInRam otherwise + // + // @throw std::invalid_argument if item handle is null + RemoveRes remove(const ReadHandle& it); + + // view a read-only parent item as a chain of allocations if it has chained + // alloc. The returned chained-alloc is good to iterate upon, but will block + // any concurrent addChainedItem or popChainedItem for the same key until the + // ChainedAllocs object is released. This is ideal for use cases which do + // very brief operations on the chain of allocations. + // + // The ordering of the iteration for the chain is LIFO. Check + // CacheChainedAllocs.h for the API and usage. + // + // @param parent the parent allocation of the chain from a ReadHandle. + // @return read-only chained alloc view of the parent + // + // @throw std::invalid_argument if the parent does not have chained allocs + ChainedAllocs viewAsChainedAllocs(const ReadHandle& parent) { + return viewAsChainedAllocsT(parent); + } + + // view a writable parent item as a chain of allocations if it has chained + // alloc. The returned chained-alloc is good to iterate upon, but will block + // any concurrent addChainedItem or popChainedItem for the same key until the + // ChainedAllocs object is released. This is ideal for use cases which do + // very brief operations on the chain of allocations. + // + // The ordering of the iteration for the chain is LIFO. Check + // CacheChainedAllocs.h for the API and usage. + // + // @param parent the parent allocation of the chain from a WriteHandle. + // @return writable chained alloc view of the parent + // + // @throw std::invalid_argument if the parent does not have chained allocs + WritableChainedAllocs viewAsWritableChainedAllocs(const WriteHandle& parent) { + return viewAsChainedAllocsT(parent); + } + + // Returns the full usable size for this item + // This can be bigger than item.getSize() + // + // @param item reference to an item + // + // @return the full usable size for this item + uint32_t getUsableSize(const Item& item) const; + + // gets the allocation class assigned to BG worker + auto getAssignedMemoryToBgWorker(size_t evictorId, size_t numWorkers, TierId tid); + bool shouldWakeupBgEvictor(TierId tid, PoolId pid, ClassId cid); + size_t backgroundWorkerId(TierId tid, PoolId pid, ClassId cid, size_t numWorkers); + + // Get a random item from memory + // This is useful for profiling and sampling cachelib managed memory + // + // @return ReadHandle if an valid item is found + // + // nullptr if the randomly chosen memory does not belong + // to an valid item + ReadHandle getSampleItem(); + + // Convert a Read Handle to an IOBuf. The returned IOBuf gives a + // read-only view to the user. The item's ownership is retained by + // the IOBuf until its destruction. + // + // When the read handle has one or more chained items attached to it, + // user will also get a series of IOBufs (first of which is the Parent). + // + // **WARNING**: folly::IOBuf allows mutation to a cachelib item even when the + // item is read-only. User is responsible to ensure no mutation occurs (i.e. + // only const functions are called). If mutation is required, please use + // `convertToIOBufForWrite`. + // + // @param handle read handle that will transfer its ownership to an IOBuf + // + // @return an IOBuf that contains the value of the item. + // This IOBuf acts as a Read Handle, on destruction, it will + // properly decrement the refcount (to release the item). + // @throw std::invalid_argument if ReadHandle is nullptr + folly::IOBuf convertToIOBuf(ReadHandle handle) { + return convertToIOBufT(handle); + } + + // Convert a Write Handle to an IOBuf. The returned IOBuf gives a + // writable view to the user. The item's ownership is retained by + // the IOBuf until its destruction. + // + // When the write handle has one or more chained items attached to it, + // user will also get a series of IOBufs (first of which is the Parent). + // + // @param handle write handle that will transfer its ownership to an IOBuf + // + // @return an IOBuf that contains the value of the item. + // This IOBuf acts as a Write Handle, on destruction, it will + // properly decrement the refcount (to release the item). + // @throw std::invalid_argument if WriteHandle is nullptr + folly::IOBuf convertToIOBufForWrite(WriteHandle handle) { + return convertToIOBufT(handle); + } + + // TODO: When Read/Write Handles are ready, change this to allow + // const-only access to data manged by iobuf and offer a + // wrapAsWritableIOBuf() API. + // + // wrap an IOBuf over the data for an item. This IOBuf does not own the item + // and the caller is responsible for ensuring that the IOBuf is valid with + // the item lifetime. If the item has chained allocations, the chains are + // also wrapped into the iobuf as chained iobufs + // + // @param item the item to wrap around + // + // @return an IOBuf that contains the value of the item. + folly::IOBuf wrapAsIOBuf(const Item& item); + + // creates a pool for the cache allocator with the corresponding name. + // + // @param name name of the pool + // @param size size of the pool + // @param allocSizes allocation class sizes, if empty, a default + // one from the memory allocator will be used + // @param config MMConfig for the MMContainer, + // default constructed if user doesn't supply one + // @param rebalanceStrategy rebalance strategy for the pool. If not set, + // the default one will be used. + // @param resizeStrategy resize strategy for the pool. If not set, + // the default one will be used. + // @param ensureProvisionable ensures that the size of the pool is enough + // to give one slab to each allocation class, + // false by default. + // + // @return a valid PoolId that the caller can use. + // @throw std::invalid_argument if the size is invalid or there is not + // enough space for creating the pool. + // std::logic_error if we have run out of pools. + PoolId addPool(folly::StringPiece name, + size_t size, + const std::set& allocSizes = {}, + MMConfig config = {}, + std::shared_ptr rebalanceStrategy = nullptr, + std::shared_ptr resizeStrategy = nullptr, + bool ensureProvisionable = false); + + // update an existing pool's config + // + // @param pid pool id for the pool to be updated + // @param config new config for the pool + // + // @throw std::invalid_argument if the poolId is invalid + void overridePoolConfig(TierId tid, PoolId pid, const MMConfig& config); + + // update an existing pool's rebalance strategy + // + // @param pid pool id for the pool to be updated + // @param rebalanceStrategy new rebalance strategy for the pool + // + // @throw std::invalid_argument if the poolId is invalid + void overridePoolRebalanceStrategy( + PoolId pid, std::shared_ptr rebalanceStrategy); + + // update an existing pool's resize strategy + // + // @param pid pool id for the pool to be updated + // @param resizeStrategy new resize strategy for the pool + // + // @throw std::invalid_argument if the poolId is invalid + void overridePoolResizeStrategy( + PoolId pid, std::shared_ptr resizeStrategy); + + // update pool size optimization strategy for this cache + // @param optimizeStrategy new resize strategy + void overridePoolOptimizeStrategy( + std::shared_ptr optimizeStrategy); + + /** + * PoolResizing can be done online while the cache allocator is being used + * to do allocations. Pools can be grown or shrunk using the following api. + * The actual resizing happens asynchronously and is controlled by the + * config parameters poolResizeIntervalSecs and poolResizeSlabsPerIter. The + * pool resizer releases slabs from pools that are over limit when the + * memory allocator is out of memory. If there is enough free memory + * available, the pool resizer does not do any resizing until the memory is + * exhausted and there is some pool that is over the limit + */ + + // shrink the existing pool by _bytes_ . + // @param bytes the number of bytes to be taken away from the pool + // @return true if the operation succeeded. false if the size of the pool is + // smaller than _bytes_ + // @throw std::invalid_argument if the poolId is invalid. + // TODO: should call shrinkPool for specific tier? + bool shrinkPool(PoolId pid, size_t bytes) { + return allocator_[currentTier()]->shrinkPool(pid, bytes); + } + + // grow an existing pool by _bytes_. This will fail if there is no + // available memory across all the pools to provide for this pool + // @param bytes the number of bytes to be added to the pool. + // @return true if the pool was grown. false if the necessary number of + // bytes were not available. + // @throw std::invalid_argument if the poolId is invalid. + // TODO: should call growPool for specific tier? + bool growPool(PoolId pid, size_t bytes) { + return allocator_[currentTier()]->growPool(pid, bytes); + } + + // move bytes from one pool to another. The source pool should be at least + // _bytes_ in size. + // + // @param src the pool to be sized down and giving the memory. + // @param dest the pool receiving the memory. + // @param bytes the number of bytes to move from src to dest. + // @param true if the resize succeeded. false if src does does not have + // correct size to do the transfer. + // @throw std::invalid_argument if src or dest is invalid pool + bool resizePools(PoolId src, PoolId dest, size_t bytes) override { + return allocator_[currentTier()]->resizePools(src, dest, bytes); + } + + // Add a new compact cache with given name and size + // + // @param name name of the compact cache pool + // @param size size of the compact cache pool + // @param args All of the arguments in CompactCache afer allocator + // So the signature of addCompactCache is: + // addCompactCache(folly::StringPiece name, + // size_t size, + // RemoveCb removeCb, + // ReplaceCb replaceCb, + // ValidCb validCb, + // bool allowPromotions = true); + // addCompactCache(folly::StringPiece name, + // size_t size, + // bool allowPromotions = true); + // + // @return pointer to CompactCache instance of the template type + // + // @throw std::logic_error if compact cache is not enabled + // @throw std::invalid_argument There is a memory pool that has the same + // name as the compact cache we are adding or + // if there is no sufficient space to create + // a compact cache. + template + CCacheT* addCompactCache(folly::StringPiece name, + size_t size, + Args&&... args); + + // Attach a compact cache to the given pool after warm roll + // + // @param name name of the compact cache pool + // @param args All of the arguments in CompactCache afer allocator + // So the signature of attachCompactCache is: + // attachCompactCache(folly::StringPiece name, + // RemoveCb removeCb, + // ReplaceCb replaceCb, + // ValidCb validCb, + // bool allowPromotions = true); + // attachCompactCache(folly::StringPiece name, + // bool allowPromotions = true); + // + // @return pointer to CompactCache instance of the template type. + // + // @throw std::out_of_range if the pool does not exist + // @throw std::invalid_argument if the compact key/value size does not match + // from warm roll + template + CCacheT* attachCompactCache(folly::StringPiece name, Args&&... args); + + // Return the base iterface of an attached compact cache to pull out its + // stats. For non-active compact cache, this would throw + // std::invalid_argument. + const ICompactCache& getCompactCache(PoolId pid) const override; + + // The enum value that indicates the CacheAllocator's shutdown status. + enum class ShutDownStatus { + kSuccess = 0, // Successfully persisted the DRAM cache, and the NvmCache if + // enabled. + kSavedOnlyDRAM, // Successfully persisted the DRAM cache only; NvmCache is + // enabled but failed to persist it. + kSavedOnlyNvmCache, // Successfully persisted the enabled NvM cache only; + // Failed to persist DRAM cache. + kFailed // Failed to persist both the DRAM cache and the enabled NvmCache. + }; + + // Persists the state of the cache allocator. On a successful shutdown, + // this cache allocator can be restored on restart. + // + // precondition: serialization must happen without any reader or writer + // present. Any modification of this object afterwards will result in an + // invalid, inconsistent state for the serialized data. There must not be + // any outstanding active handles + // + // @throw std::invalid_argument if the cache allocator isn't using shared + // memory + // @throw std::logic_error if any component is not restorable. + // @return A ShutDownStatus value indicating the result of the shutDown + // operation. + // kSuccess - successfully shut down and can be re-attached + // kFailed - failure due to outstanding active handle or error with + // cache dir + // kSavedOnlyDRAM and kSavedOnlyNvmCache - partial content saved + ShutDownStatus shutDown(); + + // No-op for workers that are already running. Typically user uses this in + // conjunction with `config.delayWorkerStart()` to avoid initialization + // ordering issues with user callback for cachelib's workers. + void startCacheWorkers(); + + // Functions that stop existing ones (if any) and create new workers + + // start pool rebalancer + // @param interval the period this worker fires. + // @param strategy rebalancing strategy + // @param freeAllocThreshold threshold for free-alloc-slab for picking victim + // allocation class. free-alloc-slab is calculated by the number of free + // allocation divided by the number of allocations in one slab. Only + // allocation classes with a higher free-alloc-slab than the threshold would + // be picked as a victim. + // + // + bool startNewPoolRebalancer(std::chrono::milliseconds interval, + std::shared_ptr strategy, + unsigned int freeAllocThreshold); + + // start pool resizer + // @param interval the period this worker fires. + // @param poolResizeSlabsPerIter maximum number of slabs each pool may remove + // in resizing. + // @param strategy resizing strategy + bool startNewPoolResizer(std::chrono::milliseconds interval, + unsigned int poolResizeSlabsPerIter, + std::shared_ptr strategy); + + // start pool optimizer + // @param regularInterval the period for optimizing regular cache + // @param ccacheInterval the period for optimizing compact cache + // @param strategy pool optimizing strategy + // @param ccacheStepSizePercent the percentage number that controls the size + // of each movement in a compact cache + // optimization. + bool startNewPoolOptimizer(std::chrono::seconds regularInterval, + std::chrono::seconds ccacheInterval, + std::shared_ptr strategy, + unsigned int ccacheStepSizePercent); + // start memory monitor + // @param memMonitorMode memory monitor mode + // @param interval the period this worker fires + // @param memAdvisePercentPerIter Percentage of + // upperLimitGB-lowerLimitGB to be + // advised every poll period. This + // governs the rate of advise + // @param memReclaimPercentPerIter Percentage of + // upperLimitGB-lowerLimitGB to be + // reclaimed every poll period. This + // governs the rate of reclaim + // @param memLowerLimit The lower limit of resident memory + // in GBytes + // that triggers reclaiming of + // previously advised away of memory + // from cache + // @param memUpperLimit The upper limit of resident memory + // in GBytes + // that triggers advising of memory + // from cache + // @param memMaxAdvisePercent Maximum percentage of item cache + // limit that + // can be advised away before advising + // is disabled leading to a probable + // OOM. + // @param strategy strategy to find an allocation class + // to release slab from + // @param reclaimRateLimitWindowSecs specifies window in seconds over + // which free/resident memory values + // are tracked to determine rate of + // change to rate limit reclaim + bool startNewMemMonitor(MemoryMonitor::Mode memMonitorMode, + std::chrono::milliseconds interval, + unsigned int memAdvisePercentPerIter, + unsigned int memReclaimPercentPerIter, + unsigned int memLowerLimitGB, + unsigned int memUpperLimitGB, + unsigned int memMaxAdvisePercent, + std::shared_ptr strategy, + std::chrono::seconds reclaimRateLimitWindowSecs); + // start memory monitor + // @param interval the period this worker fires + // @param config memory monitoring config + // @param strategy strategy to find an allocation class + // to release slab from + bool startNewMemMonitor(std::chrono::milliseconds interval, + MemoryMonitor::Config config, + std::shared_ptr strategy); + + // start reaper + // @param interval the period this worker fires + // @param reaperThrottleConfig throttling config + bool startNewReaper(std::chrono::milliseconds interval, + util::Throttler::Config reaperThrottleConfig); + + bool startNewBackgroundPromoter(std::chrono::milliseconds interval, + std::shared_ptr strategy, size_t threads); + bool startNewBackgroundEvictor(std::chrono::milliseconds interval, + std::shared_ptr strategy, size_t threads); + + // Stop existing workers with a timeout + bool stopPoolRebalancer(std::chrono::seconds timeout = std::chrono::seconds{ + 0}); + bool stopPoolResizer(std::chrono::seconds timeout = std::chrono::seconds{0}); + bool stopPoolOptimizer(std::chrono::seconds timeout = std::chrono::seconds{ + 0}); + bool stopMemMonitor(std::chrono::seconds timeout = std::chrono::seconds{0}); + bool stopReaper(std::chrono::seconds timeout = std::chrono::seconds{0}); + bool stopBackgroundEvictor(std::chrono::seconds timeout = std::chrono::seconds{0}); + bool stopBackgroundPromoter(std::chrono::seconds timeout = std::chrono::seconds{0}); + + // Set pool optimization to either true or false + // + // @param poolId The ID of the pool to optimize + void setPoolOptimizerFor(PoolId poolId, bool enableAutoResizing); + + // stop the background workers + // returns true if all workers have been successfully stopped + bool stopWorkers(std::chrono::seconds timeout = std::chrono::seconds{0}); + + // get the allocation information for the specified memory address. + // @throw std::invalid_argument if the memory does not belong to this + // cache allocator + AllocInfo getAllocInfo(const void* memory) const { + return allocator_[getTierId(memory)]->getAllocInfo(memory); + } + + // return the ids for the set of existing pools in this cache. + std::set getPoolIds() const override final { + // all tiers have the same pool ids. TODO: deduplicate + return allocator_[0]->getPoolIds(); + } + + // return a list of pool ids that are backing compact caches. This includes + // both attached and un-attached compact caches. + std::set getCCachePoolIds() const override final; + + // return a list of pool ids for regular pools. + std::set getRegularPoolIds() const override final; + + // return the pool with speicified id. + const MemoryPool& getPool(PoolId pid) const override final { + return allocator_[currentTier()]->getPool(pid); + } + + const MemoryPool& getPoolByTid(PoolId pid, TierId tid) const override final { + return allocator_[tid]->getPool(pid); + } + + // calculate the number of slabs to be advised/reclaimed in each pool + PoolAdviseReclaimData calcNumSlabsToAdviseReclaim() override final { + auto regularPoolIds = getRegularPoolIds(); + return allocator_[currentTier()]->calcNumSlabsToAdviseReclaim(regularPoolIds); + } + + // update number of slabs to advise in the cache + void updateNumSlabsToAdvise(int32_t numSlabsToAdvise) override final { + allocator_[currentTier()]->updateNumSlabsToAdvise(numSlabsToAdvise); + } + + // returns a valid PoolId corresponding to the name or kInvalidPoolId if the + // name is not a recognized pool + PoolId getPoolId(folly::StringPiece name) const noexcept; + + // returns the pool's name by its poolId. + std::string getPoolName(PoolId poolId) const { + // all tiers have the same pool names. + return allocator_[0]->getPoolName(poolId); + } + + // get stats related to all kinds of slab release events. + SlabReleaseStats getSlabReleaseStats() const noexcept override final; + + // return the distribution of the keys in the cache. This is expensive to + // compute at times even with caching. So use with caution. + // TODO think of a way to abstract this since it only makes sense for + // bucketed hashtables with chaining. + using DistributionStats = typename AccessContainer::DistributionStats; + DistributionStats getAccessContainerDistributionStats() const { + return accessContainer_->getDistributionStats(); + } + + // Provide stats on the number of keys cached and the access container for + // this cache. + using AccessStats = typename AccessContainer::Stats; + AccessStats getAccessContainerStats() const { + return accessContainer_->getStats(); + } + + // Get the total number of keys inserted into the access container + uint64_t getAccessContainerNumKeys() const { + return accessContainer_->getNumKeys(); + } + + // returns the reaper stats + ReaperStats getReaperStats() const { + auto stats = reaper_ ? reaper_->getStats() : ReaperStats{}; + return stats; + } + + // returns the background mover stats + BackgroundMoverStats getBackgroundMoverStats(MoverDir direction) const { + + auto stats = BackgroundMoverStats{}; + if (direction == MoverDir::Evict) { + for (auto &bg : backgroundEvictor_) + stats += bg->getStats(); + } else if (direction == MoverDir::Promote) { + for (auto &bg : backgroundPromoter_) + stats += bg->getStats(); + } + return stats; + + } + + /* BackgroundPromotionStats getBackgroundPromoterStats() const { + auto stats = BackgroundPromotionStats{}; + for (auto &bg : backgroundPromoter_) + stats += bg->getStats(); + return stats; + } + */ + + std::map>> + getBackgroundMoverClassStats(MoverDir direction) const { + std::map>> stats; + + if (direction == MoverDir::Evict) { + for (auto &bg : backgroundEvictor_) { + for (auto &tid : bg->getClassStats()) { + for (auto &pid : tid.second) { + for (auto &cid : pid.second) { + stats[tid.first][pid.first][cid.first] += cid.second; + } + } + } + } + } else if (direction == MoverDir::Promote) { + for (auto &bg : backgroundPromoter_) { + for (auto &tid : bg->getClassStats()) { + for (auto &pid : tid.second) { + for (auto &cid : pid.second) { + stats[tid.first][pid.first][cid.first] += cid.second; + } + } + } + } + } + + return stats; + } + + + // return the LruType of an item + typename MMType::LruType getItemLruType(const Item& item) const; + + // return the recent slab release events for a pool for rebalancer, Resizer + // and Memory monitor workers. + AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId pid) const override final; + + // get cache name + const std::string getCacheName() const override final; + + // whether it is object-cache + bool isObjectCache() const override final { return false; } + + // combined pool size for all memory tiers + size_t getPoolSize(PoolId pid) const; + + // pool stats by pool id + PoolStats getPoolStats(PoolId pid) const override final; + + // This can be expensive so it is not part of PoolStats + PoolEvictionAgeStats getPoolEvictionAgeStats( + PoolId pid, unsigned int slabProjectionLength) const override final; + + // return the cache's metadata + CacheMetadata getCacheMetadata() const noexcept override final; + + // return the overall cache stats + GlobalCacheStats getGlobalCacheStats() const override final; + + // return cache's memory usage stats + CacheMemoryStats getCacheMemoryStats() const override final; + + // return basic stats for Allocation Class + AllocationClassBaseStat getAllocationClassStats(TierId tid, PoolId pid, ClassId cid) + const override final; + + // return the nvm cache stats map + std::unordered_map getNvmCacheStatsMap() + const override final; + + // return the event tracker stats map + std::unordered_map getEventTrackerStatsMap() + const override { + std::unordered_map eventTrackerStats; + if (auto eventTracker = getEventTracker()) { + eventTracker->getStats(eventTrackerStats); + } + return eventTrackerStats; + } + + // Whether this cache allocator was created on shared memory. + bool isOnShm() const noexcept { return isOnShm_; } + + // Whether NvmCache is currently enabled + bool isNvmCacheEnabled() const noexcept { + return nvmCache_ && nvmCache_->isEnabled(); + } + + // unix timestamp when the cache was created. If 0, the cache creation + // time was not recorded from the older version of the binary. + // + // @return time when the cache was created. + time_t getCacheCreationTime() const noexcept { return cacheCreationTime_; } + time_t getNVMCacheCreationTime() const { + return nvmCacheState_.getCreationTime(); + } + + // Inspects the cache without changing its state. + // + // @param key for the cache item + // @return std::pair the first represents the state + // in the RAM and the second is a copy of the state in NVM + std::pair inspectCache(Key key); + + // blocks until the inflight operations are flushed to nvmcache. Used for + // benchmarking when we want to load up the cache first with some data and + // run the benchmarks after flushing. + void flushNvmCache(); + + // Dump the last N items for an evictable MM Container + // @return vector of the string of each item. Empty if nothing in LRU + // @throw std::invalid_argument if does not exist + std::vector dumpEvictionIterator(PoolId pid, + ClassId cid, + size_t numItems = 10); + + // returns the current count of the active handles that are handed out + // through the API. This also includes the handles that are maintained by + // the iterator and internal rebalancing threads. + int64_t getNumActiveHandles() const; + + // returns the current count of handle at an given time for this thread. If + // the threads do not transfer handles between them, the caller can rely on + // this being the current active outstanding handles. If handles can be + // transferred, then the caller needs to take snapshot of the count before + // and after and make sure that the count reflects any transfer. + // + // TODO (sathya) wrap this into a guard object + // + // @return the current handle count. If handles are never transferred + // between threads, this will always be >=0. If handles are + // transferred, this can be negative on threads that give away + // handles and always non zero on threads that acquire a handle from + // another thread. + int64_t getHandleCountForThread() const; + + // (Deprecated) reset and adjust handle count for the current thread. + // Please do not use this API as it will be moved to a private function. + void resetHandleCountForThread_private(); + void adjustHandleCountForThread_private(int64_t delta); + + // madvise(MADV_DODUMP) all recently accessed items. + // this function is intended for signal handler which can mprotect other + // code. Hence, it is inlined to make sure that the code lives in the + // caller's text segment + void FOLLY_ALWAYS_INLINE madviseRecentlyAccessedItems() const { + for (const auto& tlring : ring_.accessAllThreads()) { + tlring.madviseItems(); + } + } + + // Mark the item as dirty and enqueue for deletion from nvmcache + // @param item item to invalidate. + void invalidateNvm(Item& item); + + // Attempts to clean up left-over shared memory from preivous instance of + // cachelib cache for the cache directory. If there are other processes + // using the same directory, we don't touch it. If the directory is not + // present, we do our best to clean up based on what is possible. + // It is hard to determine if we actually cleaned up something. + // + // returns true if there was no error in trying to cleanup the segment + // because another process was attached. False if the user tried to clean up + // and the cache was actually attached. + static bool cleanupStrayShmSegments(const std::string& cacheDir, bool posix + /*TODO: const std::vector& config = {} */); + + // gives a relative offset to a pointer within the cache. + uint64_t getItemPtrAsOffset(const void* ptr); + + // this ensures that we dont introduce any more hidden fields like vtable by + // inheriting from the Hooks and their bool interface. + static_assert((sizeof(typename MMType::template Hook) + + sizeof(typename AccessType::template Hook) + + sizeof(typename RefcountWithFlags::Value) + sizeof(uint32_t) + + sizeof(uint32_t) + sizeof(KAllocation)) == sizeof(Item), + "vtable overhead"); + // XXX: this will fail due to CompressedPtr change + // static_assert(32 == sizeof(Item), "item overhead is 32 bytes"); + + // make sure there is no overhead in ChainedItem on top of a regular Item + static_assert(sizeof(Item) == sizeof(ChainedItem), + "Item and ChainedItem must be the same size"); + + static_assert(std::is_standard_layout::value, + "KAllocation not standard layout"); + static_assert(std::is_standard_layout>::value, + "ChainedItemPayload not standard layout"); + +// ensure that Item::alloc_ is the last member of Item. If not, +// Item::alloc_::data[0] will not work as a variable sized struct. +// gcc is strict about using offsetof in Item when Item has a default +// constructor, hence becoming a non-POD. Suppress -Winvalid-offsetof for +// this sake. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Winvalid-offsetof" + static_assert(sizeof(Item) - offsetof(Item, alloc_) == sizeof(Item::alloc_), + "alloc_ incorrectly arranged"); +#pragma GCC diagnostic pop + + private: + double slabsApproxFreePercentage(TierId tid) const; + + // wrapper around Item's refcount and active handle tracking + FOLLY_ALWAYS_INLINE void incRef(Item& it); + FOLLY_ALWAYS_INLINE RefcountWithFlags::Value decRef(Item& it); + + // drops the refcount and if needed, frees the allocation back to the memory + // allocator and executes the necessary callbacks. no-op if it is nullptr. + FOLLY_ALWAYS_INLINE void release(Item* it, bool isNascent); + + // Differtiate different memory setting for the initialization + enum class InitMemType { kNone, kMemNew, kMemAttach }; + // instantiates a cache allocator for common initialization + // + // @param types the type of the memory used + // @param config the configuration for the whole cache allocator + CacheAllocator(InitMemType types, Config config); + + // This is the last step in item release. We also use this for the eviction + // scenario where we have to do everything, but not release the allocation + // to the allocator and instead recycle it for another new allocation. If + // toRecycle is present, the item to be released and the item to be recycled + // will be the same if we want to recycle a parent item. If we want to + // recycle a child item, the parent would be the item to release and the + // child will be the item to recycle. If it is a chained item, we simply + // free it back. + // + // 1. It calls the remove callback + // 2. It releases the item and chained allocation memory back to allocator + // + // An item should have its refcount dropped to zero, and unlinked from + // access and mm container before being passed to this function. + // + // @param it item to be released. + // @param ctx removal context + // @param toRecycle An item that will be recycled, this item is to be + // ignored if it's found in the process of freeing + // a chained allocation + // + // @return One of ReleaseRes. In all cases, _it_ is always released back to + // the allocator unless an exception is thrown + // + // @throw runtime_error if _it_ has pending refs or is not a regular item. + // runtime_error if parent->chain is broken + enum class ReleaseRes { + kRecycled, // _it_ was released and _toRecycle_ was recycled + kNotRecycled, // _it_ was released and _toRecycle_ was not recycled + kReleased, // toRecycle == nullptr and it was released + }; + ReleaseRes releaseBackToAllocator(Item& it, + RemoveContext ctx, + bool nascent = false, + const Item* toRecycle = nullptr); + + // acquires an handle on the item. returns an empty handle if it is null. + // @param it pointer to an item + // @return WriteHandle return a handle to this item + // @throw std::overflow_error is the maximum item refcount is execeeded by + // creating this item handle. + WriteHandle acquire(Item* it); + + // creates an item handle with wait context. + WriteHandle createNvmCacheFillHandle() { return WriteHandle{*this}; } + + // acquires the wait context for the handle. This is used by NvmCache to + // maintain a list of waiters + std::shared_ptr> getWaitContext( + ReadHandle& hdl) const { + return hdl.getItemWaitContext(); + } + + using MMContainerPtr = std::unique_ptr; + using MMContainers = + std::vector, + MemoryPoolManager::kMaxPools>>; + + void createMMContainers(const PoolId pid, MMConfig config); + + TierId getTierId(const Item& item) const; + TierId getTierId(const void* ptr) const; + + // acquire the MMContainer corresponding to the the Item's class and pool. + // + // @return pointer to the MMContainer. + // @throw std::invalid_argument if the Item does not point to a valid + // allocation from the memory allocator. + MMContainer& getMMContainer(const Item& item) const noexcept; + + MMContainer& getMMContainer(TierId tid, PoolId pid, ClassId cid) const noexcept; + + // Get stats of the specified pid and cid. + // If such mmcontainer is not valid (pool id or cid out of bound) + // or the mmcontainer is not initialized, return an empty stat. + MMContainerStat getMMContainerStat(TierId tid, PoolId pid, ClassId cid) const noexcept; + + // create a new cache allocation. The allocation can be initialized + // appropriately and made accessible through insert or insertOrReplace. + // If the handle returned from this api is not passed on to + // insert/insertOrReplace, the allocation gets destroyed when the handle + // goes out of scope. + // + // @param id the pool id for the allocation that was previously + // created through addPool + // @param key the key for the allocation. This will be made a + // part of the Item and be available through getKey(). + // @param size the size of the allocation, exclusive of the key + // size. + // @param creationTime Timestamp when this item was created + // @param expiryTime set an expiry timestamp for the item (0 means no + // expiration time). + // + // @return the handle for the item or an invalid handle(nullptr) if the + // allocation failed. Allocation can fail if one such + // allocation already exists or if we are out of memory and + // can not find an eviction. Handle must be destroyed *before* + // the instance of the CacheAllocator gets destroyed + // @throw std::invalid_argument if the poolId is invalid or the size + // requested is invalid or if the key is invalid(key.size() == 0 or + // key.size() > 255) + WriteHandle allocateInternal(PoolId id, + Key key, + uint32_t size, + uint32_t creationTime, + uint32_t expiryTime, + bool fromBgThread = false); + + // create a new cache allocation on specific memory tier. + // For description see allocateInternal. + // + // @param tid id a memory tier + WriteHandle allocateInternalTier(TierId tid, + PoolId id, + Key key, + uint32_t size, + uint32_t creationTime, + uint32_t expiryTime, + bool fromBgThread); + + // Allocate a chained item + // + // The resulting chained item does not have a parent item and + // will be freed once the handle is dropped + // + // The parent handle parameter here is mainly used to find the + // correct pool to allocate memory for this chained item + // + // @param parent handle to the cache item + // @param size the size for the chained allocation + // + // @return handle to the chained allocation + // @throw std::invalid_argument if the size requested is invalid or + // if the item is invalid + WriteHandle allocateChainedItemInternal(const ReadHandle& parent, + uint32_t size); + + // Given an item and its parentKey, validate that the parentKey + // corresponds to an item that's the parent of the supplied item. + // + // @param item item that we want to get the parent handle for + // @param parentKey key of the item's parent + // + // @return handle to the parent item if the validations pass + // otherwise, an empty Handle is returned. + // + ReadHandle validateAndGetParentHandleForChainedMoveLocked( + const ChainedItem& item, const Key& parentKey); + + // Given an existing item, allocate a new one for the + // existing one to later be moved into. + // + // @param oldItem the item we want to allocate a new item for + // + // @return handle to the newly allocated item + // + WriteHandle allocateNewItemForOldItem(const Item& oldItem); + + // internal helper that grabs a refcounted handle to the item. This does + // not record the access to reflect in the mmContainer. + // + // @param key key to look up in the access container + // + // @return handle if item is found, nullptr otherwise + // + // @throw std::overflow_error is the maximum item refcount is execeeded by + // creating this item handle. + WriteHandle findInternal(Key key) { + // Note: this method can not be const because we need a non-const + // reference to create the ItemReleaser. + return accessContainer_->find(key); + } + + // look up an item by its key. This ignores the nvm cache and only does RAM + // lookup. + // + // @param key the key for lookup + // @param mode the mode of access for the lookup. + // AccessMode::kRead or AccessMode::kWrite + // + // @return the handle for the item or a handle to nullptr if the key does + // not exist. + FOLLY_ALWAYS_INLINE WriteHandle findFastInternal(Key key, AccessMode mode); + + // look up an item by its key across the nvm cache as well if enabled. + // + // @param key the key for lookup + // @param mode the mode of access for the lookup. + // AccessMode::kRead or AccessMode::kWrite + // + // @return the handle for the item or a handle to nullptr if the key does + // not exist. + FOLLY_ALWAYS_INLINE WriteHandle findImpl(Key key, AccessMode mode); + + // look up an item by its key. This ignores the nvm cache and only does RAM + // lookup. + // + // @param key the key for lookup + // @param mode the mode of access for the lookup. + // AccessMode::kRead or AccessMode::kWrite + // + // @return the handle for the item or a handle to nullptr if the key does + // not exist. + FOLLY_ALWAYS_INLINE WriteHandle findFastImpl(Key key, AccessMode mode); + + // Moves a regular item to a different memory tier. + // + // @param oldItem Reference to the item being moved + // @param newItemHdl Reference to the handle of the new item being moved into + // + // @return true If the move was completed, and the containers were updated + // successfully. + template + WriteHandle moveRegularItemWithSync(Item& oldItem, WriteHandle& newItemHdl, P&& predicate); + + // Moves a regular item to a different slab. This should only be used during + // slab release after the item's moving bit has been set. The user supplied + // callback is responsible for copying the contents and fixing the semantics + // of chained item. + // + // @param oldItem Reference to the item being moved + // @param newItemHdl Reference to the handle of the new item being moved into + // + // @return true If the move was completed, and the containers were updated + // successfully. + bool moveRegularItem(Item& oldItem, WriteHandle& newItemHdl); + + // template class for viewAsChainedAllocs that takes either ReadHandle or + // WriteHandle + template + CacheChainedAllocs viewAsChainedAllocsT( + const Handle& parent); + + // template class for convertToIOBuf that takes either ReadHandle or + // WriteHandle + template + folly::IOBuf convertToIOBufT(Handle& handle); + + // Moves a chained item to a different slab. This should only be used during + // slab release after the item's moving bit has been set. The user supplied + // callback is responsible for copying the contents and fixing the semantics + // of chained item. + // + // Note: If we have successfully moved the old item into the new, the + // newItemHdl is reset and no longer usable by the caller. + // + // @param oldItem Reference to the item being moved + // @param newItemHdl Reference to the handle of the new item being + // moved into + // + // @return true If the move was completed, and the containers were updated + // successfully. + bool moveChainedItem(ChainedItem& oldItem, WriteHandle& newItemHdl); + + // Transfers the chain ownership from parent to newParent. Parent + // will be unmarked as having chained allocations. Parent will not be null + // after calling this API. + // + // Parent and NewParent must be valid handles to items with same key and + // parent must have chained items and parent handle must be the only + // outstanding handle for parent. New parent must be without any chained item + // handles. + // + // Chained item lock for the parent's key needs to be held in exclusive mode. + // + // @param parent the current parent of the chain we want to transfer + // @param newParent the new parent for the chain + // + // @throw if any of the conditions for parent or newParent are not met. + void transferChainLocked(WriteHandle& parent, WriteHandle& newParent); + + // replace a chained item in the existing chain. This needs to be called + // with the chained item lock held exclusive + // + // @param oldItem the item we are replacing in the chain + // @param newItem the item we are replacing it with + // @param parent the parent for the chain + // + // @return handle to the oldItem + WriteHandle replaceChainedItemLocked(Item& oldItem, + WriteHandle newItemHdl, + const Item& parent); + + // Insert an item into MM container. The caller must hold a valid handle for + // the item. + // + // @param item Item that we want to insert. + // + // @throw std::runtime_error if the handle is already in the mm container + void insertInMMContainer(Item& item); + + // Removes an item from the corresponding MMContainer if it is in the + // container. The caller must hold a valid handle for the item. + // + // @param item Item that we want to remove. + // + // @return true if the item is successfully removed + // false if the item is not in MMContainer + bool removeFromMMContainer(Item& item); + + using EvictionIterator = typename MMContainer::Iterator; + + WriteHandle acquire(EvictionIterator& it) { return acquire(it.get()); } + + // Replaces an item in the MMContainer with another item, at the same + // position. + // + // @param oldItem item being replaced + // @param newItem item to replace oldItem with + // + // @return true If the replace was successful. Returns false if the + // destination item did not exist in the container, or if the + // source item already existed. + bool replaceInMMContainer(Item& oldItem, Item& newItem); + bool replaceInMMContainer(Item* oldItem, Item& newItem); + bool replaceInMMContainer(EvictionIterator& oldItemIt, Item& newItem); + + // Replaces an item in the MMContainer with another item, at the same + // position. Or, if the two chained items belong to two different MM + // containers, remove the old one from its MM container and add the new + // one to its MM container. + // + // @param oldItem item being replaced + // @param newItem item to replace oldItem with + // + // @return true If the replace was successful. Returns false if the + // destination item did not exist in the container, or if the + // source item already existed. + bool replaceChainedItemInMMContainer(Item& oldItem, Item& newItem); + + // Only replaces an item if it is accessible + bool replaceIfAccessible(Item& oldItem, Item& newItem); + + // Inserts the allocated handle into the AccessContainer, making it + // accessible for everyone. This needs to be the handle that the caller + // allocated through _allocate_. If this call fails, the allocation will be + // freed back when the handle gets out of scope in the caller. + // + // @param handle the handle for the allocation. + // @param event AllocatorApiEvent that corresponds to the current operation. + // supported events are INSERT, corresponding to the client + // insert call, and INSERT_FROM_NVM, cooresponding to the insert + // call that happens when an item is promoted from NVM storage + // to memory. + // + // @return true if the handle was successfully inserted into the hashtable + // and is now accessible to everyone. False if there was an error. + // + // @throw std::invalid_argument if the handle is already accessible or invalid + bool insertImpl(const WriteHandle& handle, AllocatorApiEvent event); + + // Removes an item from the access container and MM container. + // + // @param hk the hashed key for the item + // @param it Item to remove + // @param tombstone A tombstone for nvm::remove job created by + // nvm::createDeleteTombStone, can be empty if nvm is + // not enable, or removeFromNvm is false + // @param removeFromNvm if true clear key from nvm + // @param recordApiEvent should we record API event for this operation. + RemoveRes removeImpl(HashedKey hk, + Item& it, + DeleteTombStoneGuard tombstone, + bool removeFromNvm = true, + bool recordApiEvent = true); + + // Implementation to find a suitable eviction from the container. The + // two parameters together identify a single container. + // + // @param pid the id of the pool to look for evictions inside + // @param cid the id of the class to look for evictions inside + // @return An evicted item or nullptr if there is no suitable candidate. + Item* findEviction(TierId tid, PoolId pid, ClassId cid); + + // Try to move the item down to the next memory tier + // + // @param tid current tier ID of the item + // @param pid the pool ID the item belong to. + // @param item the item to evict + // + // @return valid handle to the item. This will be the last + // handle to the item. On failure an empty handle. + WriteHandle tryEvictToNextMemoryTier(TierId tid, PoolId pid, Item& item, bool fromBgThread); + + bool tryPromoteToNextMemoryTier(TierId tid, PoolId pid, Item& item, bool fromBgThread); + + bool tryPromoteToNextMemoryTier(Item& item, bool fromBgThread); + + // Try to move the item down to the next memory tier + // + // @param item the item to evict + // + // @return valid handle to the item. This will be the last + // handle to the item. On failure an empty handle. + WriteHandle tryEvictToNextMemoryTier(Item& item, bool fromBgThread); + + size_t memoryTierSize(TierId tid) const; + + // Deserializer CacheAllocatorMetadata and verify the version + // + // @param deserializer Deserializer object + // @throws runtime_error + serialization::CacheAllocatorMetadata deserializeCacheAllocatorMetadata( + Deserializer& deserializer); + + MMContainers deserializeMMContainers( + Deserializer& deserializer, + const typename Item::PtrCompressor& compressor); + + unsigned int reclaimSlabs(PoolId id, size_t numSlabs) final { + return allocator_[currentTier()]->reclaimSlabsAndGrow(id, numSlabs); + } + + FOLLY_ALWAYS_INLINE EventTracker* getEventTracker() const { + return config_.eventTracker.get(); + } + + // Releases a slab from a pool into its corresponding memory pool + // or back to the slab allocator, depending on SlabReleaseMode. + // SlabReleaseMode::kRebalance -> back to the pool + // SlabReleaseMode::kResize -> back to the slab allocator + // + // @param pid Pool that will make make one slab available + // @param cid Class that will release a slab back to its pool + // or slab allocator + // @param mode the mode for slab release (rebalance/resize) + // @param hint hint referring to the slab. this can be an allocation + // that the user knows to exist in the slab. If this is + // nullptr, a random slab is selected from the pool and + // allocation class. + // + // @throw std::invalid_argument if the hint is invalid or if the pid or cid + // is invalid. + // @throw std::runtime_error if fail to release a slab due to internal error + void releaseSlab(PoolId pid, + ClassId cid, + SlabReleaseMode mode, + const void* hint = nullptr) final; + + // Releasing a slab from this allocation class id and pool id. The release + // could be for a pool resizing or allocation class rebalancing. + // + // All active allocations will be evicted in the process. + // + // This function will be blocked until a slab is freed + // + // @param pid Pool that will make one slab available + // @param victim Class that will release a slab back to its pool + // or slab allocator or a receiver if defined + // @param receiver Class that will receive when the mode is set to rebalance + // @param mode the mode for slab release (rebalance/resize) + // the mode must be rebalance if an valid receiver is specified + // @param hint hint referring to the slab. this can be an allocation + // that the user knows to exist in the slab. If this is + // nullptr, a random slab is selected from the pool and + // allocation class. + // + // @throw std::invalid_argument if the hint is invalid or if the pid or cid + // is invalid. Or if the mode is set to kResize but the receiver is + // also specified. Receiver class id can only be specified if the mode + // is set to kRebalance. + // @throw std::runtime_error if fail to release a slab due to internal error + void releaseSlab(PoolId pid, + ClassId victim, + ClassId receiver, + SlabReleaseMode mode, + const void* hint = nullptr) final; + + // @param releaseContext slab release context + void releaseSlabImpl(TierId tid, const SlabReleaseContext& releaseContext); + + // @return true when successfully marked as moving, + // fasle when this item has already been freed + bool markMovingForSlabRelease(const SlabReleaseContext& ctx, + void* alloc, + util::Throttler& throttler); + + // "Move" (by copying) the content in this item to another memory + // location by invoking the move callback. + // + // + // @param ctx slab release context + // @param item old item to be moved elsewhere + // @param throttler slow this function down as not to take too much cpu + // + // @return true if the item has been moved + // false if we have exhausted moving attempts + bool moveForSlabRelease(const SlabReleaseContext& ctx, + Item& item, + util::Throttler& throttler); + + // "Move" (by copying) the content in this item to another memory + // location by invoking the move callback. + // + // @param item old item to be moved elsewhere + // @param newItemHdl handle of new item to be moved into + // + // @return true if the item has been moved + // false if we have exhausted moving attempts + bool tryMovingForSlabRelease(Item& item, WriteHandle& newItemHdl); + + // Evict an item from access and mm containers and + // ensure it is safe for freeing. + // + // @param ctx slab release context + // @param item old item to be moved elsewhere + // @param throttler slow this function down as not to take too much cpu + void evictForSlabRelease(const SlabReleaseContext& ctx, + Item& item, + util::Throttler& throttler); + + // Helper function to evict a normal item for slab release + // + // @return last handle for corresponding to item on success. empty handle on + // failure. caller can retry if needed. + WriteHandle evictNormalItem(Item& item, bool skipIfTokenInvalid = false, bool fromBgThread = false); + + // Helper function to evict a child item for slab release + // As a side effect, the parent item is also evicted + // + // @return last handle to the parent item of the child on success. empty + // handle on failure. caller can retry. + WriteHandle evictChainedItemForSlabRelease(ChainedItem& item); + + // Helper function to remove a item if expired. + // + // @return true if it item expire and removed successfully. + bool removeIfExpired(const ReadHandle& handle); + + // exposed for the Reaper to iterate through the memory and find items to + // reap under the super charged mode. This is faster if there are lots of + // items in cache and only a small fraction of them are expired at any given + // time. + template + void traverseAndExpireItems(Fn&& f) { + // The intent here is to scan the memory to identify candidates for reaping + // without holding any locks. Candidates that are identified as potential + // ones are further processed by holding the right synchronization + // primitives. So we consciously exempt ourselves here from TSAN data race + // detection. + folly::annotate_ignore_thread_sanitizer_guard g(__FILE__, __LINE__); + auto slabsSkipped = allocator_[currentTier()]->forEachAllocation(std::forward(f)); + stats().numSkippedSlabReleases.add(slabsSkipped); + } + + // exposed for the background evictor to iterate through the memory and evict + // in batch. This should improve insertion path for tiered memory config + size_t traverseAndEvictItems(unsigned int tid, unsigned int pid, unsigned int cid, size_t batch) { + auto& mmContainer = getMMContainer(tid, pid, cid); + size_t evictions = 0; + size_t evictionCandidates = 0; + std::vector candidates; + candidates.reserve(batch); + + size_t tries = 0; + mmContainer.withEvictionIterator([&tries, &candidates, &batch, this](auto &&itr){ + while (candidates.size() < batch && (config_.maxEvictionPromotionHotness == 0 || tries < config_.maxEvictionPromotionHotness) && itr) { + tries++; + Item* candidate = itr.get(); + XDCHECK(candidate); + + if (candidate->isChainedItem()) { + throw std::runtime_error("Not supported for chained items"); + } + + if (candidate->getRefCount() == 0 && candidate->markMoving()) { + candidates.push_back(candidate); + } + + ++itr; + } + }); + + for (Item *candidate : candidates) { + auto toReleaseHandle = + evictNormalItem(*candidate, true /* skipIfTokenInvalid */, true /* from BG thread */); + auto ref = candidate->unmarkMoving(); + + if (toReleaseHandle || ref == 0u) { + if (candidate->hasChainedItem()) { + (*stats_.chainedItemEvictions)[pid][cid].inc(); + } else { + (*stats_.regularItemEvictions)[pid][cid].inc(); + } + + evictions++; + } else { + if (candidate->hasChainedItem()) { + stats_.evictFailParentAC.inc(); + } else { + stats_.evictFailAC.inc(); + } + } + + if (toReleaseHandle) { + XDCHECK(toReleaseHandle.get() == candidate); + XDCHECK_EQ(1u, toReleaseHandle->getRefCount()); + + // We manually release the item here because we don't want to + // invoke the Item Handle's destructor which will be decrementing + // an already zero refcount, which will throw exception + auto& itemToRelease = *toReleaseHandle.release(); + + // Decrementing the refcount because we want to recycle the item + const auto ref = decRef(itemToRelease); + XDCHECK_EQ(0u, ref); + + auto res = releaseBackToAllocator(*candidate, RemoveContext::kEviction, + /* isNascent */ false); + XDCHECK(res == ReleaseRes::kReleased); + } else if (ref == 0u) { + // it's safe to recycle the item here as there are no more + // references and the item could not been marked as moving + // by other thread since it's detached from MMContainer. + auto res = releaseBackToAllocator(*candidate, RemoveContext::kEviction, + /* isNascent */ false); + XDCHECK(res == ReleaseRes::kReleased); + } + } + + return evictions; + } + + size_t traverseAndPromoteItems(unsigned int tid, unsigned int pid, unsigned int cid, size_t batch) { + auto& mmContainer = getMMContainer(tid, pid, cid); + size_t promotions = 0; + std::vector candidates; + candidates.reserve(batch); + + size_t tries = 0; + + mmContainer.withPromotionIterator([&tries, &candidates, &batch, this](auto &&itr){ + while (candidates.size() < batch && (config_.maxEvictionPromotionHotness == 0 || tries < config_.maxEvictionPromotionHotness) && itr) { + tries++; + Item* candidate = itr.get(); + XDCHECK(candidate); + + if (candidate->isChainedItem()) { + throw std::runtime_error("Not supported for chained items"); + } + + + // TODO: only allow it for read-only items? + // or implement mvcc + if (!candidate->isExpired() && candidate->markMoving()) { + candidates.push_back(candidate); + } + + ++itr; + } + }); + + for (Item *candidate : candidates) { + auto promoted = tryPromoteToNextMemoryTier(*candidate, true); + auto ref = candidate->unmarkMoving(); + if (promoted) + promotions++; + + if (ref == 0u) { + // stats_.promotionMoveSuccess.inc(); + auto res = releaseBackToAllocator(*candidate, RemoveContext::kEviction, + /* isNascent */ false); + XDCHECK(res == ReleaseRes::kReleased); + } + } + + return promotions; + } + + // returns true if nvmcache is enabled and we should write this item to + // nvmcache. + bool shouldWriteToNvmCache(const Item& item); + + // (this should be only called when we're the only thread accessing item) + // returns true if nvmcache is enabled and we should write this item. + bool shouldWriteToNvmCacheExclusive(const Item& item); + + // Serialize the metadata for the cache into an IOBUf. The caller can now + // use this to serialize into a serializer by estimating the size and + // calling writeToBuffer. + folly::IOBufQueue saveStateToIOBuf(); + + static typename MemoryAllocator::Config getAllocatorConfig( + const Config& config) { + return MemoryAllocator::Config{ + config.defaultAllocSizes.empty() + ? util::generateAllocSizes( + config.allocationClassSizeFactor, + config.maxAllocationClassSize, + config.minAllocationClassSize, + config.reduceFragmentationInAllocationClass) + : config.defaultAllocSizes, + config.enableZeroedSlabAllocs, config.disableFullCoredump, + config.lockMemory}; + } + + // starts one of the cache workers passing the current instance and the args + template + bool startNewWorker(folly::StringPiece name, + std::unique_ptr& worker, + std::chrono::milliseconds interval, + Args&&... args); + + // stops one of the workers belonging to this instance. + template + bool stopWorker(folly::StringPiece name, + std::unique_ptr& worker, + std::chrono::seconds timeout = std::chrono::seconds{0}); + + ShmSegmentOpts createShmCacheOpts(TierId tid); + + std::unique_ptr createNewMemoryAllocator(TierId tid); + std::unique_ptr restoreMemoryAllocator(TierId tid); + std::unique_ptr restoreCCacheManager(TierId tid); + + PoolIds filterCompactCachePools(const PoolIds& poolIds) const; + + // returns a list of pools excluding compact cache pools that are over the + // limit + PoolIds getRegularPoolIdsForResize() const override final; + + // return whether a pool participates in auto-resizing + bool autoResizeEnabledForPool(PoolId) const override final; + + // resize all compact caches to their configured size + void resizeCompactCaches() override; + + std::map serializeConfigParams() + const override final { + return config_.serialize(); + } + + typename Item::PtrCompressor createPtrCompressor() const { + return typename Item::PtrCompressor(allocator_); + } + + // helper utility to throttle and optionally log. + static void throttleWith(util::Throttler& t, std::function fn); + + // Write the item to nvm cache if enabled. This is called on eviction. + void pushToNvmCache(const Item& item); + + // Test utility function to move a key from ram into nvmcache. + bool pushToNvmCacheFromRamForTesting(Key key); + // Test utility functions to remove things from individual caches. + bool removeFromRamForTesting(Key key); + void removeFromNvmForTesting(Key key); + + // @param dramCacheAttached boolean indicating if the dram cache was + // restored from previous state + void initCommon(bool dramCacheAttached); + void initNvmCache(bool dramCacheAttached); + void initWorkers(); + + // @param type the type of initialization + // @return nullptr if the type is invalid + // @return vector of pointers to memory allocator + // @throw std::runtime_error if type is invalid + std::vector> initAllocator(InitMemType type); + + std::vector> createPrivateAllocator(); + std::vector> createAllocators(); + std::vector> restoreAllocators(); + + // @param type the type of initialization + // @return nullptr if the type is invalid + // @return pointer to access container + // @throw std::runtime_error if type is invalid + std::unique_ptr initAccessContainer(InitMemType type, + const std::string name, + AccessConfig config, + bool usePosixShm); + + std::optional saveNvmCache(); + void saveRamCache(); + + static bool itemMovingPredicate(const Item& item) { + return item.getRefCount() == 0; + } + + static bool itemExpiryPredicate(const Item& item) { + return item.getRefCount() == 1 && item.isExpired(); + } + + static bool parentEvictForSlabReleasePredicate(const Item& item) { + return item.getRefCount() == 1 && !item.isMoving(); + } + + std::unique_ptr createDeserializer(); + + // Execute func on each item. `func` can throw exception but must ensure + // the item remains in a valid state + // + // @param item Parent item + // @param func Function that gets executed on each chained item + void forEachChainedItem(const Item& item, + std::function func); + + // @param item Record the item has been accessed in its mmContainer + // @param mode the mode of access + // @param stats stats object to avoid a thread local lookup. + // @return true if successfully recorded in MMContainer + bool recordAccessInMMContainer(Item& item, AccessMode mode); + + WriteHandle findChainedItem(const Item& parent) const; + + // Get the thread local version of the Stats + detail::Stats& stats() const noexcept { return stats_; } + + void initStats(); + + // return a read-only iterator to the item's chained allocations. The order of + // iteration on the item will be LIFO of the addChainedItem calls. + folly::Range viewAsChainedAllocsRange( + const Item& parent) const; + + // updates the maxWriteRate for DynamicRandomAdmissionPolicy + // returns true if update successfully + // false if no AdmissionPolicy is set or it is not DynamicRandom + bool updateMaxRateForDynamicRandomAP(uint64_t maxRate) { + return nvmCache_ ? nvmCache_->updateMaxRateForDynamicRandomAP(maxRate) + : false; + } + + // BEGIN private members + + TierId currentTier() const { + // TODO: every function which calls this method should be refactored. + // We should go case by case and either make such function work on + // all tiers or expose separate parameter to describe the tier ID. + return 0; + } + + unsigned getNumTiers() const { + return memoryTierConfigs.size(); + } + + bool addWaitContextForMovingItem( + folly::StringPiece key, std::shared_ptr> waiter); + + class MoveCtx { + public: + MoveCtx() {} + + ~MoveCtx() { + // prevent any further enqueue to waiters + // Note: we don't need to hold locks since no one can enqueue + // after this point. + wakeUpWaiters(); + } + + // record the item handle. Upon destruction we will wake up the waiters + // and pass a clone of the handle to the callBack. By default we pass + // a null handle + void setItemHandle(WriteHandle _it) { it = std::move(_it); } + + // enqueue a waiter into the waiter list + // @param waiter WaitContext + void addWaiter(std::shared_ptr> waiter) { + XDCHECK(waiter); + waiters.push_back(std::move(waiter)); + } + + private: + // notify all pending waiters that are waiting for the fetch. + void wakeUpWaiters() { + bool refcountOverflowed = false; + for (auto& w : waiters) { + // If refcount overflowed earlier, then we will return miss to + // all subsequent waitors. + if (refcountOverflowed) { + w->set(WriteHandle{}); + continue; + } + + try { + w->set(it.clone()); + } catch (const exception::RefcountOverflow&) { + // We'll return a miss to the user's pending read, + // so we should enqueue a delete via NvmCache. + // TODO: cache.remove(it); + refcountOverflowed = true; + } + } + } + + WriteHandle it; // will be set when Context is being filled + std::vector>> waiters; // list of + // waiters + }; + using MoveMap = + folly::F14ValueMap, + folly::HeterogeneousAccessHash>; + + static size_t getShardForKey(folly::StringPiece key) { + return folly::Hash()(key) % kShards; + } + + MoveMap& getMoveMapForShard(size_t shard) { + return movesMap_[shard].movesMap_; + } + + MoveMap& getMoveMap(folly::StringPiece key) { + return getMoveMapForShard(getShardForKey(key)); + } + + std::unique_lock getMoveLockForShard(size_t shard) { + return std::unique_lock(moveLock_[shard].moveLock_); + } + + std::unique_lock getMoveLock(folly::StringPiece key) { + return getMoveLockForShard(getShardForKey(key)); + } + + // Whether the memory allocator for this cache allocator was created on shared + // memory. The hash table, chained item hash table etc is also created on + // shared memory except for temporary shared memory mode when they're created + // on heap. + const bool isOnShm_{false}; + + const Config config_{}; + + const typename Config::MemoryTierConfigs memoryTierConfigs; + + // Manages the temporary shared memory segment for memory allocator that + // is not persisted when cache process exits. + std::unique_ptr tempShm_; + + std::unique_ptr shmManager_; + + // Deserialize data to restore cache allocator. Used only while attaching to + // existing shared memory. + std::unique_ptr deserializer_; + + // used only while attaching to existing shared memory. + serialization::CacheAllocatorMetadata metadata_{}; + + // configs for the access container and the mm container. + const MMConfig mmConfig_{}; + + // the memory allocator for allocating out of the available memory. + std::vector> allocator_; + + // compact cache allocator manager + // TODO: per tier? + std::unique_ptr compactCacheManager_; + + // compact cache instances reside here when user "add" or "attach" compact + // caches. The lifetime is tied to CacheAllocator. + // The bool is an indicator for whether the compact cache participates in + // size optimization in PoolOptimizer + std::unordered_map> compactCaches_; + + // check whether a pool is enabled for optimizing + std::array, MemoryPoolManager::kMaxPools> + optimizerEnabled_{}; + + // ptr compressor + typename Item::PtrCompressor compressor_; + + // Lock to synchronize addition of a new pool and its resizing/rebalancing + folly::SharedMutex poolsResizeAndRebalanceLock_; + + // container for the allocations which are currently being memory managed by + // the cache allocator. + // we need mmcontainer per allocator pool/allocation class. + MMContainers mmContainers_; + + // container that is used for accessing the allocations by their key. + std::unique_ptr accessContainer_; + + // container that is used for accessing the chained allocation + std::unique_ptr chainedItemAccessContainer_{nullptr}; + + friend ChainedAllocs; + friend WritableChainedAllocs; + // ensure any modification to a chain of chained items are synchronized + using ChainedItemLock = facebook::cachelib::SharedMutexBuckets; + ChainedItemLock chainedItemLocks_; + + // nvmCache + std::unique_ptr nvmCache_; + + // rebalancer for the pools + std::unique_ptr poolRebalancer_; + + // resizer for the pools. + std::unique_ptr poolResizer_; + + // automatic arena resizing i.e. pool optimization + std::unique_ptr poolOptimizer_; + + // free memory monitor + std::unique_ptr memMonitor_; + + // background evictor + std::vector>> backgroundEvictor_; + std::vector>> backgroundPromoter_; + + // check whether a pool is a slabs pool + std::array isCompactCachePool_{}; + + // lock to serilize access of isCompactCachePool_ array, including creation of + // compact cache pools + folly::SharedMutex compactCachePoolsLock_; + + // mutex protecting the creation and destruction of workers poolRebalancer_, + // poolResizer_, poolOptimizer_, memMonitor_, reaper_ + mutable std::mutex workersMutex_; + + static constexpr size_t kShards = 8192; // TODO: need to define right value + + struct MovesMapShard { + alignas(folly::hardware_destructive_interference_size) MoveMap movesMap_; + }; + + struct MoveLock { + alignas(folly::hardware_destructive_interference_size) std::mutex moveLock_; + }; + + // a map of all pending moves + std::vector movesMap_; + + // a map of move locks for each shard + std::vector moveLock_; + + // time when the ram cache was first created + const uint32_t cacheCreationTime_{0}; + + // time when CacheAllocator structure is created. Whenever a process restarts + // and even if cache content is persisted, this will be reset. It's similar + // to process uptime. (But alternatively if user explicitly shuts down and + // re-attach cache, this will be reset as well) + const uint32_t cacheInstanceCreationTime_{0}; + + // thread local accumulation of handle counts + mutable util::FastStats handleCount_{}; + + mutable detail::Stats stats_{}; + // allocator's items reaper to evict expired items in bg checking + std::unique_ptr> reaper_; + + class DummyTlsActiveItemRingTag {}; + folly::ThreadLocal ring_; + + // state for the nvmcache + NvmCacheState nvmCacheState_; + + // admission policy for nvmcache + std::shared_ptr> nvmAdmissionPolicy_; + + // indicates if the shutdown of cache is in progress or not + std::atomic shutDownInProgress_{false}; + + // END private members + + // Make this friend to give access to acquire and release + friend ReadHandle; + friend ReaperAPIWrapper; + friend BackgroundMoverAPIWrapper; + friend class CacheAPIWrapperForNvm; + friend class FbInternalRuntimeUpdateWrapper; + friend class objcache2::ObjectCache; + friend class objcache2::ObjectCacheBase; + template + friend class ReadOnlyMap; + + // tests + friend class facebook::cachelib::tests::NvmCacheTest; + FRIEND_TEST(CachelibAdminTest, WorkingSetAnalysisLoggingTest); + template + friend class facebook::cachelib::tests::BaseAllocatorTest; + template + friend class facebook::cachelib::tests::AllocatorHitStatsTest; + template + friend class facebook::cachelib::tests::AllocatorResizeTest; + template + friend class facebook::cachelib::tests::PoolOptimizeStrategyTest; + friend class facebook::cachelib::tests::NvmAdmissionPolicyTest; + friend class facebook::cachelib::tests::CacheAllocatorTestWrapper; + friend class facebook::cachelib::tests::PersistenceCache; + template + friend class facebook::cachelib::tests::FixedSizeArrayTest; + template + friend class facebook::cachelib::tests::MapTest; + + // benchmarks + template + friend class facebook::cachelib::cachebench::Cache; + friend class facebook::cachelib::cachebench::tests::CacheTest; + friend void lookupCachelibBufferManagerOnly(); + friend void lookupCachelibMap(); + friend void benchCachelibMap(); + friend void benchCachelibRangeMap(); + + // objectCache + template + friend class facebook::cachelib::objcache::ObjectCache; + friend class GET_DECORATED_CLASS_NAME(objcache::test, + ObjectCache, + ObjectHandleInvalid); +}; +} // namespace cachelib +} // namespace facebook +#include "cachelib/allocator/CacheAllocator-inl.h" + +namespace facebook { +namespace cachelib { + +// Declare templates ahead of use to reduce compilation time +extern template class CacheAllocator; +extern template class CacheAllocator; +extern template class CacheAllocator; +extern template class CacheAllocator; + +// CacheAllocator with an LRU eviction policy +// LRU policy can be configured to act as a segmented LRU as well +using LruAllocator = CacheAllocator; +using LruAllocatorSpinBuckets = CacheAllocator; + +// CacheAllocator with 2Q eviction policy +// Hot, Warm, Cold queues are maintained +// Item Life Time: +// 0. On access, each item is promoted to the head of its current +// queue +// 1. first enter Hot queue +// 2. if accessed while in Hot, item will qualify entry to Warm queue +// otherwise, item will enter cold queue +// 3. items in cold queue are evicted to make room for new items +using Lru2QAllocator = CacheAllocator; + +// CacheAllocator with Tiny LFU eviction policy +// It has a window initially to gauage the frequency of accesses of newly +// inserted items. And eventually it will onl admit items that are accessed +// beyond a threshold into the warm cache. +using TinyLFUAllocator = CacheAllocator; +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/CacheAllocatorConfig.h b/bdm/allocator/CacheAllocatorConfig.h new file mode 100644 index 0000000000..4060ca2eeb --- /dev/null +++ b/bdm/allocator/CacheAllocatorConfig.h @@ -0,0 +1,1332 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +#include "cachelib/allocator/Cache.h" +#include "cachelib/allocator/MM2Q.h" +#include "cachelib/allocator/MemoryMonitor.h" +#include "cachelib/allocator/MemoryTierCacheConfig.h" +#include "cachelib/allocator/NvmAdmissionPolicy.h" +#include "cachelib/allocator/PoolOptimizeStrategy.h" +#include "cachelib/allocator/BackgroundMoverStrategy.h" +#include "cachelib/allocator/RebalanceStrategy.h" +#include "cachelib/allocator/Util.h" +#include "cachelib/common/EventInterface.h" +#include "cachelib/common/Throttler.h" + +namespace facebook { +namespace cachelib { + +// Config class for CacheAllocator. +template +class CacheAllocatorConfig { + public: + using AccessConfig = typename CacheT::AccessConfig; + using ChainedItemMovingSync = typename CacheT::ChainedItemMovingSync; + using RemoveCb = typename CacheT::RemoveCb; + using ItemDestructor = typename CacheT::ItemDestructor; + using NvmCacheEncodeCb = typename CacheT::NvmCacheT::EncodeCB; + using NvmCacheDecodeCb = typename CacheT::NvmCacheT::DecodeCB; + using NvmCacheDeviceEncryptor = typename CacheT::NvmCacheT::DeviceEncryptor; + using MoveCb = typename CacheT::MoveCb; + using NvmCacheConfig = typename CacheT::NvmCacheT::Config; + using MemoryTierConfigs = std::vector; + using Key = typename CacheT::Key; + using EventTrackerSharedPtr = std::shared_ptr; + using Item = typename CacheT::Item; + + // Set cache name as a string + CacheAllocatorConfig& setCacheName(const std::string&); + + // Set cache size in bytes. If size is smaller than 60GB (64'424'509'440), + // then we will enable full coredump. Otherwise, we will disable it. The + // reason we disable full coredump for large cache is because it takes a + // long time to dump, and also we might not have enough local storage. + CacheAllocatorConfig& setCacheSize(size_t _size); + + // Set default allocation sizes for a cache pool + CacheAllocatorConfig& setDefaultAllocSizes(std::set allocSizes); + + // Set default allocation sizes based on arguments + CacheAllocatorConfig& setDefaultAllocSizes( + double _allocationClassSizeFactor, + uint32_t _maxAllocationClassSize, + uint32_t _minAllocationClassSize, + bool _reduceFragmentationInAllocationClass); + + // Set the access config for cachelib's access container. Refer to our + // user guide for how to tune access container (configure hashtable). + CacheAllocatorConfig& setAccessConfig(AccessConfig config); + + // Set the access config for cachelib's access container based on the + // number of estimated cache entries. + CacheAllocatorConfig& setAccessConfig(size_t numEntries); + + // RemoveCallback is invoked for each item that is evicted or removed + // explicitly from RAM + CacheAllocatorConfig& setRemoveCallback(RemoveCb cb); + + // ItemDestructor is invoked for each item that is evicted or removed + // explicitly from cache (both RAM and NVM) + CacheAllocatorConfig& setItemDestructor(ItemDestructor destructor); + + // Config for NvmCache. If enabled, cachelib will also make use of flash. + CacheAllocatorConfig& enableNvmCache(NvmCacheConfig config); + + // enable the reject first admission policy through its parameters + // @param numEntries the number of entries to track across all splits + // @param numSplits the number of splits. we drop a whole split by + // FIFO + // @param suffixIgnoreLength the suffix of the key to ignore for tracking. + // disabled when set to 0 + // + // @param useDramHitSignal use hits in DRAM as signal for admission + CacheAllocatorConfig& enableRejectFirstAPForNvm(uint64_t numEntries, + uint32_t numSplits, + size_t suffixIgnoreLength, + bool useDramHitSignal); + + // enable an admission policy for NvmCache. If this is set, other supported + // options like enableRejectFirstAP etc are overlooked. + // + // @throw std::invalid_argument if nullptr is passed. + CacheAllocatorConfig& setNvmCacheAdmissionPolicy( + std::shared_ptr> policy); + + // enables encoding items before they go into nvmcache + CacheAllocatorConfig& setNvmCacheEncodeCallback(NvmCacheEncodeCb cb); + + // enables decoding items before they get back into ram cache + CacheAllocatorConfig& setNvmCacheDecodeCallback(NvmCacheDecodeCb cb); + + // enable encryption support for NvmCache. This will encrypt every byte + // written to the device. + CacheAllocatorConfig& enableNvmCacheEncryption( + std::shared_ptr encryptor); + + // return if NvmCache encryption is enabled + bool isNvmCacheEncryptionEnabled() const; + + // If enabled, it means we'll only store user-requested size into NvmCache + // instead of the full usable size returned by CacheAllocator::getUsableSize() + CacheAllocatorConfig& enableNvmCacheTruncateAllocSize(); + + // return if truncate-alloc-size is enabled. If true, it means we'll only + // store user-requested size into NvmCache instead of the full usable size + // returned by CacheAllocator::getUsableSize() + bool isNvmCacheTruncateAllocSizeEnabled() const; + + // Enable compact cache support. Refer to our user guide for how ccache works. + CacheAllocatorConfig& enableCompactCache(); + + // Configure chained items. Refer to our user guide for how chained items + // work. + // + // @param config Config for chained item's access container, it's similar to + // the main access container but only used for chained items + // @param lockPower this controls the number of locks (2^lockPower) for + // synchronizing operations on chained items. + CacheAllocatorConfig& configureChainedItems(AccessConfig config = {}, + uint32_t lockPower = 10); + + // Configure chained items. Refer to our user guide for how chained items + // work. This function calculates the optimal bucketsPower and locksPower for + // users based on estimated chained items number. + // + // @param numEntries number of estimated chained items + // @param lockPower this controls the number of locks (2^lockPower) for + // synchronizing operations on chained items. + CacheAllocatorConfig& configureChainedItems(size_t numEntries, + uint32_t lockPower = 10); + + // enable tracking tail hits + CacheAllocatorConfig& enableTailHitsTracking(); + + // Turn on full core dump which includes all the cache memory. + // This is not recommended for production as it can significantly slow down + // the coredumping process. + CacheAllocatorConfig& setFullCoredump(bool enable); + + // Sets the configuration that controls whether nvmcache is recovered if + // possible when dram cache is not recovered. By default nvmcache is + // recovered even when dram cache is not recovered. + CacheAllocatorConfig& setDropNvmCacheOnShmNew(bool enable); + + // Turn off fast shutdown mode, which interrupts any releaseSlab operations + // in progress, so that workers in the process of releaseSlab may be + // completed sooner for shutdown to take place fast. + CacheAllocatorConfig& disableFastShutdownMode(); + + // when disabling full core dump, turning this option on will enable + // cachelib to track recently accessed items and keep them in the partial + // core dump. See CacheAllocator::madviseRecentItems() + CacheAllocatorConfig& setTrackRecentItemsForDump(bool enable); + + // Page in all the cache memory asynchronously when cache starts up. + // This helps ensure the system actually has enough memory to page in all of + // the cache. We'll fail early if there isn't enough memory. + // + // If memory monitor is enabled, this is not usually needed. + CacheAllocatorConfig& setMemoryLocking(bool enable); + + // This allows cache to be persisted across restarts. One example use case is + // to preserve the cache when releasing a new version of your service. Refer + // to our user guide for how to set up cache persistence. + // TODO: get rid of baseAddr or if set make sure all mapping are adjacent? + // We can also make baseAddr a per-tier configuration + CacheAllocatorConfig& enableCachePersistence(std::string directory, + void* baseAddr = nullptr); + + // Uses posix shm segments instead of the default sys-v shm + // segments. @throw std::invalid_argument if called without enabling + // cachePersistence(). + CacheAllocatorConfig& usePosixForShm(); + + // Configures cache memory tiers. Each tier represents a cache region inside + // byte-addressable memory such as DRAM, Pmem, CXLmem. + // Accepts vector of MemoryTierCacheConfig. Each vector element describes + // configuration for a single memory cache tier. Tier sizes are specified as + // ratios, the number of parts of total cache size each tier would occupy. + // @throw std::invalid_argument if: + // - the size of configs is 0 + // - the size of configs is greater than kMaxCacheMemoryTiers + CacheAllocatorConfig& configureMemoryTiers(const MemoryTierConfigs& configs); + + // Return reference to MemoryTierCacheConfigs. + const MemoryTierConfigs& getMemoryTierConfigs() const; + + // This turns on a background worker that periodically scans through the + // access container and look for expired items and remove them. + CacheAllocatorConfig& enableItemReaperInBackground( + std::chrono::milliseconds interval, util::Throttler::Config config = {}); + + // When using free memory monitoring mode, CacheAllocator shrinks the cache + // size when the system is under memory pressure. Cache will grow back when + // the memory pressure goes down. + // + // When using resident memory monitoring mode, typically when the + // application runs inside containers, the lowerLimit and upperLimit are the + // opposite to that of the free memory monitoring mode. + // "upperLimit" refers to the upper bound of application memory. + // Cache size will actually decrease once an application reaches past it + // and grows when drops below "lowerLimit". + // + // @param interval waits for an interval between each run + // @param config memory monitoring config + // @param RebalanceStrategy an optional strategy to customize where the slab + // to give up when shrinking cache + CacheAllocatorConfig& enableMemoryMonitor( + std::chrono::milliseconds interval, + MemoryMonitor::Config config, + std::shared_ptr = {}); + + // Enable pool rebalancing. This allows each pool to internally rebalance + // slab memory distributed across different allocation classes. For example, + // if the 64 bytes allocation classes are receiving for allocation requests, + // eventually CacheAllocator will move more memory to it from other allocation + // classes. For more details, see our user guide. + CacheAllocatorConfig& enablePoolRebalancing( + std::shared_ptr defaultRebalanceStrategy, + std::chrono::milliseconds interval); + + // This lets you change pool size during runtime, and the pool resizer + // will slowly adjust each pool's memory size to the newly configured sizes. + CacheAllocatorConfig& enablePoolResizing( + std::shared_ptr resizeStrategy, + std::chrono::milliseconds interval, + uint32_t slabsToReleasePerIteration); + + // Enable pool size optimizer, which automatically adjust pool sizes as + // traffic changes or new memory added. + // For now we support different intervals between regular pools and compact + // caches. + CacheAllocatorConfig& enablePoolOptimizer( + std::shared_ptr optimizeStrategy, + std::chrono::seconds regularInterval, + std::chrono::seconds ccacheInterval, + uint32_t ccacheStepSizePercent); + + // Enable the background evictor - scans a tier to look for objects + // to evict to the next tier + CacheAllocatorConfig& enableBackgroundEvictor( + std::shared_ptr backgroundMoverStrategy, + std::chrono::milliseconds regularInterval, size_t threads); + + CacheAllocatorConfig& enableBackgroundPromoter( + std::shared_ptr backgroundMoverStrategy, + std::chrono::milliseconds regularInterval, size_t threads); + + // This enables an optimization for Pool rebalancing and resizing. + // The rough idea is to ensure only the least useful items are evicted when + // we move slab memory around. Come talk to Cache Library team if you think + // this can help your service. + CacheAllocatorConfig& enableMovingOnSlabRelease( + MoveCb cb, + ChainedItemMovingSync sync = {}, + uint32_t movingAttemptsLimit = 10); + + // Specify a threshold for detecting slab release stuck + CacheAllocatorConfig& setSlabReleaseStuckThreashold( + std::chrono::milliseconds threshold); + + // This customizes how many items we try to evict before giving up.s + // We may fail to evict if someone else (another thread) is using an item. + // Setting this to a high limit leads to a higher chance of successful + // evictions but it can lead to higher allocation latency as well. + // Unless you're very familiar with caching, come talk to Cache Library team + // before you start customizing this option. + CacheAllocatorConfig& setEvictionSearchLimit(uint32_t limit); + + // Specify a threshold for per-item outstanding references, beyond which, + // shared_ptr will be allocated instead of handles to support having more + // outstanding iobuf + // The default behavior is always using handles + CacheAllocatorConfig& setRefcountThresholdForConvertingToIOBuf(uint32_t); + + // This throttles various internal operations in cachelib. It may help + // improve latency in allocation and find paths. Come talk to Cache + // Library team if you find yourself customizing this. + CacheAllocatorConfig& setThrottlerConfig(util::Throttler::Config config); + + // Disable eviction. Not recommended unless you want to use cachelib as + // a in-memory storage. If you find yourself needing this, please come talk + // to Cache Library team. There is usually a better solutuon. + CacheAllocatorConfig& disableCacheEviction(); + + // Passes in a callback to initialize an event tracker when the allocator + // starts + CacheAllocatorConfig& setEventTracker(EventTrackerSharedPtr&&); + + // Set the minimum TTL for an item to be admitted into NVM cache. + // If nvmAdmissionMinTTL is set to be positive, any item with configured TTL + // smaller than this will always be rejected by NvmAdmissionPolicy. + CacheAllocatorConfig& setNvmAdmissionMinTTL(uint64_t ttl); + + // Skip promote children items in chained when parent fail to promote + CacheAllocatorConfig& setSkipPromoteChildrenWhenParentFailed(); + + // (deprecated) Disable cache eviction. + // Please do not create new callers. CacheLib will stop supporting disabled + // eviction. + [[deprecated]] CacheAllocatorConfig& deprecated_disableEviction(); + + bool isEvictionDisabled() const noexcept { return disableEviction; } + + // We will delay worker start until user explicitly calls + // CacheAllocator::startCacheWorkers() + CacheAllocatorConfig& setDelayCacheWorkersStart(); + + // skip promote children items in chained when parent fail to promote + bool isSkipPromoteChildrenWhenParentFailed() const noexcept { + return skipPromoteChildrenWhenParentFailed; + } + + // @return whether compact cache is enabled + bool isCompactCacheEnabled() const noexcept { return enableZeroedSlabAllocs; } + + // @return whether pool resizing is enabled + bool poolResizingEnabled() const noexcept { + return poolResizeInterval.count() > 0 && poolResizeSlabsPerIter > 0; + } + + // @return whether pool rebalancing is enabled + bool poolRebalancingEnabled() const noexcept { + return poolRebalanceInterval.count() > 0 && + defaultPoolRebalanceStrategy != nullptr; + } + + // @return whether pool optimizing is enabled + bool poolOptimizerEnabled() const noexcept { + return (regularPoolOptimizeInterval.count() > 0 || + compactCacheOptimizeInterval.count() > 0) && + poolOptimizeStrategy != nullptr; + } + + // @return whether background evictor thread is enabled + bool backgroundEvictorEnabled() const noexcept { + return backgroundEvictorInterval.count() > 0 && + backgroundEvictorStrategy != nullptr; + } + + bool backgroundPromoterEnabled() const noexcept { + return backgroundPromoterInterval.count() > 0 && + backgroundPromoterStrategy != nullptr; + } + + // @return whether memory monitor is enabled + bool memMonitoringEnabled() const noexcept { + return memMonitorConfig.mode != MemoryMonitor::Disabled && + memMonitorInterval.count() > 0; + } + + // @return whether reaper is enabled + bool itemsReaperEnabled() const noexcept { + return reaperInterval.count() > 0; + } + + const std::string& getCacheDir() const noexcept { return cacheDir; } + + const std::string& getCacheName() const noexcept { return cacheName; } + + size_t getCacheSize() const noexcept { return size; } + + bool isUsingPosixShm() const noexcept { return usePosixShm; } + + // validate the config, and return itself if valid + const CacheAllocatorConfig& validate() const; + + // check whether the RebalanceStrategy can be used with this config + bool validateStrategy( + const std::shared_ptr& strategy) const; + + // check whether the PoolOptimizeStrategy can be used with this config + bool validateStrategy( + const std::shared_ptr& strategy) const; + + // check that memory tier ratios are set properly + const CacheAllocatorConfig& validateMemoryTiers() const; + + // @return a map representation of the configs + std::map serialize() const; + + // The max number of memory cache tiers + inline static const size_t kMaxCacheMemoryTiers = 2; + + // Cache name for users to indentify their own cache. + std::string cacheName{""}; + + // Amount of memory for this cache instance (sum of all memory tiers' sizes) + size_t size = 1 * 1024 * 1024 * 1024; + + // Directory for shared memory related metadata + std::string cacheDir; + + // if true, uses posix shm; if not, uses sys-v (default) + bool usePosixShm{false}; + + // Attach shared memory to a fixed base address + void* slabMemoryBaseAddr = nullptr; + + // User defined default alloc sizes. If empty, we'll generate a default one. + // This set of alloc sizes will be used for pools that user do not supply + // a custom set of alloc sizes. + std::set defaultAllocSizes; + + // whether to detach allocator memory upon a core dump + bool disableFullCoredump{true}; + + // whether to enable fast shutdown, that would interrupt on-going slab + // release process, or not. + bool enableFastShutdown{true}; + + // if we want to track recent items for dumping them in core when we disable + // full core dump. + bool trackRecentItemsForDump{false}; + + // if enabled ensures that nvmcache is not persisted when dram cache is not + // presisted. + bool dropNvmCacheOnShmNew{false}; + + // TODO: + // BELOW are the config for various cache workers + // Today, they're set before CacheAllocator is created and stay + // fixed for the lifetime of CacheAllocator. Going foward, this + // will be allowed to be changed dynamically during runtime and + // trigger updates to the cache workers + // time interval to sleep between iterations of resizing the pools. + std::chrono::milliseconds poolResizeInterval{std::chrono::seconds(0)}; + + // number of slabs to be released per pool that is over the limit in each + // iteration. + unsigned int poolResizeSlabsPerIter{5}; + + // the rebalance strategy for the pool resizing if enabled. + std::shared_ptr poolResizeStrategy; + + // the strategy to be used when advising memory to pick a donor + std::shared_ptr poolAdviseStrategy; + + // time interval to sleep between iterators of rebalancing the pools. + std::chrono::milliseconds poolRebalanceInterval{std::chrono::seconds{1}}; + + // Free slabs pro-actively if the ratio of number of freeallocs to + // the number of allocs per slab in a slab class is above this + // threshold + // A value of 0 means, this feature is disabled. + unsigned int poolRebalancerFreeAllocThreshold{0}; + + // rebalancing strategy for all pools. By default the strategy will + // rebalance to avoid alloc fialures. + std::shared_ptr defaultPoolRebalanceStrategy{ + new RebalanceStrategy{}}; + + // The slab release process is considered as being stuck if it does not + // make any progress for the below threshold + std::chrono::milliseconds slabReleaseStuckThreshold{std::chrono::seconds(60)}; + + // rebalance to avoid alloc fialures. + std::shared_ptr backgroundEvictorStrategy; + std::shared_ptr backgroundPromoterStrategy; + // time interval to sleep between runs of the background evictor + std::chrono::milliseconds backgroundEvictorInterval{std::chrono::milliseconds{1000}}; + std::chrono::milliseconds backgroundPromoterInterval{std::chrono::milliseconds{1000}}; + + size_t backgroundEvictorThreads{1}; + size_t backgroundPromoterThreads{1}; + + // time interval to sleep between iterations of pool size optimization, + // for regular pools and compact caches + std::chrono::seconds regularPoolOptimizeInterval{0}; + std::chrono::seconds compactCacheOptimizeInterval{0}; + + // step size for compact cache size optimization: how many percents of the + // victim to move + unsigned int ccacheOptimizeStepSizePercent{1}; + + // optimization strategy + std::shared_ptr poolOptimizeStrategy{nullptr}; + + // Callback for initializing the eventTracker on CacheAllocator construction. + EventTrackerSharedPtr eventTracker{nullptr}; + + // whether to allow tracking tail hits in MM2Q + bool trackTailHits{false}; + + // Memory monitoring config + MemoryMonitor::Config memMonitorConfig; + + // time interval to sleep between iterations of monitoring memory. + // Set to 0 to disable memory monitoring + std::chrono::milliseconds memMonitorInterval{0}; + + // throttler config of items reaper for iteration + util::Throttler::Config reaperConfig{}; + + // time to sleep between each reaping period. + std::chrono::milliseconds reaperInterval{5000}; + + // interval during which we adjust dynamically the refresh ratio. + std::chrono::milliseconds mmReconfigureInterval{0}; + + // + // TODO: + // ABOVE are the config for various cache workers + // + + // the number of tries to search for an item to evict + // 0 means it's infinite + unsigned int evictionSearchTries{50}; + + // If refcount is larger than this threshold, we will use shared_ptr + // for handles in IOBuf chains. + unsigned int thresholdForConvertingToIOBuf{ + std::numeric_limits::max()}; + + // number of attempts to move an item before giving up and try to + // evict the item + unsigned int movingTries{10}; + + // Config that specifes how throttler will behave + // How much time it will sleep and how long an interval between each sleep + util::Throttler::Config throttleConfig{}; + + // Access config for chained items, Only used if chained items are enabled + AccessConfig chainedItemAccessConfig{}; + + // User level synchronization function object. This will be held while + // executing the moveCb. This is only needed when using and moving is + // enabled for chained items. + ChainedItemMovingSync movingSync{}; + + // determines how many locks we have for synchronizing chained items + // add/pop and between moving during slab rebalancing + uint32_t chainedItemsLockPower{10}; + + // Configuration for the main access container which manages the lookup + // for all normal items + AccessConfig accessConfig{}; + + // user defined callback invoked when an item is being evicted or freed from + // RAM + RemoveCb removeCb{}; + + // user defined item destructor invoked when an item is being + // evicted or freed from cache (both RAM and NVM) + ItemDestructor itemDestructor{}; + + // user defined call back to move the item. This is executed while holding + // the user provided movingSync. For items without chained allocations, + // there is no specific need for explicit movingSync and user can skip + // providing a movingSync and do explicit synchronization just in the + // moveCb if needed to protect the data being moved with concurrent + // readers. + MoveCb moveCb{}; + + // custom user provided admission policy + std::shared_ptr> nvmCacheAP{nullptr}; + + // Config for nvmcache type + folly::Optional nvmConfig; + + // configuration for reject first admission policy to nvmcache. 0 indicates + // a disabled policy. + uint64_t rejectFirstAPNumEntries{0}; + uint32_t rejectFirstAPNumSplits{20}; + + // if non zero specifies the suffix of the key to be ignored when tracking. + // this enables tracking group of keys that have common prefix. + size_t rejectFirstSuffixIgnoreLength{0}; + + // if enabled, uses the fact that item got a hit in DRAM as a signal to + // admit + bool rejectFirstUseDramHitSignal{true}; + + // Must enable this in order to call `allocateZeroedSlab`. + // Otherwise, it will throw. + // This is required for compact cache + bool enableZeroedSlabAllocs = false; + + // Asynchronously page in the cache memory before they are accessed by the + // application. This can ensure all of cache memory is accounted for in the + // RSS even if application does not access all of it, avoiding any + // surprises (i.e. OOM). + // + // This can only be turned on the first time we're creating the cache. + // This option has no effect when attaching to existing cache. + bool lockMemory{false}; + + // These configs configure how MemoryAllocator will be generating + // allocation class sizes for each pool by default + double allocationClassSizeFactor{1.25}; + uint32_t maxAllocationClassSize{Slab::kSize}; + uint32_t minAllocationClassSize{72}; + bool reduceFragmentationInAllocationClass{false}; + + // The minimum TTL an item need to have in order to be admitted into NVM + // cache. + uint64_t nvmAdmissionMinTTL{0}; + + // Skip promote children items in chained when parent fail to promote + bool skipPromoteChildrenWhenParentFailed{false}; + + // If true, we will delay worker start until user explicitly calls + // CacheAllocator::startCacheWorkers() + bool delayCacheWorkersStart{false}; + + // see MultiTierDataMovement.md + double promotionAcWatermark{4.0}; + double lowEvictionAcWatermark{2.0}; + double highEvictionAcWatermark{5.0}; + double numDuplicateElements{0.0}; // inclusivness of the cache + double syncPromotion{0.0}; // can promotion be done synchronously in user thread + + uint64_t evictorThreads{1}; + uint64_t promoterThreads{1}; + + uint64_t maxEvictionBatch{40}; + uint64_t maxPromotionBatch{10}; + + uint64_t minEvictionBatch{1}; + uint64_t minPromotionBatch{1}; + + uint64_t maxEvictionPromotionHotness{60}; + + + friend CacheT; + + private: + void mergeWithPrefix( + std::map& configMap, + const std::map& configMapToMerge, + const std::string& prefix) const; + std::string stringifyAddr(const void* addr) const; + std::string stringifyRebalanceStrategy( + const std::shared_ptr& strategy) const; + + // Configuration for memory tiers. + MemoryTierConfigs memoryTierConfigs{ + {MemoryTierCacheConfig::fromShm().setRatio(1)}}; + + // if turned on, cache allocator will not evict any item when the + // system is out of memory. The user must free previously allocated + // items to make more room. + bool disableEviction = false; +}; + +template +CacheAllocatorConfig& CacheAllocatorConfig::setCacheName( + const std::string& _cacheName) { + cacheName = _cacheName; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::setCacheSize(size_t _size) { + size = _size; + constexpr size_t maxCacheSizeWithCoredump = 64'424'509'440; // 60GB + if (size <= maxCacheSizeWithCoredump) { + return setFullCoredump(true); + } + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::setDefaultAllocSizes( + std::set allocSizes) { + defaultAllocSizes = allocSizes; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::setDefaultAllocSizes( + double _allocationClassSizeFactor, + uint32_t _maxAllocationClassSize, + uint32_t _minAllocationClassSize, + bool _reduceFragmentationInAllocationClass) { + allocationClassSizeFactor = _allocationClassSizeFactor; + maxAllocationClassSize = _maxAllocationClassSize; + minAllocationClassSize = _minAllocationClassSize; + reduceFragmentationInAllocationClass = _reduceFragmentationInAllocationClass; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::setAccessConfig( + AccessConfig config) { + accessConfig = std::move(config); + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::setAccessConfig( + size_t numEntries) { + AccessConfig config{}; + config.sizeBucketsPowerAndLocksPower(numEntries); + accessConfig = std::move(config); + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::setRemoveCallback( + RemoveCb cb) { + removeCb = std::move(cb); + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::setItemDestructor( + ItemDestructor destructor) { + itemDestructor = std::move(destructor); + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::enableRejectFirstAPForNvm( + uint64_t numEntries, + uint32_t numSplits, + size_t suffixIgnoreLength, + bool useDramHitSignal) { + if (numEntries == 0) { + throw std::invalid_argument( + "Enalbing reject first AP needs non zero numEntries"); + } + rejectFirstAPNumEntries = numEntries; + rejectFirstAPNumSplits = numSplits; + rejectFirstSuffixIgnoreLength = suffixIgnoreLength; + rejectFirstUseDramHitSignal = useDramHitSignal; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::enableNvmCache( + NvmCacheConfig config) { + nvmConfig.assign(config); + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::setNvmCacheAdmissionPolicy( + std::shared_ptr> policy) { + if (!nvmConfig) { + throw std::invalid_argument( + "NvmCache admission policy callback can not be set unless nvmcache is " + "used"); + } + + if (!policy) { + throw std::invalid_argument("Setting a null admission policy"); + } + + nvmCacheAP = std::move(policy); + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::setNvmCacheEncodeCallback( + NvmCacheEncodeCb cb) { + if (!nvmConfig) { + throw std::invalid_argument( + "NvmCache filter callback can not be set unless nvmcache is used"); + } + nvmConfig->encodeCb = std::move(cb); + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::setNvmCacheDecodeCallback( + NvmCacheDecodeCb cb) { + if (!nvmConfig) { + throw std::invalid_argument( + "NvmCache filter callback can not be set unless nvmcache is used"); + } + nvmConfig->decodeCb = std::move(cb); + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::enableNvmCacheEncryption( + std::shared_ptr encryptor) { + if (!nvmConfig) { + throw std::invalid_argument( + "NvmCache encrytion/decrytion callbacks can not be set unless nvmcache " + "is used"); + } + if (!encryptor) { + throw std::invalid_argument("Set a nullptr encryptor is NOT allowed"); + } + nvmConfig->deviceEncryptor = std::move(encryptor); + return *this; +} + +template +bool CacheAllocatorConfig::isNvmCacheEncryptionEnabled() const { + return nvmConfig && nvmConfig->deviceEncryptor; +} + +template +CacheAllocatorConfig& +CacheAllocatorConfig::enableNvmCacheTruncateAllocSize() { + if (!nvmConfig) { + throw std::invalid_argument( + "NvmCache mode can not be adjusted unless nvmcache is used"); + } + nvmConfig->truncateItemToOriginalAllocSizeInNvm = true; + return *this; +} + +template +bool CacheAllocatorConfig::isNvmCacheTruncateAllocSizeEnabled() const { + return nvmConfig && nvmConfig->truncateItemToOriginalAllocSizeInNvm; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::enableCompactCache() { + enableZeroedSlabAllocs = true; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::configureChainedItems( + AccessConfig config, uint32_t lockPower) { + chainedItemAccessConfig = config; + chainedItemsLockPower = lockPower; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::configureChainedItems( + size_t numEntries, uint32_t lockPower) { + AccessConfig config{}; + config.sizeBucketsPowerAndLocksPower(numEntries); + chainedItemAccessConfig = std::move(config); + chainedItemsLockPower = lockPower; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::enableTailHitsTracking() { + trackTailHits = true; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::setFullCoredump(bool enable) { + disableFullCoredump = !enable; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::setDropNvmCacheOnShmNew( + bool enable) { + dropNvmCacheOnShmNew = enable; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::disableFastShutdownMode() { + enableFastShutdown = false; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::setTrackRecentItemsForDump( + bool enable) { + trackRecentItemsForDump = enable; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::setMemoryLocking( + bool enable) { + lockMemory = enable; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::enableCachePersistence( + std::string cacheDirectory, void* baseAddr) { + cacheDir = cacheDirectory; + slabMemoryBaseAddr = baseAddr; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::usePosixForShm() { + if (cacheDir.empty()) { + throw std::invalid_argument( + "Posix shm can be set only when cache persistence is enabled"); + } + usePosixShm = true; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::enableItemReaperInBackground( + std::chrono::milliseconds interval, util::Throttler::Config config) { + reaperInterval = interval; + reaperConfig = config; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::configureMemoryTiers( + const MemoryTierConfigs& config) { + if (config.size() > kMaxCacheMemoryTiers) { + throw std::invalid_argument(folly::sformat( + "Too many memory tiers. The number of supported tiers is {}.", + kMaxCacheMemoryTiers)); + } + if (!config.size()) { + throw std::invalid_argument( + "There must be at least one memory tier config."); + } + memoryTierConfigs = config; + return *this; +} + +template +const typename CacheAllocatorConfig::MemoryTierConfigs& +CacheAllocatorConfig::getMemoryTierConfigs() const { + return memoryTierConfigs; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::disableCacheEviction() { + disableEviction = true; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::enableMemoryMonitor( + std::chrono::milliseconds interval, + MemoryMonitor::Config config, + std::shared_ptr adviseStrategy) { + memMonitorInterval = interval; + memMonitorConfig = std::move(config); + poolAdviseStrategy = adviseStrategy; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::enablePoolOptimizer( + std::shared_ptr strategy, + std::chrono::seconds regularPoolInterval, + std::chrono::seconds compactCacheInterval, + uint32_t ccacheStepSizePercent) { + if (validateStrategy(strategy)) { + regularPoolOptimizeInterval = regularPoolInterval; + compactCacheOptimizeInterval = compactCacheInterval; + poolOptimizeStrategy = strategy; + ccacheOptimizeStepSizePercent = ccacheStepSizePercent; + } else { + throw std::invalid_argument( + "Invalid pool optimize strategy for the cache allocator."); + } + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::enablePoolRebalancing( + std::shared_ptr defaultRebalanceStrategy, + std::chrono::milliseconds interval) { + if (validateStrategy(defaultRebalanceStrategy)) { + defaultPoolRebalanceStrategy = defaultRebalanceStrategy; + poolRebalanceInterval = interval; + } else { + throw std::invalid_argument( + "Invalid rebalance strategy for the cache allocator."); + } + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::enableBackgroundEvictor( + std::shared_ptr strategy, + std::chrono::milliseconds interval, size_t evictorThreads) { + backgroundEvictorStrategy = strategy; + backgroundEvictorInterval = interval; + backgroundEvictorThreads = evictorThreads; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::enableBackgroundPromoter( + std::shared_ptr strategy, + std::chrono::milliseconds interval, size_t promoterThreads) { + backgroundPromoterStrategy = strategy; + backgroundPromoterInterval = interval; + backgroundPromoterThreads = promoterThreads; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::enablePoolResizing( + std::shared_ptr resizeStrategy, + std::chrono::milliseconds interval, + uint32_t slabsToReleasePerIteration) { + if (validateStrategy(resizeStrategy)) { + poolResizeStrategy = resizeStrategy; + poolResizeInterval = interval; + poolResizeSlabsPerIter = slabsToReleasePerIteration; + } else { + throw std::invalid_argument( + "Invalid pool resizing strategy for the cache allocator."); + } + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::enableMovingOnSlabRelease( + MoveCb cb, ChainedItemMovingSync sync, uint32_t movingAttemptsLimit) { + moveCb = cb; + movingSync = sync; + movingTries = movingAttemptsLimit; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::setSlabReleaseStuckThreashold( + std::chrono::milliseconds threshold) { + slabReleaseStuckThreshold = threshold; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::setEvictionSearchLimit( + uint32_t limit) { + evictionSearchTries = limit; + return *this; +} + +template +CacheAllocatorConfig& +CacheAllocatorConfig::setRefcountThresholdForConvertingToIOBuf( + uint32_t threshold) { + thresholdForConvertingToIOBuf = threshold; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::setThrottlerConfig( + util::Throttler::Config config) { + throttleConfig = config; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::setEventTracker( + EventTrackerSharedPtr&& otherEventTracker) { + eventTracker = std::move(otherEventTracker); + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::setNvmAdmissionMinTTL( + uint64_t ttl) { + if (!nvmConfig) { + throw std::invalid_argument( + "NvmAdmissionMinTTL can not be set unless nvmcache is used"); + } + + nvmAdmissionMinTTL = ttl; + return *this; +} + +template +CacheAllocatorConfig& +CacheAllocatorConfig::setSkipPromoteChildrenWhenParentFailed() { + skipPromoteChildrenWhenParentFailed = true; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::deprecated_disableEviction() { + disableEviction = true; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::setDelayCacheWorkersStart() { + delayCacheWorkersStart = true; + return *this; +} + +template +const CacheAllocatorConfig& CacheAllocatorConfig::validate() const { + // we can track tail hits only if MMType is MM2Q + if (trackTailHits && T::MMType::kId != MM2Q::kId) { + throw std::invalid_argument( + "Tail hits tracking cannot be enabled on MMTypes except MM2Q."); + } + + size_t maxCacheSize = CompressedPtr::getMaxAddressableSize(); + // Configured cache size should not exceed the maximal addressable space for + // cache. + if (size > maxCacheSize) { + throw std::invalid_argument(folly::sformat( + "config cache size: {} exceeds max addressable space for cache: {}", + size, + maxCacheSize)); + } + + // we don't allow user to enable both RemoveCB and ItemDestructor + if (removeCb && itemDestructor) { + throw std::invalid_argument( + "It's not allowed to enable both RemoveCB and ItemDestructor."); + } + + return validateMemoryTiers(); +} + +template +bool CacheAllocatorConfig::validateStrategy( + const std::shared_ptr& strategy) const { + if (!strategy) { + return true; + } + + auto type = strategy->getType(); + return type != RebalanceStrategy::NumTypes && + (type != RebalanceStrategy::MarginalHits || trackTailHits); +} + +template +bool CacheAllocatorConfig::validateStrategy( + const std::shared_ptr& strategy) const { + if (!strategy) { + return true; + } + + auto type = strategy->getType(); + return type != PoolOptimizeStrategy::NumTypes && + (type != PoolOptimizeStrategy::MarginalHits || trackTailHits); +} + +template +const CacheAllocatorConfig& CacheAllocatorConfig::validateMemoryTiers() + const { + size_t parts = 0; + for (const auto& tierConfig : memoryTierConfigs) { + if (!tierConfig.getRatio()) { + throw std::invalid_argument("Tier ratio must be an integer number >=1."); + } + parts += tierConfig.getRatio(); + } + + if (parts > size) { + throw std::invalid_argument( + "Sum of tier ratios must be less than total cache size."); + } + return *this; +} + +template +std::map CacheAllocatorConfig::serialize() const { + std::map configMap; + + configMap["size"] = std::to_string(size); + configMap["cacheDir"] = cacheDir; + configMap["posixShm"] = isUsingPosixShm() ? "set" : "empty"; + + configMap["defaultAllocSizes"] = ""; + // Stringify std::set + for (auto& elem : defaultAllocSizes) { + if (configMap["defaultAllocSizes"] != "") { + configMap["defaultAllocSizes"] += ", "; + } + configMap["defaultAllocSizes"] += std::to_string(elem); + } + configMap["disableFullCoredump"] = std::to_string(disableFullCoredump); + configMap["dropNvmCacheOnShmNew"] = std::to_string(dropNvmCacheOnShmNew); + configMap["trackRecentItemsForDump"] = + std::to_string(trackRecentItemsForDump); + configMap["poolResizeInterval"] = util::toString(poolResizeInterval); + configMap["poolResizeSlabsPerIter"] = std::to_string(poolResizeSlabsPerIter); + + configMap["poolRebalanceInterval"] = util::toString(poolRebalanceInterval); + configMap["slabReleaseStuckThreshold"] = + util::toString(slabReleaseStuckThreshold); + configMap["trackTailHits"] = std::to_string(trackTailHits); + // Stringify enum + switch (memMonitorConfig.mode) { + case MemoryMonitor::FreeMemory: + configMap["memMonitorMode"] = "Free Memory"; + break; + case MemoryMonitor::ResidentMemory: + configMap["memMonitorMode"] = "Resident Memory"; + break; + case MemoryMonitor::Disabled: + configMap["memMonitorMode"] = "Disabled"; + break; + default: + configMap["memMonitorMode"] = "Unknown"; + } + configMap["memMonitorInterval"] = util::toString(memMonitorInterval); + configMap["memAdvisePercentPerIter"] = + std::to_string(memMonitorConfig.maxAdvisePercentPerIter); + configMap["memReclaimPercentPerIter"] = + std::to_string(memMonitorConfig.maxReclaimPercentPerIter); + configMap["memMaxAdvisePercent"] = + std::to_string(memMonitorConfig.maxAdvisePercent); + configMap["memLowerLimitGB"] = std::to_string(memMonitorConfig.lowerLimitGB); + configMap["memUpperLimitGB"] = std::to_string(memMonitorConfig.upperLimitGB); + configMap["reclaimRateLimitWindowSecs"] = + std::to_string(memMonitorConfig.reclaimRateLimitWindowSecs.count()); + configMap["reaperInterval"] = util::toString(reaperInterval); + configMap["mmReconfigureInterval"] = util::toString(mmReconfigureInterval); + configMap["disableEviction"] = std::to_string(disableEviction); + configMap["evictionSearchTries"] = std::to_string(evictionSearchTries); + configMap["thresholdForConvertingToIOBuf"] = + std::to_string(thresholdForConvertingToIOBuf); + configMap["movingTries"] = std::to_string(movingTries); + configMap["chainedItemsLockPower"] = std::to_string(chainedItemsLockPower); + configMap["removeCb"] = removeCb ? "set" : "empty"; + configMap["nvmAP"] = nvmCacheAP ? "custom" : "empty"; + configMap["nvmAPRejectFirst"] = rejectFirstAPNumEntries ? "set" : "empty"; + configMap["moveCb"] = moveCb ? "set" : "empty"; + configMap["enableZeroedSlabAllocs"] = std::to_string(enableZeroedSlabAllocs); + configMap["lockMemory"] = std::to_string(lockMemory); + configMap["allocationClassSizeFactor"] = + std::to_string(allocationClassSizeFactor); + configMap["maxAllocationClassSize"] = std::to_string(maxAllocationClassSize); + configMap["minAllocationClassSize"] = std::to_string(minAllocationClassSize); + configMap["reduceFragmentationInAllocationClass"] = + std::to_string(reduceFragmentationInAllocationClass); + configMap["slabMemoryBaseAddr"] = stringifyAddr(slabMemoryBaseAddr); + configMap["poolResizeStrategy"] = + stringifyRebalanceStrategy(poolResizeStrategy); + configMap["poolAdviseStrategy"] = + stringifyRebalanceStrategy(poolAdviseStrategy); + configMap["defaultPoolRebalanceStrategy"] = + stringifyRebalanceStrategy(defaultPoolRebalanceStrategy); + configMap["eventTracker"] = eventTracker ? "set" : "empty"; + configMap["nvmAdmissionMinTTL"] = std::to_string(nvmAdmissionMinTTL); + configMap["delayCacheWorkersStart"] = + delayCacheWorkersStart ? "true" : "false"; + mergeWithPrefix(configMap, throttleConfig.serialize(), "throttleConfig"); + mergeWithPrefix(configMap, + chainedItemAccessConfig.serialize(), + "chainedItemAccessConfig"); + mergeWithPrefix(configMap, accessConfig.serialize(), "accessConfig"); + mergeWithPrefix(configMap, reaperConfig.serialize(), "reaperConfig"); + if (nvmConfig) + mergeWithPrefix(configMap, nvmConfig->serialize(), "nvmConfig"); + + return configMap; +} + +template +void CacheAllocatorConfig::mergeWithPrefix( + std::map& configMap, + const std::map& configMapToMerge, + const std::string& prefix) const { + for (auto const& kv : configMapToMerge) { + configMap[prefix + "::" + kv.first] = kv.second; + } +} + +template +std::string CacheAllocatorConfig::stringifyAddr(const void* addr) const { + if (addr == nullptr) + return ""; + const std::string HEX = "0123456789abcdef"; + uintptr_t num = (uintptr_t)slabMemoryBaseAddr; + std::string res = ""; + while (num) { + res = HEX[(num & 0xf)] + res; + num >>= 4; + } + return res; +} + +template +std::string CacheAllocatorConfig::stringifyRebalanceStrategy( + const std::shared_ptr& strategy) const { + if (!strategy) + return "empty"; + switch (strategy->getType()) { + case RebalanceStrategy::PickNothingOrTest: + return "PickNothingOrTest"; + case RebalanceStrategy::Random: + return "Random"; + case RebalanceStrategy::MarginalHits: + return "MarginalHits"; + case RebalanceStrategy::FreeMem: + return "FreeMem"; + case RebalanceStrategy::HitsPerSlab: + return "HitsPerSlab"; + case RebalanceStrategy::LruTailAge: + return "LruTailAge"; + case RebalanceStrategy::PoolResize: + return "PoolResize"; + case RebalanceStrategy::StressRebalance: + return "StressRebalance"; + default: + return "undefined"; + } +} +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/CacheChainedItemIterator.h b/bdm/allocator/CacheChainedItemIterator.h new file mode 100644 index 0000000000..2d55d5eebd --- /dev/null +++ b/bdm/allocator/CacheChainedItemIterator.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include "cachelib/common/Iterators.h" + +namespace facebook { +namespace cachelib { + +namespace tests { +template +class BaseAllocatorTest; +} // namespace tests + +// Class to iterate through chained items in the special case that the caller +// has the item but no itemhandle (e.g. during release) +template +class CacheChainedItemIterator + : public detail::IteratorFacade, + ItemT, + std::forward_iterator_tag> { + public: + using Item = ItemT; + CacheChainedItemIterator() = default; + + Item& dereference() const { + if (curr_) { + return *curr_; + } + if (curIOBuf_) { + return *reinterpret_cast(curIOBuf_->writableData()); + } + throw std::runtime_error("no item to dereference"); + } + + // advance the iterator. + // Do nothing if uninitizliaed. + void increment() { + if (curr_) { + curr_ = curr_->asChainedItem().getNext(*compressor_); + } + if (curIOBuf_) { + curIOBuf_ = curIOBuf_->next(); + } + } + + bool equal(const CacheChainedItemIterator& other) const { + if (curr_ || other.curr_) { + return curr_ == other.curr_; + } + return curIOBuf_ == other.curIOBuf_; + } + + private: + using PtrCompressor = typename Item::PtrCompressor; + // Private because only CacheT can create this. + // @param item Pointer to chained item (nullptr for null iterator) + // @param compressor Compressor used to get pointer to next in chain + explicit CacheChainedItemIterator(Item* item, const PtrCompressor& compressor) + : curr_(item), compressor_(&compressor) { + // If @item is not nullptr, check that it is a chained item + if (curr_ && !curr_->isChainedItem()) { + throw std::invalid_argument( + "Cannot initialize ChainedAllocIterator, Item is not a ChainedItem"); + } + } + + // only NvmCacheT can create with this constructor + // this is used to construct chained item for ItemDestructor + // with DipperItem on Navy, Item is allocated at heap (as IOBuf) + // instead of in allocator memory pool. + explicit CacheChainedItemIterator(folly::IOBuf* iobuf) : curIOBuf_(iobuf) { + // If @item is not nullptr, check that it is a chained item or parent item + // sine IOBuf chains is a circle, so we need to let the parent be the end + // iterator + if (curIOBuf_ && !dereference().isChainedItem() && + !dereference().hasChainedItem()) { + throw std::invalid_argument( + "Cannot initialize ChainedAllocIterator, Item is not a ChainedItem"); + } + } + + // Current iterator position in chain + Item* curr_{nullptr}; + + // Removed/evicted from NVM + folly::IOBuf* curIOBuf_{nullptr}; + + // Pointer compressor to traverse the chain. + const PtrCompressor* compressor_{nullptr}; + + friend Cache; + friend typename Cache::NvmCacheT; + friend typename Cache::ChainedAllocs; + friend typename Cache::WritableChainedAllocs; + + // For testing + template + friend class facebook::cachelib::tests::BaseAllocatorTest; +}; +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/CacheDetails.cpp b/bdm/allocator/CacheDetails.cpp new file mode 100644 index 0000000000..5438b5cbad --- /dev/null +++ b/bdm/allocator/CacheDetails.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cachelib/allocator/CacheDetails.h" +namespace facebook { +namespace cachelib { +namespace detail { + +const std::string kShmInfoName = "shm_info"; +const std::string kShmCacheName = "shm_cache"; +const std::string kShmHashTableName = "shm_hash_table"; +const std::string kShmChainedItemHashTableName = "shm_chained_alloc_hash_table"; + +} // namespace detail +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/CacheDetails.h b/bdm/allocator/CacheDetails.h new file mode 100644 index 0000000000..06c03cf9fa --- /dev/null +++ b/bdm/allocator/CacheDetails.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +// Conatains common symbols shared across different build targets. We provide +// a thin wrapper in ReadOnlySharedCacheView that depends on symbols used in +// allocator + +namespace facebook { +namespace cachelib { +namespace detail { +// identifier for the metadata info +extern const std::string kShmInfoName; + +// identifier for the main cache segment +extern const std::string kShmCacheName; + +// identifier for the main hash table if used +extern const std::string kShmHashTableName; + +// identifier for the auxilary hash table for chained items +extern const std::string kShmChainedItemHashTableName; + +} // namespace detail +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/CacheItem-inl.h b/bdm/allocator/CacheItem-inl.h new file mode 100644 index 0000000000..db0bbe7ca8 --- /dev/null +++ b/bdm/allocator/CacheItem-inl.h @@ -0,0 +1,512 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// CacheItem implementations. +namespace facebook { +namespace cachelib { +template +uint32_t CacheItem::getRequiredSize(Key key, + uint32_t size) noexcept { + const uint64_t requiredSize = + static_cast(size) + key.size() + sizeof(Item); + + XDCHECK_LE(requiredSize, + static_cast(std::numeric_limits::max())); + if (requiredSize > + static_cast(std::numeric_limits::max())) { + return 0; + } + return static_cast(requiredSize); +} + +template +uint64_t CacheItem::getRefcountMax() noexcept { + return RefcountWithFlags::kAccessRefMask; +} + +template +CacheItem::CacheItem(Key key, + uint32_t size, + uint32_t creationTime, + uint32_t expiryTime) + : creationTime_(creationTime), expiryTime_(expiryTime), alloc_(key, size) {} + +template +CacheItem::CacheItem(Key key, uint32_t size, uint32_t creationTime) + : CacheItem(key, size, creationTime, 0 /* expiryTime_ */) {} + +template +const typename CacheItem::Key CacheItem::getKey() + const noexcept { + return alloc_.getKey(); +} + +template +const void* CacheItem::getMemory() const noexcept { + return getMemoryInternal(); +} + +template +void* CacheItem::getMemory() noexcept { + return getMemoryInternal(); +} + +template +void* CacheItem::getMemoryInternal() const noexcept { + if (isChainedItem()) { + return asChainedItem().getMemory(); + } else { + return alloc_.getMemory(); + } +} + +template +uint32_t CacheItem::getOffsetForMemory() const noexcept { + return reinterpret_cast(getMemory()) - + reinterpret_cast(this); +} + +template +uint32_t CacheItem::getSize() const noexcept { + if (isChainedItem()) { + return asChainedItem().getSize(); + } else { + return alloc_.getSize(); + } +} + +template +uint32_t CacheItem::getExpiryTime() const noexcept { + return expiryTime_; +} + +template +bool CacheItem::isExpired() const noexcept { + thread_local uint32_t staleTime = 0; + + if (expiryTime_ == 0) { + return false; + } + + if (expiryTime_ < staleTime) { + return true; + } + + uint32_t currentTime = static_cast(util::getCurrentTimeSec()); + if (currentTime != staleTime) { + staleTime = currentTime; + } + return expiryTime_ < currentTime; +} + +template +bool CacheItem::isExpired(uint32_t currentTimeSec) const noexcept { + return (expiryTime_ > 0 && expiryTime_ < currentTimeSec); +} + +template +uint32_t CacheItem::getCreationTime() const noexcept { + return creationTime_; +} + +template +std::chrono::seconds CacheItem::getConfiguredTTL() const noexcept { + return std::chrono::seconds(expiryTime_ > 0 ? expiryTime_ - creationTime_ + : 0); +} + +template +uint32_t CacheItem::getLastAccessTime() const noexcept { + return mmHook_.getUpdateTime(); +} + +template +std::string CacheItem::toString() const { + if (isChainedItem()) { + return asChainedItem().toString(); + } else { + return folly::sformat( + "item: " + "memory={}:raw-ref={}:size={}:key={}:hex-key={}:" + "isInMMContainer={}:isAccessible={}:isMoving={}:references={}:ctime={}:" + "expTime={}:updateTime={}:isNvmClean={}:isNvmEvicted={}:hasChainedItem=" + "{}", + this, getRefCountAndFlagsRaw(), getSize(), + folly::humanify(getKey().str()), folly::hexlify(getKey()), + isInMMContainer(), isAccessible(), isMoving(), getRefCount(), + getCreationTime(), getExpiryTime(), getLastAccessTime(), isNvmClean(), + isNvmEvicted(), hasChainedItem()); + } +} + +template +void CacheItem::changeKey(Key key) { + if (!isChainedItem()) { + throw std::invalid_argument("Item is not chained type"); + } + + alloc_.changeKey(key); + XDCHECK_EQ(key, getKey()); +} + +template +RefcountWithFlags::Value CacheItem::getRefCount() const noexcept { + return ref_.getAccessRef(); +} + +template +RefcountWithFlags::Value CacheItem::getRefCountAndFlagsRaw() + const noexcept { + return ref_.getRaw(); +} + +template +bool CacheItem::isDrained() const noexcept { + return ref_.isDrained(); +} + +template +bool CacheItem::isExclusive() const noexcept { + return ref_.isExclusive(); +} + +template +void CacheItem::markAccessible() noexcept { + ref_.markAccessible(); +} + +template +void CacheItem::unmarkAccessible() noexcept { + ref_.unmarkAccessible(); +} + +template +void CacheItem::markInMMContainer() noexcept { + ref_.markInMMContainer(); +} + +template +void CacheItem::unmarkInMMContainer() noexcept { + ref_.unmarkInMMContainer(); +} + +template +bool CacheItem::isAccessible() const noexcept { + return ref_.isAccessible(); +} + +template +bool CacheItem::isInMMContainer() const noexcept { + return ref_.isInMMContainer(); +} + +template +bool CacheItem::markMoving() noexcept { + return ref_.markMoving(); +} + +template +RefcountWithFlags::Value CacheItem::unmarkMoving() noexcept { + return ref_.unmarkMoving(); +} + +template +bool CacheItem::isMoving() const noexcept { + return ref_.isMoving(); +} + +template +bool CacheItem::isOnlyMoving() const noexcept { + return ref_.isOnlyMoving(); +} + +template +void CacheItem::markNvmClean() noexcept { + ref_.markNvmClean(); +} + +template +void CacheItem::unmarkNvmClean() noexcept { + ref_.unmarkNvmClean(); +} + +template +bool CacheItem::isNvmClean() const noexcept { + return ref_.isNvmClean(); +} + +template +void CacheItem::markNvmEvicted() noexcept { + ref_.markNvmEvicted(); +} + +template +void CacheItem::unmarkNvmEvicted() noexcept { + ref_.unmarkNvmEvicted(); +} + +template +bool CacheItem::isNvmEvicted() const noexcept { + return ref_.isNvmEvicted(); +} + +template +void CacheItem::markIncomplete() noexcept { + ref_.markIncomplete(); +} + +template +void CacheItem::unmarkIncomplete() noexcept { + ref_.unmarkIncomplete(); +} + +template +bool CacheItem::isIncomplete() const noexcept { + return ref_.isIncomplete(); +} + +template +void CacheItem::markIsChainedItem() noexcept { + XDCHECK(!hasChainedItem()); + ref_.markIsChainedItem(); +} + +template +void CacheItem::unmarkIsChainedItem() noexcept { + XDCHECK(!hasChainedItem()); + ref_.unmarkIsChainedItem(); +} + +template +void CacheItem::markHasChainedItem() noexcept { + XDCHECK(!isChainedItem()); + ref_.markHasChainedItem(); +} + +template +void CacheItem::unmarkHasChainedItem() noexcept { + XDCHECK(!isChainedItem()); + ref_.unmarkHasChainedItem(); +} + +template +typename CacheItem::ChainedItem& +CacheItem::asChainedItem() noexcept { + return *static_cast(this); +} + +template +const typename CacheItem::ChainedItem& +CacheItem::asChainedItem() const noexcept { + return *static_cast(this); +} + +template +bool CacheItem::isChainedItem() const noexcept { + return ref_.isChainedItem(); +} + +template +bool CacheItem::hasChainedItem() const noexcept { + return ref_.hasChainedItem(); +} + +template +template +void CacheItem::setFlag() noexcept { + ref_.template setFlag(); +} + +template +template +void CacheItem::unSetFlag() noexcept { + ref_.template unSetFlag(); +} + +template +template +bool CacheItem::isFlagSet() const noexcept { + return ref_.template isFlagSet(); +} + +template +bool CacheItem::updateExpiryTime(uint32_t expiryTimeSecs) noexcept { + // check for moving to make sure we are not updating the expiry time while at + // the same time re-allocating the item with the old state of the expiry time + // in moveRegularItem(). See D6852328 + if (isMoving() || !isInMMContainer() || isChainedItem()) { + return false; + } + // attempt to atomically update the value of expiryTime + while (true) { + uint32_t currExpTime = expiryTime_; + if (__sync_bool_compare_and_swap(&expiryTime_, currExpTime, + expiryTimeSecs)) { + return true; + } + } +} + +template +bool CacheItem::extendTTL(std::chrono::seconds ttl) noexcept { + return updateExpiryTime(util::getCurrentTimeSec() + ttl.count()); +} + +// Chained items are chained in a single linked list. The payload of each +// chained item is chained to the next item. +template +class CACHELIB_PACKED_ATTR ChainedItemPayload { + public: + using ChainedItem = CacheChainedItem; + using Item = CacheItem; + using PtrCompressor = typename ChainedItem::PtrCompressor; + + // Pointer to the next chained allocation. initialize to nullptr. + SListHook hook_{}; + + // Payload + mutable unsigned char data_[0]; + + // Usable memory for this allocation. The caller is free to do whatever he + // wants with it and needs to ensure concurrency for access into this + // piece of memory. + void* getMemory() const noexcept { return &data_; } + + ChainedItem* getNext(const PtrCompressor& compressor) const noexcept { + return static_cast(hook_.getNext(compressor)); + } + + void setNext(const ChainedItem* next, + const PtrCompressor& compressor) noexcept { + hook_.setNext(static_cast(next), compressor); + XDCHECK_EQ(reinterpret_cast(getNext(compressor)), + reinterpret_cast(next)); + } +}; + +template +uint32_t CacheChainedItem::getRequiredSize(uint32_t size) noexcept { + const uint64_t requiredSize = static_cast(size) + + static_cast(kKeySize) + sizeof(Item) + + sizeof(Payload); + XDCHECK_LE(requiredSize, + static_cast(std::numeric_limits::max())); + if (requiredSize > + static_cast(std::numeric_limits::max())) { + return 0; + } + return static_cast(requiredSize); +} + +template +CacheChainedItem::CacheChainedItem(CompressedPtr ptr, + uint32_t size, + uint32_t creationTime) + : Item(Key{reinterpret_cast(&ptr), kKeySize}, + size + sizeof(Payload), + creationTime) { + this->markIsChainedItem(); + + // Explicitly call ChainedItemPayload's ctor to initialize it properly, since + // ChainedItemPayload is not a member of CacheChainedItem. + new (reinterpret_cast(&getPayload())) Payload(); +} + +template +void CacheChainedItem::changeKey(CompressedPtr newKey) { + if (this->isAccessible()) { + throw std::invalid_argument(folly::sformat( + "chained item {} is still in access container while modifying the key ", + toString())); + } + Item::changeKey(Key{reinterpret_cast(&newKey), kKeySize}); +} + +template +typename CacheChainedItem::Item& +CacheChainedItem::getParentItem( + const PtrCompressor& compressor) const noexcept { + const auto compressedPtr = + *reinterpret_cast(this->getKey().begin()); + return *compressor.unCompress(compressedPtr); +} + +template +void* CacheChainedItem::getMemory() const noexcept { + return getPayload().getMemory(); +} + +template +uint32_t CacheChainedItem::getSize() const noexcept { + // Chained Item has its own embedded payload in its KAllocation, so we + // need to deduct its size here to give the user the accurate usable size + return this->alloc_.getSize() - sizeof(Payload); +} + +template +std::string CacheChainedItem::toString() const { + const auto cPtr = + *reinterpret_cast(Item::getKey().data()); + return folly::sformat( + "chained item: " + "memory={}:raw-ref={}:size={}:parent-compressed-ptr={}:" + "isInMMContainer={}:isAccessible={}:isMoving={}:references={}:ctime={}:" + "expTime={}:updateTime={}", + this, Item::getRefCountAndFlagsRaw(), Item::getSize(), cPtr.getRaw(), + Item::isInMMContainer(), Item::isAccessible(), Item::isMoving(), + Item::getRefCount(), Item::getCreationTime(), Item::getExpiryTime(), + Item::getLastAccessTime()); +} + +template +void CacheChainedItem::appendChain( + ChainedItem& newChain, const PtrCompressor& compressor) { + if (getNext(compressor)) { + throw std::invalid_argument( + folly::sformat("Item: {} is not the last item in a chain. Next: {}", + toString(), getNext(compressor)->toString())); + } + setNext(&newChain, compressor); +} + +template +typename CacheChainedItem::ChainedItem* +CacheChainedItem::getNext( + const PtrCompressor& compressor) const noexcept { + return getPayload().getNext(compressor); +} + +template +void CacheChainedItem::setNext( + const ChainedItem* next, const PtrCompressor& compressor) noexcept { + getPayload().setNext(next, compressor); + XDCHECK_EQ(reinterpret_cast(getNext(compressor)), + reinterpret_cast(next)); +} + +template +typename CacheChainedItem::Payload& +CacheChainedItem::getPayload() { + return *reinterpret_cast(this->alloc_.getMemory()); +} + +template +const typename CacheChainedItem::Payload& +CacheChainedItem::getPayload() const { + return *reinterpret_cast(this->alloc_.getMemory()); +} +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/CacheItem.h b/bdm/allocator/CacheItem.h new file mode 100644 index 0000000000..61b374720e --- /dev/null +++ b/bdm/allocator/CacheItem.h @@ -0,0 +1,573 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +#include "cachelib/allocator/Cache.h" +#include "cachelib/allocator/CacheChainedItemIterator.h" +#include "cachelib/allocator/Handle.h" +#include "cachelib/allocator/KAllocation.h" +#include "cachelib/allocator/Refcount.h" +#include "cachelib/allocator/TypedHandle.h" +#include "cachelib/allocator/datastruct/SList.h" +#include "cachelib/allocator/memory/CompressedPtr.h" +#include "cachelib/allocator/memory/MemoryAllocator.h" +#include "cachelib/common/CompilerUtils.h" +#include "cachelib/common/Exceptions.h" +#include "cachelib/common/Mutex.h" + +namespace facebook { + +namespace cachelib { + +namespace tests { +template +class BaseAllocatorTest; + +template +class AllocatorHitStatsTest; + +template +class MapTest; + +class CacheAllocatorTestWrapper; +} // namespace tests + +// forward declaration +template +class CacheAllocator; + +template +class CacheChainedItem; + +template +class ChainedItemPayload; + +template +class NvmCache; + +template +class CacheChainedAllocs; + +template +class Map; + +// This is the actual representation of the cache item. It has two member +// hooks of type MMType::Hook and AccessType::Hook to ensure that the CacheItem +// can be put in the MMType::Container and AccessType::Container. +template +class CACHELIB_PACKED_ATTR CacheItem { + public: + /** + * CacheAllocator is what the user will be interacting to cache + * anything. NvmCache is an abstraction that abstracts away NVM + * devices. It is abstracted inside CacheAllocator and user does + * not deal with it directly. An item in NVM takes longer to fetch + * than one that resides in RAM. + */ + using CacheT = CacheAllocator; + using NvmCacheT = NvmCache; + using Flags = RefcountWithFlags::Flags; + + /** + * Currently there are two types of items that can be cached. + * A ChainedItem is a dependent item on a regular item. A chained + * item does not have a real key but is instead associated with + * a regular item (its parent item). Multiple chained items can be + * linked to the same regular item and together they can cache data + * much bigger that of a single item. + */ + using Item = CacheItem; + using ChainedItem = CacheChainedItem; + + /** + * A cache item is roughly consisted of the following parts: + * + * --------------------- + * | Intrusive Hooks | + * | Reference & Flags | + * | Creation Time | + * | Expiry Time | + * | Payload | + * --------------------- + * + * Intrusive hooks are used for access/mm containers. They contain + * compressed pointers that link an item to said container in addition + * to other metadata that the container itself deems useful to keep. + * + * Payload in this case is KAllocation which contains its own metadata + * that describes the length of the payload, the size of the key in + * addition to the actual key and the data. + */ + using AccessHook = typename CacheTrait::AccessType::template Hook; + using MMHook = typename CacheTrait::MMType::template Hook; + using Key = KAllocation::Key; + + /** + * User primarily interacts with an item through its handle. + * An item handle is essentially a std::shared_ptr like structure + * that ensures the item's refcount is properly maintained and ensures + * the item is freed when it is not linked to access/mm containers + * and its refcount drops to 0. + */ + using ReadHandle = detail::ReadHandleImpl; + using WriteHandle = detail::WriteHandleImpl; + using Handle = WriteHandle; + using HandleMaker = std::function; + + /** + * Item* and ChainedItem* are represented in this compressed form + * inside the access and mm containers. They are more efficient to + * store than raw pointers and can be leveraged to allow the cache + * to be mapped to different addresses on shared memory. + */ + using CompressedPtr = facebook::cachelib::CompressedPtr; + using SingleTierPtrCompressor = MemoryAllocator::SingleTierPtrCompressor; + using PtrCompressor = MemoryAllocator::PtrCompressor; + + // Get the required size for a cache item given the size of memory + // user wants to allocate and the key size for the item + // + // @return required size if it's within the max size of uint32_t, + // 0 otherwise + static uint32_t getRequiredSize(Key key, uint32_t size) noexcept; + + // Get the number of maximum outstanding handles there can be at any given + // time for an item + static uint64_t getRefcountMax() noexcept; + + // Copying or moving this can launch a nuclear missile and blow up everything + CacheItem(const CacheItem&) = delete; + CacheItem(CacheItem&&) = delete; + void operator=(const CacheItem&) = delete; + void operator=(CacheItem&&) = delete; + + // Fetch the key corresponding to the allocation + const Key getKey() const noexcept; + + // Readonly memory for this allocation. + const void* getMemory() const noexcept; + + // Writable memory for this allocation. The caller is free to do whatever he + // wants with it and needs to ensure thread safety for access into this + // piece of memory. + void* getMemory() noexcept; + + // Cast item's readonly memory to a readonly user type + template + const T* getMemoryAs() const noexcept { + return reinterpret_cast(getMemory()); + } + + // Cast item's writable memory to a writable user type + template + T* getMemoryAs() noexcept { + return reinterpret_cast(getMemory()); + } + + // This is the size of the memory allocation requested by the user. + // The memory range [getMemory(), getMemory() + getSize()) is usable. + uint32_t getSize() const noexcept; + + // Return timestamp of when this item was created + uint32_t getCreationTime() const noexcept; + + // return the original configured time to live in seconds. + std::chrono::seconds getConfiguredTTL() const noexcept; + + // Return the last time someone accessed this item + uint32_t getLastAccessTime() const noexcept; + + // Convenience method for debug purposes. + std::string toString() const; + + // return the expiry time of the item + uint32_t getExpiryTime() const noexcept; + + // check if the item reaches the expiry timestamp + // expiryTime_ == 0 means no time limitation for this Item + bool isExpired() const noexcept; + + // Check if the item is expired relative to the provided timestamp. + bool isExpired(uint32_t currentTimeSecs) const noexcept; + + /** + * Access specific flags for an item + * + * These flags are set atomically and any of the APIs here will give a + * consistent view on all the flags that are set or unset at that moment. + * + * However the content of the flag can be changed after any of these calls + * are returned, so to reliably rely on them, the user needs to make sure + * they're either the sole owner of this item or every one accessing this + * item is only reading its content. + */ + bool isChainedItem() const noexcept; + bool hasChainedItem() const noexcept; + + /** + * Keep track of whether the item was modified while in ram cache + */ + bool isNvmClean() const noexcept; + void markNvmClean() noexcept; + void unmarkNvmClean() noexcept; + + /** + * Marks that the item was potentially evicted from the nvmcache and might + * need to be rewritten even if it was nvm-clean + */ + void markNvmEvicted() noexcept; + void unmarkNvmEvicted() noexcept; + bool isNvmEvicted() const noexcept; + + /** + * Marks that the item is migrating between memory tiers and + * not ready for access now. Accessing thread should wait. + */ + void markIncomplete() noexcept; + void unmarkIncomplete() noexcept; + bool isIncomplete() const noexcept; + + /** + * Function to set the timestamp for when to expire an item + * + * This API will only succeed when an item is a regular item, and user + * has already inserted it into the cache (via @insert or @insertOrReplace). + * In addition, the item cannot be in a "moving" state. + * + * @param expiryTime the expiryTime value to update to + * + * @return boolean indicating whether expiry time was successfully updated + * false when item is not linked in cache, or in moving state, or a + * chained item + */ + bool updateExpiryTime(uint32_t expiryTimeSecs) noexcept; + + // Same as @updateExpiryTime, but sets expiry time to @ttl seconds from now. + // It has the same restrictions as @updateExpiryTime. An item must be a + // regular item and is part of the cache and NOT in the moving state. + // + // @param ttl TTL (from now) + // @return boolean indicating whether expiry time was successfully updated + // false when item is not linked in cache, or in moving state, or a + // chained item + bool extendTTL(std::chrono::seconds ttl) noexcept; + + // Return the refcount of an item + RefcountWithFlags::Value getRefCount() const noexcept; + + // Returns true if the item is in access container, false otherwise + bool isAccessible() const noexcept; + + protected: + // construct an item without expiry timestamp. + CacheItem(Key key, uint32_t size, uint32_t creationTime); + + // @param key Key for this item + // @param size Size allocated by the user. This may be smaller than + // the full usable size + // @param creationTime Timestamp when this item was created + // @param expiryTime Timestamp when this item will be expired. + CacheItem(Key key, uint32_t size, uint32_t creationTime, uint32_t expiryTime); + + // changes the item's key. This is only supported for ChainedItems. For + // regular items, the key does not change with the lifetime of the item. For + // ChainedItems since the key is the parent item, the key can change when + // the parent item is being moved or tranferred. + // + // @throw std::invalid_argument if item is not a chained item or the key + // size does not match with the current key + void changeKey(Key key); + + void* getMemoryInternal() const noexcept; + + /** + * CacheItem's refcount contain admin references, access referneces, and + * flags, refer to Refcount.h for details. + * + * Currently we support up to 2^18 references on any given item. + * Increment and decrement may throw the following due to over/under-flow. + * cachelib::exception::RefcountOverflow + * cachelib::exception::RefcountUnderflow + */ + RefcountWithFlags::Value getRefCountAndFlagsRaw() const noexcept; + + FOLLY_ALWAYS_INLINE void incRef() { + if (LIKELY(ref_.incRef())) { + return; + } + throw exception::RefcountOverflow( + folly::sformat("Refcount maxed out. item: {}", toString())); + } + + FOLLY_ALWAYS_INLINE RefcountWithFlags::Value decRef() { + return ref_.decRef(); + } + + // Whether or not an item is completely drained of all references including + // the internal ones. This means there is no access refcount bits and zero + // admin bits set. I.e. refcount is 0 and the item is not linked, accessible, + // nor moving + bool isDrained() const noexcept; + + // Whether or not we hold the last exclusive access to this item + // Refcount is 1 and the item is not linked, accessible, nor moving + bool isExclusive() const noexcept; + + /** + * The following three functions correspond to the state of the allocation + * in the memory management container. This is protected by the + * MMContainer's internal locking. Inspecting this outside the mm + * container will be racy. + */ + void markInMMContainer() noexcept; + void unmarkInMMContainer() noexcept; + bool isInMMContainer() const noexcept; + + /** + * The following three functions correspond to the state of the allocation + * in the access container. This will be protected by the access container + * lock. Depending on their state outside of the access container might be + * racy + */ + void markAccessible() noexcept; + void unmarkAccessible() noexcept; + + /** + * The following two functions corresond to whether or not an item is + * currently in the process of being moved. This happens during a slab + * rebalance or resize operation. + * + * An item can only be marked moving when `isInMMContainer` returns true. + * This operation is atomic. + * + * User can also query if an item "isOnlyMoving". This returns true only + * if the refcount is 0 and only the moving bit is set. + * + * Unmarking moving does not depend on `isInMMContainer` + */ + bool markMoving() noexcept; + RefcountWithFlags::Value unmarkMoving() noexcept; + bool isMoving() const noexcept; + bool isOnlyMoving() const noexcept; + + /** + * Item cannot be marked both chained allocation and + * marked as having chained allocations at the same time + */ + void markIsChainedItem() noexcept; + void unmarkIsChainedItem() noexcept; + void markHasChainedItem() noexcept; + void unmarkHasChainedItem() noexcept; + ChainedItem& asChainedItem() noexcept; + const ChainedItem& asChainedItem() const noexcept; + + // Returns the offset of the beginning of usable memory for an item + uint32_t getOffsetForMemory() const noexcept; + + /** + * Functions to set, unset and get bits + */ + template + void setFlag() noexcept; + template + void unSetFlag() noexcept; + template + bool isFlagSet() const noexcept; + + /** + * The following are the data members of CacheItem + * + * Hooks to access and mm containers are public since other parts of the + * code need access to them. Everything else should be private. + */ + public: + // Hook for the access type + AccessHook accessHook_; + + using AccessContainer = typename CacheTrait::AccessType::template Container< + Item, + &Item::accessHook_, + typename CacheTrait::AccessTypeLocks>; + + // Hook for the mm type + MMHook mmHook_; + + using MMContainer = + typename CacheTrait::MMType::template Container; + + protected: + // Refcount for the item and also flags on the items state + RefcountWithFlags ref_; + + // Time when this cache item is created + const uint32_t creationTime_{0}; + + // Expiry timestamp for the item + // 0 means no time limitation + uint32_t expiryTime_{0}; + + // The actual allocation. + KAllocation alloc_; + + friend ChainedItem; + friend CacheT; + friend AccessContainer; + friend MMContainer; + friend NvmCacheT; + template + friend class CacheChainedAllocs; + template + friend class CacheChainedItemIterator; + friend class facebook::cachelib::tests::CacheAllocatorTestWrapper; + template + friend class Map; + + // tests + template + friend class facebook::cachelib::tests::BaseAllocatorTest; + friend class facebook::cachelib::tests::MapTest>; + FRIEND_TEST(LruAllocatorTest, ItemSampling); + FRIEND_TEST(LruAllocatorTest, AddChainedAllocationSimple); + FRIEND_TEST(ItemTest, ChangeKey); + FRIEND_TEST(ItemTest, ToString); + FRIEND_TEST(ItemTest, CreationTime); + FRIEND_TEST(ItemTest, ExpiryTime); + FRIEND_TEST(ItemTest, ChainedItemConstruction); + FRIEND_TEST(ItemTest, NonStringKey); + template + friend class facebook::cachelib::tests::AllocatorHitStatsTest; +}; + +// A chained item has a hook pointing to the next chained item. The hook is +// a compressed pointer stored at the beginning of KAllocation's data. +// A chained item's key is a compressed pointer to its parent item. +// +// Memory layout: +// | --------------------- | +// | AccessHook | +// | MMHook | +// | RefCountWithFlags | +// | creationTime_ | +// | --------------------- | +// | K | size_ | +// | A | ---------------- | +// | l | | keyData | <-- sizeof(CompressedPtr) +// | l | | -------- | +// | o | | P | hook | <-- sizeof(SlistHook) +// | c | data_ | a | data | +// | a | | y | | +// | t | | l | | +// | i | | o | | +// | o | | a | | +// | n | | d | | +// | --------------------- | +template +class CACHELIB_PACKED_ATTR CacheChainedItem : public CacheItem { + public: + using Item = CacheItem; + using ChainedItem = CacheChainedItem; + using Payload = ChainedItemPayload; + using CompressedPtr = typename Item::CompressedPtr; + using PtrCompressor = typename Item::PtrCompressor; + + /** + * Key for CacheChainedItem is the raw pointer to the parent item, + * so it is 8 bytes big. + */ + using Key = typename Item::Key; + static constexpr uint32_t kKeySize = sizeof(CompressedPtr); + + // Get the required size for a cache item given the size of memory + // user wants to allocate + // + // @return required size if it's within the max size of uint32_t, + // 0 otherwise + static uint32_t getRequiredSize(uint32_t size) noexcept; + + // Get the parent item this chained allocation is associated with + Item& getParentItem(const PtrCompressor& compressor) const noexcept; + + // Usable memory for this allocation. The caller is free to do whatever he + // wants with it and needs to ensure concurrency for access into this + // piece of memory. + void* getMemory() const noexcept; + + // This is the size of the memory allocation requested by the user. + // The memory range [getMemory(), getMemory() + getSize()) is usable. + uint32_t getSize() const noexcept; + + // Convenience method for debug purposes. + std::string toString() const; + + protected: + // The key of a chained allocation is the address of its parent item + // + // @param ptr Compressed ptr to parent item + // @param allocSize This is the size of the entire allocation for + // constructing this item + // @param creationTime Timestamp when this item was created + CacheChainedItem(CompressedPtr key, uint32_t size, uint32_t creationTime); + + // reset the key of the ChainedItem. For regular Items, we dont allow doing + // this. However for chained items since the parent is the key, we need to + // allow this for transferring the ownership from one parent to another. + // + // @throw std::invalid_argument if the chained item is still in accessible + // state. + void changeKey(CompressedPtr newKey); + + // Append chain to this item. The new chain can contain one or more items + // but this item to which the new chain is being appended must be a single + // item, or the last item of an existing chain + // + // @throw std::invalid_argument if this item is already part of a chain + // and is not the last item + void appendChain(ChainedItem& newChain, const PtrCompressor& compressor); + + // get the next in the chain for this chained item. + ChainedItem* getNext(const PtrCompressor& compressor) const noexcept; + + // set the next in chain for this chained Item + void setNext(const ChainedItem* next, + const PtrCompressor& compressor) noexcept; + + Payload& getPayload(); + const Payload& getPayload() const; + + friend Payload; + friend CacheAllocator; + template + friend class CacheChainedAllocs; + template + friend class CacheChainedItemIterator; + friend NvmCache>; + template + friend class facebook::cachelib::tests::BaseAllocatorTest; + FRIEND_TEST(ItemTest, ChainedItemConstruction); + FRIEND_TEST(ItemTest, ToString); + FRIEND_TEST(ItemTest, ChangeKey); +}; +} // namespace cachelib +} // namespace facebook + +#include "cachelib/allocator/CacheItem-inl.h" diff --git a/bdm/allocator/CacheStats.cpp b/bdm/allocator/CacheStats.cpp new file mode 100644 index 0000000000..5ce7ad9c92 --- /dev/null +++ b/bdm/allocator/CacheStats.cpp @@ -0,0 +1,340 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cachelib/allocator/CacheStats.h" + +#include "cachelib/allocator/CacheStatsInternal.h" + +namespace facebook { +namespace cachelib { +namespace detail { + +void Stats::init() { + cacheHits = std::make_unique(); + allocAttempts = std::make_unique(); + evictionAttempts = std::make_unique(); + fragmentationSize = std::make_unique(); + allocFailures = std::make_unique(); + chainedItemEvictions = std::make_unique(); + regularItemEvictions = std::make_unique(); + auto initToZero = [](auto& a) { + for (auto& s : a) { + for (auto& c : s) { + c.set(0); + } + } + }; + + initToZero(*allocAttempts); + initToZero(*evictionAttempts); + initToZero(*allocFailures); + initToZero(*fragmentationSize); + initToZero(*chainedItemEvictions); + initToZero(*regularItemEvictions); + + classAllocLatency = std::make_unique(); +} + +template +struct SizeVerify {}; + +void Stats::populateGlobalCacheStats(GlobalCacheStats& ret) const { +#ifndef SKIP_SIZE_VERIFY + SizeVerify a = SizeVerify<16176>{}; + std::ignore = a; +#endif + ret.numCacheGets = numCacheGets.get(); + ret.numCacheGetMiss = numCacheGetMiss.get(); + ret.numCacheGetExpiries = numCacheGetExpiries.get(); + ret.numCacheRemoves = numCacheRemoves.get(); + ret.numCacheRemoveRamHits = numCacheRemoveRamHits.get(); + ret.numRamDestructorCalls = numRamDestructorCalls.get(); + ret.numDestructorExceptions = numDestructorExceptions.get(); + + ret.numNvmGets = numNvmGets.get(); + ret.numNvmGetMiss = numNvmGetMiss.get(); + ret.numNvmGetMissFast = numNvmGetMissFast.get(); + ret.numNvmGetMissExpired = numNvmGetMissExpired.get(); + ret.numNvmGetMissDueToInflightRemove = numNvmGetMissDueToInflightRemove.get(); + ret.numNvmGetMissErrs = numNvmGetMissErrs.get(); + ret.numNvmGetCoalesced = numNvmGetCoalesced.get(); + ret.numNvmPuts = numNvmPuts.get(); + ret.numNvmDeletes = numNvmDeletes.get(); + ret.numNvmSkippedDeletes = numNvmSkippedDeletes.get(); + ret.numNvmPutErrs = numNvmPutErrs.get(); + ret.numNvmPutEncodeFailure = numNvmPutEncodeFailure.get(); + ret.numNvmAbortedPutOnTombstone += numNvmAbortedPutOnTombstone.get(); + ret.numNvmCompactionFiltered += numNvmCompactionFiltered.get(); + ret.numNvmAbortedPutOnInflightGet = numNvmAbortedPutOnInflightGet.get(); + ret.numNvmCleanEvict = numNvmCleanEvict.get(); + ret.numNvmCleanDoubleEvict = numNvmCleanDoubleEvict.get(); + ret.numNvmDestructorCalls = numNvmDestructorCalls.get(); + ret.numNvmDestructorRefcountOverflow = numNvmDestructorRefcountOverflow.get(); + ret.numNvmExpiredEvict = numNvmExpiredEvict.get(); + ret.numNvmPutFromClean = numNvmPutFromClean.get(); + ret.numNvmEvictions = numNvmEvictions.get(); + + ret.numNvmEncryptionErrors = numNvmEncryptionErrors.get(); + ret.numNvmDecryptionErrors = numNvmDecryptionErrors.get(); + + ret.numNvmRejectsByExpiry = numNvmRejectsByExpiry.get(); + ret.numNvmRejectsByClean = numNvmRejectsByClean.get(); + ret.numNvmRejectsByAP = numNvmRejectsByAP.get(); + + ret.numChainedParentItems = numChainedParentItems.get(); + ret.numChainedChildItems = numChainedChildItems.get(); + ret.numNvmAllocAttempts = numNvmAllocAttempts.get(); + ret.numNvmAllocForItemDestructor = numNvmAllocForItemDestructor.get(); + ret.numNvmItemDestructorAllocErrors = numNvmItemDestructorAllocErrors.get(); + + ret.allocateLatencyNs = this->allocateLatency_.estimate(); + ret.moveChainedLatencyNs = this->moveChainedLatency_.estimate(); + ret.moveRegularLatencyNs = this->moveRegularLatency_.estimate(); + ret.nvmLookupLatencyNs = this->nvmLookupLatency_.estimate(); + ret.nvmInsertLatencyNs = this->nvmInsertLatency_.estimate(); + ret.nvmRemoveLatencyNs = this->nvmRemoveLatency_.estimate(); + ret.ramEvictionAgeSecs = this->ramEvictionAgeSecs_.estimate(); + ret.ramItemLifeTimeSecs = this->ramItemLifeTimeSecs_.estimate(); + ret.nvmSmallLifetimeSecs = this->nvmSmallLifetimeSecs_.estimate(); + ret.nvmLargeLifetimeSecs = this->nvmLargeLifetimeSecs_.estimate(); + ret.nvmEvictionSecondsPastExpiry = + this->nvmEvictionSecondsPastExpiry_.estimate(); + ret.nvmEvictionSecondsToExpiry = this->nvmEvictionSecondsToExpiry_.estimate(); + ret.nvmPutSize = this->nvmPutSize_.estimate(); + + auto accum = [](const PerPoolClassAtomicCounters& c) { + uint64_t sum = 0; + for (const auto& x : c) { + for (const auto& v : x) { + sum += v.get(); + } + } + return sum; + }; + ret.allocAttempts = accum(*allocAttempts); + ret.evictionAttempts = accum(*evictionAttempts); + ret.allocFailures = accum(*allocFailures); + ret.numEvictions = accum(*chainedItemEvictions); + ret.numEvictions += accum(*regularItemEvictions); + + ret.invalidAllocs = invalidAllocs.get(); + ret.numRefcountOverflow = numRefcountOverflow.get(); + + ret.numEvictionFailureFromAccessContainer = evictFailAC.get(); + ret.numEvictionFailureFromConcurrentFill = evictFailConcurrentFill.get(); + ret.numEvictionFailureFromParentAccessContainer = evictFailParentAC.get(); + ret.numEvictionFailureFromMoving = evictFailMove.get(); + ret.numEvictionFailureFromParentMoving = evictFailParentMove.get(); + ret.numAbortedSlabReleases = numAbortedSlabReleases.get(); + ret.numSkippedSlabReleases = numSkippedSlabReleases.get(); +} + +} // namespace detail + +PoolStats& PoolStats::operator+=(const PoolStats& other) { + auto verify = [](bool isCompatible) { + if (!isCompatible) { + throw std::invalid_argument( + "attempting to aggregate incompatible pool stats"); + } + }; + + XDCHECK_EQ(cacheStats.size(), mpStats.acStats.size()); + + verify(cacheStats.size() == other.cacheStats.size()); + verify(mpStats.acStats.size() == other.mpStats.acStats.size()); + verify(getClassIds() == other.getClassIds()); + + // aggregate mp stats + { + auto& d = mpStats; + const auto& s = other.mpStats; + d.freeSlabs += s.freeSlabs; + d.slabsUnAllocated += s.slabsUnAllocated; + d.numSlabResize += s.numSlabResize; + d.numSlabRebalance += s.numSlabRebalance; + d.numSlabAdvise += s.numSlabAdvise; + } + + for (const ClassId i : other.getClassIds()) { + verify(cacheStats.at(i).allocSize == other.cacheStats.at(i).allocSize); + + // aggregate CacheStat stats + { + auto& d = cacheStats.at(i); + const auto& s = other.cacheStats.at(i); + d.allocAttempts += s.allocAttempts; + d.evictionAttempts += s.evictionAttempts; + d.allocFailures += s.allocFailures; + d.fragmentationSize += s.fragmentationSize; + d.numHits += s.numHits; + d.chainedItemEvictions += s.chainedItemEvictions; + d.regularItemEvictions += s.regularItemEvictions; + } + + // aggregate container stats within CacheStat + { + auto& d = cacheStats.at(i).containerStat; + const auto& s = other.cacheStats.at(i).containerStat; + d.size += s.size; + + if (d.oldestTimeSec < s.oldestTimeSec) { + d.oldestTimeSec = s.oldestTimeSec; + } + + d.numHotAccesses += s.numHotAccesses; + d.numColdAccesses += s.numColdAccesses; + d.numWarmAccesses += s.numWarmAccesses; + } + + // aggregate ac stats + { + auto& d = mpStats.acStats.at(i); + const auto& s = other.mpStats.acStats.at(i); + // allocsPerSlab is fixed for each allocation class, and thus + // there is no need to aggregate it + /* d.allocsPerSlab */ + d.usedSlabs += s.usedSlabs; + d.freeSlabs += s.freeSlabs; + d.freeAllocs += s.freeAllocs; + d.activeAllocs += s.activeAllocs; + d.full = d.full && s.full ? true : false; + } + } + + // aggregate rest of PoolStats + numPoolGetHits += other.numPoolGetHits; + return *this; +} + +uint64_t PoolStats::numFreeAllocs() const noexcept { + return mpStats.numFreeAllocs(); +} + +size_t PoolStats::freeMemoryBytes() const noexcept { + return mpStats.freeMemory(); +} + +uint64_t PoolStats::numEvictions() const noexcept { + uint64_t n = 0; + for (const auto& s : cacheStats) { + n += s.second.numEvictions(); + } + return n; +} + +uint64_t PoolStats::numItems() const noexcept { + uint64_t n = 0; + for (const auto& s : cacheStats) { + n += s.second.numItems(); + } + return n; +} + +uint64_t PoolStats::numAllocFailures() const { + uint64_t n = 0; + for (const auto& s : cacheStats) { + n += s.second.allocFailures; + } + return n; +} + +uint64_t PoolStats::numAllocAttempts() const { + uint64_t n = 0; + for (const auto& s : cacheStats) { + n += s.second.allocAttempts; + } + return n; +} + +uint64_t PoolStats::numEvictionAttempts() const { + uint64_t n = 0; + for (const auto& s : cacheStats) { + n += s.second.evictionAttempts; + } + return n; +} + +uint64_t PoolStats::totalFragmentation() const { + uint64_t n = 0; + for (const auto& s : cacheStats) { + n += s.second.fragmentationSize; + } + return n; +} + +uint64_t PoolStats::numActiveAllocs() const noexcept { + uint64_t n = 0; + for (const auto& ac : mpStats.acStats) { + n += ac.second.activeAllocs; + } + return n; +} + +uint64_t PoolStats::minEvictionAge() const { + if (isCompactCache) { + return 0; + } + + XDCHECK_GT(cacheStats.size(), 0u); + + // treat 0 eviction age as higher values so that we filter that out for min. + // 0 eviction age usually means we dont have anything in that classId. a is + // min than b only when a is not 0 and is less than b or if b is 0. all + // other cases, a < b is false. + return std::min_element(cacheStats.begin(), cacheStats.end(), + [](auto& a, auto& b) { + uint64_t aVal = a.second.getEvictionAge(); + uint64_t bVal = b.second.getEvictionAge(); + return (aVal != 0 && aVal < bVal) || bVal == 0; + }) + ->second.getEvictionAge(); +} + +uint64_t PoolStats::maxEvictionAge() const { + if (isCompactCache) { + return 0; + } + + XDCHECK_GT(cacheStats.size(), 0u); + return std::max_element(cacheStats.begin(), cacheStats.end(), + [](auto& a, auto& b) { + return a.second.getEvictionAge() < + b.second.getEvictionAge(); + }) + ->second.getEvictionAge(); +} + +namespace { +double hitRatioCalc(uint64_t ops, uint64_t miss) { + return miss == 0 || ops == 0 ? 100.0 + : 100.0 - 100.0 * static_cast(miss) / + static_cast(ops); +} +} // namespace + +uint64_t PoolStats::numEvictableItems() const noexcept { + uint64_t n = 0; + for (const auto& s : cacheStats) { + n += s.second.numEvictableItems(); + } + return n; +} + +double CCacheStats::hitRatio() const { return hitRatioCalc(get, getMiss); } + +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/CacheStats.h b/bdm/allocator/CacheStats.h new file mode 100644 index 0000000000..2913fa5c60 --- /dev/null +++ b/bdm/allocator/CacheStats.h @@ -0,0 +1,734 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "cachelib/allocator/Util.h" +#include "cachelib/allocator/memory/MemoryAllocator.h" +#include "cachelib/allocator/memory/MemoryAllocatorStats.h" +#include "cachelib/allocator/memory/Slab.h" +#include "cachelib/common/FastStats.h" +#include "cachelib/common/PercentileStats.h" +#include "cachelib/common/RollingStats.h" +#include "cachelib/common/Time.h" + +namespace facebook { +namespace cachelib { + +// stats class for a single eviction queue +struct EvictionStatPerType { + // the age of the oldest element in seconds + uint64_t oldestElementAge = 0ULL; + + // number of elements in the eviction queue + uint64_t size = 0ULL; + + // the estimated age after removing a slab worth of elements + uint64_t projectedAge = 0ULL; +}; + +// stats class for one MM container (a.k.a one allocation class) related to +// evictions +struct EvictionAgeStat { + EvictionStatPerType warmQueueStat; + + EvictionStatPerType hotQueueStat; + + EvictionStatPerType coldQueueStat; +}; + +// stats related to evictions for a pool +struct PoolEvictionAgeStats { + // Map from allocation class id to the eviction age stats + std::unordered_map classEvictionAgeStats; + + uint64_t getOldestElementAge(ClassId cid) const { + return classEvictionAgeStats.at(cid).warmQueueStat.oldestElementAge; + } + + const EvictionStatPerType& getWarmEvictionStat(ClassId cid) const { + return classEvictionAgeStats.at(cid).warmQueueStat; + } + + const EvictionStatPerType& getHotEvictionStat(ClassId cid) const { + return classEvictionAgeStats.at(cid).hotQueueStat; + } + + const EvictionStatPerType& getColdEvictionStat(ClassId cid) const { + return classEvictionAgeStats.at(cid).coldQueueStat; + } +}; + +// Stats for MM container +struct MMContainerStat { + // number of elements in the container. + size_t size; + + // what is the unix timestamp in seconds of the oldest element existing in + // the container. + uint64_t oldestTimeSec; + + // refresh time for LRU + uint64_t lruRefreshTime; + + // TODO: Make the MMContainerStat generic by moving the Lru/2Q specific + // stats inside MMType and exporting them through a generic stats interface. + // number of hits in each lru. + uint64_t numHotAccesses; + uint64_t numColdAccesses; + uint64_t numWarmAccesses; + uint64_t numTailAccesses; +}; + +struct AllocationClassBaseStat { + // size of allocation class + size_t allocSize{0}; + + // size of memory assigned to this allocation class + size_t memorySize{0}; + + // percent of free memory in this class + double approxFreePercent{0.0}; + + // Rolling allocation latency (in ns) + util::RollingStats allocLatencyNs; +}; + +// cache related stats for a given allocation class. +struct CacheStat { + // allocation size for this container. + uint32_t allocSize; + + // number of attempts to allocate memory + uint64_t allocAttempts{0}; + + // number of eviction attempts + uint64_t evictionAttempts{0}; + + // number of failed attempts + uint64_t allocFailures{0}; + + // total fragmented memory size in bytes + uint64_t fragmentationSize{0}; + + // number of hits for this container. + uint64_t numHits; + + // number of evictions from this class id that was of a chained item + uint64_t chainedItemEvictions; + + // number of regular items that were evicted from this classId + uint64_t regularItemEvictions; + + // the stats from the mm container + MMContainerStat containerStat; + + uint64_t numItems() const noexcept { return numEvictableItems(); } + + // number of elements in this MMContainer + size_t numEvictableItems() const noexcept { return containerStat.size; } + + // total number of evictions. + uint64_t numEvictions() const noexcept { + return chainedItemEvictions + regularItemEvictions; + } + + // the current oldest item in the container in seconds. + uint64_t getEvictionAge() const noexcept { + return containerStat.oldestTimeSec != 0 + ? util::getCurrentTimeSec() - containerStat.oldestTimeSec + : 0; + } +}; + +// Stats for a pool +struct PoolStats { + // pool name given by users of this pool. + std::string poolName; + + // true if the pool is a compact cache pool. + bool isCompactCache; + + // total pool size assigned by users when adding pool. + uint64_t poolSize; + + // total size of the pool that is actively usable, taking advising into + // account + uint64_t poolUsableSize; + + // total size of the pool that is set to be advised away. + uint64_t poolAdvisedSize; + + // container stats that provide evictions etc. + std::unordered_map cacheStats; + + // stats from the memory allocator perspective. this is a map of MPStat + // for each allocation class that this pool has. + MPStats mpStats; + + // number of get hits for this pool. + uint64_t numPoolGetHits; + + // estimates for eviction age for items in this pool + util::PercentileStats::Estimates evictionAgeSecs{}; + + const std::set& getClassIds() const noexcept { + return mpStats.classIds; + } + + // number of attempts to allocate + uint64_t numAllocAttempts() const; + + // number of attempts to evict + uint64_t numEvictionAttempts() const; + + // number of attempts that failed + uint64_t numAllocFailures() const; + + // toal memory fragmentation size of this pool. + uint64_t totalFragmentation() const; + + // total number of free allocs for this pool + uint64_t numFreeAllocs() const noexcept; + + // amount of cache memory that is not allocated. + size_t freeMemoryBytes() const noexcept; + + // number of evictions for this pool + uint64_t numEvictions() const noexcept; + + // number of all items in this pool + uint64_t numItems() const noexcept; + + // number of evictable items + uint64_t numEvictableItems() const noexcept; + + // total number of allocations currently in this pool + uint64_t numActiveAllocs() const noexcept; + + // number of hits for an alloc class in this pool + uint64_t numHitsForClass(ClassId cid) const { + return cacheStats.at(cid).numHits; + } + + // number of slabs in this class id + uint64_t numSlabsForClass(ClassId cid) const { + return mpStats.acStats.at(cid).totalSlabs(); + } + + // alloc size corresponding to the class id + uint32_t allocSizeForClass(ClassId cid) const { + return cacheStats.at(cid).allocSize; + } + + // mm container eviction age for the class + uint64_t evictionAgeForClass(ClassId cid) const { + return cacheStats.at(cid).getEvictionAge(); + } + + // total free allocs for the class + uint64_t numFreeAllocsForClass(ClassId cid) const { + return mpStats.acStats.at(cid).freeAllocs; + } + + // This is the real eviction age of this pool as this number + // guarantees the time any item inserted into this pool will live + // ignores the classIds that are not used. + uint64_t minEvictionAge() const; + + // computes the maximum eviction age across all class Ids + uint64_t maxEvictionAge() const; + + // aggregate this pool stats with another that is compatible. To be + // compatible, they need to have the same number of classIds + // + // throws when the operation is not compatible. + PoolStats& operator+=(const PoolStats& other); +}; + +// Stats for slab release events +struct SlabReleaseStats { + uint64_t numActiveSlabReleases; + uint64_t numSlabReleaseForRebalance; + uint64_t numSlabReleaseForResize; + uint64_t numSlabReleaseForAdvise; + uint64_t numSlabReleaseForRebalanceAttempts; + uint64_t numSlabReleaseForResizeAttempts; + uint64_t numSlabReleaseForAdviseAttempts; + uint64_t numMoveAttempts; + uint64_t numMoveSuccesses; + uint64_t numEvictionAttempts; + uint64_t numEvictionSuccesses; + uint64_t numSlabReleaseStuck; +}; + +// Stats for reaper +struct ReaperStats { + // the total number of items the reaper has visited. + uint64_t numVisitedItems{0}; + + // the number of items reaped. + uint64_t numReapedItems{0}; + + uint64_t numVisitErrs{0}; + + // number of times we went through the whole cache + uint64_t numTraversals{0}; + + // indicates the time in ms for the last iteration across the entire cache + uint64_t lastTraversalTimeMs{0}; + + // indicates the maximum of all traversals + uint64_t minTraversalTimeMs{0}; + + // indicates the minimum of all traversals + uint64_t maxTraversalTimeMs{0}; + + // indicates the average of all traversals + uint64_t avgTraversalTimeMs{0}; +}; +struct BackgroundStrategyStats { + + std::map > highEvictionAcWatermarks; + std::map > acLatencies; + + + BackgroundStrategyStats& operator+=(const BackgroundStrategyStats& rhs){ + for (const auto entry : rhs.highEvictionAcWatermarks) { + auto cid = entry.first; + auto count = entry.second; + highEvictionAcWatermarks[cid] = count; + } + + for (const auto entry : rhs.acLatencies) { + auto cid = entry.first; + auto count = entry.second; + acLatencies[cid] = count; + } + return *this; + } + +}; + +// Mover Stats +struct BackgroundMoverStats { + // the number of items this worker moved by looking at pools/classes stats + uint64_t numMovedItems{0}; + // number of times we went executed the thread //TODO: is this def correct? + uint64_t runCount{0}; + // total number of classes + uint64_t totalClasses{0}; + // eviction size + uint64_t totalBytesMoved{0}; + + BackgroundStrategyStats strategyStats; + BackgroundStrategyStats s; + BackgroundMoverStats& operator+=(const BackgroundMoverStats& rhs) { + numMovedItems += rhs.numMovedItems; + runCount += rhs.runCount; + totalClasses += rhs.totalClasses; + totalBytesMoved += rhs.totalBytesMoved; + strategyStats += rhs.strategyStats; + return *this; + } +}; + +struct BackgroundPromotionStats { + // the number of items this worker evicted by looking at pools/classes stats + uint64_t numPromotedItems{0}; + + // number of times we went executed the thread //TODO: is this def correct? + uint64_t runCount{0}; + + BackgroundPromotionStats& operator+=(const BackgroundPromotionStats& rhs) { + numPromotedItems += rhs.numPromotedItems; + runCount += rhs.runCount; + return *this; + } +}; + +// CacheMetadata type to export +struct CacheMetadata { + // allocator_version + int allocatorVersion; + // ram_format_version + int ramFormatVersion; + // nvm_format_version + int nvmFormatVersion; + // cache_total_size + size_t cacheSize; +}; + +// forward declaration +namespace detail { +struct Stats; +} + +// Stats that apply globally in cache and +// the ones that are aggregated over all pools +struct GlobalCacheStats { + // background eviction stats + BackgroundMoverStats evictionStats; + + BackgroundMoverStats promotionStats; + + // number of calls to CacheAllocator::find + uint64_t numCacheGets{0}; + + // number of such calls being a miss in the cache. + uint64_t numCacheGetMiss{0}; + + // number of such calls being an expiry in the cache. This is also included + // in the numCacheGetMiss stats above. + uint64_t numCacheGetExpiries{0}; + + // number of remove calls to CacheAllocator::remove that requires + // a lookup first and then remove the item + uint64_t numCacheRemoves{0}; + + // number of remove calls that resulted in a ram hit + uint64_t numCacheRemoveRamHits{0}; + + // number of item destructor calls from ram + uint64_t numRamDestructorCalls{0}; + + // number of nvm gets + uint64_t numNvmGets{0}; + + // number of nvm misses + uint64_t numNvmGetMiss{0}; + + // number of nvm isses due to internal errors + uint64_t numNvmGetMissErrs{0}; + + // number of nvm misses due to inflight remove on the same key + uint64_t numNvmGetMissDueToInflightRemove{0}; + + // number of nvm misses that happened synchronously + uint64_t numNvmGetMissFast{0}; + + // number of nvm gets that are expired + uint64_t numNvmGetMissExpired{0}; + + // number of gets that joined a concurrent fill for same item + uint64_t numNvmGetCoalesced{0}; + + // number of deletes issues to nvm + uint64_t numNvmDeletes{0}; + + // number of deletes skipped and not issued to nvm + uint64_t numNvmSkippedDeletes{0}; + + // number of writes to nvm + uint64_t numNvmPuts{0}; + + // number of put errors; + uint64_t numNvmPutErrs{0}; + + // number of put failures due to encode call back + uint64_t numNvmPutEncodeFailure{0}; + + // number of puts that observed an inflight delete and aborted + uint64_t numNvmAbortedPutOnTombstone{0}; + + // number of items that are filtered by compaction + uint64_t numNvmCompactionFiltered{0}; + + // number of puts that observed an inflight get and aborted + uint64_t numNvmAbortedPutOnInflightGet{0}; + + // number of evictions from NvmCache + uint64_t numNvmEvictions{0}; + + // number of evictions from nvm that found an inconsistent state in RAM + uint64_t numNvmUncleanEvict{0}; + + // number of evictions that were issued for an item that was in RAM in clean + // state + uint64_t numNvmCleanEvict{0}; + + // number of evictions that were issued more than once on an unclean item. + uint64_t numNvmCleanDoubleEvict{0}; + + // number of evictions that were already expired + uint64_t numNvmExpiredEvict{0}; + + // number of item destructor calls from nvm + uint64_t numNvmDestructorCalls{0}; + + // number of RefcountOverflow happens causing item destructor + // being skipped in nvm + uint64_t numNvmDestructorRefcountOverflow{0}; + + // number of puts to nvm of a clean item in RAM due to nvm eviction. + uint64_t numNvmPutFromClean{0}; + + // attempts made from nvm cache to allocate an item for promotion + uint64_t numNvmAllocAttempts{0}; + + // attempts made from nvm cache to allocate an item for its destructor + uint64_t numNvmAllocForItemDestructor{0}; + // heap allocate errors for item destrutor + uint64_t numNvmItemDestructorAllocErrors{0}; + + // size of itemRemoved_ hash set in nvm + uint64_t numNvmItemRemovedSetSize{0}; + + // number of attempts to allocate an item + uint64_t allocAttempts{0}; + + // number of eviction attempts + uint64_t evictionAttempts{0}; + + // number of failures to allocate an item due to internal error + uint64_t allocFailures{0}; + + // number of evictions across all the pools in the cache. + uint64_t numEvictions{0}; + + // number of allocation attempts with invalid input params. + uint64_t invalidAllocs{0}; + + // total number of items + uint64_t numItems{0}; + + // number of refcount overflows + uint64_t numRefcountOverflow{0}; + + // number of exception occurred inside item destructor + uint64_t numDestructorExceptions{0}; + + // number of allocated and CHAINED items that are parents (i.e., + // consisting of at least one chained child) + uint64_t numChainedChildItems{0}; + + // number of allocated and CHAINED items that are children (i.e., + // allocated with a parent handle that it's chained to) + uint64_t numChainedParentItems{0}; + + // number of eviction failures + uint64_t numEvictionFailureFromAccessContainer{0}; + uint64_t numEvictionFailureFromConcurrentFill{0}; + uint64_t numEvictionFailureFromParentAccessContainer{0}; + uint64_t numEvictionFailureFromMoving{0}; + uint64_t numEvictionFailureFromParentMoving{0}; + + // latency and percentile stats of various cachelib operations + util::PercentileStats::Estimates allocateLatencyNs{}; + util::PercentileStats::Estimates moveChainedLatencyNs{}; + util::PercentileStats::Estimates moveRegularLatencyNs{}; + util::PercentileStats::Estimates nvmLookupLatencyNs{}; + util::PercentileStats::Estimates nvmInsertLatencyNs{}; + util::PercentileStats::Estimates nvmRemoveLatencyNs{}; + util::PercentileStats::Estimates ramEvictionAgeSecs{}; + util::PercentileStats::Estimates ramItemLifeTimeSecs{}; + util::PercentileStats::Estimates nvmSmallLifetimeSecs{}; + util::PercentileStats::Estimates nvmLargeLifetimeSecs{}; + util::PercentileStats::Estimates nvmEvictionSecondsPastExpiry{}; + util::PercentileStats::Estimates nvmEvictionSecondsToExpiry{}; + util::PercentileStats::Estimates nvmPutSize{}; + + // time when CacheAllocator structure is created. Whenever a process restarts + // and even if cache content is persisted, this will be reset. It's similar + // to process uptime. (But alternatively if user explicitly shuts down and + // re-attach cache, this will be reset as well) + uint64_t cacheInstanceUpTime{0}; + + // time since the ram cache was created in seconds + uint64_t ramUpTime{0}; + + // time since the nvm cache was created in seconds + uint64_t nvmUpTime{0}; + + // If true, it means ram cache is brand new, or it was not restored from a + // previous cache instance + bool isNewRamCache{false}; + + // If true, it means nvm cache is brand new, or it was not restored from a + // previous cache instance + bool isNewNvmCache{false}; + + // if nvmcache is currently active and serving gets + bool nvmCacheEnabled; + + // stats related to the reaper + ReaperStats reaperStats; + + uint64_t numNvmRejectsByExpiry{}; + uint64_t numNvmRejectsByClean{}; + uint64_t numNvmRejectsByAP{}; + + // Decryption and Encryption errors + uint64_t numNvmEncryptionErrors{0}; + uint64_t numNvmDecryptionErrors{0}; + + // Number of times slab release was aborted due to shutdown + uint64_t numAbortedSlabReleases{0}; + + // Number of times slab was skipped when reaper runs + uint64_t numSkippedSlabReleases{0}; + + // current active handles outstanding. This stat should + // not go to negative. If it's negative, it means we have + // leaked handles (or some sort of accounting bug internally) + int64_t numActiveHandles; +}; + +struct CacheMemoryStats { + // current memory used for cache in bytes. This excludes the memory used for + // headers. This can change as memory is advised and reclaimed. + size_t cacheSize{0}; + + // regular pool memory size in bytes + size_t regularCacheSize{0}; + + // compact cache pool memory size in bytes + size_t compactCacheSize{0}; + + // current advised away memory size in bytes. + size_t advisedSize{0}; + + // maximum advised pct of regular cache. + size_t maxAdvisedPct{0}; + + // amount of memory that is not assigned for any pool in bytes + size_t unReservedSize{0}; + + // size of the nvm cache in addition to the ram cache. + size_t nvmCacheSize{0}; + + // returns the advised memory in the unit of slabs. + size_t numAdvisedSlabs() const { return advisedSize / Slab::kSize; } + + // returne usable portion of the cache size + size_t usableCacheSize() const { return cacheSize - advisedSize; } + + // amount of memory available on the host + size_t memAvailableSize{0}; + + // rss size of the process + size_t memRssSize{0}; + + // percentage of free slabs + std::vector slabsApproxFreePercentages{0.0}; +}; + +// Stats for compact cache +struct CCacheStats { + uint64_t get; + uint64_t getHit; + uint64_t getMiss; + uint64_t getErr; + uint64_t tailHits; + + uint64_t set; + uint64_t setHit; + uint64_t setMiss; + uint64_t setErr; + uint64_t evictions; + + uint64_t del; + uint64_t delHit; + uint64_t delMiss; + uint64_t delErr; + + uint64_t purgeSuccess; + uint64_t purgeErr; + uint64_t lockTimeout; + uint64_t promoteTimeout; + + double hitRatio() const; + + CCacheStats& operator+=(const CCacheStats& other) { + get += other.get; + getHit += other.getHit; + getMiss += other.getMiss; + getErr += other.getErr; + tailHits += other.tailHits; + + set += other.set; + setHit += other.setHit; + setMiss += other.setMiss; + setErr += other.setErr; + evictions += other.evictions; + + del += other.del; + delHit += other.delHit; + delMiss += other.delMiss; + delErr += other.delErr; + + purgeSuccess += other.purgeSuccess; + purgeErr += other.purgeErr; + lockTimeout += other.lockTimeout; + promoteTimeout += other.promoteTimeout; + + return *this; + } +}; + +// Types of background workers +enum PoolWorkerType { + POOL_REBALANCER = 0, + POOL_RESIZER, + MEMORY_MONITOR, + MAX_POOL_WORKER +}; + +/* Slab release event data */ +struct SlabReleaseData { + // Time when release occured. + std::chrono::system_clock::time_point timeOfRelease; + // The class where the slab was released from. + ClassId from; + // The receiver of the released slab. + ClassId to; + // The sequence of this event, with respect to other release events logged by + // this process. + uint64_t sequenceNum; + // Time release took. + uint64_t durationMs; + // PoolId of the pool where the rebalance occurred. + PoolId pid; + // Number of slabs in the victim class after rebalancing. + unsigned int numSlabsInVictim; + // Number of slabs in the receiver class after rebalancing. + unsigned int numSlabsInReceiver; + // Allocation size of the victim class. + uint32_t victimAllocSize; + // Allocation size of the receiver class. + uint32_t receiverAllocSize; + // Eviction age of the victim class. + uint64_t victimEvictionAge; + // Eviction age of the receiver class. + uint64_t receiverEvictionAge; + // Number of free allocs in the victim class + uint64_t numFreeAllocsInVictim; +}; + +using SlabReleaseEvents = std::vector; + +// Slab release events organized by their type +struct AllSlabReleaseEvents { + SlabReleaseEvents rebalancerEvents; + SlabReleaseEvents resizerEvents; + SlabReleaseEvents monitorEvents; +}; + +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/CacheStatsInternal.h b/bdm/allocator/CacheStatsInternal.h new file mode 100644 index 0000000000..50a70c2c22 --- /dev/null +++ b/bdm/allocator/CacheStatsInternal.h @@ -0,0 +1,264 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +#include + +#include "cachelib/allocator/Cache.h" +#include "cachelib/allocator/memory/MemoryAllocator.h" +#include "cachelib/common/AtomicCounter.h" +#include "cachelib/common/RollingStats.h" + +namespace facebook { +namespace cachelib { + +// forward declaration +struct GlobalCacheStats; + +namespace detail { + +// collection of stats that are updated at a high frequency, making it +// necessary to track them as thread local counters that are aggregated. +struct Stats { + // overall hit ratio related stats for the cache. + // number of calls to CacheAllocator::find + TLCounter numCacheGets{0}; + + // number of such calls being a miss in the cache. + TLCounter numCacheGetMiss{0}; + + // number of such calls being an expiry in the cache. This is also included + // in the numCacheGetMiss stats above. + TLCounter numCacheGetExpiries{0}; + + // number of remove calls to CacheAllocator::remove that requires + // a lookup first and then remove the item + TLCounter numCacheRemoves{0}; + + // number of remove calls that resulted in a ram hit + TLCounter numCacheRemoveRamHits{0}; + + // number of item destructor calls from ram + TLCounter numRamDestructorCalls{0}; + + // number of nvm gets + TLCounter numNvmGets{0}; + + // number of nvm get miss that happened synchronously + TLCounter numNvmGetMissFast{0}; + + // number of nvm misses + TLCounter numNvmGetMiss{0}; + + // number of nvm isses due to internal errors + TLCounter numNvmGetMissErrs{0}; + + // number of nvm misses due to inflight remove on the same key + TLCounter numNvmGetMissDueToInflightRemove{0}; + + // number of nvm gets that are expired + TLCounter numNvmGetMissExpired{0}; + + // number of gets that joined a concurrent fill for same item + AtomicCounter numNvmGetCoalesced{0}; + + // number of deletes issues to nvm + TLCounter numNvmDeletes{0}; + + // number of deletes skipped and not issued to nvm + TLCounter numNvmSkippedDeletes{0}; + + // number of writes to nvm + TLCounter numNvmPuts{0}; + + // number of put errors; + TLCounter numNvmPutErrs{0}; + + // number of put failures due to encode call back + AtomicCounter numNvmPutEncodeFailure{0}; + + // number of puts that observed an inflight delete and aborted + AtomicCounter numNvmAbortedPutOnTombstone{0}; + + // number of puts that observed an inflight concurrent get and aborted + AtomicCounter numNvmAbortedPutOnInflightGet{0}; + + // number of items that are filtered by compaction + AtomicCounter numNvmCompactionFiltered{0}; + + // number of evictions from NvmCache + TLCounter numNvmEvictions{0}; + + // number of evictions from nvm that found an inconsistent state in RAM + AtomicCounter numNvmUncleanEvict{0}; + + // number of evictions that were issued for an item that was in RAM in clean + // state + AtomicCounter numNvmCleanEvict{0}; + + // number of evictions that were issued more than once on an unclean item. + AtomicCounter numNvmCleanDoubleEvict{0}; + + // number of evictions that were already expired + AtomicCounter numNvmExpiredEvict{0}; + + // number of item destructor calls from nvm + AtomicCounter numNvmDestructorCalls{0}; + + // number of RefcountOverflow happens causing item destructor + // being skipped in nvm + AtomicCounter numNvmDestructorRefcountOverflow{0}; + + // number of entries that were clean in RAM, but evicted and rewritten to + // nvmcache because the nvmcache version was evicted + AtomicCounter numNvmPutFromClean{0}; + + // Decryption and Encryption errors + AtomicCounter numNvmEncryptionErrors{0}; + AtomicCounter numNvmDecryptionErrors{0}; + + // basic admission policy stats + TLCounter numNvmRejectsByExpiry{0}; + TLCounter numNvmRejectsByClean{0}; + TLCounter numNvmRejectsByAP{0}; + + // attempts made from nvm cache to allocate an item for promotion + TLCounter numNvmAllocAttempts{0}; + + // attempts made from nvm cache to allocate an item for its destructor + TLCounter numNvmAllocForItemDestructor{0}; + // heap allocate errors for item destrutor + TLCounter numNvmItemDestructorAllocErrors{0}; + + // the number of allocated items that are permanent + TLCounter numPermanentItems{0}; + + // the number of allocated and CHAINED items that are parents (i.e., + // consisting of at least one chained child) + TLCounter numChainedParentItems{0}; + + // the number of allocated and CHAINED items that are children (i.e., + // allocated with a parent handle that it's chained to) + TLCounter numChainedChildItems{0}; + + // the numbers for move and evictions in the process of slab release. + AtomicCounter numMoveAttempts{0}; + AtomicCounter numMoveSuccesses{0}; + AtomicCounter numEvictionAttempts{0}; + AtomicCounter numEvictionSuccesses{0}; + + // the number times a refcount overflow occurred, resulting in an exception + // being thrown + AtomicCounter numRefcountOverflow{0}; + + // number of exception occurred inside item destructor + AtomicCounter numDestructorExceptions{0}; + + // The number of slabs being released right now. + // This must be zero when `saveState()` is called. + AtomicCounter numActiveSlabReleases{0}; + // Number of different slab releases. + AtomicCounter numReleasedForRebalance{0}; + AtomicCounter numReleasedForResize{0}; + AtomicCounter numReleasedForAdvise{0}; + AtomicCounter numAbortedSlabReleases{0}; + AtomicCounter numSkippedSlabReleases{0}; + + // Flag indicating the slab release stuck + AtomicCounter numSlabReleaseStuck{0}; + + // allocations with invalid parameters + AtomicCounter invalidAllocs{0}; + + // latency stats of various cachelib operations + mutable util::PercentileStats allocateLatency_; + mutable util::PercentileStats moveChainedLatency_; + mutable util::PercentileStats moveRegularLatency_; + mutable util::PercentileStats nvmLookupLatency_; + mutable util::PercentileStats nvmInsertLatency_; + mutable util::PercentileStats nvmRemoveLatency_; + + // percentile stats for various cache statistics + mutable util::PercentileStats ramEvictionAgeSecs_; + mutable util::PercentileStats ramItemLifeTimeSecs_; + mutable util::PercentileStats nvmSmallLifetimeSecs_; + mutable util::PercentileStats nvmLargeLifetimeSecs_; + mutable util::PercentileStats nvmEvictionSecondsPastExpiry_; + mutable util::PercentileStats nvmEvictionSecondsToExpiry_; + + // per-pool percentile stats for eviction age + std::array + perPoolEvictionAgeSecs_; + + // This tracks in each window what are the percentiles of the sizes of + // items that we have written to flash. This is at-the-moment view of what + // we're currently writing into flash. + mutable util::PercentileStats nvmPutSize_; + + using PerPoolClassAtomicCounters = + std::array, + MemoryPoolManager::kMaxPools>; + + // count of a stat for a specific allocation class + using PerPoolClassTLCounters = + std::array, + MemoryPoolManager::kMaxPools>; + + // hit count for every alloc class in every pool + std::unique_ptr cacheHits{}; + std::unique_ptr allocAttempts{}; + std::unique_ptr evictionAttempts{}; + std::unique_ptr allocFailures{}; + std::unique_ptr fragmentationSize{}; + std::unique_ptr chainedItemEvictions{}; + std::unique_ptr regularItemEvictions{}; + + using PerTierPoolClassRollingStats = std::array< + std::array, + MemoryPoolManager::kMaxPools>, + CacheBase::kMaxTiers>; + + // rolling latency tracking for every alloc class in every pool + std::unique_ptr classAllocLatency{}; + + // Eviction failures due to parent cannot be removed from access container + AtomicCounter evictFailParentAC{0}; + + // Eviction failures due to parent cannot be removed because it's being + // moved + AtomicCounter evictFailParentMove{0}; + + // Eviction failures because this item cannot be removed from access + // container + AtomicCounter evictFailAC{0}; + + // Eviction failures because this item has a potential concurrent fill + // from nvm cache. For consistency reason, we cannot evict it. Refer + // to NvmCache.h for more details. + AtomicCounter evictFailConcurrentFill{0}; + + // Eviction failures because this item is being moved + AtomicCounter evictFailMove{0}; + + void init(); + + void populateGlobalCacheStats(GlobalCacheStats& ret) const; +}; + +} // namespace detail +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/CacheTraits.h b/bdm/allocator/CacheTraits.h new file mode 100644 index 0000000000..0b51686f05 --- /dev/null +++ b/bdm/allocator/CacheTraits.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include "cachelib/allocator/ChainedHashTable.h" +#include "cachelib/allocator/MM2Q.h" +#include "cachelib/allocator/MMLru.h" +#include "cachelib/allocator/MMTinyLFU.h" +#include "cachelib/common/Mutex.h" + +namespace facebook { +namespace cachelib { +// The cache traits supported by CacheLib. +// Cache trait is a combination of MMType, AccessType and AccesTypeLock. +// MMType is the type of MM (memory management) container used by the cache, +// which controls a cache item's life time. +// AccessType is the type of access container, which controls how an item is +// accessed. +// AccessTypeLock is the lock type for the access container that supports +// multiple locking primitives +struct LruCacheTrait { + using MMType = MMLru; + using AccessType = ChainedHashTable; + using AccessTypeLocks = SharedMutexBuckets; +}; + +struct LruCacheWithSpinBucketsTrait { + using MMType = MMLru; + using AccessType = ChainedHashTable; + using AccessTypeLocks = SpinBuckets; +}; + +struct Lru2QCacheTrait { + using MMType = MM2Q; + using AccessType = ChainedHashTable; + using AccessTypeLocks = SharedMutexBuckets; +}; + +struct TinyLFUCacheTrait { + using MMType = MMTinyLFU; + using AccessType = ChainedHashTable; + using AccessTypeLocks = SharedMutexBuckets; +}; + +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/CacheVersion.h b/bdm/allocator/CacheVersion.h new file mode 100644 index 0000000000..cd2ca7b2d7 --- /dev/null +++ b/bdm/allocator/CacheVersion.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace facebook { +namespace cachelib { +// If you're bumping kCacheRamFormatVersion or kCacheNvmFormatVersion, +// you *MUST* bump this as well. +// +// If the change is not an incompatible one, but you want to keep track of it, +// then you only need to bump this version. +// I.e. you're rolling out a new feature that is cache compatible with previous +// Cachelib instances. +constexpr uint64_t kCachelibVersion = 17; + +// Updating this version will cause RAM cache to be dropped for all +// cachelib users!!! Proceed with care!! You must coordinate with +// cachelib users either directly or via Cache Library group. +// +// If you're bumping this version, you *MUST* bump kCachelibVersion +// as well. +constexpr uint64_t kCacheRamFormatVersion = 3; + +// Updating this version will cause NVM cache to be dropped for all +// cachelib users!!! Proceed with care!! You must coordinate with +// cachelib users either directly or via Cache Library group. +// +// If you're bumping this version, you *MUST* bump kCachelibVersion +// as well. +constexpr uint64_t kCacheNvmFormatVersion = 2; + +// @return a string as version. +// cachelib: X, ram: Y, nvm: Z +inline const std::string& getCacheVersionString() { + static std::string kVersionStr = + "{ \"cachelib\" : " + std::to_string(kCachelibVersion) + + ", \"ram\" : " + std::to_string(kCacheRamFormatVersion) + + ", \"nvm\" : " + std::to_string(kCacheNvmFormatVersion) + "}"; + return kVersionStr; +} +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/ChainedAllocs.h b/bdm/allocator/ChainedAllocs.h new file mode 100644 index 0000000000..fdf3ae1dcd --- /dev/null +++ b/bdm/allocator/ChainedAllocs.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +namespace facebook { +namespace cachelib { + +// exposes the parent and its chain of allocations through an iterator and +// index. The chain is traversed in the LIFO order. The caller needs to ensure +// that there are no concurrent addChainedItem or popChainedItem while this +// happens. +template +class CacheChainedAllocs { + public: + using Item = typename Cache::Item; + using ChainedItem = typename Iter::Item; + + CacheChainedAllocs(CacheChainedAllocs&&) = default; + CacheChainedAllocs& operator=(CacheChainedAllocs&&) = default; + + // return the parent of the chain. + const Item& getParentItem() const noexcept { return *parent_; } + // iterate and compute the length of the chain. This is O(N) computation. + // + // @return the length of the chain + size_t computeChainLength() const { + const auto chain = getChain(); + return std::distance(chain.begin(), chain.end()); + } + + // return the nTh in the chain from the beginning. n = 0 is the first in the + // chain and last inserted. + ChainedItem* getNthInChain(size_t n) { + size_t i = 0; + for (auto& c : getChain()) { + if (i++ == n) { + return &c; + } + } + return nullptr; + } + + folly::Range getChain() const { + return folly::Range{Iter{&head_, compressor_}, Iter{}}; + } + + private: + friend Cache; + using LockType = typename Cache::ChainedItemLock; + using ReadLockHolder = typename LockType::ReadLockHolder; + using PtrCompressor = typename Item::PtrCompressor; + + CacheChainedAllocs(const CacheChainedAllocs&) = delete; + CacheChainedAllocs& operator=(const CacheChainedAllocs&) = delete; + + // only the cache can create this view of chained allocs + // + // @param l the lock to be held while iterating on the chain + // @param parent handle to the parent + // @param head beginning of the chain of the allocations + // @param c pointer compressor to traverse the chain + CacheChainedAllocs(ReadLockHolder l, + Handle parent, + Item& head, + const PtrCompressor& c) + : lock_(std::move(l)), + parent_(std::move(parent)), + head_(head), + compressor_(c) { + if (!parent_ || !parent_->hasChainedItem()) { + throw std::invalid_argument("Parent does not have a chain"); + } + + if (!head_.isChainedItem()) { + throw std::invalid_argument("Head of chained allocation is invalid"); + } + } + + // lock protecting the traversal of the chain + ReadLockHolder lock_; + + // handle to the parent item. holding this ensures that remaining of the + // chain is not evicted. + Handle parent_; + + // verify this would not cause issues with the moving slab release logic. + // Evicting logic is fine since it looks for the parent's refcount + Item& head_; + + // pointer compressor to traverse the chain. + const PtrCompressor& compressor_; +}; +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/ChainedHashTable-inl.h b/bdm/allocator/ChainedHashTable-inl.h new file mode 100644 index 0000000000..2655a4b12d --- /dev/null +++ b/bdm/allocator/ChainedHashTable-inl.h @@ -0,0 +1,624 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#include +#include +#pragma GCC diagnostic pop + +namespace facebook { +namespace cachelib { + +template T::*HookPtr> +ChainedHashTable::Impl::Impl(size_t numBuckets, + const PtrCompressor& compressor, + const Hasher& hasher) + : numBuckets_(numBuckets), + numBucketsMask_(numBuckets - 1), + compressor_(compressor), + hasher_(hasher) { + if (numBuckets == 0) { + throw std::invalid_argument("Can not have 0 buckets"); + } + if (numBuckets & (numBuckets - 1)) { + throw std::invalid_argument("Number of buckets must be a power of two"); + } + hashTable_ = std::make_unique(numBuckets_); + CompressedPtr* memStart = hashTable_.get(); + std::fill(memStart, memStart + numBuckets_, CompressedPtr{}); +} + +template T::*HookPtr> +ChainedHashTable::Impl::Impl(size_t numBuckets, + void* memStart, + const PtrCompressor& compressor, + const Hasher& hasher, + bool resetMem) + : numBuckets_(numBuckets), + numBucketsMask_(numBuckets - 1), + hashTable_(static_cast(memStart)), + restorable_(true), + compressor_(compressor), + hasher_(hasher) { + if (numBuckets == 0) { + throw std::invalid_argument("Can not have 0 buckets"); + } + if (numBuckets & (numBuckets - 1)) { + throw std::invalid_argument("Number of buckets must be a power of two"); + } + if (resetMem) { + CompressedPtr* memStartBucket = static_cast(memStart); + std::fill(memStartBucket, memStartBucket + numBuckets_, CompressedPtr{}); + } +} + +template T::*HookPtr> +ChainedHashTable::Impl::Impl::~Impl() { + if (restorable_) { + hashTable_.release(); + } +} + +template T::*HookPtr> +typename ChainedHashTable::Impl::BucketId +ChainedHashTable::Impl::getBucket( + typename T::Key k) const noexcept { + return (*hasher_)(k.data(), k.size()) & numBucketsMask_; +} + +template T::*HookPtr> +bool ChainedHashTable::Impl::insertInBucket( + T& node, BucketId bucket) noexcept { + XDCHECK_LT(bucket, numBuckets_); + const auto existing = findInBucket(node.getKey(), bucket); + if (existing != nullptr) { + // already there + return false; + } + + // insert at the head of the bucket + const auto head = hashTable_[bucket]; + hashTable_[bucket] = compressor_.compress(&node); + setHashNext(node, head); + return true; +} + +template T::*HookPtr> +T* ChainedHashTable::Impl::insertOrReplaceInBucket( + T& node, BucketId bucket) noexcept { + XDCHECK_LT(bucket, numBuckets_); + + // See if we can find the key and the previous node + T* curr = compressor_.unCompress(hashTable_[bucket]); + T* prev = nullptr; + + const auto key = node.getKey(); + while (curr != nullptr && key != curr->getKey()) { + prev = curr; + curr = getHashNext(*curr); + } + + // insert if the key doesn't exist + if (!curr) { + const auto head = hashTable_[bucket]; + hashTable_[bucket] = compressor_.compress(&node); + setHashNext(node, head); + return nullptr; + } + + // replace + if (prev) { + setHashNext(*prev, &node); + } else { + hashTable_[bucket] = compressor_.compress(&node); + } + setHashNext(node, getHashNext(*curr)); + + return curr; +} + +template T::*HookPtr> +void ChainedHashTable::Impl::removeFromBucket( + T& node, BucketId bucket) noexcept { + // node must be present in hashtable. + XDCHECK_EQ(reinterpret_cast(findInBucket(node.getKey(), bucket)), + reinterpret_cast(&node)) + << node.toString(); + + T* const prev = findPrevInBucket(node, bucket); + if (prev != nullptr) { + setHashNext(*prev, getHashNext(node)); + } else { + XDCHECK_EQ(reinterpret_cast(&node), + reinterpret_cast( + compressor_.unCompress(hashTable_[bucket]))); + hashTable_[bucket] = getHashNextCompressed(node); + } +} + +template T::*HookPtr> +T* ChainedHashTable::Impl::findInBucket( + Key key, BucketId bucket) const noexcept { + XDCHECK_LT(bucket, numBuckets_); + T* curr = compressor_.unCompress(hashTable_[bucket]); + while (curr != nullptr && curr->getKey() != key) { + curr = getHashNext(*curr); + } + return curr; +} + +template T::*HookPtr> +T* ChainedHashTable::Impl::findPrevInBucket( + const T& node, BucketId bucket) const noexcept { + XDCHECK_LT(bucket, numBuckets_); + T* curr = compressor_.unCompress(hashTable_[bucket]); + T* prev = nullptr; + + const auto key = node.getKey(); + while (curr != nullptr && key != curr->getKey()) { + prev = curr; + curr = getHashNext(*curr); + } + // node must be in the hashtable + XDCHECK(curr != nullptr); + return prev; +} + +template T::*HookPtr> +template +void ChainedHashTable::Impl::forEachBucketElem(BucketId bucket, + F&& func) const { + XDCHECK_LT(bucket, numBuckets_); + T* curr = compressor_.unCompress(hashTable_[bucket]); + + while (curr != nullptr) { + func(curr); + curr = getHashNext(*curr); + } +} + +template T::*HookPtr> +unsigned int ChainedHashTable::Impl::getBucketNumElems( + BucketId bucket) const { + XDCHECK_LT(bucket, numBuckets_); + + T* curr = compressor_.unCompress(hashTable_[bucket]); + + unsigned int numElems = 0; + while (curr != nullptr) { + ++numElems; + curr = getHashNext(*curr); + } + return numElems; +} + +// AccessContainer interface +template T::*HookPtr, + typename LockT> +ChainedHashTable::Container::Container( + const serialization::ChainedHashTableObject& object, + const Config& config, + ShmAddr memSegment, + const PtrCompressor& compressor, + HandleMaker hm) + : Container(object, + config, + memSegment.addr, + memSegment.size, + compressor, + std::move(hm)) {} + +template T::*HookPtr, + typename LockT> +ChainedHashTable::Container::Container( + const serialization::ChainedHashTableObject& object, + const Config& config, + void* memStart, + size_t nBytes, + const PtrCompressor& compressor, + HandleMaker hm) + : config_{config}, + handleMaker_(std::move(hm)), + ht_{config_.getNumBuckets(), memStart, compressor, config_.getHasher(), + false /* resetMem */}, + locks_{config_.getLocksPower(), config_.getHasher()}, + numKeys_(*object.numKeys()) { + if (config_.getBucketsPower() != + static_cast(*object.bucketsPower())) { + throw std::invalid_argument(folly::sformat( + "Hashtable bucket power not compatible. old = {}, new = {}", + *object.bucketsPower(), + config.getBucketsPower())); + } + + if (nBytes != ht_.size()) { + throw std::invalid_argument( + folly::sformat("Hashtable size not compatible. old = {}, new = {}", + ht_.size(), + nBytes)); + } + + // checking hasher magic id not equal to 0 is to ensure it'll be + // a warm roll going from a cachelib without hasher magic id to + // one with a magic id + if (*object.hasherMagicId() != 0 && + *object.hasherMagicId() != config_.getHasher()->getMagicId()) { + throw std::invalid_argument(folly::sformat( + "Hash object's ID mismatch. expected = {}, actual = {}", + *object.hasherMagicId(), config_.getHasher()->getMagicId())); + } +} + +template T::*HookPtr, + typename LockT> +typename ChainedHashTable::Container::DistributionStats +ChainedHashTable::Container::getDistributionStats() const { + const auto now = util::getCurrentTimeSec(); + const uint64_t numKeys = numKeys_; + + std::unique_lock statsLockGuard(cachedStatsLock_); + const auto numKeysDifference = numKeys > cachedStats_.numKeys + ? numKeys - cachedStats_.numKeys + : cachedStats_.numKeys - numKeys; + + const bool needToRecompute = + (now - cachedStatsUpdateTime_ > 10 * 60 /* seconds */) || + (cachedStats_.numKeys > 0 && + (static_cast(numKeysDifference) / + static_cast(cachedStats_.numKeys) > + 0.05)); + + // return the cached value or if someone else is already computing. + if (!needToRecompute || !canRecomputeDistributionStats_) { + return cachedStats_; + } + + // record that we are iterating so that we dont cause everyone who + // observes this to recompute + canRecomputeDistributionStats_ = false; + + // release the lock. + statsLockGuard.unlock(); + + // compute the distribution + std::map distribution; + const auto numBuckets = ht_.getNumBuckets(); + for (BucketId currBucket = 0; currBucket < numBuckets; ++currBucket) { + auto l = locks_.lockShared(currBucket); + ++distribution[ht_.getBucketNumElems(currBucket)]; + } + + // acquire lock + statsLockGuard.lock(); + cachedStats_.numKeys = numKeys; + cachedStats_.itemDistribution = std::move(distribution); + cachedStats_.numBuckets = ht_.getNumBuckets(); + cachedStatsUpdateTime_ = now; + canRecomputeDistributionStats_ = true; + return cachedStats_; +} + +template T::*HookPtr, + typename LockT> +bool ChainedHashTable::Container::insert(T& node) noexcept { + if (node.isAccessible()) { + // already in hash table. + return false; + } + + const auto bucket = ht_.getBucket(node.getKey()); + auto l = locks_.lockExclusive(bucket); + const bool res = ht_.insertInBucket(node, bucket); + + if (res) { + node.markAccessible(); + numKeys_.fetch_add(1, std::memory_order_relaxed); + } + + return res; +} + +template T::*HookPtr, + typename LockT> +typename T::Handle +ChainedHashTable::Container::insertOrReplace(T& node) { + if (node.isAccessible()) { + return handleMaker_(nullptr); + } + + const auto bucket = ht_.getBucket(node.getKey()); + auto l = locks_.lockExclusive(bucket); + T* oldNode = ht_.insertOrReplaceInBucket(node, bucket); + XDCHECK_NE(reinterpret_cast(&node), + reinterpret_cast(oldNode)); + + // grab a handle to the old node before we mark it as not being in the hash + // table. + typename T::Handle handle; + try { + handle = handleMaker_(oldNode); + } catch (const std::exception&) { + // put the element back since we failed to grab handle. + ht_.insertOrReplaceInBucket(*oldNode, bucket); + XDCHECK_EQ( + reinterpret_cast(ht_.findInBucket(node.getKey(), bucket)), + reinterpret_cast(oldNode)) + << oldNode->toString(); + throw; + } + + node.markAccessible(); + + if (oldNode) { + oldNode->unmarkAccessible(); + } else { + numKeys_.fetch_add(1, std::memory_order_relaxed); + } + + return handle; +} + +template T::*HookPtr, + typename LockT> +bool ChainedHashTable::Container::replaceIfAccessible( + T& oldNode, T& newNode) noexcept { + return replaceIf(oldNode, newNode, [](T&) { return true; }); +} + +template T::*HookPtr, + typename LockT> +template +bool ChainedHashTable::Container::replaceIf(T& oldNode, + T& newNode, + F&& predicate) { + const auto key = newNode.getKey(); + const auto bucket = ht_.getBucket(key); + auto l = locks_.lockExclusive(bucket); + + if (oldNode.isAccessible() && predicate(oldNode)) { + ht_.insertOrReplaceInBucket(newNode, bucket); + oldNode.unmarkAccessible(); + newNode.markAccessible(); + return true; + } + return false; +} + +template T::*HookPtr, + typename LockT> +bool ChainedHashTable::Container::remove(T& node) noexcept { + const auto bucket = ht_.getBucket(node.getKey()); + auto l = locks_.lockExclusive(bucket); + + // check inside the lock to prevent from racing removes + if (!node.isAccessible()) { + return false; + } + + ht_.removeFromBucket(node, bucket); + node.unmarkAccessible(); + + numKeys_.fetch_sub(1, std::memory_order_relaxed); + return true; +} + +template T::*HookPtr, + typename LockT> +typename T::Handle ChainedHashTable::Container::removeIf( + T& node, const std::function& predicate) { + const auto bucket = ht_.getBucket(node.getKey()); + auto l = locks_.lockExclusive(bucket); + + // check inside the lock to prevent from racing removes + if (node.isAccessible() && predicate(node)) { + // grab the handle before we do any other state change. this ensures that + // if handle maker throws an exception, we leave the item in a consistent + // state. + auto handle = handleMaker_(&node); + ht_.removeFromBucket(node, bucket); + node.unmarkAccessible(); + numKeys_.fetch_sub(1, std::memory_order_relaxed); + return handle; + } else { + return handleMaker_(nullptr); + } +} + +template T::*HookPtr, + typename LockT> +typename T::Handle ChainedHashTable::Container::find( + Key key) const { + const auto bucket = ht_.getBucket(key); + auto l = locks_.lockShared(bucket); + return handleMaker_(ht_.findInBucket(key, bucket)); +} + +template T::*HookPtr, + typename LockT> +serialization::ChainedHashTableObject +ChainedHashTable::Container::saveState() const { + if (!ht_.isRestorable()) { + throw std::logic_error( + "hashtable is not restorable since the memory is not managed by user"); + } + + if (numIterators_ != 0) { + throw std::logic_error( + folly::sformat("There are {} pending iterators", numIterators_.load())); + } + + serialization::ChainedHashTableObject object; + *object.bucketsPower() = config_.getBucketsPower(); + *object.locksPower() = config_.getLocksPower(); + *object.numKeys() = numKeys_; + *object.hasherMagicId() = config_.getHasher()->getMagicId(); + return object; +} + +template T::*HookPtr, + typename LockT> +void ChainedHashTable::Container::getBucketElems( + BucketId bucket, std::vector& handles) const { + handles.clear(); + auto l = locks_.lockShared(bucket); + + ht_.forEachBucketElem(bucket, [this, &handles](T* e) { + try { + XDCHECK(e); + handles.emplace_back(handleMaker_(e)); + } catch (const std::exception&) { + // if we are not able to acquire a handle, skip over them. + } + }); +} + +// Container's Iterator +// with/without throtter to iterate +template T::*HookPtr, + typename LockT> +typename ChainedHashTable::Container::Iterator& +ChainedHashTable::Container::Iterator::operator++() { + if (throttler_) { + throttler_->throttle(); + } + + ++curSor_; + if (curSor_ < bucketElems_.size()) { + return *this; + } + + ++currBucket_; + for (; currBucket_ < container_->config_.getNumBuckets(); ++currBucket_) { + container_->getBucketElems(currBucket_, bucketElems_); + if (!bucketElems_.empty()) { + curSor_ = 0; + return *this; + } else if (throttler_) { + throttler_->throttle(); + } + } + + // reach the end + bucketElems_.clear(); + curSor_ = 0; + return *this; +} + +template T::*HookPtr, + typename LockT> +T& ChainedHashTable::Container::Iterator::operator*() { + return *curr(); +} + +template T::*HookPtr, + typename LockT> +ChainedHashTable::Container::Iterator::Iterator( + Container& container, + folly::Optional throttlerConfig) + : container_(&container) { + if (throttlerConfig) { + throttler_.assign(util::Throttler(*throttlerConfig)); + } + + ++container_->numIterators_; + + reset(); +} + +template T::*HookPtr, + typename LockT> +ChainedHashTable::Container::Iterator::Iterator( + Iterator&& other) noexcept + : container_{other.container_}, + currBucket_{other.currBucket_}, + curSor_{other.curSor_}, + bucketElems_(std::move(other.bucketElems_)) { + // increment the iterator count when we move. + ++container_->numIterators_; +} + +template T::*HookPtr, + typename LockT> +typename ChainedHashTable::Container::Iterator& +ChainedHashTable::Container::Iterator::operator=( + Iterator&& other) noexcept { + if (this != &other) { + this->~Iterator(); + new (this) Iterator(std::move(other)); + } + return *this; +} + +template T::*HookPtr, + typename LockT> +ChainedHashTable::Container::Iterator::Iterator( + Container& container, EndIterT) + : container_(&container), currBucket_{container_->config_.getNumBuckets()} { + // increment the iterator for both the end and begin() types so that the + // destructor can just blindly decrement. + ++container_->numIterators_; + XDCHECK_EQ(0u, curSor_); +} + +template T::*HookPtr, + typename LockT> +typename ChainedHashTable::Container::Iterator +ChainedHashTable::Container::begin( + folly::Optional throttlerConfig) { + return Iterator(*this, throttlerConfig); +} + +template T::*HookPtr, + typename LockT> +void ChainedHashTable::Container::Iterator::reset() { + curSor_ = 0; + currBucket_ = 0; + container_->getBucketElems(currBucket_, bucketElems_); + while (bucketElems_.empty() && + ++currBucket_ < container_->config_.getNumBuckets()) { + if (throttler_) { + throttler_->throttle(); + } + container_->getBucketElems(currBucket_, bucketElems_); + } + XDCHECK_EQ(0u, curSor_); +} +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/ChainedHashTable.h b/bdm/allocator/ChainedHashTable.h new file mode 100644 index 0000000000..f3c87bfa7a --- /dev/null +++ b/bdm/allocator/ChainedHashTable.h @@ -0,0 +1,698 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include + +#include "cachelib/allocator/Cache.h" +#include "cachelib/allocator/memory/serialize/gen-cpp2/objects_types.h" +#include "cachelib/common/CompilerUtils.h" +#include "cachelib/common/Mutex.h" +#include "cachelib/common/Throttler.h" +#include "cachelib/shm/Shm.h" + +namespace facebook { +namespace cachelib { + +/** + * Implementation of a hash table with chaining. The elements of the hash + * table need to have a public member of type Hook . Expects T to provide a + * getKey(), getHash() and appropriate key comparison operators for + * doing the key comparisons. The hashtable container guarantees thread + * safety. The container acts as an intrusive member-hook hashtable. + */ +class ChainedHashTable { + public: + // unique identifier per AccessType + static const int kId; + + template + struct Hook; + + private: + // Implements a hash table with chaining. + template T::*HookPtr> + class Impl { + public: + using Key = typename T::Key; + using BucketId = size_t; + using CompressedPtr = typename T::CompressedPtr; + using PtrCompressor = typename T::PtrCompressor; + + // allocate memory for hash table; the memory is managed by Impl. + // + // @param numBuckets the number of buckets to be allocated, power of two + // @param compressor object used to compress/decompress node pointers + // @param hasher object used to hash the key for its bucket id + Impl(size_t numBuckets, + const PtrCompressor& compressor, + const Hasher& hasher); + + // allocate memory for hash table; the memory is managed by the user. + // + // @param numBuckets the number of buckets to be allocated, power of two + // @param memStart user managed memory. The size must be enough to + // accommodate the number of the buckets + // @param compressor object used to compress/decompress node pointers + // @param hasher object used to hash the key for its bucket id + // @param resetMem fill memory with CompressedPtr{} + Impl(size_t numBuckets, + void* memStart, + const PtrCompressor& compressor, + const Hasher& hasher, + bool resetMem = false); + + // hash table memory is not released if managed by user. + // i.e. Impl::isRestorable() == true + ~Impl(); + + // prohibit copying + Impl(const Impl&) = delete; + Impl& operator=(const Impl&) = delete; + + T* getHashNext(const T& node) const noexcept { + return (node.*HookPtr).getHashNext(compressor_); + } + + CompressedPtr getHashNextCompressed(const T& node) const noexcept { + return (node.*HookPtr).getHashNext(); + } + + void setHashNext(T& node, T* next) const noexcept { + (node.*HookPtr).setHashNext(next, compressor_); + } + + void setHashNext(T& node, CompressedPtr next) { + (node.*HookPtr).setHashNext(next); + } + + // inserts the element into the bucket. + // + // @param node node to be inserted into the hashtable + // @param bucket the hashtable bucket that the node belongs to + // @return True if the insertion was success. False if not. Insertion + // fails if there is already a node with similar key in the + // hashtable. + bool insertInBucket(T& node, BucketId bucket) noexcept; + + // inserts or replaces the element into the bucket. + // + // @param node node to be inserted into the hashtable + // @param bucket the hashtable bucket that the node belongs to + // @return old node if it exists, nullptr otherwise + T* insertOrReplaceInBucket(T& node, BucketId bucket) noexcept; + + // removes the node from the bucket. + // + // precondition: node must be in the bucket. + // @param node the node to be removed. + // @param bucket the hashtable bucket that the node belongs to + void removeFromBucket(T& node, BucketId bucket) noexcept; + + // finds the node corresponding to the key from the bucket and returns it + // if found. + // + // @param key the key for the node we are looking for. + // @param bucket the hashtable bucket that the key belongs to + // @return a T* corresponding to the node or nullptr if there is no such + // node with the key in the bucket. + T* findInBucket(Key key, BucketId bucket) const noexcept; + + // gets the bucket for the key by using the corresponding hash function. + BucketId getBucket(Key k) const noexcept; + + // Call 'func' on each element in the given bucket. + // + // @param bucket the bucket id to fetch. + template + void forEachBucketElem(BucketId bucket, F&& func) const; + + // fetch the number of elements of a given bucket + // + // @param bucket the bucket id to fetch. + unsigned int getBucketNumElems(BucketId bucket) const; + + // true if the hash table can be restored + bool isRestorable() const noexcept { return restorable_; } + + // return the hashtable size in bytes + size_t size() const noexcept { return numBuckets_ * sizeof(CompressedPtr); } + + // return the number of buckets in hash table + size_t getNumBuckets() const noexcept { return numBuckets_; } + + private: + // finds the previous node in the hash chain for this node if one exists + // such that prev->next is node. + // + // @param node the node for which we are looking for the previous + // @param bucket the hashtable bucket that the node belongs to + // @return previous node for this node in the hash chain or nullptr if + // this node is in the head of the hash chain. + T* findPrevInBucket(const T& node, BucketId bucket) const noexcept; + + // number of buckets we have in the hashtable, must be power of two + const size_t numBuckets_{0}; + + // materialized value of numBuckets_ - 1 + const size_t numBucketsMask_{0}; + + // actual buckets. + std::unique_ptr hashTable_; + + // indicate whether or not the hash table uses user-managed memory and + // is thus restorable from serialized state + const bool restorable_{false}; + + // object used to compress/decompress node pointers to reduce memory + // footprint of Hook + const PtrCompressor compressor_; + + // Hash the key + const Hasher hasher_; + }; + + public: + using SerializationType = serialization::ChainedHashTableObject; + + // node used for chaining the hash table for collision. + template + struct CACHELIB_PACKED_ATTR Hook { + using CompressedPtr = typename T::CompressedPtr; + using PtrCompressor = typename T::PtrCompressor; + // sets the next in the hash chain to the passed in value. + void setHashNext(T* n, const PtrCompressor& compressor) noexcept { + next_ = compressor.compress(n); + } + + void setHashNext(CompressedPtr n) noexcept { next_ = n; } + + // gets the next in hash chain for this node. + T* getHashNext(const PtrCompressor& compressor) const noexcept { + return compressor.unCompress(next_); + } + + CompressedPtr getHashNext() const noexcept { return next_; } + + private: + CompressedPtr next_{}; + }; + + // Config class for the chained hash table. + class Config { + public: + // Do not add 'noexcept' here - causes GCC to delete this method: + // "config() is implicitly deleted because its exception-specification + // does not match the implicit exception-specification + // " + // followed by: + // "CacheAllocatorConfig.h:522:29: error: use of deleted function + // constexpr facebook::cachelib::ChainedHashTable::Config::Config() + Config() = default; + + // @param bucketsPower number of buckets in base 2 logarithm + // @param locksPower number of locks in base 2 logarithm + // @param pageSize page size + Config(unsigned int bucketsPower, + unsigned int locksPower, + PageSizeT pageSize = PageSizeT::NORMAL) + : Config(bucketsPower, + locksPower, + std::make_shared(), + pageSize) {} + + // @param bucketsPower number of buckets in base 2 logarithm + // @param locksPower number of locks in base 2 logarithm + // @param hasher the key hash function + // @param pageSize page size + Config(unsigned int bucketsPower, + unsigned int locksPower, + Hasher hasher, + PageSizeT pageSize = PageSizeT::NORMAL) + : bucketsPower_(bucketsPower), + locksPower_(locksPower), + pageSize_(pageSize), + hasher_(std::move(hasher)) { + if (bucketsPower_ > kMaxBucketPower || locksPower_ > kMaxLockPower || + locksPower_ > bucketsPower_) { + throw std::invalid_argument(folly::sformat( + "Invalid arguments to the config constructor bucketPower = {}, " + "lockPower = {}", + bucketsPower_, locksPower_)); + } + } + + Config(const Config&) = default; + Config& operator=(const Config&) = default; + + size_t getNumBuckets() const noexcept { + return static_cast(1) << bucketsPower_; + } + + size_t getNumLocks() const noexcept { + return static_cast(1) << locksPower_; + } + + // Estimate bucketsPower and LocksPower based on cache entries. + void sizeBucketsPowerAndLocksPower(size_t cacheEntries) { + // The percentage of used buckets vs unused buckets is measured by a load + // factor. For optimal performance, the load factor should not be more + // than 60%. + bucketsPower_ = + static_cast(ceil(log2(cacheEntries * 1.6 /* load factor */))); + + if (bucketsPower_ > kMaxBucketPower) { + throw std::invalid_argument(folly::sformat( + "Invalid arguments to the config constructor cacheEntries = {}", + cacheEntries)); + } + + // 1 lock per 1000 buckets. + locksPower_ = std::max(1, bucketsPower_ - 10); + } + + unsigned int getBucketsPower() const noexcept { return bucketsPower_; } + + unsigned int getLocksPower() const noexcept { return locksPower_; } + + const Hasher& getHasher() const noexcept { return hasher_; } + + std::map serialize() const { + std::map configMap; + configMap["BucketsPower"] = std::to_string(bucketsPower_); + configMap["LocksPower"] = std::to_string(locksPower_); + configMap["Hasher"] = + hasher_->getMagicId() == 1 ? "FNVHash" : "MurmurHash2"; + return configMap; + } + + PageSizeT getPageSize() const { return pageSize_; } + + private: + // 4 billion buckets should be good enough for everyone. + static constexpr unsigned int kMaxBucketPower = 32; + static constexpr unsigned int kMaxLockPower = 32; + + // The following are expressed as powers of two to make the modulo + // arithmetic simpler. + + // total number of buckets in the hashtable expressed as power of two. + unsigned int bucketsPower_{10}; + + // total number of locks for the hashtable expressed as a power of two. + unsigned int locksPower_{5}; + + PageSizeT pageSize_{PageSizeT::NORMAL}; + + Hasher hasher_ = std::make_shared(); + }; + + // Interface for the Container that implements a hash table. Maintains + // the node's isInAccessContainer state. T must implement an interface to + // markAccessible(), unmarkAccessible() and isAccessible(). + template T::*HookPtr, + typename LockT = facebook::cachelib::SharedMutexBuckets> + struct Container { + private: + using BucketId = typename Impl::BucketId; + + public: + using Key = typename T::Key; + using Handle = typename T::Handle; + using HandleMaker = typename T::HandleMaker; + using CompressedPtr = typename T::CompressedPtr; + using PtrCompressor = typename T::PtrCompressor; + + // default handle maker that calls incRef + static const HandleMaker kDefaultHandleMaker; + + // container with default config. + Container() noexcept + : Container(Config{}, PtrCompressor(), kDefaultHandleMaker) {} + + // create hash table container with local-managed memory + // @param config the config for the hashtable + // @param compressor object used to compress/decompress node pointers + // @param hm the functor that creates a Handle from T* + Container(Config c, + const PtrCompressor& compressor, + HandleMaker hm = kDefaultHandleMaker) + : config_(std::move(c)), + handleMaker_(std::move(hm)), + ht_{config_.getNumBuckets(), compressor, config_.getHasher()}, + locks_{config_.getLocksPower(), config_.getHasher()} {} + + // create hash table container with user-managed memory + // + // @param c config for hash table + // @param memStart hash table memory managed by the user + // @param compressor object used to compress/decompress node pointers + // @param hm the functor that creates a Handle from T* + Container(Config c, + void* memStart, + const PtrCompressor& compressor, + HandleMaker hm = kDefaultHandleMaker) + : config_(std::move(c)), + handleMaker_(std::move(hm)), + ht_{config_.getNumBuckets(), memStart, compressor, + config_.getHasher(), true /* resetMem */}, + locks_{config_.getLocksPower(), config_.getHasher()} {} + + // restore hash table from serialized data. + // + // @param object serialized object + // @param newConfig the new set of configurations + // @param memSegment shared memory segment for the hash table + // @param compressor object used to compress/decompress node pointers + // @param hm the functor that creates a Handle from T* + // + // @throw std::invalid argument if the bucket power in new config does not + // match the previous state or the size of the memSegment does not + // match the old state. + Container(const serialization::ChainedHashTableObject& object, + const Config& newConfig, + ShmAddr memSegment, + const PtrCompressor& compressor, + HandleMaker hm = kDefaultHandleMaker); + + // restore hash table from previous state. This only works when the + // hash table memory is managed by the user. + // + // @param object serialized object + // @param newConfig the new set of configurations + // @param memStart hash table memory managed by the user + // @param nBytes size of memory allocation pointed to by memStart + // @param compressor object used to compress/decompress node pointers + // @param hm the functor that creates a Handle from T* + // + // @throw std::invalid argument if the bucket power in new config does not + // match the previous state or the size of the memSegment does not + // match the old state. + Container(const serialization::ChainedHashTableObject& object, + const Config& newConfig, + void* memStart, + size_t nBytes, + const PtrCompressor& compressor, + HandleMaker hm = kDefaultHandleMaker); + + Container(const Container&) = delete; + Container& operator=(const Container&) = delete; + + // inserts the node into the hash table and marks it as being in the + // hashtable upon success. If another node exists with the same key, the + // insert fails. On failure the state of the node is unchanged. + // + // @param node the node to be inserted into the hashtable + // @return True if the node was successfully inserted into the hashtable. + // False if not. + bool insert(T& node) noexcept; + + // inserts or replaces the node into the hash table and marks it being in + // the hashtable upon success. If another node exists with the same key, the + // that node is removed. On failure the state of the node is unchanged. + // + // @param node the node to be inserted into the hashtable + // @return if the node was successfully inserted into the hashtable, + // returns a null handle. If the node replaced an existing node, + // a handle to the old node is returned. + // + // @throw std::overflow_error is the maximum item refcount is execeeded by + // creating this item handle. + Handle insertOrReplace(T& node); + + // replaces a node into the hash table, only if another node exists with + // the same key and is marked accessible. + // + // @param oldNode expected current node in the hash table + // @param newNode the new node for the key + // + // @return true if oldNode exists, is accessible, and was replaced + // successfully. + bool replaceIfAccessible(T& oldNode, T& newNode) noexcept; + + // replaces a node if predicate returns true on the existing node + // + // @param oldNode expected current node in the hash table + // @param newNode the new node for the key + // @param predicate asseses if condition is met for the oldNode to merit + // a replace + // + // @return true if oldNode exists, is accessible, predicate is true, and + // was replaced successfully. + template + bool replaceIf(T& oldNode, T& newNode, F&& predicate); + + // removes the node from the hashtable and unmarks it as accessible. If + // the node does not exists, returns False. + // + // @param node node to be removed from the hashtable. + // @return True if the node was in the hashtable and if it was + // successfully removed. False if the node was not in the + // hashtable. + bool remove(T& node) noexcept; + + // remove a node from the container if it exists for the key and the + // predicate returns true for the node. This is intended to simplify the + // eviction purposes to guarantee a good selection of candidate. + // + // @param node the node to be removed + // @param predicate the predicate check for the node + // + // @return handle to the node if we successfully removed it. returns a + // null handle if the node was either not in the container or the + // predicate failed. + Handle removeIf(T& node, + const std::function& predicate); + + // finds the node corresponding to the key in the hashtable and returns a + // handle to that node. + // + // @param key the lookup key + // @param args arguments to construct a handle for T. + // + // @return Handle with valid T* if there is a node corresponding to the + // key or a Handle with nullptr if not. + // + // @throw std::overflow_error is the maximum item refcount is execeeded by + // creating this item handle. + Handle find(Key key) const; + + // for saving the state of the hash table + // + // precondition: serialization must happen without any reader or writer + // present. Any modification of this object afterwards will result in an + // invalid, inconsistent state for the serialized data. + // + // @throw std::logic_error if the container has any pending iterators that + // need to be destroyed or if the container can not be restored. + serialization::ChainedHashTableObject saveState() const; + + // get the required size for the buckets. + static size_t getRequiredSize(size_t numBuckets) noexcept { + return sizeof(CompressedPtr) * numBuckets; + } + + const Config& getConfig() const noexcept { return config_; } + + unsigned int getHashpower() const noexcept { + return config_.getBucketsPower(); + } + + // Iterator interface for the hashtable. Iterates over the hashtable + // bucket by bucket and takes a snapshot of the bucket to iterate over. It + // guarantees that all keys that were present when the iteration started + // will be accessible unless they are removed. Keys that are + // removed/inserted during the lifetime of an iterator are not guaranteed + // to be either visited or not-visited. Adding/Removing from the hash + // table while the iterator is alive will not invalidate any iterator or + // the element that the iterator points at currently. The iterator + // internally holds a Handle to the item. + class Iterator { + public: + ~Iterator() { + XDCHECK_GT(container_->numIterators_.load(), 0u); + --container_->numIterators_; + } + Iterator(const Iterator&) = delete; + Iterator& operator=(const Iterator&) = delete; + + Iterator(Iterator&&) noexcept; + Iterator& operator=(Iterator&&) noexcept; + enum EndIterT { EndIter }; + + // increment the iterator to the next element. + // with/without throttler + Iterator& operator++(); + + // dereference the current element that the iterator is pointing to. + T& operator*(); + T* operator->() { return &(*(*this)); } + + bool operator==(const Iterator& other) const noexcept { + return container_ == other.container_ && + currBucket_ == other.currBucket_ && curSor_ == other.curSor_; + } + + bool operator!=(const Iterator& other) const noexcept { + return !(*this == other); + } + + // TODO(jiayueb): change to return ReadHandle after fixing all the breaks + const Handle& asHandle() { return curr(); } + + // reset the Iterator to begin of container + void reset(); + + private: + // container for the iterator + using C = Container; + + // construct an iterator with the given + friend C; + explicit Iterator(C& ht, + folly::Optional + throttlerConfig = folly::none); + + Iterator(C& ht, EndIterT); + + // the container over which we are iterating + C* container_; + + // current bucket that the iterator is pointing to. + BucketId currBucket_{0}; + + // cursor into the current bucket. + unsigned int curSor_{0}; + + // current bucket. + std::vector bucketElems_; + + // optional throttler + folly::Optional throttler_ = folly::none; + + // returns the handle for current item in the iterator. + Handle& curr() { + if (curSor_ < bucketElems_.size()) { + return bucketElems_[curSor_]; + } + throw std::logic_error( + "Iterator in invalid state with curSor_: " + + folly::to(curSor_) + ", currBucket_: " + + folly::to(currBucket_) + ", total buckets: " + + folly::to(container_->config_.getNumBuckets())); + } + }; + + // Iterator interface to the container. + // whether it constructs iterator of begin with a throttler config + Iterator begin(folly::Optional throttlerConfig); + + Iterator begin() { return Iterator(*this); } + Iterator end() { return Iterator(*this, Iterator::EndIter); } + + // Stats describing the distribution of items (keys) in the hash table + struct DistributionStats { + uint64_t numKeys{0}; + uint64_t numBuckets{0}; + // map from bucket id to number of items in the bucket. + std::map itemDistribution{}; + }; + + struct Stats { + uint64_t numKeys; + uint64_t numBuckets; + }; + + // Get the distribution stats. This function will use cached results + // if the difference since last updated is not significant. This is + // expensive. Call at your discretion. + // + // Critiera for refreshing the stats: + // - 10 minutes since last update, OR + // - 5% more or less number of keys in the hash table + DistributionStats getDistributionStats() const; + + // lightweight stats that give the number of keys and buckets inside the + // container. This is guaranteed to be fast. + Stats getStats() const noexcept { return {numKeys_, ht_.getNumBuckets()}; } + + // Get the total number of keys inserted into the hash table + uint64_t getNumKeys() const noexcept { + return numKeys_.load(std::memory_order_relaxed); + } + + private: + using Hashtable = Impl; + + // Fetch a vector of handle to the items belonging to a given bucket. This + // is for use by the iterator. 'handles' will be cleared and then populated + // with handles for the items in the given bucket. Items will be skipped if + // the handle cannot be acquired for any reason. + void getBucketElems(BucketId bucket, std::vector& handles) const; + + // config for the hash table. + const Config config_{}; + + // handle maker to convert the T* to T::Handle + HandleMaker handleMaker_; + + // the hashtable buckets + Hashtable ht_; + + // locks protecting the hashtable buckets + mutable LockT locks_; + + std::atomic numIterators_{0}; + + // Cached stats for distribution + // This is updated if the number of keys changes by more than 5%, or + // it has been 10 minutes since the stats has last been updated. + mutable std::mutex cachedStatsLock_; + mutable DistributionStats cachedStats_{}; + + // if we can recompute the cachedStats if it is too old. Set to false when + // another thread is computing it. + mutable bool canRecomputeDistributionStats_{true}; + + // when the distribution was last computed. + mutable time_t cachedStatsUpdateTime_{0}; + + // number of the keys stored in this hash table + std::atomic numKeys_{0}; + }; +}; + +template T::*HookPtr, + typename LockT> +const typename T::HandleMaker + ChainedHashTable::Container::kDefaultHandleMaker = + [](T* t) -> typename T::Handle { + if (t) { + t->incRef(); + } + return typename T::Handle{t}; +}; +} // namespace cachelib +} // namespace facebook + +#include "cachelib/allocator/ChainedHashTable-inl.h" diff --git a/bdm/allocator/ContainerTypes.cpp b/bdm/allocator/ContainerTypes.cpp new file mode 100644 index 0000000000..ceaa8101c9 --- /dev/null +++ b/bdm/allocator/ContainerTypes.cpp @@ -0,0 +1,32 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cachelib/allocator/ChainedHashTable.h" +#include "cachelib/allocator/MM2Q.h" +#include "cachelib/allocator/MMLru.h" +#include "cachelib/allocator/MMTinyLFU.h" +namespace facebook { +namespace cachelib { +// Types of AccessContainer and MMContainer +// MMType +const int MMLru::kId = 1; +const int MM2Q::kId = 2; +const int MMTinyLFU::kId = 3; + +// AccessType +const int ChainedHashTable::kId = 1; +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/FreeMemStrategy.cpp b/bdm/allocator/FreeMemStrategy.cpp new file mode 100644 index 0000000000..81978a9d7d --- /dev/null +++ b/bdm/allocator/FreeMemStrategy.cpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cachelib/allocator/FreeMemStrategy.h" + +#include + +#include +#include + +#include "cachelib/allocator/Util.h" + +namespace facebook { +namespace cachelib { + +FreeMemStrategy::FreeMemStrategy(Config config) + : RebalanceStrategy(FreeMem), config_(std::move(config)) {} + +// The list of allocation classes to be rebalanced is determined by: +// +// 0. Filter out classes that have below minSlabThreshold_ +// +// 1. Filter out classes that have just gained a slab recently +// +// 2. Pick the first class we find with free memory past the threshold +RebalanceContext FreeMemStrategy::pickVictimAndReceiverImpl( + const CacheBase& cache, PoolId pid) { + const auto& pool = cache.getPool(pid); + if (pool.getUnAllocatedSlabMemory() > + config_.maxUnAllocatedSlabs * Slab::kSize) { + return kNoOpContext; + } + + const auto poolStats = cache.getPoolStats(pid); + + // ignore allocation classes that have fewer than the threshold of slabs. + const auto victims = filterByNumEvictableSlabs( + poolStats, std::move(poolStats.getClassIds()), config_.minSlabs); + + if (victims.empty()) { + XLOG(DBG, "Rebalancing: No victims available"); + return kNoOpContext; + } + + RebalanceContext ctx; + ctx.victimClassId = pickVictimByFreeMem( + victims, poolStats, config_.getFreeMemThreshold(), getPoolState(pid)); + + if (ctx.victimClassId == Slab::kInvalidClassId) { + return kNoOpContext; + } + + XLOGF(DBG, "Rebalancing: victimAC = {}", static_cast(ctx.victimClassId)); + return ctx; +} +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/FreeMemStrategy.h b/bdm/allocator/FreeMemStrategy.h new file mode 100644 index 0000000000..bca4ea78dc --- /dev/null +++ b/bdm/allocator/FreeMemStrategy.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/RebalanceStrategy.h" + +namespace facebook { +namespace cachelib { + +// Strategy that frees a slab from any allocation class that's above the free +// memory limit. This strategy only picks the victim but not the receiver. +class FreeMemStrategy : public RebalanceStrategy { + public: + struct Config { + // minimum number of slabs to retain in every allocation class. + unsigned int minSlabs{1}; + + // use free memory if it is amounts to more than this many slabs. + unsigned int numFreeSlabs{3}; + + // this strategy will not rebalance anything if the number + // of free slabs is more than this number + size_t maxUnAllocatedSlabs{1000}; + + // free memory threshold to be used for picking victim. + size_t getFreeMemThreshold() const noexcept { + return numFreeSlabs * Slab::kSize; + } + + Config() noexcept {} + Config(unsigned int _minSlabs, + unsigned int _numFreeSlabs, + unsigned int _maxUnAllocatedSlabs) noexcept + : minSlabs{_minSlabs}, + numFreeSlabs(_numFreeSlabs), + maxUnAllocatedSlabs(_maxUnAllocatedSlabs) {} + }; + + explicit FreeMemStrategy(Config config = {}); + + RebalanceContext pickVictimAndReceiverImpl(const CacheBase& cache, + PoolId pid) final; + + private: + const Config config_; +}; +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/FreeThresholdStrategy.cpp b/bdm/allocator/FreeThresholdStrategy.cpp new file mode 100644 index 0000000000..d4387b2888 --- /dev/null +++ b/bdm/allocator/FreeThresholdStrategy.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (c) Intel and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cachelib/allocator/FreeThresholdStrategy.h" + +#include "cachelib/allocator/memory/MemoryPoolManager.h" +#include "cachelib/allocator/memory/MemoryAllocator.h" +#include "cachelib/allocator/Cache.h" +#include "cachelib/allocator/CacheStats.h" + +#include + +namespace facebook { +namespace cachelib { + +FreeThresholdStrategy::FreeThresholdStrategy(double lowEvictionAcWatermark, + double highEvictionAcWatermark, + uint64_t maxEvictionBatch, + uint64_t minEvictionBatch) + : lowEvictionAcWatermark(lowEvictionAcWatermark), + highEvictionAcWatermark(highEvictionAcWatermark), + maxEvictionBatch(maxEvictionBatch), + minEvictionBatch(minEvictionBatch), + highEvictionAcWatermarks(CacheBase::kMaxTiers, + std::vector>>(MemoryPoolManager::kMaxPools, + std::vector>(MemoryAllocator::kMaxClasses, + std::vector(3, highEvictionAcWatermark)))), + acLatencies(CacheBase::kMaxTiers, + std::vector>>(MemoryPoolManager::kMaxPools, + std::vector>(MemoryAllocator::kMaxClasses, + std::vector(2, 0.0)))) {} + +std::vector FreeThresholdStrategy::calculateBatchSizes( + const CacheBase& cache, + std::vector> acVec) { + std::vector batches{}; + for (auto [tid, pid, cid] : acVec) { + auto stats = cache.getAllocationClassStats(tid, pid, cid); + if (stats.approxFreePercent >= highEvictionAcWatermark) { + batches.push_back(0); + } else { + auto toFreeMemPercent = highEvictionAcWatermark - stats.approxFreePercent; + auto toFreeItems = static_cast( + toFreeMemPercent * stats.memorySize / stats.allocSize); + batches.push_back(toFreeItems); + auto acAllocLatencyNs = cache.getAllocationClassStats(tid, pid, cid).allocLatencyNs.estimate(); //moving avg latency estimation for ac class + calculateLatency(acAllocLatencyNs, tid, pid, cid); + + } + } + + if (batches.size() == 0) { + return batches; + } + + auto maxBatch = *std::max_element(batches.begin(), batches.end()); + if (maxBatch == 0) + return batches; + + std::transform( + batches.begin(), batches.end(), batches.begin(), [&](auto numItems) { + if (numItems == 0) { + return 0UL; + } + + auto cappedBatchSize = maxEvictionBatch * numItems / maxBatch; + if (cappedBatchSize < minEvictionBatch) + return minEvictionBatch; + else + return cappedBatchSize; + }); + + return batches; +} + +void FreeThresholdStrategy::calculateLatency(uint64_t acLatency, unsigned int tid, PoolId pid, ClassId cid){ + + auto best_latency= acLatencies[tid][pid][cid][0]; + acLatencies[tid][pid][cid][1]=best_latency; + acLatencies[tid][pid][cid][0]=acLatency; + + + +} + +BackgroundStrategyStats FreeThresholdStrategy::getStats() { + BackgroundStrategyStats s; + + + + auto numClasses = MemoryAllocator::kMaxClasses; + for (int i = 0; i < 1; i++) { + for (int j = 0; j < 1; j++) { + for (int k = 0; k < numClasses; k++) { + s.highEvictionAcWatermarks[k] = + std::make_tuple ( + highEvictionAcWatermarks[i][j][k][0], + highEvictionAcWatermarks[i][j][k][1], + highEvictionAcWatermarks[i][j][k][2]); + s.acLatencies[k]= + std::make_pair ( + acLatencies[i][j][k][0], + acLatencies[i][j][k][1] ); + + } + } + } + return s; +} + + +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/FreeThresholdStrategy.h b/bdm/allocator/FreeThresholdStrategy.h new file mode 100644 index 0000000000..d4b205f518 --- /dev/null +++ b/bdm/allocator/FreeThresholdStrategy.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/BackgroundMoverStrategy.h" +#include "cachelib/allocator/Cache.h" + +namespace facebook { +namespace cachelib { + +// Base class for background mover strategy. +class FreeThresholdStrategy : public BackgroundMoverStrategy { + public: + FreeThresholdStrategy(double lowEvictionAcWatermark, + double highEvictionAcWatermark, + uint64_t maxEvictionBatch, + uint64_t minEvictionBatch); + ~FreeThresholdStrategy() {} + + std::vector calculateBatchSizes( + const CacheBase& cache, + std::vector> acVecs); + BackgroundStrategyStats getStats(); + + private: + double lowEvictionAcWatermark{2.0}; + double highEvictionAcWatermark{5.0}; + uint64_t maxEvictionBatch{40}; + uint64_t minEvictionBatch{5}; + std::vector>>> highEvictionAcWatermarks; + + std::vector>>> acLatencies; + +private: + //void calculateBenefitMig(uint64_t p99, unsigned int tid, PoolId pid, ClassId cid); //function to calculate the benefit of eviction for a certain ac class + void calculateLatency(uint64_t p99, unsigned int tid, PoolId pid, ClassId cid); + +}; + +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/Handle.h b/bdm/allocator/Handle.h new file mode 100644 index 0000000000..ce455a0bca --- /dev/null +++ b/bdm/allocator/Handle.h @@ -0,0 +1,644 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +#include "cachelib/allocator/nvmcache/WaitContext.h" + +namespace facebook { +namespace cachelib { +namespace detail { +// Bit mask for flags on cache handle +enum class HandleFlags : uint8_t { + // Indicates if a handle has been created but not inserted into the cache yet. + // This is used to track if removeCB is invoked when the item is freed back. + kNascent = 1 << 0, + + // Indicate if the item was expired + kExpired = 1 << 1, + + // Indicate if we went to NvmCache to look for this item + kWentToNvm = 1 << 2, +}; + +template +struct WriteHandleImpl; + +// RAII class that manages cache item pointer lifetime. These handles +// can only be created by a CacheAllocator and upon destruction the handle +// takes care of releasing the item to the correct cache allocator instance. +// Handle must be destroyed *before* the instance of the CacheAllocator +// gets destroyed. +template +struct ReadHandleImpl { + using Item = T; + using CacheT = typename T::CacheT; + + ReadHandleImpl() = default; + /*implicit*/ ReadHandleImpl(std::nullptr_t) {} + + // reset the handle by releasing the item it holds. + void reset() noexcept { + waitContext_.reset(); + + if (it_ == nullptr) { + return; + } + + assert(alloc_ != nullptr); + try { + alloc_->release(it_, isNascent()); + } catch (const std::exception& e) { + XLOGF(CRITICAL, "Failed to release {:#10x} : {}", static_cast(it_), + e.what()); + } + it_ = nullptr; + } + + // Waits for item (if async op in progress) and then releases item's + // ownership to the caller. + Item* release() noexcept { + auto ret = getInternal(); + if (waitContext_) { + waitContext_->releaseHandle(); + waitContext_.reset(); + } else { + it_ = nullptr; + } + return ret; + } + + ~ReadHandleImpl() noexcept { reset(); } + + ReadHandleImpl(const ReadHandleImpl&) = delete; + ReadHandleImpl& operator=(const ReadHandleImpl&) = delete; + + FOLLY_ALWAYS_INLINE ReadHandleImpl(ReadHandleImpl&& other) noexcept + : alloc_(other.alloc_), + it_(other.releaseItem()), + waitContext_(std::move(other.waitContext_)), + flags_(other.getFlags()) {} + + FOLLY_ALWAYS_INLINE ReadHandleImpl& operator=( + ReadHandleImpl&& other) noexcept { + if (this != &other) { + this->~ReadHandleImpl(); + new (this) ReadHandleImpl(std::move(other)); + } + return *this; + } + + // == and != operators for comparison with Item* + friend bool operator==(const ReadHandleImpl& a, const Item* it) noexcept { + return a.get() == it; + } + friend bool operator==(const Item* it, const ReadHandleImpl& a) noexcept { + return a == it; + } + friend bool operator!=(const ReadHandleImpl& a, const Item* it) noexcept { + return !(a == it); + } + friend bool operator!=(const Item* it, const ReadHandleImpl& a) noexcept { + return !(a == it); + } + + // == and != operators for comparison with nullptr + friend bool operator==(const ReadHandleImpl& a, std::nullptr_t) noexcept { + return a.get() == nullptr; + } + friend bool operator==(std::nullptr_t nullp, + const ReadHandleImpl& a) noexcept { + return a == nullp; + } + friend bool operator!=(const ReadHandleImpl& a, + std::nullptr_t nullp) noexcept { + return !(a == nullp); + } + friend bool operator!=(std::nullptr_t nullp, + const ReadHandleImpl& a) noexcept { + return !(a == nullp); + } + + // == and != operator + friend bool operator==(const ReadHandleImpl& a, + const ReadHandleImpl& b) noexcept { + return a.get() == b.get(); + } + friend bool operator!=(const ReadHandleImpl& a, + const ReadHandleImpl& b) noexcept { + return !(a == b); + } + + // for use in bool contexts like `if (handle) { ... }` + FOLLY_ALWAYS_INLINE explicit operator bool() const noexcept { + return get() != nullptr; + } + + // Accessors always return a const item. + FOLLY_ALWAYS_INLINE const Item* operator->() const noexcept { + return getInternal(); + } + FOLLY_ALWAYS_INLINE const Item& operator*() const noexcept { + return *getInternal(); + } + FOLLY_ALWAYS_INLINE const Item* get() const noexcept { return getInternal(); } + + // Convert to semi future. + folly::SemiFuture toSemiFuture() && { + if (isReady()) { + return folly::makeSemiFuture(std::forward(*this)); + } + folly::Promise promise; + auto semiFuture = promise.getSemiFuture(); + auto cb = onReady([p = std::move(promise)](ReadHandleImpl handle) mutable { + p.setValue(std::move(handle)); + }); + if (cb) { + // Handle became ready after the initial isReady check. So we will run + // the callback to set the promise inline. + cb(std::move(*this)); + return semiFuture; + } else { + return std::move(semiFuture).deferValue([](ReadHandleImpl handle) { + if (handle) { + // Increment one refcount on user thread since we transferred a handle + // from a cachelib internal thread. + handle.alloc_->adjustHandleCountForThread_private(1); + } + return handle; + }); + } + } + + WriteHandleImpl toWriteHandle() && { + XDCHECK_NE(alloc_, nullptr); + XDCHECK_NE(getInternal(), nullptr); + alloc_->invalidateNvm(*getInternal()); + return WriteHandleImpl{std::move(*this)}; + } + + using ReadyCallback = folly::Function; + + // Return true iff item handle is ready to use. + // Empty handles are considered ready with it_ == nullptr. + FOLLY_ALWAYS_INLINE bool isReady() const noexcept { + return waitContext_ ? waitContext_->isReady() : true; + } + + // Return true if this item has a wait context which means + // it has missed in DRAM and went to nvm cache. + bool wentToNvm() const noexcept { + return getFlags() & static_cast(HandleFlags::kWentToNvm); + } + + // Return true if this handle couldn't be fulfilled because the item had + // already expired. If item is present then the source of truth + // lies with the actual item. + bool wasExpired() const noexcept { + return getFlags() & static_cast(HandleFlags::kExpired); + } + + // blocks until `isReady() == true`. + void wait() const noexcept { + if (isReady()) { + return; + } + CHECK(waitContext_.get() != nullptr); + waitContext_->wait(); + } + + // Clones Item handle. returns an empty handle if it is null. + // @return HandleImpl return a handle to this item + // @throw std::overflow_error is the maximum item refcount is execeeded by + // creating this item handle. + ReadHandleImpl clone() const { + ReadHandleImpl hdl{}; + if (alloc_) { + hdl = alloc_->acquire(getInternal()); + } + hdl.cloneFlags(*this); + return hdl; + } + + bool isWriteHandle() const { return false; } + + protected: + // accessor. Calling getInternal() on handle with isReady() == false blocks + // the thread until the handle is ready. + FOLLY_ALWAYS_INLINE Item* getInternal() const noexcept { + return waitContext_ ? waitContext_->get() : it_; + } + + private: + struct ItemWaitContext : public WaitContext { + explicit ItemWaitContext(CacheT& alloc) : alloc_(alloc) {} + + // @return managed item pointer + // NOTE: get() blocks the thread until isReady is true + Item* get() const noexcept { + wait(); + XDCHECK(isReady()); + return it_.load(std::memory_order_acquire); + } + + // Wait until we have the item + void wait() const noexcept { + if (isReady()) { + return; + } + baton_.wait(); + XDCHECK(isReady()); + } + + uint8_t getFlags() const { return flags_; } + + // Assumes ownership of the item managed by hdl + // and invokes the onReadyCallback_ + // postcondition: `isReady() == true` + // + // NOTE: It's a bug to set a hdl that's already ready and can + // terminate the application. This is only used internally within + // cachelib and shouldn't be exposed outside of cachelib to applications. + // + // Active item handle count semantics + // ---------------------------------- + // + // CacheLib keeps track of thread-local handle count in order to detect + // if we have leaked handles on shutdown. The reason for thread-local is + // to achieve optimal concurrency without forcing all threads to sync on + // updating the same atomics. For async get (from nvm-cache), there are + // three scenarios that we need to consider regarding the handle count. + // 1. User calls "wait()" on a handle or relies on checking "isReady()". + // 2. User adds a "onReadyCallback" via "onReady()". + // 3. User converts handle to a "SemiFuture". + // + // For (2), user must increment the active handle count, if user's cb + // is successfully enqueued. Something like the following. User can do + // this at the beginning of their onReadyCallback. Now, beware that + // user should NOT execute the onReadyCallback if the onReady() enqueue + // failed, as that means the item is already ready, and there's no need + // to adjust the refcount. + // + // TODO: T87126824. The behavior for (2) is far from ideal. CacheLib will + // figure out a way to resolve this API and avoid the need for user + // to explicitly adjust the handle count. + // + // For (1) and (3), do nothing. Cachelib automatically handles handle + // counts internally. In particular, for (1), the current thread will + // have "zero" outstanding handle for this handle user is accessing, + // instead when the handle is destructed, we will "inc" the handle count + // and then "dec" the handle count to achieve a net-zero change in count. + // For (3), the for the "handle count" associated with the original + // item-handle that had been converted to SemiFuture, we have done the + // same as (1) which we will achieve a net-zero change at destruction. + // In addition, we will be bumping the handle count by 1, when SemiFuture + // is evaluated (via defer callback). This is because we have cloned + // an item handle to be passed to the SemiFuture. + void set(ReadHandleImpl hdl) override { + XDCHECK(!isReady()); + SCOPE_EXIT { hdl.release(); }; + + flags_ = hdl.getFlags(); + auto it = hdl.getInternal(); + it_.store(it, std::memory_order_release); + // Handles are fulfilled by threads different from the owners. Adjust + // the refcount tracking accordingly. use the local copy to not make + // this an atomic load check. + if (it) { + alloc_.adjustHandleCountForThread_private(-1); + } + { + std::lock_guard l(mtx_); + if (onReadyCallback_) { + // We will construct another handle that will be transferred to + // another thread. So we will decrement a count locally to be back + // to 0 on this thread. In the user thread, they must increment by + // 1. It is done automatically if the user converted their Handle + // to a SemiFuture via toSemiFuture(). + auto readHandle = hdl.clone(); + if (readHandle) { + alloc_.adjustHandleCountForThread_private(-1); + } + onReadyCallback_(std::move(readHandle)); + } + } + baton_.post(); + } + + // @return true iff we have the item + bool isReady() const noexcept { + return it_.load(std::memory_order_acquire) != + reinterpret_cast(kItemNotReady); + } + + // Set the onReady callback + // @param cb ReadyCallback + // + // @return callback function back if handle does not have waitContext_ + // or if waitContext_ is ready. Caller is + // expected to call this function + // empty function if waitContext exists and is not ready + // + ReadyCallback onReady(ReadyCallback&& callBack) { + std::lock_guard l(mtx_); + if (isReady()) { + return std::move(callBack); + } + onReadyCallback_ = std::move(callBack); + // callback consumed, return empty function + return ReadyCallback(); + } + + void releaseHandle() noexcept { + // After @wait, callback is invoked. We don't have to worry about mutex. + wait(); + if (it_.exchange(nullptr, std::memory_order_release) != nullptr) { + alloc_.adjustHandleCountForThread_private(1); + } + } + + ~ItemWaitContext() override { + if (!isReady()) { + XDCHECK(false); + XLOG(CRITICAL, "Destorying an unresolved handle"); + return; + } + auto it = it_.load(std::memory_order_acquire); + if (it == nullptr) { + return; + } + + // If we have a wait context, we acquired the handle from another thread + // that asynchronously created the handle. Fix up the thread local + // refcount so that alloc_.release does not decrement it to negative. + alloc_.adjustHandleCountForThread_private(1); + try { + alloc_.release(it, /* isNascent */ false); + } catch (const std::exception& e) { + XLOGF(CRITICAL, "Failed to release {:#10x} : {}", + static_cast(it), e.what()); + } + } + + protected: + friend class ReadHandleImpl; + // Method used only by ReadHandleImpl ctor + void discard() { + it_.store(nullptr, std::memory_order_relaxed); + } + private: + // we are waiting on Item* to be set to a value. One of the valid values is + // nullptr. So choose something that we dont expect to indicate a ptr + // state that is not valid. + static constexpr uintptr_t kItemNotReady = 0x1221; + mutable folly::fibers::Baton baton_; //< baton to wait on for the handle to + // be "ready" + std::mutex mtx_; //< mutex to set and get onReadyCallback_ + ReadyCallback onReadyCallback_; //< callback invoked when "ready" + std::atomic it_{reinterpret_cast(kItemNotReady)}; //< The item + uint8_t flags_{}; //< flags associated with the handle generated by NvmCache + CacheT& alloc_; //< allocator instance + }; + + // Set the onReady callback which should be invoked once the item is ready. + // If the item is ready, the callback is returned back to the user for + // execution. + // + // If the callback is successfully enqueued, then within the callback, user + // must increment per-thread handle count by 1. + // cache->adjustHandleCountForThread_private(1); + // This is needed because cachelib had previously moved a handle from an + // internal thread to this callback, and cachelib internally removed a + // 1. It is done automatically if the user converted their Handle + // to a SemiFuture via toSemiFuture(). For more details, refer to comments + // around ItemWaitContext. + // + // @param callBack callback function + // + // @return an empty function if the callback was enqueued to be + // executed when the handle becomes ready. + // if the handle becomes/is ready, this returns the + // original callback back to the caller to execute. + // + FOLLY_NODISCARD ReadyCallback onReady(ReadyCallback&& callBack) { + return (waitContext_) ? waitContext_->onReady(std::move(callBack)) + : std::move(callBack); + } + + std::shared_ptr getItemWaitContext() const noexcept { + return waitContext_; + } + + // Internal book keeping to track handles that correspond to items that are + // not present in cache. This state is mutated, but does not affect the user + // visible meaning of the item handle(public API). Hence this is const. + // markNascent is set when we know the handle is constructed for an item that + // is not inserted into the cache yet. we unmark the nascent flag when we know + // the item was successfully inserted into the cache by the caller. + void markNascent() const { + flags_ |= static_cast(HandleFlags::kNascent); + } + void unmarkNascent() const { + flags_ &= ~static_cast(HandleFlags::kNascent); + } + bool isNascent() const { + return flags_ & static_cast(HandleFlags::kNascent); + } + + void markExpired() { flags_ |= static_cast(HandleFlags::kExpired); } + void markWentToNvm() { + flags_ |= static_cast(HandleFlags::kWentToNvm); + } + + uint8_t getFlags() const { + return waitContext_ ? waitContext_->getFlags() : flags_; + } + void cloneFlags(const ReadHandleImpl& other) { flags_ = other.getFlags(); } + + Item* releaseItem() noexcept { return std::exchange(it_, nullptr); } + + // User of a handle can access cache via this accessor + CacheT& getCache() const { + XDCHECK(alloc_); + return *alloc_; + } + + // Handle which has the item already + FOLLY_ALWAYS_INLINE ReadHandleImpl(Item* it, CacheT& alloc) noexcept + : alloc_(&alloc), it_(it) { + if (it_ && it_->isIncomplete()) { + waitContext_ = std::make_shared(alloc); + if (!alloc_->addWaitContextForMovingItem(it->getKey(), waitContext_)) { + waitContext_->discard(); + waitContext_.reset(); + } + } + } + + // handle that has a wait context allocated. Used for async handles + // In this case, the it_ will be filled in asynchronously and mulitple + // Handles can wait on the one underlying handle + explicit ReadHandleImpl(CacheT& alloc) noexcept + : alloc_(&alloc), + it_(nullptr), + waitContext_(std::make_shared(alloc)) {} + + // Only CacheAllocator and NvmCache can create non-default constructed handles + friend CacheT; + friend typename CacheT::NvmCacheT; + + // Object-cache's c++ allocator will need to create a zero refcount handle in + // order to access CacheAllocator API. Search for this function for details. + template + friend HandleT* objcacheInitializeZeroRefcountHandle(void* handleStorage, + Item2* it, + Cache2& alloc); + + // A handle is marked as nascent when it was not yet inserted into the cache. + // However, user can override it by marking an item as "not nascent" even if + // it's not inserted into the cache. Unmarking it means a not-yet-inserted + // item will still be processed by RemoveCallback if user frees it. Today, + // the only user who can do this is Cachelib's ObjectCache API to ensure the + // correct RAII behavior for an object. + template + friend void objcacheUnmarkNascent(const HandleT& hdl); + + // Object-cache's c++ allocator needs to access CacheAllocator directly from + // an item handle in order to access CacheAllocator APIs. + template + friend typename HandleT::CacheT& objcacheGetCache(const HandleT& hdl); + + // instance of the cache this handle and item belong to. + CacheT* alloc_ = nullptr; + + // pointer to item when the item does not have a wait context associated. + Item* it_ = nullptr; + + // The waitContext allows an application to wait until the item is fetched. + // This provides a future kind of interfaces (see ItemWaitContext for + // details). + std::shared_ptr waitContext_; + + mutable uint8_t flags_{}; + + // Only CacheAllocator and NvmCache can create non-default constructed handles + friend CacheT; + friend typename CacheT::NvmCacheT; + + // Following methods are only used in tests where we need to access private + // methods in ReadHandle + template + friend T1 createHandleWithWaitContextForTest(T2&); + template + friend std::shared_ptr getWaitContextForTest( + T1&); + FRIEND_TEST(ItemHandleTest, WaitContext_readycb); + FRIEND_TEST(ItemHandleTest, WaitContext_ready_immediate); + FRIEND_TEST(ItemHandleTest, onReadyWithNoWaitContext); +}; + +// WriteHandleImpl is a sub class of ReadHandleImpl to function as a mutable +// handle. User is able to obtain a mutable item from a "write handle". +template +struct WriteHandleImpl : public ReadHandleImpl { + using Item = T; + using CacheT = typename T::CacheT; + using ReadHandle = ReadHandleImpl; + using ReadHandle::ReadHandle; // inherit constructors + + // Accessors always return a non-const item. + FOLLY_ALWAYS_INLINE Item* operator->() const noexcept { + return ReadHandle::getInternal(); + } + FOLLY_ALWAYS_INLINE Item& operator*() const noexcept { + return *ReadHandle::getInternal(); + } + FOLLY_ALWAYS_INLINE Item* get() const noexcept { + return ReadHandle::getInternal(); + } + + // Clones write handle. returns an empty handle if it is null. + // @return WriteHandleImpl return a handle to this item + // @throw std::overflow_error is the maximum item refcount is execeeded by + // creating this item handle. + WriteHandleImpl clone() const { return WriteHandleImpl{ReadHandle::clone()}; } + + bool isWriteHandle() const { return true; } + + // Friends + friend ReadHandle; + // Only CacheAllocator and NvmCache can create non-default constructed handles + friend CacheT; + friend typename CacheT::NvmCacheT; + + // Object-cache's c++ allocator will need to create a zero refcount handle in + // order to access CacheAllocator API. Search for this function for details. + template + friend HandleT* objcacheInitializeZeroRefcountHandle(void* handleStorage, + Item2* it, + Cache2& alloc); + + // A handle is marked as nascent when it was not yet inserted into the cache. + // However, user can override it by marking an item as "not nascent" even if + // it's not inserted into the cache. Unmarking it means a not-yet-inserted + // item will still be processed by RemoveCallback if user frees it. Today, + // the only user who can do this is Cachelib's ObjectCache API to ensure the + // correct RAII behavior for an object. + template + friend void objcacheUnmarkNascent(const HandleT& hdl); + + // Object-cache's c++ allocator needs to access CacheAllocator directly from + // an item handle in order to access CacheAllocator APIs. + template + friend typename HandleT::CacheT& objcacheGetCache(const HandleT& hdl); + + // Following methods are only used in tests where we need to access private + // methods in WriteHandle + template + friend T1 createHandleWithWaitContextForTest(T2&); + template + friend std::shared_ptr getWaitContextForTest( + T1&); + FRIEND_TEST(ItemHandleTest, WaitContext_readycb); + FRIEND_TEST(ItemHandleTest, WaitContext_ready_immediate); + FRIEND_TEST(ItemHandleTest, onReadyWithNoWaitContext); + + private: + explicit WriteHandleImpl(ReadHandle&& readHandle) + : ReadHandle(std::move(readHandle)) {} +}; + +template +std::ostream& operator<<(std::ostream& os, const ReadHandleImpl& it) { + if (it) { + os << it->toString(); + } else { + os << "nullptr"; + } + return os; +} +} // namespace detail +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/HitsPerSlabStrategy.cpp b/bdm/allocator/HitsPerSlabStrategy.cpp new file mode 100644 index 0000000000..20fd15bbd8 --- /dev/null +++ b/bdm/allocator/HitsPerSlabStrategy.cpp @@ -0,0 +1,207 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cachelib/allocator/HitsPerSlabStrategy.h" + +#include + +#include +#include + +#include "cachelib/allocator/Util.h" + +namespace facebook { +namespace cachelib { + +HitsPerSlabStrategy::HitsPerSlabStrategy(Config config) + : RebalanceStrategy(HitsPerSlab), config_(std::move(config)) {} + +// The list of allocation classes to be rebalanced is determined by: +// +// 0. Filter out classes that have below minSlabThreshold_ +// +// 1. Filter out classes that have just gained a slab recently +// +// 2. pick victim from the one that has poorest hitsPerSlab +ClassId HitsPerSlabStrategy::pickVictim(const Config& config, + const CacheBase& cache, + PoolId pid, + const PoolStats& stats) { + auto victims = stats.getClassIds(); + + // ignore allocation classes that have fewer than the threshold of slabs. + victims = + filterByNumEvictableSlabs(stats, std::move(victims), config.minSlabs); + + // ignore allocation classes that recently gained a slab. These will be + // growing in their eviction age and we want to let the evicitons stabilize + // before we consider them again. + victims = filterVictimsByHoldOff(pid, stats, std::move(victims)); + + // filter out alloc classes with less than the minimum tail age + if (config.minLruTailAge != 0) { + // we are only concerned about the eviction age and not the projected age. + const auto poolEvictionAgeStats = + cache.getPoolEvictionAgeStats(pid, /* projectionLength */ 0); + victims = filterByMinTailAge(poolEvictionAgeStats, std::move(victims), + config.minLruTailAge); + } + + if (victims.empty()) { + return Slab::kInvalidClassId; + } + + const auto& poolState = getPoolState(pid); + auto victimClassId = pickVictimByFreeMem( + victims, stats, config.getFreeMemThreshold(), poolState); + + if (victimClassId != Slab::kInvalidClassId) { + return victimClassId; + } + + return *std::min_element( + victims.begin(), victims.end(), [&](ClassId a, ClassId b) { + double weight_a = + config.getWeight ? config.getWeight(pid, a, stats) : 1; + double weight_b = + config.getWeight ? config.getWeight(pid, b, stats) : 1; + return poolState.at(a).projectedDeltaHitsPerSlab(stats) * weight_a < + poolState.at(b).projectedDeltaHitsPerSlab(stats) * weight_b; + }); +} + +// The list of allocation classes to be receiver is determined by: +// +// 0. Filter out classes that have no evictions +// +// 1. Filter out classes that have no slabs +// +// 2. pick receiver from the one that has highest hitsPerSlab +ClassId HitsPerSlabStrategy::pickReceiver(const Config& config, + PoolId pid, + const PoolStats& stats, + ClassId victim) const { + auto receivers = stats.getClassIds(); + receivers.erase(victim); + + const auto& poolState = getPoolState(pid); + // filter out alloc classes that are not evicting + receivers = filterByNoEvictions(stats, std::move(receivers), poolState); + + // filter out receivers who currently dont have any slabs. Their delta hits + // do not make much sense. + receivers = filterByNumEvictableSlabs(stats, std::move(receivers), 0); + + if (receivers.empty()) { + return Slab::kInvalidClassId; + } + + return *std::max_element( + receivers.begin(), receivers.end(), [&](ClassId a, ClassId b) { + double weight_a = + config.getWeight ? config.getWeight(pid, a, stats) : 1; + double weight_b = + config.getWeight ? config.getWeight(pid, b, stats) : 1; + return poolState.at(a).deltaHitsPerSlab(stats) * weight_a < + poolState.at(b).deltaHitsPerSlab(stats) * weight_b; + }); +} + +RebalanceContext HitsPerSlabStrategy::pickVictimAndReceiverImpl( + const CacheBase& cache, PoolId pid) { + if (!cache.getPool(pid).allSlabsAllocated()) { + XLOGF(DBG, + "Pool Id: {}" + " does not have all its slabs allocated" + " and does not need rebalancing.", + static_cast(pid)); + return kNoOpContext; + } + + const auto poolStats = cache.getPoolStats(pid); + + const auto config = getConfigCopy(); + + RebalanceContext ctx; + ctx.victimClassId = pickVictim(config, cache, pid, poolStats); + ctx.receiverClassId = pickReceiver(config, pid, poolStats, ctx.victimClassId); + + if (ctx.victimClassId == ctx.receiverClassId || + ctx.victimClassId == Slab::kInvalidClassId || + ctx.receiverClassId == Slab::kInvalidClassId) { + return kNoOpContext; + } + + auto& poolState = getPoolState(pid); + double weightVictim = 1; + double weightReceiver = 1; + if (config.getWeight) { + weightReceiver = config.getWeight(pid, ctx.receiverClassId, poolStats); + weightVictim = config.getWeight(pid, ctx.victimClassId, poolStats); + } + const auto victimProjectedDeltaHitsPerSlab = + poolState.at(ctx.victimClassId).projectedDeltaHitsPerSlab(poolStats) * + weightVictim; + const auto receiverDeltaHitsPerSlab = + poolState.at(ctx.receiverClassId).deltaHitsPerSlab(poolStats) * + weightReceiver; + + XLOGF(DBG, + "Rebalancing: receiver = {}, receiver delta hits per slab = {}, victim " + "= {}, victim projected delta hits per slab = {}", + static_cast(ctx.receiverClassId), receiverDeltaHitsPerSlab, + static_cast(ctx.victimClassId), victimProjectedDeltaHitsPerSlab); + + const auto improvement = + receiverDeltaHitsPerSlab - victimProjectedDeltaHitsPerSlab; + if (receiverDeltaHitsPerSlab < victimProjectedDeltaHitsPerSlab || + improvement < config.minDiff || + improvement < config.diffRatio * static_cast( + victimProjectedDeltaHitsPerSlab)) { + XLOG(DBG, " Not enough to trigger slab rebalancing"); + return kNoOpContext; + } + + // start a hold off so that the receiver does not become a victim soon + // enough. + poolState.at(ctx.receiverClassId).startHoldOff(); + + // update all alloc classes' hits state to current hits so that next time we + // only look at the delta hits sicne the last rebalance. + for (const auto i : poolStats.getClassIds()) { + poolState[i].updateHits(poolStats); + } + + return ctx; +} + +ClassId HitsPerSlabStrategy::pickVictimImpl(const CacheBase& cache, + PoolId pid) { + const auto poolStats = cache.getPoolStats(pid); + const auto config = getConfigCopy(); + auto victimClassId = pickVictim(config, cache, pid, poolStats); + + auto& poolState = getPoolState(pid); + // update all alloc classes' hits state to current hits so that next time we + // only look at the delta hits sicne the last resize. + for (const auto i : poolStats.getClassIds()) { + poolState[i].updateHits(poolStats); + } + + return victimClassId; +} +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/HitsPerSlabStrategy.h b/bdm/allocator/HitsPerSlabStrategy.h new file mode 100644 index 0000000000..db863db296 --- /dev/null +++ b/bdm/allocator/HitsPerSlabStrategy.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/RebalanceStrategy.h" + +namespace facebook { +namespace cachelib { + +// Strategy that rebalances the slabs by moving slabs from the allocation class +// with the lowest hits per slab to the highest hits per slab within the pool. +class HitsPerSlabStrategy : public RebalanceStrategy { + public: + struct Config : public BaseConfig { + // Absolute difference to be rebalanced + unsigned int minDiff{100}; + + // Relative difference to be rebalanced + double diffRatio{0.1}; + + // minimum number of slabs to retain in every allocation class. + unsigned int minSlabs{1}; + + // use free memory if it amounts to more than this many slabs. + unsigned int numSlabsFreeMem{3}; + + // minimum tail age for an allocation class to be eligible to be a victim + unsigned int minLruTailAge{0}; + + // optionial weight function based on allocation class size + using WeightFn = std::function; + WeightFn getWeight = {}; + + // free memory threshold used to pick victim. + size_t getFreeMemThreshold() const noexcept { + return numSlabsFreeMem * Slab::kSize; + } + + Config() noexcept {} + Config(double ratio, unsigned int _minSlabs) noexcept + : Config(ratio, _minSlabs, 0) {} + Config(double ratio, + unsigned int _minSlabs, + unsigned int _minLruTailAge) noexcept + : diffRatio(ratio), + minSlabs(_minSlabs), + minLruTailAge(_minLruTailAge) {} + Config(double ratio, + unsigned int _minSlabs, + unsigned int _minLruTailAge, + const WeightFn& weightFunction) noexcept + : diffRatio(ratio), + minSlabs(_minSlabs), + minLruTailAge(_minLruTailAge), + getWeight(weightFunction) {} + }; + + // Update the config. This will not affect the current rebalancing, but + // will take effect in the next round + void updateConfig(const BaseConfig& baseConfig) override final { + std::lock_guard l(configLock_); + config_ = static_cast(baseConfig); + } + + explicit HitsPerSlabStrategy(Config config = {}); + + protected: + // This returns a copy of the current config. + // This ensures that we're always looking at the same config even though + // someone else may have updated the config during rebalancing + Config getConfigCopy() const { + std::lock_guard l(configLock_); + return config_; + } + + RebalanceContext pickVictimAndReceiverImpl(const CacheBase& cache, + PoolId pid) override final; + + ClassId pickVictimImpl(const CacheBase& cache, PoolId pid) override final; + + private: + static AllocInfo makeAllocInfo(PoolId pid, + ClassId cid, + const PoolStats& stats) { + return AllocInfo{pid, cid, stats.allocSizeForClass(cid)}; + } + + ClassId pickVictim(const Config& config, + const CacheBase& cache, + PoolId pid, + const PoolStats& stats); + + ClassId pickReceiver(const Config& config, + PoolId pid, + const PoolStats& stats, + ClassId victim) const; + + // Config for this strategy, this can be updated anytime. + // Do not access this directly, always use `getConfig()` to + // obtain a copy first + Config config_; + mutable std::mutex configLock_; +}; +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/ICompactCache.h b/bdm/allocator/ICompactCache.h new file mode 100644 index 0000000000..42e6ff21a5 --- /dev/null +++ b/bdm/allocator/ICompactCache.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/CacheStats.h" + +namespace facebook { +namespace cachelib { + +/** + * A virtual interface for compact cache + */ +class ICompactCache { + public: + ICompactCache() {} + virtual ~ICompactCache() {} + + // return the name of the compact cache. + virtual std::string getName() const = 0; + + // return the pool id for the compact cache + virtual PoolId getPoolId() const = 0; + + // get the size of the ccache's allocator (in bytes). Returns 0 if it is + // disabled. + virtual size_t getSize() const = 0; + + // get the config size of the compact cache + virtual size_t getConfiguredSize() const = 0; + + // get the stats about the compact cache + virtual CCacheStats getStats() const = 0; + + // resize the compact cache according to configured size + virtual void resize() = 0; +}; +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/KAllocation.h b/bdm/allocator/KAllocation.h new file mode 100644 index 0000000000..533d298c13 --- /dev/null +++ b/bdm/allocator/KAllocation.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#include +#include +#pragma GCC diagnostic pop +#include + +#include "cachelib/common/BytesEqual.h" +#include "cachelib/common/CompilerUtils.h" + +namespace facebook { +namespace cachelib { + +/** + * Represents an allocation with a key. Provides a byte array interface that + * holds the key and a contiguous space of memory. + */ +class CACHELIB_PACKED_ATTR KAllocation { + public: + using KeyLenT = uint8_t; + // Maximum size of the key. + static constexpr KeyLenT kKeyMaxLen = std::numeric_limits::max(); + // Maximum number of bits of the value (payload minus the key) + static constexpr uint32_t kMaxValSizeBits = + NumBits::value - NumBits::value; + // Maximum size of the value (payload minus the key) + static constexpr uint32_t kMaxValSize = + (static_cast(1) << kMaxValSizeBits) - 1; + + // type of the key for allocations. It is a folly::StringPiece aka + // Range with a custom comparison operator that should do + // better than folly's compare. + class Key : public folly::StringPiece { + public: + using folly::StringPiece::StringPiece; + + /* implicit */ + Key(folly::StringPiece rhs) : folly::StringPiece(rhs) {} + + bool operator==(Key other) const { + return size() == other.size() && bytesEqual(data(), other.data(), size()); + } + + bool operator!=(Key other) const { return !(*this == other); } + }; + + // constructs the KAllocation instance by initializing the key and the + // appropriate fields. The key is copied into the allocation. + // + // total size of variable allocation in KAllocation: + // valSize + keySize + // + // @param key the key for the allocation + // @param valSize size of the value + // + // @throw std::invalid_argument if the key/size is invalid. + KAllocation(const Key key, uint32_t valSize) + : size_((static_cast(key.size()) << kMaxValSizeBits) | + valSize) { + if (valSize > kMaxValSize) { + throw std::invalid_argument(folly::sformat( + "value size exceeded maximum allowed. total size: {}", valSize)); + } + + throwIfKeyInvalid(key); + + // Copy the key into the allocation + memcpy(&data_[0], key.start(), getKeySize()); + } + + KAllocation(const KAllocation&) = delete; + KAllocation& operator=(const KAllocation&) = delete; + + // returns the key corresponding to the allocation. + const Key getKey() const noexcept { + return Key{reinterpret_cast(&data_[0]), getKeySize()}; + } + + // updates the current key with the new one. The key size must match. + void changeKey(Key key) { + if (key.size() != getKeySize()) { + throw std::invalid_argument("Key size mismatch"); + } + std::memcpy(&data_[0], key.start(), getKeySize()); + } + + // return a void* to the usable memory block. There are no alignment + // guarantees. + // TODO add support for alignment + void* getMemory() const noexcept { return &data_[getKeySize()]; } + + // get the size of the value. + uint32_t getSize() const noexcept { return size_ & kMaxValSize; } + + // Check if the key is valid. The length of the key needs to be in (0, + // kKeyMaxLen) to be valid + static bool isKeyValid(folly::StringPiece key) { + // StringPiece empty() does not realy check for start being nullptr + return (key.size() <= kKeyMaxLen) && (!key.empty()) && (key.start()); + } + + // Throw readable exception if the key is invalid. + static void throwIfKeyInvalid(folly::StringPiece key) { + if (!isKeyValid(key)) { + // We need to construct the key for the error message manually here for + // two reasons + // + // 1) The StringPiece can start with a nullptr, and have a non-0 length, + // this in turn means that std::string's constructor will throw a + // std::logic_error. So we construct the string manually + // 2) The StringPiece might not be null terminated. So we construct the + // std::string manually with a pointer and size. Which mandates good + // internal representation + auto badKey = + (key.start()) ? std::string(key.start(), key.size()) : std::string{}; + throw std::invalid_argument{ + folly::sformat("Invalid cache key : {}", folly::humanify(badKey))}; + } + } + + private: + // Top 8 bits are for key size (up to 255 bytes) + // Bottom 24 bits are for value size (up to 16777215 bytes) + const uint32_t size_; + + // beginning of the byte array. First keylen bytes correspond to the key and + // the next size - keylen_ bytes are usable. + mutable unsigned char data_[0]; + + uint32_t getKeySize() const noexcept { + return static_cast(size_ >> kMaxValSizeBits); + } +}; +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/LruTailAgeStrategy.cpp b/bdm/allocator/LruTailAgeStrategy.cpp new file mode 100644 index 0000000000..182b0cc6a3 --- /dev/null +++ b/bdm/allocator/LruTailAgeStrategy.cpp @@ -0,0 +1,200 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cachelib/allocator/LruTailAgeStrategy.h" + +#include + +#include +#include + +#include "cachelib/allocator/Util.h" + +namespace facebook { +namespace cachelib { + +LruTailAgeStrategy::LruTailAgeStrategy(Config config) + : RebalanceStrategy(LruTailAge), config_(std::move(config)) {} + +uint64_t LruTailAgeStrategy::getOldestElementAge( + const PoolEvictionAgeStats& poolEvictionAgeStats, ClassId cid) const { + switch (config_.queueSelector) { + case Config::QueueSelector::kHot: + return poolEvictionAgeStats.classEvictionAgeStats.at(cid) + .hotQueueStat.oldestElementAge; + case Config::QueueSelector::kWarm: + return poolEvictionAgeStats.classEvictionAgeStats.at(cid) + .warmQueueStat.oldestElementAge; + case Config::QueueSelector::kCold: + return poolEvictionAgeStats.classEvictionAgeStats.at(cid) + .coldQueueStat.oldestElementAge; + default: + XDCHECK(false) << "queue selector is invalid"; + return 0; + } +} + +uint64_t LruTailAgeStrategy::getProjectedAge( + const PoolEvictionAgeStats& poolEvictionAgeStats, ClassId cid) const { + switch (config_.queueSelector) { + case Config::QueueSelector::kHot: + return poolEvictionAgeStats.classEvictionAgeStats.at(cid) + .hotQueueStat.projectedAge; + case Config::QueueSelector::kWarm: + return poolEvictionAgeStats.classEvictionAgeStats.at(cid) + .warmQueueStat.projectedAge; + case Config::QueueSelector::kCold: + return poolEvictionAgeStats.classEvictionAgeStats.at(cid) + .coldQueueStat.projectedAge; + default: + XDCHECK(false) << "queue selector is invalid"; + return 0; + } +} + +// The list of allocation classes to be rebalanced is determined by: +// +// 0. Filter out classes that have below minSlabThreshold_ +// +// 1. Filter out classes that have just gained a slab recently +// +// 2. Compute weighted tail age from all the remaining ACs +// +// 3. Pick an AC with the oldest tail age higher than the weighted average +ClassId LruTailAgeStrategy::pickVictim( + const Config& config, + PoolId pid, + const PoolStats& poolStats, + const PoolEvictionAgeStats& poolEvictionAgeStats) { + auto victims = poolStats.getClassIds(); + + // ignore allocation classes that have fewer than the threshold of slabs. + victims = + filterByNumEvictableSlabs(poolStats, std::move(victims), config.minSlabs); + + // ignore allocation classes that recently gained a slab. These will be + // growing in their eviction age and we want to let the evicitons stabilize + // before we consider them again. + victims = filterVictimsByHoldOff(pid, poolStats, std::move(victims)); + + if (victims.empty()) { + XLOG(DBG, "Rebalancing: No victims available"); + return Slab::kInvalidClassId; + } + + auto victimClassId = pickVictimByFreeMem( + victims, poolStats, config.getFreeMemThreshold(), getPoolState(pid)); + + if (victimClassId != Slab::kInvalidClassId) { + return victimClassId; + } + + // the oldest projected age among the victims + return *std::max_element( + victims.begin(), victims.end(), [&](ClassId a, ClassId b) { + return ( + getProjectedAge(poolEvictionAgeStats, a) * + (config.getWeight ? config.getWeight(pid, a, poolStats) : 1.0) < + getProjectedAge(poolEvictionAgeStats, b) * + (config.getWeight ? config.getWeight(pid, b, poolStats) : 1.0)); + }); +} + +ClassId LruTailAgeStrategy::pickReceiver( + const Config& config, + PoolId pid, + const PoolStats& stats, + ClassId victim, + const PoolEvictionAgeStats& poolEvictionAgeStats) const { + auto receivers = stats.getClassIds(); + receivers.erase(victim); + + receivers = filterByNoEvictions(stats, receivers, getPoolState(pid)); + if (receivers.empty()) { + return Slab::kInvalidClassId; + } + + // the youngest age among the potenital receivers + return *std::min_element( + receivers.begin(), receivers.end(), [&](ClassId a, ClassId b) { + return (getOldestElementAge(poolEvictionAgeStats, a) * + (config.getWeight ? config.getWeight(pid, a, stats) : 1.0) < + getOldestElementAge(poolEvictionAgeStats, b) * + (config.getWeight ? config.getWeight(pid, b, stats) : 1.0)); + }); +} + +RebalanceContext LruTailAgeStrategy::pickVictimAndReceiverImpl( + const CacheBase& cache, PoolId pid) { + if (!cache.getPool(pid).allSlabsAllocated()) { + XLOGF(DBG, + "Pool Id: {}" + " does not have all its slabs allocated" + " and does not need rebalancing.", + static_cast(pid)); + + return kNoOpContext; + } + + const auto config = getConfigCopy(); + + const auto poolStats = cache.getPoolStats(pid); + const auto poolEvictionAgeStats = + cache.getPoolEvictionAgeStats(pid, config.slabProjectionLength); + + RebalanceContext ctx; + ctx.victimClassId = pickVictim(config, pid, poolStats, poolEvictionAgeStats); + ctx.receiverClassId = pickReceiver(config, pid, poolStats, ctx.victimClassId, + poolEvictionAgeStats); + if (ctx.victimClassId == ctx.receiverClassId || + ctx.victimClassId == Slab::kInvalidClassId || + ctx.receiverClassId == Slab::kInvalidClassId) { + return kNoOpContext; + } + + if (!config.getWeight) { + const auto victimProjectedTailAge = + getProjectedAge(poolEvictionAgeStats, ctx.victimClassId); + const auto receiverTailAge = + getOldestElementAge(poolEvictionAgeStats, ctx.receiverClassId); + + XLOGF(DBG, "Rebalancing: receiver = {}, receiverTailAge = {}, victim = {}", + static_cast(ctx.receiverClassId), receiverTailAge, + static_cast(ctx.victimClassId)); + + const auto improvement = victimProjectedTailAge - receiverTailAge; + if (victimProjectedTailAge < receiverTailAge || + improvement < config.minTailAgeDifference || + improvement < config.tailAgeDifferenceRatio * + static_cast(victimProjectedTailAge)) { + return kNoOpContext; + } + } + + // start a hold off so that the receiver does not become a victim soon + // enough. + getPoolState(pid).at(ctx.receiverClassId).startHoldOff(); + return ctx; +} + +ClassId LruTailAgeStrategy::pickVictimImpl(const CacheBase& cache, PoolId pid) { + const auto config = getConfigCopy(); + const auto poolEvictionAgeStats = + cache.getPoolEvictionAgeStats(pid, config.slabProjectionLength); + return pickVictim(config, pid, cache.getPoolStats(pid), poolEvictionAgeStats); +} +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/LruTailAgeStrategy.h b/bdm/allocator/LruTailAgeStrategy.h new file mode 100644 index 0000000000..2a90045ba4 --- /dev/null +++ b/bdm/allocator/LruTailAgeStrategy.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/RebalanceStrategy.h" + +namespace facebook { +namespace cachelib { + +// If an allocation class has its tail age higher than a threshold, +// we look at how much it is higher than the average tail age. If the +// differnece is greater than the tail age difference ratio specified +// in the config, that allocation class will release a slab. +class LruTailAgeStrategy : public RebalanceStrategy { + public: + struct Config : public BaseConfig { + // any LRU whose tail age surpasses the average tail age by this ratio is to + // be rebalanced + double tailAgeDifferenceRatio{0.25}; + + // minimum tail age difference between victim and receiver for a slab + // rebalance to happen + unsigned int minTailAgeDifference{100}; + + // minimum number of slabs to retain in every allocation class. + unsigned int minSlabs{1}; + + // use free memory if it is amounts to more than this many slabs. + unsigned int numSlabsFreeMem{3}; + + // how many slabs worth of items do we project to determine a victim. + unsigned int slabProjectionLength{1}; + + // Weight function can be changed as config parameter. By default, + // weight function is null, and no weighted tail age is computed. + // If the weight function is set, tailAgeDifferenceRatio and + // minTailAgeDifference are ignored + using WeightFn = std::function; + WeightFn getWeight = {}; + + // This lets us specify which queue's eviction age to use. + // Note not all eviction policies provide hot, warm, and cold queues. + // We leave it up to the policy to determine how to define hot, warm, cold + // eviction ages. For exmaple, in LRU, we use the same eviction-age + // for all three stats. + enum class QueueSelector { kHot, kWarm, kCold }; + QueueSelector queueSelector{QueueSelector::kWarm}; + + // The free memory threshold to be used to pick victim class. + size_t getFreeMemThreshold() const noexcept { + return numSlabsFreeMem * Slab::kSize; + } + + Config() noexcept {} + Config(double ratio, unsigned int _minSlabs) noexcept + : tailAgeDifferenceRatio(ratio), minSlabs{_minSlabs} {} + Config(double ratio, + unsigned int _minSlabs, + const WeightFn& _getWeight) noexcept + : tailAgeDifferenceRatio(ratio), + minSlabs{_minSlabs}, + getWeight(_getWeight) {} + }; + + // Update the config. This will not affect the current rebalancing, but + // will take effect in the next round + void updateConfig(const BaseConfig& baseConfig) override { + std::lock_guard l(configLock_); + config_ = static_cast(baseConfig); + } + + explicit LruTailAgeStrategy(Config config = {}); + + protected: + // This returns a copy of the current config. + // This ensures that we're always looking at the same config even though + // someone else may have updated the config during rebalancing + Config getConfigCopy() const { + std::lock_guard l(configLock_); + return config_; + } + + RebalanceContext pickVictimAndReceiverImpl(const CacheBase& cache, + PoolId pid) override final; + + ClassId pickVictimImpl(const CacheBase& cache, PoolId pid) override final; + + private: + static AllocInfo makeAllocInfo(PoolId pid, + ClassId cid, + const PoolStats& stats) { + return AllocInfo{pid, cid, stats.allocSizeForClass(cid)}; + } + + ClassId pickVictim(const Config& config, + PoolId pid, + const PoolStats& stats, + const PoolEvictionAgeStats& poolEvictionAgeStats); + + ClassId pickReceiver(const Config& config, + PoolId pid, + const PoolStats& stats, + ClassId victim, + const PoolEvictionAgeStats& poolEvictionAgeStats) const; + + uint64_t getOldestElementAge(const PoolEvictionAgeStats& poolEvictionAgeStats, + ClassId cid) const; + + uint64_t getProjectedAge(const PoolEvictionAgeStats& poolEvictionAgeStats, + ClassId cid) const; + + // Config for this strategy, this can be updated anytime. + // Do not access this directly, always use `getConfig()` to + // obtain a copy first + Config config_; + mutable std::mutex configLock_; +}; +} // namespace cachelib +} // namespace facebook diff --git a/bdm/allocator/MM2Q-inl.h b/bdm/allocator/MM2Q-inl.h new file mode 100644 index 0000000000..0b0df33413 --- /dev/null +++ b/bdm/allocator/MM2Q-inl.h @@ -0,0 +1,477 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace facebook { +namespace cachelib { + +/* Container Interface Implementation */ +template T::*HookPtr> +MM2Q::Container::Container(const serialization::MM2QObject& object, + PtrCompressor compressor) + : lru_(*object.lrus(), compressor), + tailTrackingEnabled_(*object.tailTrackingEnabled()), + config_(*object.config()) { + lruRefreshTime_ = config_.lruRefreshTime; + nextReconfigureTime_ = config_.mmReconfigureIntervalSecs.count() == 0 + ? std::numeric_limits