diff --git a/.gitignore b/.gitignore index 01a3f4641..f2424a220 100644 --- a/.gitignore +++ b/.gitignore @@ -91,4 +91,16 @@ clang-format-diff.py .py3/ cmake-build-debug -cmake-build-release \ No newline at end of file +cmake-build-release + +*_example +inode_vptrs +.cache/ +.conda/ +*.txt +__pycache__/ +*.csv + +include/csv2/ +debug.* +*.log \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index bb394f9ea..3d1050205 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -256,6 +256,7 @@ endif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") set(CMAKE_CXX_FLAGS "-mavx2 -lpmem ${CMAKE_CXX_FLAGS}") add_definitions(-DART) add_definitions(-DUSE_PMEM) +add_definitions(-DART_PLUS) option(PORTABLE "build a portable binary" OFF) option(FORCE_SSE42 "force building with SSE4.2, even when PORTABLE=ON" OFF) @@ -589,6 +590,13 @@ set(SOURCES db/art/art_node.cc db/art/compactor.cc db/art/global_memtable.cc + db/art/heat_buckets.cc + db/art/clf_model.cc + db/art/filter_cache_heap.cc + db/art/filter_cache_item.cc + db/art/filter_cache.cc + db/art/filter_cache_client.cc + db/art/greedy_algo.cc db/art/heat_group.cc db/art/lock.cc db/art/logger.cc diff --git a/TARGETS b/TARGETS index 0ea8e033e..f18fc69c9 100644 --- a/TARGETS +++ b/TARGETS @@ -447,6 +447,13 @@ cpp_library( "db/art/art_node.cc" "db/art/compactor.cc" "db/art/global_memtable.cc", + "db/art/heat_buckets.cc", + "db/art/clf_model.cc", + "db/art/filter_cache_heap.cc", + "db/art/filter_cache_item.cc", + "db/art/filter_cache.cc", + "db/art/filter_cache_client.cc", + "db/art/greedy_algo.cc", "db/art/heat_group.cc", "db/art/lock.cc", "db/art/logger.cc", diff --git a/YCSB/Makefile b/YCSB/Makefile index 1b2a65816..10ba2d280 100644 --- a/YCSB/Makefile +++ b/YCSB/Makefile @@ -10,13 +10,19 @@ #---------------------build config------------------------- DEBUG_BUILD ?= 0 -EXTRA_CXXFLAGS ?= -I../include -I../include/rocksdb -EXTRA_LDFLAGS ?= -L../ -lpmem -ldl -lz -lbz2 -lsnappy -llz4 -lzstd +EXTRA_CXXFLAGS ?= -I../include -I../include/rocksdb +EXTRA_LDFLAGS ?= -L../ -lpmem -ldl BIND_ROCKSDB ?= 1 BIND_LEVELDB ?= 0 BIND_LMDB ?= 0 +EXTRA_LDFLAGS += -lstdc++ +EXTRA_LDFLAGS += -lsocket++ +# EXTRA_LDFLAGS += -lpython3.12 +# EXTRA_CXXFLAGS += -I$(PYTHON_INCLUDE_PATH) +# EXTRA_CXXFLAGS += -L$(PYTHON_LIBRARY_PATH) + #---------------------------------------------------------- ifeq ($(DEBUG_BUILD), 1) diff --git a/YCSB/leveldb/leveldb_db.cc b/YCSB/leveldb/leveldb_db.cc index 27eca6c3d..b201a1467 100644 --- a/YCSB/leveldb/leveldb_db.cc +++ b/YCSB/leveldb/leveldb_db.cc @@ -151,16 +151,6 @@ void LeveldbDB::GetOptions(const utils::Properties &props, leveldb::Options *opt } } -void LeveldbDB::SerializeRow(const std::vector &values, std::string *data) { - for (const Field &field : values) { - uint32_t len = field.name.size(); - data->append(reinterpret_cast(&len), sizeof(uint32_t)); - data->append(field.name.data(), field.name.size()); - len = field.value.size(); - data->append(reinterpret_cast(&len), sizeof(uint32_t)); - data->append(field.value.data(), field.value.size()); - } -} void LeveldbDB::DeserializeRowFilter(std::vector *values, const std::string &data, const std::vector &fields) { diff --git a/YCSB/rocksdb/rocksdb.properties b/YCSB/rocksdb/rocksdb.properties index 2540c7d4a..d4b7f9774 100644 --- a/YCSB/rocksdb/rocksdb.properties +++ b/YCSB/rocksdb/rocksdb.properties @@ -1,5 +1,5 @@ -rocksdb.dbname=/tmp/tmp_data/db_nvm_l0 -rocksdb.nvm_path=/mnt/chen/nodememory +rocksdb.dbname=/mnt/walsm/tmp/tmp_data/db_test_art +rocksdb.nvm_path=/mnt/walsm/node_memory rocksdb.format=single rocksdb.destroy=false @@ -23,5 +23,7 @@ rocksdb.cache_size=8388608 rocksdb.compressed_cache_size=0 rocksdb.bloom_bits=0 +# set total_threads to 32, see rocksdb_db.cc rocksdb.increase_parallelism=true -rocksdb.optimize_level_style_compaction=true +# rocksdb.optimize_level_style_compaction=true +rocksdb.optimize_universal_style_compaction=true diff --git a/YCSB/rocksdb/rocksdb_db.cc b/YCSB/rocksdb/rocksdb_db.cc index d7cd575d0..4f8dda0cd 100644 --- a/YCSB/rocksdb/rocksdb_db.cc +++ b/YCSB/rocksdb/rocksdb_db.cc @@ -18,6 +18,7 @@ #include #include #include +#include namespace { const std::string PROP_NAME = "rocksdb.dbname"; @@ -98,6 +99,9 @@ namespace { const std::string PROP_OPTIMIZE_LEVELCOMP = "rocksdb.optimize_level_style_compaction"; const std::string PROP_OPTIMIZE_LEVELCOMP_DEFAULT = "false"; + const std::string PROP_OPTIMIZE_UNIVERSALCOMP = "rocksdb.optimize_universal_style_compaction"; + const std::string PROP_OPTIMIZE_UNIVERSALCOMP_DEFAULT = "false"; + const std::string PROP_OPTIONS_FILE = "rocksdb.optionsfile"; const std::string PROP_OPTIONS_FILE_DEFAULT = ""; @@ -351,11 +355,14 @@ void RocksdbDB::GetOptions(const utils::Properties &props, rocksdb::Options *opt opt->table_factory.reset(rocksdb::NewBlockBasedTableFactory(table_options)); if (props.GetProperty(PROP_INCREASE_PARALLELISM, PROP_INCREASE_PARALLELISM_DEFAULT) == "true") { - opt->IncreaseParallelism(); + opt->IncreaseParallelism(32); } if (props.GetProperty(PROP_OPTIMIZE_LEVELCOMP, PROP_OPTIMIZE_LEVELCOMP_DEFAULT) == "true") { opt->OptimizeLevelStyleCompaction(); } + if (props.GetProperty(PROP_OPTIMIZE_UNIVERSALCOMP, PROP_OPTIMIZE_UNIVERSALCOMP_DEFAULT) == "true") { + opt->OptimizeUniversalStyleCompaction(); + } } } @@ -424,6 +431,12 @@ DB::Status RocksdbDB::ReadSingle(const std::string &table, const std::string &ke std::vector &result) { std::string data; rocksdb::Status s = db_->Get(rocksdb::ReadOptions(), key, &data); + #ifdef GEN_WORKLOAD + std::fstream f; + f.open("../workload/workload", std::ios::out | std::ios::app); + f << key < &values) { + /* std::string data; rocksdb::Status s = db_->Get(rocksdb::ReadOptions(), key, &data); if (s.IsNotFound()) { @@ -491,6 +505,9 @@ DB::Status RocksdbDB::UpdateSingle(const std::string &table, const std::string & throw utils::Exception(std::string("RocksDB Put: ") + s.ToString()); } return kOK; + */ + // use insert, not read-modify-write + return InsertSingle(table, key, values); } DB::Status RocksdbDB::MergeSingle(const std::string &table, const std::string &key, diff --git a/YCSB/workloads/workloadt b/YCSB/workloads/workloadt new file mode 100644 index 000000000..a69512474 --- /dev/null +++ b/YCSB/workloads/workloadt @@ -0,0 +1,16 @@ +# Yahoo! Cloud System Benchmark +# Workload T: For Debug + + +recordcount=5000000 +operationcount=2200000 +workload=com.yahoo.ycsb.workloads.CoreWorkload + +readallfields=true + +readproportion=1 +updateproportion=0 +scanproportion=0 +insertproportion=0 + +requestdistribution=zipfian \ No newline at end of file diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index 1347693e3..a5a8e6d5a 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -45,6 +45,13 @@ if test -z "$OUTPUT"; then exit 1 fi +ROCKSDB_DISABLE_SNAPPY=1 +ROCKSDB_DISABLE_ZLIB=1 +ROCKSDB_DISABLE_BZIP=1 +ROCKSDB_DISABLE_LZ4=1 +ROCKSDB_DISABLE_ZSTD=1 +ROCKSDB_DISABLE_GFLAGS=1 + # we depend on C++11 PLATFORM_CXXFLAGS="-std=c++11" # we currently depend on POSIX platform @@ -272,7 +279,11 @@ JAVA_LDFLAGS="$PLATFORM_LDFLAGS" JAVA_STATIC_LDFLAGS="$PLATFORM_LDFLAGS" JAVAC_ARGS="-source 7" -COMMON_FLAGS="$COMMON_FLAGS -DUSE_PMEM -DART" +COMMON_FLAGS="$COMMON_FLAGS -DUSE_PMEM -DART -DART_PLUS" +COMMON_FLAGS="$COMMON_FLAGS -lstdc++ -lsocket++" +# COMMON_FLAGS="$COMMON_FLAGS -lstdc++ -lpython3.12" +# COMMON_FLAGS="$COMMON_FLAGS -I$PYTHON_INCLUDE_PATH" +# COMMON_FLAGS="$COMMON_FLAGS -L$PYTHON_LIBRARY_PATH" PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpmem" JAVA_LDFLAGS="$JAVA_LDFLAGS -lpmem" diff --git a/db/art/clf_model.cc b/db/art/clf_model.cc new file mode 100644 index 000000000..2c8224979 --- /dev/null +++ b/db/art/clf_model.cc @@ -0,0 +1,232 @@ +#include "clf_model.h" +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ROCKSDB_NAMESPACE { + +uint16_t ClfModel::feature_num_; +std::string ClfModel::dataset_name_; +std::string ClfModel::dataset_path_; +std::string ClfModel::host_, ClfModel::port_; +size_t ClfModel::buffer_size_; + +void ClfModel::write_debug_dataset() { + assert(feature_num_ > 0); + // ready for writer + std::ofstream stream(dataset_path_); + csv2::Writer> writer(stream); + + // init hotness values + std::map hotness_map; + double base_hotness = 0.01; + for (int i = 0; i < 200; i ++) { + float r = static_cast (rand()) / static_cast (RAND_MAX) + base_hotness; + hotness_map[i] = r; + } + + // init header vector + std::vector> rows; + std::vector header; + header.emplace_back("Level"); + for (int i = 0; i < 20; i ++) { + header.emplace_back("Rate_" + std::to_string(i)); + header.emplace_back("Hotness_" + std::to_string(i)); + } + header.emplace_back("Target"); + header.emplace_back("Count"); + rows.emplace_back(header); + + // ready for shuffling + std::vector ids; + for(int i = 0; i < 200; i ++) { + ids.emplace_back(i); + } + + // generate values + for (int i = 0; i < 1000; i ++) { + // std::vector value; + std::vector values; + uint32_t level = i / 200; + uint16_t target = 5 - level; + uint32_t count = (1000 - i) * 100; + float r = static_cast (rand()) / static_cast (RAND_MAX); + if (r > 0.10 * level) { + target -= 1; + count -= 100; + } + + auto seed = std::chrono::system_clock::now().time_since_epoch().count(); + std::shuffle(ids.begin(), ids.end(), std::default_random_engine(seed)); + values.emplace_back(std::to_string(level)); + for (int j = 0; j < 20; j ++) { + values.emplace_back(std::to_string(uint32_t(ids[j] * 0.005 * RATE_SIGNIFICANT_DIGITS_FACTOR))); + values.emplace_back(std::to_string(uint32_t(HOTNESS_SIGNIFICANT_DIGITS_FACTOR * hotness_map[ids[j]]))); + } + values.emplace_back(std::to_string(target)); // Target column + values.emplace_back(std::to_string(count)); // Count column + assert(values.size() == feature_num_ + 2); + rows.emplace_back(values); + } + + writer.write_rows(rows); + stream.close(); +} + +void ClfModel::write_real_dataset(std::vector>& datas, std::vector& tags, std::vector& get_cnts) { + assert(feature_num_ > 0); + // tags is real class of all segments, + // we also need to write these tags to dataset besides features + assert(datas.size()==tags.size()); + assert(datas.size()==get_cnts.size()); + // ready for writer + std::ofstream stream(dataset_path_); + csv2::Writer> writer(stream); + + // init csv header vector + std::vector> rows; + std::vector header; + uint16_t ranges_num = (feature_num_ - 1) / 2; + header.emplace_back("Level"); + for (int i = 0; i < ranges_num; i ++) { + header.emplace_back("Rate_" + std::to_string(i)); + header.emplace_back("Hotness_" + std::to_string(i)); + } + // remind that targeted class is in csv Target column + // corresponding to code of lgb.py in ../models dir + header.emplace_back("Target"); + header.emplace_back("Count"); + rows.emplace_back(header); + + std::vector values; + size_t idx = 0; + for (std::vector& data : datas) { + // resize features vector to size feature_num_ + prepare_data(data); + values.clear(); + for (uint32_t& value : data) { + values.emplace_back(std::to_string(value)); + } + // remember to write real tag and get cnt to dataset + values.emplace_back(std::to_string(tags[idx])); + values.emplace_back(std::to_string(get_cnts[idx++])); + assert(values.size() == feature_num_ + 2); + rows.emplace_back(values); + } + + writer.write_rows(rows); + stream.close(); +} + +void ClfModel::write_dataset(std::vector>& datas, std::vector& tags, std::vector& get_cnts) { + assert(feature_num_ > 0); + if (datas.empty()) { + write_debug_dataset(); + // dataset_cnt_ += 1; + return; + } + + assert(feature_num_ % 2 != 0); // features num: 2r + 1 + + write_real_dataset(datas, tags, get_cnts); + // dataset_cnt_ += 1; + return; +} + +void ClfModel::make_train(std::vector>& datas, std::vector& tags, std::vector& get_cnts) { + assert(feature_num_ > 0); + write_dataset(datas, tags, get_cnts); + + // already write dataset + // send msg to LightGBM server, let server read dataset and train new model + libsocket::inet_stream sock(host_, port_, LIBSOCKET_IPv4); + std::string message = TRAIN_PREFIX + dataset_name_; + // already write dataset, send dataset path to server + // should not receive any message from server + std::string recv_buffer; + recv_buffer.resize(buffer_size_); + sock << message; + sock >> recv_buffer; // wait for training end + // will destroy sock when leaving this func scope +} + +void ClfModel::make_predict_samples(std::vector>& datas) { + assert(feature_num_ > 0); + csv2::Reader, + csv2::quote_character<'"'>, + csv2::first_row_is_header, + csv2::trim_policy::trim_whitespace> csv; + std::vector data; + if (csv.mmap(dataset_path_)) { + // const auto header = csv.header(); + int cnt = 0; + for (auto row : csv) { + // only choose first 10 samples + if ((++cnt) > 10) { + break; + } + data.clear(); + for (auto cell : row) { + std::string value; + cell.read_value(value); + data.emplace_back(stoul(value)); + } + // remind that csv reader will read a empty row in the end, that is why !data.empty() + // csv file last two column is real tag and get cnt + // we need to pop out last column + if (!data.empty()) { + data.pop_back(); // pop out get cnt + data.pop_back(); // pop out real tag + } + assert(data.size() == feature_num_); + datas.emplace_back(data); + } + } +} + +void ClfModel::make_real_predict(std::vector>& datas, std::vector& preds) { + assert(preds.empty()); + libsocket::inet_stream sock(host_, port_, LIBSOCKET_IPv4); + std::string message, recv_buffer; + for (std::vector& data : datas) { + if (!data.empty()) { + prepare_data(data); + message.clear(); + recv_buffer.clear(); + recv_buffer.resize(buffer_size_); + message = std::to_string(data[0]); + for (size_t i = 1; i < data.size(); i ++) { + message = message + " " + std::to_string(data[i]); + } + message = PREDICT_PREFIX + message; + assert(message.size() <= buffer_size_); + sock << message; + // only receive pred tag integer + sock >> recv_buffer; + uint16_t pred = std::stoul(recv_buffer); + assert(pred >= MIN_UNITS_NUM && pred <= MAX_UNITS_NUM); + preds.emplace_back(pred); + } + } + // only write pred result to vector preds, and return nothing + assert(datas.size() == preds.size()); +} + +void ClfModel::make_predict(std::vector>& datas, std::vector& preds) { + preds.clear(); + + // datas empty means we are debuging class ClfModel + if (datas.empty()) { + make_predict_samples(datas); + } + // only write pred result to vector preds, and return nothing + make_real_predict(datas, preds); + return; +} + +} \ No newline at end of file diff --git a/db/art/clf_model.h b/db/art/clf_model.h new file mode 100644 index 000000000..1aaf3ea21 --- /dev/null +++ b/db/art/clf_model.h @@ -0,0 +1,120 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include "macros.h" + +// dataset data point format: +// every data point accounts for one segment +// supposed that considering r key range +// every key range have id and hotness ( see heat_buckets ) +// so data point features format : +// LSM-Tree level, Key Range 1 rate, Key Range 1 hotness, Key Range 2 rate, Key Range 2 hotness, ..., Key Range r rate, Key Range r hotness +// we also need to append best units num (from solving programming problem) and visit count to every row +// so in data csv, one row would be like: +// LSM-Tree level, key range 1 rate, key range 1 hotness, ..., best units num (for the segment), visit count to this segment (in last long peroid) +// for example, assume that segment 1 can be devide into key range 3 (50% keys), key range 4 (30% keys), key range 6 (20% keys) +// sort these key ranges by their keys rate (e.g. key range 3 rate = 50%) and set feature num to 7 +// data format can be like: +// 5 (segment 1 level in LSM-Tree), 50% (key range 3 rate), 5234 (key range 3 hotness), 30% (key range 4 rate), 2222 (key range 4 hotness), 20% (key range 6 rate), 11111 (key range 6 hotness) +// remind that heat_buckets recorded hotness is double type, +// because data feature only accept uint32_t type, +// we use uint32_t(HOTNESS_SIGNIFICANT_DIGITS_FACTOR * hotness) to closely estimate its hotness value +// also use uint32_t(RATE_SIGNIFICANT_DIGITS_FACTOR * rate) to closely estimate its rate in this segment + +namespace ROCKSDB_NAMESPACE { + +struct RangeRatePair; +class ClfModel; + +bool RangeRatePairLessorComparor(const RangeRatePair& pair_1, const RangeRatePair& pair_2); +bool RangeRatePairGreatorComparor(const RangeRatePair& pair_1, const RangeRatePair& pair_2); + +struct RangeRatePair { + uint32_t range_id; + double rate_in_segment; + RangeRatePair(const uint32_t& id, const double& rate) { + range_id = id; rate_in_segment = rate; + } +}; + +inline bool RangeRatePairLessorComparor(const RangeRatePair& pair_1, const RangeRatePair& pair_2) { + return pair_1.rate_in_segment < pair_2.rate_in_segment; +} + +inline bool RangeRatePairGreatorComparor(const RangeRatePair& pair_1, const RangeRatePair& pair_2) { + return pair_1.rate_in_segment > pair_2.rate_in_segment; +} + +class ClfModel { +private: + static uint16_t feature_num_; // model input features num + static std::string dataset_name_; // dataset csv file name + static std::string dataset_path_; // path to save dataset csv file + static std::string host_, port_; // lightgbm server connection + static size_t buffer_size_; // socket receive buffer max size +public: + // init member vars + ClfModel() { + feature_num_ = 0; + dataset_name_ = DATASET_NAME; + // MODEL_PATH must end with '/' + dataset_path_ = MODEL_PATH; + dataset_path_ = dataset_path_ + DATASET_NAME; + host_ = HOST; port_ = PORT; + buffer_size_ = BUFFER_SIZE; + } + + // check whether ready, only need check feature_nums_ now + bool is_ready() { return feature_num_ > 0; } + + // make ready for training, only need init feature_nums_ now + // when first call ClfModel, we need to use current segments information to init features_num_ + // we can calcuate feature nums for every segment, + // feature num = level feature num (1) + 2 * num of key ranges + // we set features_num_ to largest feature num + void make_ready(std::vector& features_nums) { + if (features_nums.empty()) { + feature_num_ = MAX_FEATURES_NUM; // debug feature num, see ../lgb_server files + } else { + // we may limit feature_num_ because of the socket transmit size limit is 1024 bytes + // so feature_num_ may be limit to at most about 3 * 30 + 1 = 91 + feature_num_ = *max_element(features_nums.begin(), features_nums.end()); + if (feature_num_ > MAX_FEATURES_NUM) { + feature_num_ = MAX_FEATURES_NUM; + } + } + + // std::cout << "[DEBUG] ClfModel ready, feature_num_: " << feature_num_ << std::endl; + } + + ~ClfModel() { + return; // do nothing + } + + // resize data point features + void prepare_data(std::vector& data) { + // at least level feature and one key range, so data size always >= 3 + assert(data.size() >= 3); + data.resize(feature_num_, 0); + } + + // resize every data point and write to csv file for training + void write_debug_dataset(); + void write_real_dataset(std::vector>& datas, std::vector& tags, std::vector& get_cnts); + void write_dataset(std::vector>& datas, std::vector& tags, std::vector& get_cnts); + + // write dataset then send msg to train new model in LightGBM server side + void make_train(std::vector>& datas, std::vector& tags, std::vector& get_cnts); + + // predict + void make_predict_samples(std::vector>& datas); + void make_real_predict(std::vector>& datas, std::vector& preds); + void make_predict(std::vector>& datas, std::vector& preds); +}; + +} \ No newline at end of file diff --git a/db/art/filter_cache.cc b/db/art/filter_cache.cc new file mode 100644 index 000000000..8c0f99c23 --- /dev/null +++ b/db/art/filter_cache.cc @@ -0,0 +1,811 @@ +#include "filter_cache.h" +#include + +namespace ROCKSDB_NAMESPACE { + +FilterCache FilterCacheManager::filter_cache_; +HeatBuckets FilterCacheManager::heat_buckets_; +ClfModel FilterCacheManager::clf_model_; +GreedyAlgo FilterCacheManager::greedy_algo_; +FilterCacheHeapManager FilterCacheManager::heap_manager_; +uint32_t FilterCacheManager::get_cnt_; +uint32_t FilterCacheManager::period_cnt_; +uint32_t FilterCacheManager::last_long_period_; +uint32_t FilterCacheManager::last_short_period_; +std::mutex FilterCacheManager::update_mutex_; +bool FilterCacheManager::train_signal_; +std::map FilterCacheManager::last_count_recorder_; +std::map FilterCacheManager::current_count_recorder_; +std::mutex FilterCacheManager::count_mutex_; +bool FilterCacheManager::is_ready_; + +bool FilterCache::check_key(const uint32_t& segment_id, const std::string& key) { + auto it = filter_cache_.find(segment_id); + if (it == filter_cache_.end()) { + // not in cache, that means we havent insert segment FilterCacheItem info into cache + // actually, we start inserting after every segment becomes available + return true; + } else { + return (it->second).check_key(key); + } +} + +void FilterCache::enable_for_segments(std::unordered_map& segment_units_num_recorder, const bool& is_forced, + std::set& level_0_segment_ids, std::set& failed_segment_ids) { + failed_segment_ids.clear(); + filter_cache_mutex_.lock(); + for (auto it = segment_units_num_recorder.begin(); it != segment_units_num_recorder.end(); it ++) { + const uint32_t segment_id = it->first; + const uint16_t units_num = it->second; + auto cache_it = filter_cache_.find(segment_id); + bool is_level_0 = level_0_segment_ids.count(segment_id); + if (cache_it != filter_cache_.end()) { + // filter units cached + if (is_forced || is_level_0 || !is_full()) { + const uint32_t old_size = (cache_it->second).approximate_size(); + (cache_it->second).enable_units(units_num); + used_space_size_ = used_space_size_ - old_size + (cache_it->second).approximate_size(); + if (is_level_0) { + level_0_used_space_size_ = level_0_used_space_size_ - old_size + (cache_it->second).approximate_size(); + } + } else { + failed_segment_ids.insert(segment_id); + } + } else { + // filter units not cached + // now cache it + if (is_forced || is_level_0 || !is_full()) { + FilterCacheItem cache_item(units_num); + filter_cache_.insert(std::make_pair(segment_id, cache_item)); + used_space_size_ = used_space_size_ + cache_item.approximate_size(); + if (is_level_0) { + level_0_used_space_size_ = level_0_used_space_size_ + cache_item.approximate_size(); + } + } else { + failed_segment_ids.insert(segment_id); + } + } + } + filter_cache_mutex_.unlock(); +} + +void FilterCache::update_for_segments(std::unordered_map& segment_units_num_recorder, const bool& is_forced, + std::set& level_0_segment_ids, std::set& failed_segment_ids) { + filter_cache_mutex_.lock(); + for (auto it = segment_units_num_recorder.begin(); it != segment_units_num_recorder.end(); it ++) { + const uint32_t segment_id = it->first; + const uint16_t units_num = it->second; + auto cache_it = filter_cache_.find(segment_id); + bool is_level_0 = level_0_segment_ids.count(segment_id); + if (cache_it != filter_cache_.end()) { + // filter units cached + if (is_forced || is_level_0 || !is_full()) { + const uint32_t old_size = (cache_it->second).approximate_size(); + (cache_it->second).enable_units(units_num); + used_space_size_ = used_space_size_ - old_size + (cache_it->second).approximate_size(); + if (is_level_0) { + level_0_used_space_size_ = level_0_used_space_size_ - old_size + (cache_it->second).approximate_size(); + } + } else { + failed_segment_ids.insert(segment_id); + } + } else { + // filter units not cached + // do nothing!!! + } + } + filter_cache_mutex_.unlock(); +} + +bool FilterCache::is_full() { + return double(used_space_size_) / double(cache_size_) >= FULL_RATE; +} + +bool FilterCache::is_ready() { + return double(used_space_size_) / double(cache_size_) >= READY_RATE; +} + +void FilterCache::release_for_segments(std::vector& segment_ids, std::set& level_0_segment_ids) { + std::sort(segment_ids.begin(), segment_ids.end()); + // delete key-value pair in filter_cache_ + filter_cache_mutex_.lock(); + auto it = filter_cache_.begin(); + size_t idx = 0; + while (it != filter_cache_.end() && idx < segment_ids.size()) { + if (it->first < segment_ids[idx]) { + it ++; + } else if (it->first > segment_ids[idx]) { + idx ++; + } else { + used_space_size_ = used_space_size_ - (it->second).approximate_size(); + if (level_0_segment_ids.count(it->first)) { + level_0_used_space_size_ = level_0_used_space_size_ - (it->second).approximate_size(); + } + it = filter_cache_.erase(it); + } + } + filter_cache_mutex_.unlock(); +} + +bool FilterCacheManager::make_heat_buckets_ready(const std::string& key, + std::unordered_map>& segment_info_recorder) { + // heat_buckets not ready, still sample into pool + if (!heat_buckets_.is_ready()) { + std::vector> segments_infos; + for (auto it = segment_info_recorder.begin(); it != segment_info_recorder.end(); it ++) { + assert((it->second).size() == 2); + segments_infos.emplace_back(it->second); + } + // segments_infos can be empty, then use default number of buckets + heat_buckets_.sample(key, segments_infos); + } + return heat_buckets_.is_ready(); +} + +void FilterCacheManager::hit_heat_buckets(const std::string& key) { + if (heat_buckets_.is_ready()) { + get_cnt_ += 1; + if (get_cnt_ >= PERIOD_COUNT) { + heat_buckets_.hit(key, true); + get_cnt_ = 0; + period_cnt_ += 1; + } else { + heat_buckets_.hit(key, false); + } + } + if (period_cnt_ - last_long_period_ >= TRAIN_PERIODS) { + update_mutex_.lock(); + + if (period_cnt_ - last_long_period_ >= TRAIN_PERIODS) { + last_long_period_ = period_cnt_; + update_count_recorder(); + std::map estimate_count_recorder; + estimate_counts_for_all(estimate_count_recorder); + heap_manager_.sync_visit_cnt(estimate_count_recorder); + train_signal_ = true; + } + + update_mutex_.unlock(); + } + if (period_cnt_ - last_short_period_ >= 1) { + update_mutex_.lock(); + + if (period_cnt_ - last_short_period_ >= 1) { + last_short_period_ = period_cnt_; + std::map estimate_count_recorder; + estimate_counts_for_all(estimate_count_recorder); + heap_manager_.sync_visit_cnt(estimate_count_recorder); + } + + update_mutex_.unlock(); + } +} + +bool FilterCacheManager::make_clf_model_ready(std::vector& features_nums) { + clf_model_.make_ready(features_nums); + return clf_model_.is_ready(); +} + +bool FilterCacheManager::check_key(const uint32_t& segment_id, const std::string& key) { + // move hit_count_recorder to a background thread + // hit_count_recorder(segment_id); // one get opt will cause query to many segments. + // so one get opt only call one hit_heat_buckets, but call many hit_count_recorder + return filter_cache_.check_key(segment_id, key); +} + +void FilterCacheManager::hit_count_recorder(const uint32_t& segment_id) { + count_mutex_.lock(); + + auto it = current_count_recorder_.find(segment_id); + if (it == current_count_recorder_.end()) { + // segment havent been visited, need to insert count + current_count_recorder_.insert(std::make_pair(segment_id, 1)); + } else { + // segment have been visited, only update count + it->second = it->second + 1; + } + + count_mutex_.unlock(); +} + +void FilterCacheManager::update_count_recorder() { + count_mutex_.lock(); + + last_count_recorder_.clear(); + last_count_recorder_.insert(current_count_recorder_.begin(), current_count_recorder_.end()); + for (auto it = current_count_recorder_.begin(); it != current_count_recorder_.end(); it++) { + it->second = 0; + } + + count_mutex_.unlock(); +} + +void FilterCacheManager::inherit_count_recorder(std::vector& merged_segment_ids, std::vector& new_segment_ids, const uint32_t& level_0_base_count, + std::map>& inherit_infos_recorder) { + count_mutex_.lock(); + + std::map merged_last_count_recorder, merged_current_count_recorder; // cache merged segment count temporarily + for (uint32_t& merged_segment_id : merged_segment_ids) { + merged_last_count_recorder.insert(std::make_pair(merged_segment_id, last_count_recorder_[merged_segment_id])); + last_count_recorder_.erase(merged_segment_id); + merged_current_count_recorder.insert(std::make_pair(merged_segment_id, current_count_recorder_[merged_segment_id])); + current_count_recorder_.erase(merged_segment_id); + } + + std::map new_last_count_recorder, new_current_count_recorder; + for (auto infos_it = inherit_infos_recorder.begin(); infos_it != inherit_infos_recorder.end(); infos_it ++) { + double last_count = 0, current_count = 0; + std::unordered_map& info = infos_it->second; + for (auto info_it = info.begin(); info_it != info.end(); info_it ++) { + last_count = last_count + INHERIT_REMAIN_FACTOR * (merged_last_count_recorder[info_it->first] * info_it->second); + current_count = current_count + INHERIT_REMAIN_FACTOR * (merged_current_count_recorder[info_it->first] * info_it->second); + } + new_last_count_recorder.insert(std::make_pair(infos_it->first, uint32_t(last_count))); + new_current_count_recorder.insert(std::make_pair(infos_it->first, uint32_t(current_count))); + } + + for (uint32_t& new_segment_id : new_segment_ids) { + auto last_it = last_count_recorder_.find(new_segment_id); + uint32_t new_last_count = level_0_base_count; // level 0 segments init + if (new_last_count_recorder.count(new_segment_id) > 0) { + new_last_count = new_last_count_recorder[new_segment_id]; + } + if (last_it != last_count_recorder_.end()) { + last_it->second = last_it->second + new_last_count; + } else { + last_count_recorder_.insert(std::make_pair(new_segment_id, new_last_count)); + } + + auto current_it = current_count_recorder_.find(new_segment_id); + uint32_t new_current_count = level_0_base_count; // level 0 segments init + if (new_current_count_recorder.count(new_segment_id) > 0) { + new_current_count = new_current_count_recorder[new_segment_id]; + } + if (current_it != current_count_recorder_.end()) { + current_it->second = current_it->second + new_current_count; + } else { + current_count_recorder_.insert(std::make_pair(new_segment_id, new_current_count)); + } + } + + count_mutex_.unlock(); +} + +void FilterCacheManager::estimate_counts_for_all(std::map& approximate_counts_recorder) { + const uint32_t long_period_total_count = TRAIN_PERIODS * PERIOD_COUNT; + uint32_t current_long_period_count = PERIOD_COUNT * (period_cnt_ % TRAIN_PERIODS) + get_cnt_; + double current_long_period_rate = std::min(double(current_long_period_count) / double(long_period_total_count), 1.0); + + approximate_counts_recorder.clear(); + approximate_counts_recorder.insert(current_count_recorder_.begin(), current_count_recorder_.end()); + auto approx_it = approximate_counts_recorder.begin(); + auto last_it = last_count_recorder_.begin(); + while (approx_it != approximate_counts_recorder.end() && last_it != last_count_recorder_.end()) { + if (approx_it->first > last_it->first) { + last_it ++; + } else if(approx_it->first < last_it->first) { + approx_it ++; + } else { + approx_it->second = approx_it->second + uint32_t((1 - current_long_period_rate) * last_it->second); + approx_it ++; + } + } + + // return nothing, already write result to approximate_counts_recorder +} + + +void FilterCacheManager::try_retrain_model(std::map& level_recorder, + std::map>& segment_ranges_recorder, + std::map& unit_size_recorder) { + // we should guarantee these 3 external recorder share the same keys set + // we need to do this job outside FilterCacheManager + assert(level_recorder.size() == segment_ranges_recorder.size()); + // assert(level_recorder.size() == unit_size_recorder.size()); + if (train_signal_ == false) { + return; + } + + // solve programming problem + std::map label_recorder; + std::map algo_infos; + /* + auto get_cnt_it = last_count_recorder_.begin(); + auto unit_size_it = unit_size_recorder.begin(); + while (unit_size_it != unit_size_recorder.end() && get_cnt_it != last_count_recorder_.end()) { + if (unit_size_it->first > get_cnt_it->first) { + get_cnt_it ++; + } else if (unit_size_it->first < get_cnt_it->first) { + unit_size_it ++; + } else { + algo_infos.insert(std::make_pair(unit_size_it->first, SegmentAlgoInfo(get_cnt_it->second, unit_size_it->second))); + unit_size_it ++; + } + } + greedy_algo_.solve(algo_infos, label_recorder, filter_cache_.cache_size_except_level_0()); + */ + assert(unit_size_recorder.size() == 0); + auto get_cnt_it = last_count_recorder_.begin(); + while (get_cnt_it != last_count_recorder_.end()) { + // unit_size_recorder always empty, so we only use DEFAULT_UNIT_SIZE + algo_infos.insert(std::make_pair(get_cnt_it->first, SegmentAlgoInfo(get_cnt_it->second, DEFAULT_UNIT_SIZE))); + get_cnt_it ++; + } + greedy_algo_.solve(algo_infos, label_recorder, filter_cache_.cache_size_except_level_0()); + + // programming problem may include some merged segments, we need to ignore them + auto old_level_it = level_recorder.begin(); + auto old_range_it = segment_ranges_recorder.begin(); + auto old_label_it = label_recorder.begin(); + while (old_level_it != level_recorder.end() && + old_range_it != segment_ranges_recorder.end() && + old_label_it != label_recorder.end()) { + assert(old_level_it->first == old_range_it->first); + if (old_level_it->first < old_label_it->first) { + old_level_it = level_recorder.erase(old_label_it); + old_range_it = segment_ranges_recorder.erase(old_range_it); + } else if (old_level_it->first > old_label_it->first) { + old_label_it = label_recorder.erase(old_label_it); + } else { + old_level_it ++; + old_range_it ++; + old_label_it ++; + } + } + while (old_level_it != level_recorder.end() && + old_range_it != segment_ranges_recorder.end()) { + assert(old_level_it->first == old_range_it->first); + old_level_it = level_recorder.erase(old_label_it); + old_range_it = segment_ranges_recorder.erase(old_range_it); + } + while (old_label_it != label_recorder.end()) { + old_label_it = label_recorder.erase(old_label_it); + } + + std::vector buckets = heat_buckets_.buckets(); + std::vector> datas; + std::vector labels; + std::vector get_cnts; + + auto level_it = level_recorder.begin(); // key range id start with 0 + auto range_it = segment_ranges_recorder.begin(); + auto count_it = last_count_recorder_.begin(); + auto label_it = label_recorder.begin(); + while (level_it != level_recorder.end() && range_it != segment_ranges_recorder.end() && + count_it != last_count_recorder_.end() && label_it != label_recorder.end()) { + assert(level_it->first == range_it->first); + assert(level_it->first == label_it->first); + if (count_it->first < level_it->first) { + count_it ++; + } else if (count_it->first > level_it->first) { + level_it ++; + range_it ++; + label_it ++; + } else { + if (level_it->second > 0) { + // add data row + std::vector data; + std::sort((range_it->second).begin(), (range_it->second).end(), RangeRatePairGreatorComparor); + data.emplace_back(level_it->second); + for (RangeRatePair& pair : range_it->second) { + assert(pair.range_id >= 0 && pair.range_id < buckets.size()); + data.emplace_back(uint32_t(RATE_SIGNIFICANT_DIGITS_FACTOR * pair.rate_in_segment)); + data.emplace_back(uint32_t(HOTNESS_SIGNIFICANT_DIGITS_FACTOR * buckets[pair.range_id].hotness_)); + } + datas.emplace_back(data); + // add label row + labels.emplace_back(label_it->second); + // add get cnt row + get_cnts.emplace_back(count_it->second); + } + + level_it ++; + range_it ++; + label_it ++; + } + } + + // check three vectors have same length + assert(datas.size() == labels.size()); + assert(get_cnts.size() == labels.size()); + + clf_model_.make_train(datas, labels, get_cnts); + + train_signal_ = false; +} + +void FilterCacheManager::update_cache_and_heap(std::map& level_recorder, + std::map>& segment_ranges_recorder) { + assert(level_recorder.size() == segment_ranges_recorder.size()); + std::vector segment_ids; + std::vector> datas; + std::vector preds; + std::unordered_map segment_units_num_recorder; + std::map current_units_num_limit_recorder; + std::vector buckets = heat_buckets_.buckets(); + + // build data rows into datas + auto level_it = level_recorder.begin(); + auto range_it = segment_ranges_recorder.begin(); + while (level_it != level_recorder.end() && range_it != segment_ranges_recorder.end()) { + if (level_it->first < range_it->first) { + level_it ++; + } else if (level_it->first > range_it->first) { + range_it ++; + } else { + assert(level_it->first == range_it->first); + + if (level_it->second > 0) { + // add data row + std::vector data; + std::sort((range_it->second).begin(), (range_it->second).end(), RangeRatePairGreatorComparor); + data.emplace_back(level_it->second); + for (RangeRatePair& pair : range_it->second) { + assert(pair.range_id >= 0 && pair.range_id < buckets.size()); + data.emplace_back(uint32_t(RATE_SIGNIFICANT_DIGITS_FACTOR * pair.rate_in_segment)); + data.emplace_back(uint32_t(HOTNESS_SIGNIFICANT_DIGITS_FACTOR * buckets[pair.range_id].hotness_)); + } + datas.emplace_back(data); + } + + level_it ++; + range_it ++; + } + } + + // use datas to make prediction + clf_model_.make_predict(datas, preds); + assert(segment_ids.size() == preds.size()); + size_t idx = 0; + while (idx < segment_ids.size() && idx < preds.size()) { + segment_units_num_recorder.insert(std::make_pair(segment_ids[idx], preds[idx])); + current_units_num_limit_recorder.insert(std::make_pair(segment_ids[idx], preds[idx])); + idx = idx + 1; + } + + // update filter cache helper heaps + heap_manager_.sync_units_num_limit(current_units_num_limit_recorder); + + // update filter cache + std::set empty_level_0_segment_ids; // no level 0 segment in heaps and model data, dont worry + std::set empty_failed_segment_ids; + filter_cache_.update_for_segments(segment_units_num_recorder, true, empty_level_0_segment_ids, empty_failed_segment_ids); +} + +void FilterCacheManager::remove_segments(std::vector& segment_ids, std::set& level_0_segment_ids) { + // update filter cache helper heaps + heap_manager_.batch_delete(segment_ids); + // update filter cache map + filter_cache_.release_for_segments(segment_ids, level_0_segment_ids); +} + +bool FilterCacheManager::adjust_cache_and_heap() { + if ((!is_ready_) || !filter_cache_.is_full()) { + return false; + } + FilterCacheModifyResult result; + /* + struct FilterCacheModifyResult { + uint32_t enable_segment_id; + uint32_t disable_segment_id; + uint16_t enable_segment_units_num; + uint16_t disable_segment_units_num; + uint16_t enable_segment_next_units_num; + uint16_t disable_segment_next_units_num; + double enable_benefit; + double disable_cost; + }; + */ + bool can_adjust = heap_manager_.try_modify(result); + if (can_adjust) { + std::unordered_map segment_units_num_recorder; + std::set empty_level_0_segment_ids; // no level 0 segment in heaps, dont worry + std::set empty_failed_segment_ids; // force to update segments' filter units group, so dont worry for cache space + segment_units_num_recorder.insert(std::make_pair(result.enable_segment_id, result.enable_segment_next_units_num)); + segment_units_num_recorder.insert(std::make_pair(result.disable_segment_id, result.disable_segment_next_units_num)); + filter_cache_.update_for_segments(segment_units_num_recorder, true, empty_level_0_segment_ids, empty_failed_segment_ids); + } + return can_adjust; +} + +void FilterCacheManager::insert_segments(std::vector& merged_segment_ids, std::vector& new_segment_ids, + std::map>& inherit_infos_recorder, + std::map& level_recorder, const uint32_t& level_0_base_count, + std::map>& segment_ranges_recorder) { + std::unordered_map segment_units_num_recorder; + std::map approximate_counts_recorder; + std::set failed_segment_ids; + std::vector new_segment_items; + std::set old_level_0_segment_ids, new_level_0_segment_ids; + std::vector buckets = heat_buckets_.buckets(); + std::sort(merged_segment_ids.begin(), merged_segment_ids.end()); + std::sort(new_segment_ids.begin(), new_segment_ids.end()); + + // pick up merged or new level 0 segments + // assume level_recorder keys set equals to merged_segment_ids + new_segment_ids + assert(new_segment_ids.size() == 0 || merged_segment_ids.size() + new_segment_ids.size() == level_recorder.size()); + auto level_it = level_recorder.begin(); + size_t merged_idx = 0, new_idx = 0; + while (level_it != level_recorder.end()) { + if (merged_idx < merged_segment_ids.size() && level_it->first == merged_segment_ids[merged_idx]) { + if (level_it->second == 0) { + old_level_0_segment_ids.insert(level_it->first); + } + merged_idx ++; + } else if (new_idx < new_segment_ids.size() && level_it->first == new_segment_ids[new_idx]) { + if (level_it->second == 0) { + new_level_0_segment_ids.insert(level_it->first); + segment_units_num_recorder.insert(std::make_pair(level_it->first, MAX_UNITS_NUM)); + } else { + // not a level 0 segment, set default units num + segment_units_num_recorder.insert(std::make_pair(level_it->first, DEFAULT_UNITS_NUM)); + } + new_idx ++; + } + level_it ++; + } + + if (!is_ready_) { + // if is_ready_ is false, no need to enable two-heaps adjustment, remember to update is_ready_ in the end + // remove merged segments' units in filter cache and nodes in filter heaps + heap_manager_.batch_delete(merged_segment_ids); + filter_cache_.release_for_segments(merged_segment_ids, old_level_0_segment_ids); + + // inherit merged segments' counts to new segments' counts + // ensure that new segments that are not in inherit_infos_recorder keys set are only level 0 segments + inherit_count_recorder(merged_segment_ids, new_segment_ids, level_0_base_count, inherit_infos_recorder); + estimate_counts_for_all(approximate_counts_recorder); + + // insert units into filter cache + filter_cache_.enable_for_segments(segment_units_num_recorder, false, new_level_0_segment_ids, failed_segment_ids); + + // insert nodes into filter heaps + for (uint32_t& new_segment_id : new_segment_ids) { + if (new_level_0_segment_ids.count(new_segment_id)) { + // no need to insert level 0 segment nodes into heap + continue; + } else if (failed_segment_ids.count(new_segment_id)) { + // failed to insert filter units + uint16_t units_num = segment_units_num_recorder[new_segment_id]; + new_segment_items.emplace_back(FilterCacheHeapItem(new_segment_id, approximate_counts_recorder[new_segment_id], + 0, 0, units_num)); + } else { + // succeed to insert filter units + uint16_t units_num = segment_units_num_recorder[new_segment_id]; + new_segment_items.emplace_back(FilterCacheHeapItem(new_segment_id, approximate_counts_recorder[new_segment_id], + units_num, 0, units_num)); + } + } + heap_manager_.batch_upsert(new_segment_items); + + // remember to update is_ready_ + if (filter_cache_.is_ready()) { + is_ready_ = true; + } + } else { + // is_ready_ is true, then we will not update is_ready_, that means is_ready_ will be always true + // remove merged segments' units in filter cache and nodes in filter heaps + heap_manager_.batch_delete(merged_segment_ids); + filter_cache_.release_for_segments(merged_segment_ids, old_level_0_segment_ids); + + // inherit merged segments' counts to new segments' counts + // ensure that new segments that are not in inherit_infos_recorder keys set are only level 0 segments + inherit_count_recorder(merged_segment_ids, new_segment_ids, level_0_base_count, inherit_infos_recorder); + estimate_counts_for_all(approximate_counts_recorder); + + // predict units num for new non level 0 segments and update segment_units_num_recorder + std::vector> pred_datas; + std::vector pred_segment_ids; + std::vector pred_results; + for (uint32_t& new_segment_id : new_segment_ids) { + if (new_level_0_segment_ids.count(new_segment_id)) { + // no need to predict for level 0 segments + continue; + } else { + pred_segment_ids.emplace_back(new_segment_id); + + std::vector pred_data; + pred_data.emplace_back(level_recorder[new_segment_id]); + for (RangeRatePair& pair : segment_ranges_recorder[new_segment_id]) { + assert(pair.range_id >= 0 && pair.range_id < buckets.size()); + pred_data.emplace_back(uint32_t(RATE_SIGNIFICANT_DIGITS_FACTOR * pair.rate_in_segment)); + pred_data.emplace_back(uint32_t(HOTNESS_SIGNIFICANT_DIGITS_FACTOR * buckets[pair.range_id].hotness_)); + } + pred_datas.emplace_back(pred_data); + } + } + assert(pred_datas.size() == pred_segment_ids.size()); + clf_model_.make_predict(pred_datas, pred_results); + assert(pred_datas.size() == pred_results.size()); + size_t pred_idx = 0; + while (pred_idx < pred_segment_ids.size() && pred_idx < pred_results.size()) { + segment_units_num_recorder[pred_segment_ids[pred_idx]] = pred_results[pred_idx]; + pred_idx = pred_idx + 1; + } + + // insert units into filter cache + filter_cache_.enable_for_segments(segment_units_num_recorder, false, new_level_0_segment_ids, failed_segment_ids); + + // insert nodes into filter heaps + for (uint32_t& new_segment_id : new_segment_ids) { + if (new_level_0_segment_ids.count(new_segment_id)) { + // no need to insert level 0 segment nodes into heap + continue; + } else if (failed_segment_ids.count(new_segment_id)) { + // failed to insert filter units + uint16_t units_num = segment_units_num_recorder[new_segment_id]; + new_segment_items.emplace_back(FilterCacheHeapItem(new_segment_id, approximate_counts_recorder[new_segment_id], + 0, 0, units_num)); + } else { + // succeed to insert filter units + uint16_t units_num = segment_units_num_recorder[new_segment_id]; + new_segment_items.emplace_back(FilterCacheHeapItem(new_segment_id, approximate_counts_recorder[new_segment_id], + units_num, 0, units_num)); + } + } + heap_manager_.batch_upsert(new_segment_items); + } +} + +void FilterCacheManager::delete_segments(std::vector& merged_segment_ids, std::map& level_recorder) { + std::set old_level_0_segment_ids; + std::sort(merged_segment_ids.begin(), merged_segment_ids.end()); + + // level_recorder is a copy of global level_recorder + assert(merged_segment_ids.size() == level_recorder.size()); + auto level_it = level_recorder.begin(); + size_t merged_idx = 0; + while (level_it != level_recorder.end()) { + assert(merged_idx < merged_segment_ids.size() && level_it->first == merged_segment_ids[merged_idx]); + if (merged_idx < merged_segment_ids.size() && level_it->first == merged_segment_ids[merged_idx]) { + if (level_it->second == 0) { + old_level_0_segment_ids.insert(level_it->first); + } + merged_idx ++; + } + level_it ++; + } + + if (!is_ready_) { + // if is_ready_ is false, no need to enable two-heaps adjustment, remember to update is_ready_ in the end + // remove merged segments' units in filter cache and nodes in filter heaps + heap_manager_.batch_delete(merged_segment_ids); + filter_cache_.release_for_segments(merged_segment_ids, old_level_0_segment_ids); + + // remember to update is_ready_ + if (filter_cache_.is_ready()) { + is_ready_ = true; + } + } else { + // is_ready_ is true, then we will not update is_ready_, that means is_ready_ will be always true + // remove merged segments' units in filter cache and nodes in filter heaps + heap_manager_.batch_delete(merged_segment_ids); + filter_cache_.release_for_segments(merged_segment_ids, old_level_0_segment_ids); + } +} + +void FilterCacheManager::move_segments(std::vector& moved_segment_ids, + std::map& old_level_recorder, + std::map& move_level_recorder, + std::map>& move_segment_ranges_recorder) { + std::unordered_map segment_units_num_recorder; + std::map approximate_counts_recorder; + std::vector new_segment_items; + std::set old_level_0_segment_ids; + std::vector buckets = heat_buckets_.buckets(); + std::sort(moved_segment_ids.begin(), moved_segment_ids.end()); + + // pick up merged or new level 0 segments, but this type of compaction must not move to level 0, + // so we may only move level 0 to higher level + assert(moved_segment_ids.size() == old_level_recorder.size()); + assert(moved_segment_ids.size() == move_level_recorder.size()); + assert(moved_segment_ids.size() == move_segment_ranges_recorder.size()); + auto level_it = old_level_recorder.begin(); + size_t moved_idx = 0, new_idx = 0; + while (level_it != old_level_recorder.end()) { + assert(moved_idx < moved_segment_ids.size() && level_it->first == moved_segment_ids[moved_idx]); + if (moved_idx < moved_segment_ids.size() && level_it->first == moved_segment_ids[moved_idx]) { + if (level_it->second == 0) { + old_level_0_segment_ids.insert(level_it->first); + } + segment_units_num_recorder.insert(std::make_pair(level_it->first, DEFAULT_UNITS_NUM)); + // actually, we cannot move segments to level 0 in trivial move compaction (only flushing do this). + moved_idx ++; + } + level_it ++; + } + + if (!is_ready_) { + // firstly, delete moved segments + heap_manager_.batch_delete(moved_segment_ids); + filter_cache_.release_for_segments(moved_segment_ids, old_level_0_segment_ids); + + // inherit these segments' count + for (uint32_t& segment_id : moved_segment_ids) { + auto last_it = last_count_recorder_.find(segment_id); + auto current_it = current_count_recorder_.find(segment_id); + if (last_it != last_count_recorder_.end()) { + last_it->second = INHERIT_REMAIN_FACTOR * (last_it->second); + } + if (current_it != current_count_recorder_.end()) { + current_it->second = INHERIT_REMAIN_FACTOR * (current_it->second); + } + } + estimate_counts_for_all(approximate_counts_recorder); + + // insert units into filter cache + std::set empty_new_level_0_segment_ids, empty_failed_segment_ids; + filter_cache_.enable_for_segments(segment_units_num_recorder, true, empty_new_level_0_segment_ids, empty_failed_segment_ids); + + // insert nodes into filter heaps + for (uint32_t& segment_id : moved_segment_ids) { + assert(move_level_recorder[segment_id] > 0); + uint16_t units_num = segment_units_num_recorder[segment_id]; + new_segment_items.emplace_back(FilterCacheHeapItem(segment_id, approximate_counts_recorder[segment_id], + units_num, 0, units_num)); + } + heap_manager_.batch_upsert(new_segment_items); + + // remember to update is_ready_ + if (filter_cache_.is_ready()) { + is_ready_ = true; + } + } else { + // firstly, delete moved segments + heap_manager_.batch_delete(moved_segment_ids); + filter_cache_.release_for_segments(moved_segment_ids, old_level_0_segment_ids); + + // inherit these segments' count + for (uint32_t& segment_id : moved_segment_ids) { + auto last_it = last_count_recorder_.find(segment_id); + auto current_it = current_count_recorder_.find(segment_id); + if (last_it != last_count_recorder_.end()) { + last_it->second = INHERIT_REMAIN_FACTOR * (last_it->second); + } + if (current_it != current_count_recorder_.end()) { + current_it->second = INHERIT_REMAIN_FACTOR * (current_it->second); + } + } + estimate_counts_for_all(approximate_counts_recorder); + + // predict units num for new non level 0 segments and update segment_units_num_recorder + std::vector> pred_datas; + std::vector pred_segment_ids; + std::vector pred_results; + for (uint32_t& segment_id : moved_segment_ids) { + assert(move_level_recorder[segment_id] > 0); + pred_segment_ids.emplace_back(segment_id); + + std::vector pred_data; + pred_data.emplace_back(move_level_recorder[segment_id]); + for (RangeRatePair& pair : move_segment_ranges_recorder[segment_id]) { + assert(pair.range_id >= 0 && pair.range_id < buckets.size()); + pred_data.emplace_back(uint32_t(RATE_SIGNIFICANT_DIGITS_FACTOR * pair.rate_in_segment)); + pred_data.emplace_back(uint32_t(HOTNESS_SIGNIFICANT_DIGITS_FACTOR * buckets[pair.range_id].hotness_)); + } + } + assert(pred_datas.size() == pred_segment_ids.size()); + clf_model_.make_predict(pred_datas, pred_results); + assert(pred_datas.size() == pred_results.size()); + size_t pred_idx = 0; + while (pred_idx < pred_segment_ids.size() && pred_idx < pred_results.size()) { + segment_units_num_recorder[pred_segment_ids[pred_idx]] = pred_results[pred_idx]; + pred_idx = pred_idx + 1; + } + + // insert units into filter cache + std::set empty_new_level_0_segment_ids, empty_failed_segment_ids; + filter_cache_.enable_for_segments(segment_units_num_recorder, true, empty_new_level_0_segment_ids, empty_failed_segment_ids); + + // insert nodes into filter heaps + for (uint32_t& segment_id : moved_segment_ids) { + assert(move_level_recorder[segment_id] > 0); + uint16_t units_num = segment_units_num_recorder[segment_id]; + new_segment_items.emplace_back(FilterCacheHeapItem(segment_id, approximate_counts_recorder[segment_id], + units_num, 0, units_num)); + } + heap_manager_.batch_upsert(new_segment_items); + } +} + +} \ No newline at end of file diff --git a/db/art/filter_cache.h b/db/art/filter_cache.h new file mode 100644 index 000000000..5578e58f7 --- /dev/null +++ b/db/art/filter_cache.h @@ -0,0 +1,266 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include "macros.h" +#include "greedy_algo.h" +#include "clf_model.h" +#include "heat_buckets.h" +#include "filter_cache_heap.h" +#include "filter_cache_item.h" + +namespace ROCKSDB_NAMESPACE { + +class FilterCache; +class FilterCacheManager; + +// FilterCache main component is a STL Map, key -- segment id, value -- Structure of Filter Units ( called FilterCacheItem) +// its main job is auto enable/disable filter units for one segment, and check whether one key exists in enabled units +// its work is below: +// 1. enable / disable units for a batch of segments (one segment may not exist in FilterCache) +// 2. check whether one given key exists in one segment +// 3. check whether filter cache is approximately full +// 4. check whether ready to train first model +// 5. release FilterCacheItem of these merged (outdated) segments +class FilterCache { +private: + std::map filter_cache_; + uint32_t used_space_size_; + uint32_t level_0_used_space_size_; + uint32_t cache_size_; // max size of cache + std::mutex filter_cache_mutex_; + +public: + FilterCache() { filter_cache_.clear(); cache_size_ = CACHE_SPACE_SIZE; used_space_size_ = 0; level_0_used_space_size_ = 0; } + + ~FilterCache() { /* do nothing */ } + + // other levels total cache size + uint32_t cache_size_except_level_0() { return cache_size_ * FULL_RATE - level_0_used_space_size_; } + + // check whether one given key exist in one segment + bool check_key(const uint32_t& segment_id, const std::string& key); + + // enable / disable units for a batch of segments (one segment may not exist in FilterCache) + // if enabled units num exceed given units num, it will disable units + void enable_for_segments(std::unordered_map& segment_units_num_recorder, const bool& is_forced, + std::set& level_0_segment_ids, std::set& failed_segment_ids); + + // the only difference from enable_for_segments is: + // this func dont insert any filter units for segments that dont exist in cache, but enable_for_segments unc does + void update_for_segments(std::unordered_map& segment_units_num_recorder, const bool& is_forced, + std::set& level_0_segment_ids, std::set& failed_segment_ids); + + // check whether filter cache is approximately full + // actually, we will leave (1-FULL_RATE) * cache_size_ space for emergency usage + bool is_full(); + + // check whether ready to train first model + bool is_ready(); + + // release filter units of merged segments + void release_for_segments(std::vector& segment_ids, std::set& level_0_segment_ids); +}; + +// FilterCacheManager is combined of these components: +// HeatBuckets, ClfModel, FilterCacheHeapManager, GreedyAlgo and FilterCache +// its work is below: +// 1. input segment id and target key, check whether target key exist in this segment +// 2. if one check done, add 1 to the get cnt of this segment. we need to maintain get cnts record in last long period and current long period +// 3. Inherit: convert get cnts of merged segments to get cnts of newly generated segments (get cnts in both last long period and current long period) +// 4. use existing counts of segments in last and current long period to estimate a approximate get cnt for one alive segment +// 5. record total get cnt and update short periods, reminded TRAIN_PERIODS short periods is a long period +// 6. if a short period end, update HeatBuckets +// 7. if a long period end, use greedy algorithm to solve filter units allocation problem and evaluate old model with this solution. if model doesnt work well, retrain it +// 8. if model still works well or model already retrained, we predict new ideal units num for current segments, release unnecessary filter units , add necessary filter unit and update FIlterCacheHeap +// 9. if old segments are merged, remove related filter units from FilterCache and FilterCacheHeap +// 10. if new segments are generated, we need to predict ideal units num for them, insert filter units (if cache have space left) , calcuate estimated count and update FIlterCacheHeap. +// 11. after one short period end, estimate current segments' approximate get cnts, then use these estimated get cnts to update FIlterCacheHeap +// 12. before FilterCache becomes full for the first time, just set default units num for every new segments and insert filter units for segments +// 13. After FilterCache becomes full for the first time, start a background thread to monitor FitlerCacheHeap and use two-heaps adjustment to optimize FilterCache (this thread never ends) +class FilterCacheManager { +private: + // TODO: mutex can be optimized or use a message queue or a thread pool to reduce time costed by mutex + static FilterCache filter_cache_; + static HeatBuckets heat_buckets_; + static ClfModel clf_model_; + static GreedyAlgo greedy_algo_; + static FilterCacheHeapManager heap_manager_; + static uint32_t get_cnt_; // record get cnt in current period, when exceeding PERIOD_COUNT, start next period + static uint32_t period_cnt_; // record period cnt, if period_cnt_ - last_train_period_ >= TRAIN_PERIODS, start to evaluate or retrain ClfModel + static uint32_t last_long_period_; // record last short period cnt of last long period + static uint32_t last_short_period_; // helper var for update job when one short period ends + static std::mutex update_mutex_; // guarantee counts records only updated once + static bool train_signal_; // if true, try to retrain model. we call one background thread to monitor this flag and retrain + static std::map last_count_recorder_; // get cnt recorder of segments in last long period + static std::map current_count_recorder_; // get cnt recorder of segments in current long period + static std::mutex count_mutex_; // guarentee last_count_recorder and current_count_recorder treated orderedly + static bool is_ready_; // check whether ready to use adaptive filter assignment +public: + FilterCacheManager() { get_cnt_ = 0; last_long_period_ = 0; last_short_period_ = 0; train_signal_ = false; } + + ~FilterCacheManager(); + + // one background thread monitor this func, if return true, call try_retrain_model at once, wait for training end, and call update_cache_and_heap + bool need_retrain() { return train_signal_; } + + // one background thread monitor this func, if return true, call make_clf_model_ready first, then call try_retrain_model at once and wait for training end. + // lastly call update_cache_and_heap. if all end, stop this thread, because if is_ready_ is true, is_ready_ will never change to false + bool ready_work() { return is_ready_; } + + bool heat_buckets_ready() { return heat_buckets_.is_ready(); } + + // input segment id and target key, check whether target key exist in this segment + // return true when target key may exist (may cause false positive fault) + // if there is no cache item for this segment, always return true + // normal bloom filter units query, can we put hit_count_recorder outside this func? this will make get opt faster + // will be called by a get operation, this will block get operation + // remember to call hit_count_recorder in a background thread + bool check_key(const uint32_t& segment_id, const std::string& key); + + // add 1 to get cnt of specified segment in current long period + // will be called when calling check_key + // remember to move this func to a single background thread aside check_key + // because this func shouldn't block get operations + void hit_count_recorder(const uint32_t& segment_id); + + // copy counts to last_count_recorder and reset counts of current_count_recorder + void update_count_recorder(); + + // inherit counts of merged segments to counts of new segments and remove counts of merged segments + // inherit_infos_recorder: { {new segment 1: [{old segment 1: inherit rate 1}, {old segment 2: inherit rate 2}, ...]}, ...} + void inherit_count_recorder(std::vector& merged_segment_ids, std::vector& new_segment_ids, const uint32_t& level_0_base_count, + std::map>& inherit_infos_recorder); + + // estimate approximate get cnts for every alive segment + void estimate_counts_for_all(std::map& approximate_counts_recorder); + + // noticed that at the beginning, heat buckets need to sample put keys to init itself before heat buckets start to work + // segment_info_recorder is external variable that records every alive segments' min key and max key + // it is like { segment 1: [min_key_1, max_key_1], segment 2: [min_key_2, max_key_2], ... } + // return true when heat_buckets is ready, so no need to call this func again + // remember to be called when receiving put opt. Normally, we can make heat_buckets_ ready before YCSB load end, so we can use it in YCSB testing phase + // every put operation will call a background thread to call make_heat_buckets_ready + // after this func return true, no need to call this func in put operation + // remember to move this func to a single background thread aside put operations + bool make_heat_buckets_ready(const std::string& key, std::unordered_map>& segment_info_recorder); + + // clf_model_ need to determine feature nums before training + // actually, YCSB will load data before testing + // features_nums: [feature_num_1, feature_num_2, ...], it includes all feature_num of all non level 0 alive segments + // feature_num_k is 2 * (number of key ranges intersecting with segment k) + 1 + // return true when clf_model_ set to ready successfully + // we need to call make_clf_model_ready before we first call try_retrain_model + // simply, if ready_work return true, we call make_clf_model_ready at once + bool make_clf_model_ready(std::vector& features_nums); + + // add 1 to get cnt of target key range for every get operation + // update short periods if get cnt exceeds PERIOD_COUNT + // every get opt will make add 1 to only one heat bucket counter + // also need to update count records if one long period end + // also need to re-calcuate estimated count of current segments and update FilterCacheHeap if one short period end + // we should use one background thread to call this func in every get operation + void hit_heat_buckets(const std::string& key); + + // if one long period end, we need to check effectiveness of model. + // if model doesnt work well in current workload, we retrain this model + // 1. use greedy algorithm to solve filter units allocation problem (receive ideal enabled units num for every current segments) + // 2. write filter units nums and segment-related features to a csv file + // 3. python lightgbm server maintain latest model. it will read the csv file and use I/O cost metric to check effectiveness of this model + // 4. if effectiveness check not pass, retrain this model + // reminded that if this new model training not end, lightgbm still use old model to predict ideal units num for segments + // level_recorder: { segment 1: level_1, segment 2: level_2, ... }, level_k is the index of LSM-Tree level (0, 1, 2, ...) + // range_heat_recorder: { segment 1: range_id_1, ...}, ... }, + // unit_size_recorder: { segment 1: unit_size_1, segment 2: unit_size_2, ... } + // we assume for every segment, its ranges in range_heat_recorder value must be unique!!! + // all 3 recorders need to maintain all current non level 0 segments info, and their keys size and keys set should be the same (their keys are segments' ids) + // we ignore all level 0 segments !!! 3 recorders keys set should be the same ------ all alive segments' ids (except level 0) + // because of the time cost of writing csv file, we need to do this func with a background thread + // need real benchmark data to debug this func + void try_retrain_model(std::map& level_recorder, + std::map>& segment_ranges_recorder, + std::map& unit_size_recorder); + + // after one long period end, we may retrain model. when try_retrain_model func end, we need to predict units num for every segments + // then use these units num to update FilterCache, at last update units num limit in FilterCacheHeap + // we ignore all level 0 segments !!! + // level_recorder: { segment 1: level_1, segment 2: level_2, ... }, level_k is the index of LSM-Tree level (0, 1, 2, ...) + // range_heat_recorder: { segment 1: range_id_1, ...}, ... }, + // noticed that we only pick up those segments that are in both level_recorder and segment_ranges_recorder + // and update their filter units in filter cache and nodes in heap + // so level_recorder keys set and segment_ranges_recorder keys set can be different + // only be called after try_retrain_model (should be guaranteed) + // we can guarantee this by putting try_retrain_model and update_cache_and_heap into only one background thread + void update_cache_and_heap(std::map& level_recorder, + std::map>& segment_ranges_recorder); + + // remove merged segments' filter units in the filter cache + // also remove related items in FilterCacheHeap + // segment_ids: [level_1_segment_1, level_0_segment_1, ...] + // level_0_segment_ids: [level_0_segment_1, ...] + // should be called by one background thread + // this func will be called by insert_segments + // you can also call this func alone after segments are merged (not suggested) + void remove_segments(std::vector& segment_ids, std::set& level_0_segment_ids); + + // insert new segments into cache + // all level 0 segments must enable all filter units + // if is_ready_ is not true, set default filter units num (except level 0), insert into filter_cache_ and heaps + // if is_ready_ is true, predict filter units num and insert necessary filter units as much as possible + // then insert into heaps + // for level 0 segments, we only need to insert all units into cache, dont insert any nodes into heaps + // that means level 0 segments units num never be modified + // merged_segment_ids can be null when new segments in level 0 + // level_0_base_count is the default value of level 0 segments' count in current recorder and last recorder + // noticed that level 0 segments are from MemTable, we can compute total memtable get count and let the total count divided by new level 0 segments num + // it equals to the avg count of new level 0 segments. we use this avg count to simply init level 0 segments' counts in current recorder and last recorder + // merged_segment_ids: all merged segments' id + // new_segment_ids: all new segments' id + // inherit_infos_recorder: count inherit information (from merged segments to new segments), see inherit_count_recorder func + // level_recorder: include merged segments and new segments, like { segment 1 : level_num_1, ... } + // level_0_base_count: the initial count in last recorder and current recorder of new level 0 segments + // segment_ranges_recorder: only include new segments, see update_cache_and_heap + // level_recorder keys set and segment_ranges_recorder keys set can be different + // but should ensure all new segments are in both level_recorder and segment_ranges_recorder + // should be called by one background thread! + // when old segments are merged into some new segments, call this func in one background thread + void insert_segments(std::vector& merged_segment_ids, std::vector& new_segment_ids, + std::map>& inherit_infos_recorder, + std::map& level_recorder, const uint32_t& level_0_base_count, + std::map>& segment_ranges_recorder); + + // in func insert_segments above, we will also remove merged segments, this work well for normal compaction and flush + // but we found that WaLSM also do delete compaction (only delete segments) + // which not fit to func insert_segments, so we need a alone func delete_segments + // this func only delete merged segments + // we only need argument merged_segment_ids (all merged segments' ids) + // and level_recorder which only include merged segments' level + void delete_segments(std::vector& merged_segment_ids, std::map& level_recorder); + + // move segments to another level, used for trivial move compaction + void move_segments(std::vector& moved_segment_ids, + std::map& old_level_recorder, + std::map& move_level_recorder, + std::map>& move_segment_ranges_recorder); + + // make filter unit adjustment based on two heaps (benefit of enabling one unit & cost of disabling one unit) + // simply, we disable one unit of one segment and enable one unit of another segment and guarantee cost < benefit + // dont mind these two units size are not equal + // in YCSB Benchmark, sizes of filter units are very close + // return true when we successfully make one adjustment + // return false when we cannot make one adjustment + // one background should exec this func and never stop + bool adjust_cache_and_heap(); + + std::vector& range_seperators() { + return heat_buckets_.seperators(); + } +}; + +} \ No newline at end of file diff --git a/db/art/filter_cache_client.cc b/db/art/filter_cache_client.cc new file mode 100644 index 000000000..6dfd99e67 --- /dev/null +++ b/db/art/filter_cache_client.cc @@ -0,0 +1,155 @@ +#include "filter_cache_client.h" + +namespace ROCKSDB_NAMESPACE { + +task_thread_pool::task_thread_pool FilterCacheClient::pool_{FILTER_CACHE_THREADS_NUM}; +FilterCacheManager FilterCacheClient::filter_cache_manager_; +bool FilterCacheClient::heat_buckets_ready_; + +void FilterCacheClient::do_prepare_heat_buckets(const std::string& key, std::unordered_map>* const segment_info_recorder) { + filter_cache_manager_.make_heat_buckets_ready(key, *segment_info_recorder); +} + +bool FilterCacheClient::prepare_heat_buckets(const std::string& key, std::unordered_map>* const segment_info_recorder) { + heat_buckets_ready_ = filter_cache_manager_.heat_buckets_ready(); + if (!heat_buckets_ready_) { + // if heat_buckets_ready_ false + assert(segment_info_recorder->size() == 0); // should always empty + heat_buckets_ready_ = filter_cache_manager_.heat_buckets_ready(); + if (!heat_buckets_ready_) { + pool_.submit_detach(do_prepare_heat_buckets, key, segment_info_recorder); + heat_buckets_ready_ = filter_cache_manager_.heat_buckets_ready(); + } + } + return heat_buckets_ready_; +} + +void FilterCacheClient::do_retrain_or_keep_model(std::vector* const features_nums_except_level_0, + std::map* const level_recorder, + std::map>* const segment_ranges_recorder, + std::map* const unit_size_recorder) { + std::map level_copy; + std::map> segment_ranges_copy; + std::map unit_size_copy; + // if this func background monitor signal, how can it receive latest argument? input pointer! + while (!filter_cache_manager_.heat_buckets_ready()); + while (!filter_cache_manager_.ready_work()); // wait for manager ready + assert(filter_cache_manager_.heat_buckets_ready()); // must guarantee that heat buckets ready before we make filter cache manager ready + + // actually we will load data before we test, so we can ensure that heat buckets ready first + filter_cache_manager_.make_clf_model_ready(*features_nums_except_level_0); + // lock and copy recorders + global_recorder_mutex_.lock(); + level_copy = *level_recorder; + segment_ranges_copy = *segment_ranges_recorder; + unit_size_copy = *unit_size_recorder; + global_recorder_mutex_.unlock(); + // train first time, before that, there is no model left + filter_cache_manager_.try_retrain_model(level_copy, segment_ranges_copy, unit_size_copy); + filter_cache_manager_.update_cache_and_heap(level_copy, segment_ranges_copy); + + // retrain in long periods + while (true) { + // in one long period + while (!filter_cache_manager_.need_retrain()); // wait for long period end + // lock and copy recorders + global_recorder_mutex_.lock(); + level_copy = *level_recorder; + segment_ranges_copy = *segment_ranges_recorder; + unit_size_copy = *unit_size_recorder; + global_recorder_mutex_.unlock(); + // train first time, before that, there is no model left + filter_cache_manager_.try_retrain_model(level_copy, segment_ranges_copy, unit_size_copy); + filter_cache_manager_.update_cache_and_heap(level_copy, segment_ranges_copy); + } + // this loop never end +} + +void FilterCacheClient::retrain_or_keep_model(std::vector* const features_nums_except_level_0, + std::map* const level_recorder, + std::map>* const segment_ranges_recorder, + std::map* const unit_size_recorder) { + pool_.submit_detach(do_retrain_or_keep_model, features_nums_except_level_0, level_recorder, segment_ranges_recorder, unit_size_recorder); + // if first model training not end, python lgb_model server still return default units num + // then retrain model when every long period end. if model still work well, keep this model instead + // no need to return any value +} + +void FilterCacheClient::do_hit_count_recorder(const uint32_t& segment_id) { + filter_cache_manager_.hit_count_recorder(segment_id); +} + +bool FilterCacheClient::check_key(const uint32_t& segment_id, const std::string& key) { + bool result = filter_cache_manager_.check_key(segment_id, key); + pool_.submit_detach(do_hit_count_recorder, segment_id); + return result; +} + +void FilterCacheClient::do_hit_heat_buckets(const std::string& key) { + filter_cache_manager_.hit_heat_buckets(key); +} + +void FilterCacheClient::get_updating_work(const std::string& key) { + pool_.submit_detach(do_hit_heat_buckets, key); +} + +void FilterCacheClient::do_make_adjustment() { + while (true) { + // never stop making heap adjustment + filter_cache_manager_.adjust_cache_and_heap(); + } +} + +void FilterCacheClient::make_adjustment() { + pool_.submit_detach(do_make_adjustment); +} + +void FilterCacheClient::do_batch_insert_segments(std::vector& merged_segment_ids, std::vector& new_segment_ids, + std::map>& inherit_infos_recorder, + std::map& level_recorder, const uint32_t& level_0_base_count, + std::map>& segment_ranges_recorder) { + filter_cache_manager_.insert_segments(merged_segment_ids, new_segment_ids, inherit_infos_recorder, + level_recorder, 0, segment_ranges_recorder); +} + +void FilterCacheClient::batch_insert_segments(std::vector merged_segment_ids, std::vector new_segment_ids, + std::map> inherit_infos_recorder, + std::map level_recorder, const uint32_t& level_0_base_count, + std::map> segment_ranges_recorder) { + assert(merged_segment_ids.size() > 0 && new_segment_ids.size() > 0); + assert(new_segment_ids.size() == inherit_infos_recorder.size()); + assert(merged_segment_ids.size() + new_segment_ids.size() == level_recorder.size()); + assert(new_segment_ids.size() == segment_ranges_recorder.size()); + if (level_0_base_count == 0) { + pool_.submit_detach(do_batch_insert_segments, merged_segment_ids, new_segment_ids, inherit_infos_recorder, level_recorder, INIT_LEVEL_0_COUNT, segment_ranges_recorder); + } else { + pool_.submit_detach(do_batch_insert_segments, merged_segment_ids, new_segment_ids, inherit_infos_recorder, level_recorder, level_0_base_count, segment_ranges_recorder); + } +} + +void FilterCacheClient::do_batch_delete_segments(std::vector& merged_segment_ids, std::map& level_recorder) { + filter_cache_manager_.delete_segments(merged_segment_ids, level_recorder); +} + +void FilterCacheClient::batch_delete_segments(std::vector merged_segment_ids, std::map level_recorder) { + assert(merged_segment_ids.size() == level_recorder.size()); + pool_.submit_detach(do_batch_delete_segments, merged_segment_ids, level_recorder); +} + +void FilterCacheClient::do_batch_move_segments(std::vector& moved_segment_ids, + std::map& old_level_recorder, + std::map& move_level_recorder, + std::map>& move_segment_ranges_recorder) { + filter_cache_manager_.move_segments(moved_segment_ids, old_level_recorder, move_level_recorder, move_segment_ranges_recorder); +} + +void FilterCacheClient::batch_move_segments(std::vector moved_segment_ids, + std::map old_level_recorder, + std::map move_level_recorder, + std::map> move_segment_ranges_recorder) { + assert(moved_segment_ids.size() == move_level_recorder.size()); + assert(moved_segment_ids.size() == move_segment_ranges_recorder.size()); + pool_.submit_detach(do_batch_move_segments, moved_segment_ids, old_level_recorder, move_level_recorder, move_segment_ranges_recorder); +} + +} \ No newline at end of file diff --git a/db/art/filter_cache_client.h b/db/art/filter_cache_client.h new file mode 100644 index 000000000..bb195f74a --- /dev/null +++ b/db/art/filter_cache_client.h @@ -0,0 +1,116 @@ +#pragma once + +#include +#include +#include +#include "macros.h" +#include "filter_cache.h" + +namespace ROCKSDB_NAMESPACE { + +// global mutex to control global level recorder, ... +static std::mutex global_recorder_mutex_; + +class FilterCacheClient; + +class FilterCacheClient { +private: + static task_thread_pool::task_thread_pool pool_; + static FilterCacheManager filter_cache_manager_; + // we need heat_buckets_ready_ to become true before filter_cache_ready_ + // In YCSB benchmark, we first load data (insert key-value pairs) then may try get operation + // so we can guarantee that heat_buckets_ready_ become true before filter_cache_ready_ + static bool heat_buckets_ready_; // the same as FilterCacheManager.heat_buckets_.is_ready() + + // background thread part of prepare_heat_buckets + static void do_prepare_heat_buckets(const std::string& key, std::unordered_map>* const segment_info_recorder); + + // background thread part of retrain_or_keep_model + static void do_retrain_or_keep_model(std::vector* const features_nums_except_level_0, + std::map* const level_recorder, + std::map>* const segment_ranges_recorder, + std::map* const unit_size_recorder); + + // background thread part of check_key + static void do_hit_count_recorder(const uint32_t& segment_id); + + // background thread part of get_updating_work + static void do_hit_heat_buckets(const std::string& key); + + // background thread part of make_adjustment + static void do_make_adjustment(); + + // background thread part of batch_insert_segments + static void do_batch_insert_segments(std::vector& merged_segment_ids, std::vector& new_segment_ids, + std::map>& inherit_infos_recorder, + std::map& level_recorder, const uint32_t& level_0_base_count, + std::map>& segment_ranges_recorder); + + // background thread part of batch_delete_segments + void do_batch_delete_segments(std::vector& merged_segment_ids, std::map& level_recorder); + + // background thread part of batch_move_segments + void do_batch_move_segments(std::vector& moved_segment_ids, + std::map& old_level_recorder, + std::map& move_level_recorder, + std::map>& move_segment_ranges_recorder); +public: + FilterCacheClient() { + heat_buckets_ready_ = false; + } + + std::vector& range_seperators() { + return filter_cache_manager_.range_seperators(); + } + + // corresponding to FilterCacheManager.make_heat_buckets_ready + // segment_info_recorder is a map that recorder min key and max key of every segment, its value be like: [min key, max key] + // because heat buckets is mainly used for model features, and model dont do any work on level 0 segments + // so segment_info_recorder only need to record min key and max key of non level 0 segments + // (we can modify micro SAMPLES_MAXCNT to fit in the YCSB load period, simply, SAMPLES_MAXCNT should be at least 50%-75% of load data num ???) + // set SAMPLES_MAXCNT < YCSB load kv nums, to make sure that we can make heat_buckets ready in YCSB load period + // if segment_info_recorder is empty, try default key ranges num and divide + // segment_info_recorder should be empty !!! + bool prepare_heat_buckets(const std::string& key, std::unordered_map>* const segment_info_recorder); + + // correspinding to FilterCacheManager work: monitor manager ready_work(), call manager make_clf_model_ready and train first model + // lastly call update_cache_and_heap + // features_nums recorders features num (key ranges num * 2 + 1) of every non level 0 segments + // other arguments see FilterCacheManager make_clf_model_ready, try_retrain_model and update_cache_and_heap + // please ensure that 3 recorders need to keep the same segments set, or error will occur in train func + // you can use mutex in compaction and flushing to guarantee this + // then when every long period end, try to retrain a new model or keep last model + // unit_size_recorder should be empty !!! + // features_nums_except_level_0 empty !!! + void retrain_or_keep_model(std::vector* const features_nums_except_level_0, + std::map* const level_recorder, + std::map>* const segment_ranges_recorder, + std::map* const unit_size_recorder); + + // correespinding to FilterCacheManager work: check_key and hit_count_recorder + // return FilterCacheManager.check_key() and leave hit_count_recorder to background + bool check_key(const uint32_t& segment_id, const std::string& key); + + // every db get operation need one hit_heat_buckets + void get_updating_work(const std::string& key); + + // heap based adjustment + void make_adjustment(); + + // batch insert segments into filter cache manager, will also delete merged segments + void batch_insert_segments(std::vector merged_segment_ids, std::vector new_segment_ids, + std::map> inherit_infos_recorder, + std::map level_recorder, const uint32_t& level_0_base_count, + std::map> segment_ranges_recorder); + + // batch delete segments from filter cache manager + void batch_delete_segments(std::vector merged_segment_ids, std::map level_recorder); + + // batch of moving segments to one level + void batch_move_segments(std::vector moved_segment_ids, + std::map old_level_recorder, + std::map move_level_recorder, + std::map> move_segment_ranges_recorder); +}; + +} diff --git a/db/art/filter_cache_heap.cc b/db/art/filter_cache_heap.cc new file mode 100644 index 000000000..fd2afb48c --- /dev/null +++ b/db/art/filter_cache_heap.cc @@ -0,0 +1,1036 @@ +#include "filter_cache_heap.h" +#include +#include + +namespace ROCKSDB_NAMESPACE { + +FilterCacheHeap FilterCacheHeapManager::benefit_heap_; +FilterCacheHeap FilterCacheHeapManager::cost_heap_; +std::map FilterCacheHeapManager::heap_visit_cnt_recorder_; +std::map FilterCacheHeapManager::units_num_limit_recorder_; +std::mutex FilterCacheHeapManager::manager_mutex_; + +FilterCacheHeapNode FilterCacheHeap::heap_top() { + // need lock heap, or we may retrive outdated node + // heap_mutex_.lock(); + + if (!heap_.empty()) { + return heap_[0]; + } else { + return nullptr; + } + + // heap_mutex_.unlock(); +} + +/* +void FilterCacheHeap::pop() { + // heap_mutex_.lock(); + + FilterCacheHeapNode node; + const size_t size = heap_.size(); + + assert(heap_type_ != UNKNOWN_HEAP); + if (heap_type_ == BENEFIT_HEAP) { + std::pop_heap(heap_.begin(), heap_.end(), FilterCacheHeapNodeLessComparor); + } else if (heap_type_ == COST_HEAP) { + std::pop_heap(heap_.begin(), heap_.end(), FilterCacheHeapNodeGreaterComparor); + } + node = heap_[size - 1]; + heap_.pop_back(); + + // remove node from heap_index_ + if (node == nullptr) { + return; + } + const uint32_t segment_id = node->segment_id; + auto it = heap_index_.find(segment_id); + if (it != heap_index_.end()) { + heap_index_.erase(node->segment_id); + } + if (node != nullptr) { + delete node; // remember to release node !!! + } + assert(heap_.size() == heap_index_.size()); + + // heap_mutex_.unlock(); +} +*/ + +/* +void FilterCacheHeap::push(FilterCacheHeapNode& node) { + if (node == nullptr) { + return; + } + + // heap_mutex_.lock(); + + heap_.emplace_back(node); + assert(heap_type_ != UNKNOWN_HEAP); + if (heap_type_ == BENEFIT_HEAP) { + std::push_heap(heap_.begin(), heap_.end(), FilterCacheHeapNodeLessComparor); + } else if (heap_type_ == COST_HEAP) { + std::push_heap(heap_.begin(), heap_.end(), FilterCacheHeapNodeGreaterComparor); + } + + // remember to upsert node into heap_index_ + // upsert(node); + if (node == nullptr) { + return; + } + const uint32_t segment_id = node->segment_id; + auto it = heap_index_.find(segment_id); + if (it != heap_index_.end()) { + it->second = node; // already exist in heap_index_, only update + } else { + heap_index_.insert(std::make_pair(segment_id, node)); // insert into heap_index_ + } + assert(heap_.size() == heap_index_.size()); + + // heap_mutex_.unlock(); +} +*/ + +void FilterCacheHeap::batch_query(std::vector& segment_ids, std::vector& return_nodes) { + // heap_mutex_.lock(); + + return_nodes.clear(); + for (uint32_t& segment_id : segment_ids) { + auto it = heap_index_.find(segment_id); + FilterCacheHeapNode return_node = nullptr; + // if node->is_alive is false, the segment already merged and never exists in storage + // so we should return null when query a merged segment id + if (it != heap_index_.end() && (it->second)->is_alive == true) { + return_node = it->second; // node exists in heap_index_ and segment alive + } + return_nodes.emplace_back(return_node); + } + + // heap_mutex_.unlock(); +} + +void FilterCacheHeap::batch_upsert(std::vector& nodes) { + // heap_mutex_.lock(); + + // we guarantee that if one node already exists in heap_index_, it must exist in heap + for (FilterCacheHeapNode& node : nodes) { + const uint32_t segment_id = node->segment_id; + auto it = heap_index_.find(segment_id); + if (it != heap_index_.end()) { + // exist in heap_index_ and heap_ + // we may query nodes from this heap, and update var in nodes, then upsert original nodes + // check it->second != node to make sure that we won't free a refered sapce + if (it->second != node) { + *(it->second) = *(node); // only copy content, this will update content of node in heap_index_ and heap_ + delete node; // remember to free unnecessary space! + } + } else { + // not exist in heap_index_ and heap_ + heap_index_.insert(std::make_pair(segment_id, node)); // insert into heap_index_ + heap_.emplace_back(node); // push into heap_ + } + } + + // update or insert done, need to rebuild heap_ + rebuild_heap(); + + // heap_mutex_.unlock(); +} + +void FilterCacheHeap::batch_delete(std::vector& segment_ids) { + // heap_mutex_.lock(); + + // we guarantee that if one node not exist in heap_index_, it must not exist in heap + for (uint32_t& segment_id : segment_ids) { + auto it = heap_index_.find(segment_id); + if (it == heap_index_.end()) { + // not exist in heap_index_ and heap_ + // do nothing + } else { + // exist in heap_index_ and heap_ + // set is_alive to false and delete after that + it->second->is_alive = false; + } + } + + // delete nodes that is_alive == false + auto it = heap_.begin(); + FilterCacheHeapNode node = nullptr; + while (it != heap_.end()) { + node = (*it); + if (node->is_alive == false) { + // need delete + const uint32_t segment_id = node->segment_id; + // already delete node in heap_ + it = heap_.erase(it); // it will point to next available node + // already delete node in heap_index_ + heap_index_.erase(segment_id); + // remember to free node after that + delete node; + } else { + it ++; + } + } + + // delete done, need to rebuild heap_ + rebuild_heap(); + + // heap_mutex_.unlock(); +} + +void FilterCacheHeapManager::batch_delete(std::vector& segment_ids) { + manager_mutex_.lock(); + + for (uint32_t& segment_id : segment_ids) { + auto cnt_it = heap_visit_cnt_recorder_.find(segment_id); + auto limit_it = units_num_limit_recorder_.find(segment_id); + if (cnt_it != heap_visit_cnt_recorder_.end()) { + heap_visit_cnt_recorder_.erase(segment_id); + } + if (limit_it != units_num_limit_recorder_.end()) { + units_num_limit_recorder_.erase(segment_id); + } + } + + benefit_heap_.batch_delete(segment_ids); + cost_heap_.batch_delete(segment_ids); + + manager_mutex_.unlock(); +} + +void FilterCacheHeapManager::batch_upsert(std::vector& items) { + manager_mutex_.lock(); + + std::vector benefit_nodes, cost_nodes; + for (FilterCacheHeapItem& item : items) { + assert(item.current_units_num >= MIN_UNITS_NUM); + assert(item.current_units_num <= item.units_num_limit); + double benefit = StandardBenefitWithMaxBound(item.approx_visit_cnt, item.current_units_num, item.units_num_limit); + double cost = StandardCostWithMinBound(item.approx_visit_cnt, item.current_units_num, MIN_UNITS_NUM); + // item meets at least one conditions + // so that item always upsert into heap + // if item.approx_visit_cnt = 0, still push into heap + // we may modify its visit cnt in heap later + /* + if (item.current_units_num > MIN_UNITS_NUM) { + // make ready to upsert cost nodes + // FilterCacheHeapItem(const uint32_t& id, const uint32_t& cnt, const uint16_t& units, const double& heap_value, const uint16_t& limit) + cost_nodes.emplace_back(new FilterCacheHeapItem(item.segment_id, + item.approx_visit_cnt, + item.current_units_num, + cost, + item.units_num_limit) + ); + } + if (item.current_units_num < item.units_num_limit) { + // make ready to upsert benefit nodes + // FilterCacheHeapItem(const uint32_t& id, const uint32_t& cnt, const uint16_t& units, const double& heap_value, const uint16_t& limit) + benefit_nodes.emplace_back(new FilterCacheHeapItem(item.segment_id, + item.approx_visit_cnt, + item.current_units_num, + benefit, + item.units_num_limit) + ); + } + */ + + if (item.current_units_num <= item.units_num_limit) { + cost_nodes.emplace_back(new FilterCacheHeapItem(item.segment_id, + item.approx_visit_cnt, + item.current_units_num, + cost, + item.units_num_limit) + ); + benefit_nodes.emplace_back(new FilterCacheHeapItem(item.segment_id, + item.approx_visit_cnt, + item.current_units_num, + benefit, + item.units_num_limit) + ); + } + + // update visit cnt, we need to keep recorder visit cnt and heap visit cnt the same + const uint32_t segment_id = item.segment_id; + const uint32_t visit_cnt = item.approx_visit_cnt; + const uint16_t units_limit = item.units_num_limit; + auto cnt_it = heap_visit_cnt_recorder_.find(segment_id); + auto limit_it = units_num_limit_recorder_.find(segment_id); + if (cnt_it != heap_visit_cnt_recorder_.end()) { + cnt_it->second = visit_cnt; + } else { + heap_visit_cnt_recorder_.insert(std::make_pair(segment_id, visit_cnt)); + } + if (limit_it != units_num_limit_recorder_.end()) { + limit_it->second = units_limit; + } else { + units_num_limit_recorder_.insert(std::make_pair(segment_id, units_limit)); + } + } + + // upsert nodes into heaps + benefit_heap_.batch_upsert(benefit_nodes); + cost_heap_.batch_upsert(cost_nodes); + + manager_mutex_.unlock(); +} + +bool FilterCacheHeapManager::try_modify(FilterCacheModifyResult& result) { + manager_mutex_.lock(); + + FilterCacheHeapNode benefit_node = benefit_heap_.heap_top(); + FilterCacheHeapNode cost_node = cost_heap_.heap_top(); + // if benefit heap or cost heap empty, no need to modify + if (benefit_node == nullptr || cost_node == nullptr) { + manager_mutex_.unlock(); // remember to unlock, or we will cause deadlock + return false; + } + + if (benefit_node->is_alive == false || cost_node->is_alive == false) { + manager_mutex_.unlock(); // remember to unlock, or we will cause deadlock + return false; + } + + const double benefit = benefit_node->benefit_or_cost; + const double cost = cost_node->benefit_or_cost; + // if benefit of enable one unit <= cost of disable one unit, no need to modify + if (benefit <= cost) { + manager_mutex_.unlock(); // remember to unlock, or we will cause deadlock + return false; + } + + const uint32_t benefit_segment_id = benefit_node->segment_id; + const uint32_t cost_segment_id = cost_node->segment_id; + // if we will enable and disable one unit of the same segment, ignore it + if (benefit_segment_id == cost_segment_id) { + manager_mutex_.unlock(); // remember to unlock, or we will cause deadlock + return false; + } + + // FilterCacheHeapItem(const uint32_t& id, const uint32_t& cnt, const uint16_t& units, const double& heap_value) + // we can try filter unit modification, reminded that this modification will modify units num of two segments + // so we need to upsert new nodes of these two segments into benefit heap and cost heap + std::vector new_benefit_nodes, new_cost_nodes; + + /* + if (benefit_node->current_units_num + 1 < benefit_node->units_num_limit) { + new_benefit_nodes.emplace_back(new FilterCacheHeapItem(benefit_node->segment_id, + benefit_node->approx_visit_cnt, + benefit_node->current_units_num + 1, + StandardBenefit(benefit_node->approx_visit_cnt, + benefit_node->current_units_num + 1 + ), + benefit_node->units_num_limit + ) + ); + } + // benefit node will enable one unit, so its units num will always > MIN_UNITS_NUM + new_cost_nodes.emplace_back(new FilterCacheHeapItem(benefit_node->segment_id, + benefit_node->approx_visit_cnt, + benefit_node->current_units_num + 1, + StandardCost(benefit_node->approx_visit_cnt, + benefit_node->current_units_num + 1 + ), + benefit_node->units_num_limit + ) + ); + + if (cost_node->current_units_num - 1 > MIN_UNITS_NUM) { + new_cost_nodes.emplace_back(new FilterCacheHeapItem(cost_node->segment_id, + cost_node->approx_visit_cnt, + cost_node->current_units_num - 1, + StandardCost(cost_node->approx_visit_cnt, + cost_node->current_units_num - 1 + ), + cost_node->units_num_limit + ) + ); + } + // cost node will disable one unit, so its units num will always < MAX_UNITS_NUM + new_benefit_nodes.emplace_back(new FilterCacheHeapItem(cost_node->segment_id, + cost_node->approx_visit_cnt, + cost_node->current_units_num - 1, + StandardBenefit(cost_node->approx_visit_cnt, + cost_node->current_units_num - 1 + ), + cost_node->units_num_limit + ) + ); + */ + // we set benefit of nodes (units num == units num limit) to 0.0 + // and cost of nodes (units num == 0) to Infinite + // these prevent modifying these nodes' units num + // so we dont need to check units num + new_benefit_nodes.emplace_back(new FilterCacheHeapItem(benefit_node->segment_id, + benefit_node->approx_visit_cnt, + benefit_node->current_units_num + 1, + StandardBenefitWithMaxBound(benefit_node->approx_visit_cnt, + benefit_node->current_units_num + 1, + benefit_node->units_num_limit + ), + benefit_node->units_num_limit + ) + ); + new_cost_nodes.emplace_back(new FilterCacheHeapItem(benefit_node->segment_id, + benefit_node->approx_visit_cnt, + benefit_node->current_units_num + 1, + StandardCostWithMinBound(benefit_node->approx_visit_cnt, + benefit_node->current_units_num + 1, + MIN_UNITS_NUM + ), + benefit_node->units_num_limit + ) + ); + new_cost_nodes.emplace_back(new FilterCacheHeapItem(cost_node->segment_id, + cost_node->approx_visit_cnt, + cost_node->current_units_num - 1, + StandardCostWithMinBound(cost_node->approx_visit_cnt, + cost_node->current_units_num - 1, + MIN_UNITS_NUM), + cost_node->units_num_limit + ) + ); + new_benefit_nodes.emplace_back(new FilterCacheHeapItem(cost_node->segment_id, + cost_node->approx_visit_cnt, + cost_node->current_units_num - 1, + StandardBenefitWithMaxBound(cost_node->approx_visit_cnt, + cost_node->current_units_num - 1, + cost_node->units_num_limit + ), + cost_node->units_num_limit + ) + ); + // already make ready for upsert + benefit_heap_.batch_upsert(new_benefit_nodes); + cost_heap_.batch_upsert(new_cost_nodes); + + // write result + result.enable_segment_id = benefit_node->segment_id; + result.disable_segment_id = cost_node->segment_id; + result.enable_segment_units_num = benefit_node->current_units_num; + result.disable_segment_units_num = cost_node->current_units_num; + result.enable_segment_next_units_num = benefit_node->current_units_num + 1; + result.disable_segment_next_units_num = cost_node->current_units_num - 1; + result.enable_benefit = benefit; + result.disable_cost = cost; + + // return nothing, result already written into var result + + manager_mutex_.unlock(); + + return true; +} + +void FilterCacheHeapManager::sync_visit_cnt(std::map& current_visit_cnt_recorder) { + manager_mutex_.lock(); + + std::vector sync_nodes; + std::vector sync_segment_ids; + + auto heap_it = heap_visit_cnt_recorder_.begin(); + auto current_it = current_visit_cnt_recorder.begin(); + while (heap_it != heap_visit_cnt_recorder_.end() && + current_it != current_visit_cnt_recorder.end()) { + if (heap_it->first < current_it->first) { + heap_it ++; + } else if (heap_it->first > current_it->first) { + current_it ++; + } else { + // heap_it->first == current_it->first + assert(heap_it->first == current_it->first); + int64_t old_visit_cnt = heap_it->second; + int64_t cur_visit_cnt = current_it->second; + if (std::abs(cur_visit_cnt-old_visit_cnt) > VISIT_CNT_UPDATE_BOUND) { + heap_it->second = current_it->second; // remember to update heap visit cnt recorder + sync_segment_ids.emplace_back(current_it->first); + } + // heap_it ++; + current_it ++; + } + } + + // query nodes in heap + std::vector sync_benefit_nodes, sync_cost_nodes; + benefit_heap_.batch_query(sync_segment_ids, sync_benefit_nodes); + cost_heap_.batch_query(sync_segment_ids, sync_cost_nodes); + + // update visit cnt and benefit/cost in these nodes + for (FilterCacheHeapNode& sync_benefit_node : sync_benefit_nodes) { + if (sync_benefit_node != nullptr) { + sync_benefit_node->approx_visit_cnt = current_visit_cnt_recorder[sync_benefit_node->segment_id]; + sync_benefit_node->benefit_or_cost = StandardBenefitWithMaxBound(sync_benefit_node->approx_visit_cnt, + sync_benefit_node->current_units_num, + sync_benefit_node->units_num_limit); + } + } + for (FilterCacheHeapNode& sync_cost_node : sync_cost_nodes) { + if (sync_cost_node != nullptr) { + sync_cost_node->approx_visit_cnt = current_visit_cnt_recorder[sync_cost_node->segment_id]; + sync_cost_node->benefit_or_cost = StandardCostWithMinBound(sync_cost_node->approx_visit_cnt, + sync_cost_node->current_units_num, + MIN_UNITS_NUM); + } + } + + // upsert nodes into benefit heap and cost heap + // benefit_heap_.batch_upsert(sync_benefit_nodes); + // cost_heap_.batch_upsert(sync_cost_nodes); + + + // notice that we already updated these nodes in heap, we only need to rebuild heap + // but heap.upsert include the step of checking whether these segments already in heap + // this will waste some time, can we rebuild heap directly? + benefit_heap_.rebuild_heap(); + cost_heap_.rebuild_heap(); + + manager_mutex_.unlock(); +} + +void FilterCacheHeapManager::sync_units_num_limit(std::map& current_units_num_limit_recorder) { + manager_mutex_.lock(); + + std::vector sync_nodes; + std::vector sync_segment_ids; + + auto origin_it = units_num_limit_recorder_.begin(); + auto current_it = current_units_num_limit_recorder.begin(); + while (origin_it != units_num_limit_recorder_.end() && + current_it != current_units_num_limit_recorder.end()) { + if (origin_it->first < current_it->first) { + origin_it ++; + } else if (origin_it->first > current_it->first) { + current_it ++; + } else { + // origin_it->first == current_it->first + assert(origin_it->first == current_it->first); + assert(current_it->second <= MAX_UNITS_NUM); + if (origin_it->second != current_it->second) { + origin_it->second = current_it->second; + sync_segment_ids.emplace_back(current_it->first); + } + current_it ++; + } + } + + // query nodes in heap + std::vector sync_benefit_nodes, sync_cost_nodes; + benefit_heap_.batch_query(sync_segment_ids, sync_benefit_nodes); + cost_heap_.batch_query(sync_segment_ids, sync_cost_nodes); + + // update units num limit, units num and benefit/cost in these nodes + for (FilterCacheHeapNode& sync_benefit_node : sync_benefit_nodes) { + if (sync_benefit_node != nullptr) { + sync_benefit_node->units_num_limit = current_units_num_limit_recorder[sync_benefit_node->segment_id]; + sync_benefit_node->current_units_num = std::min(sync_benefit_node->units_num_limit, + sync_benefit_node->current_units_num); + sync_benefit_node->benefit_or_cost = StandardBenefitWithMaxBound(sync_benefit_node->approx_visit_cnt, + sync_benefit_node->current_units_num, + sync_benefit_node->units_num_limit); + } + } + for (FilterCacheHeapNode& sync_cost_node : sync_cost_nodes) { + if (sync_cost_node != nullptr) { + sync_cost_node->units_num_limit = current_units_num_limit_recorder[sync_cost_node->segment_id]; + sync_cost_node->current_units_num = std::min(sync_cost_node->units_num_limit, + sync_cost_node->current_units_num); + sync_cost_node->benefit_or_cost = StandardCostWithMinBound(sync_cost_node->approx_visit_cnt, + sync_cost_node->current_units_num, + MIN_UNITS_NUM); + } + } + + // upsert nodes into benefit heap and cost heap + // benefit_heap_.batch_upsert(sync_benefit_nodes); + // cost_heap_.batch_upsert(sync_cost_nodes); + + + // notice that we already updated these nodes in heap, we only need to rebuild heap + // but heap.upsert include the step of checking whether these segments already in heap + // this will waste some time, can we rebuild heap directly? + benefit_heap_.rebuild_heap(); + cost_heap_.rebuild_heap(); + + manager_mutex_.unlock(); +} + +void FilterCacheHeapManager::debug() { + std::vector items; + std::vector segment_ids; + std::map current_visit_cnt_recorder; + std::map current_units_num_limit_recorder; + std::map b_heap_index; + std::vector b_heap; + std::map c_heap_index; + std::vector c_heap; + std::fstream f_heap; + f_heap.open("/pg_wal/ycc/heap.log", std::ios::out | std::ios::app); + // FilterCacheHeapItem(const uint32_t& id, const uint32_t& cnt, const uint16_t& units, + // const double& heap_value, const uint16_t& limit) + // 1. try to insert some new data + f_heap << "[DEBUG] debug step 1 : batch insert" << std::endl << std::endl; + for (uint32_t id = 0; id < 70; id++) { + items.emplace_back(id % 70, (id % 70) * 10, (id % 70) / 10, 0, MAX_UNITS_NUM); + } + batch_upsert(items); + benefit_heap_.heap_index(b_heap_index); + benefit_heap_.heap(b_heap); + cost_heap_.heap_index(c_heap_index); + cost_heap_.heap(c_heap); + f_heap << "[DEBUG] step1 b_heap_index : " << std::endl; + for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) { + FilterCacheHeapNode node = it->second; + f_heap << it->first << " -> "; + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step1 b_heap : " << std::endl; + for (FilterCacheHeapNode& node : b_heap) { + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step1 c_heap_index : " << std::endl; + for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) { + FilterCacheHeapNode node = it->second; + f_heap << it->first << " -> "; + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step1 c_heap : " << std::endl; + for (FilterCacheHeapNode& node : c_heap) { + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step1 visit_cnt_recorder : " << std::endl; + for (auto it = heap_visit_cnt_recorder_.begin(); + it != heap_visit_cnt_recorder_.end(); it++) { + f_heap << it->first << " -> " << it->second << std::endl; + } + f_heap << "[DEBUG] step1 units_limit_recorder : " << std::endl; + for (auto it = units_num_limit_recorder_.begin(); + it != units_num_limit_recorder_.end(); it++) { + f_heap << it->first << " -> " << it->second << std::endl; + } + + // 2. try to update old data + f_heap << std::endl << std::endl<< "[DEBUG] debug step 2 : batch update (using upsert)" << std::endl << std::endl; + items.clear(); + for (uint32_t id = 0; id < 70; id++) { + items.emplace_back(id % 70, (id % 70) * std::pow(10, (id % 70) / 10), (id % 70) / 10, 0, MAX_UNITS_NUM); + } + batch_upsert(items); + benefit_heap_.heap_index(b_heap_index); + benefit_heap_.heap(b_heap); + cost_heap_.heap_index(c_heap_index); + cost_heap_.heap(c_heap); + f_heap << "[DEBUG] step2 b_heap_index : " << std::endl; + for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) { + FilterCacheHeapNode node = it->second; + f_heap << it->first << " -> "; + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step2 b_heap : " << std::endl; + for (FilterCacheHeapNode& node : b_heap) { + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step2 c_heap_index : " << std::endl; + for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) { + FilterCacheHeapNode node = it->second; + f_heap << it->first << " -> "; + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step2 c_heap : " << std::endl; + for (FilterCacheHeapNode& node : c_heap) { + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step2 visit_cnt_recorder : " << std::endl; + for (auto it = heap_visit_cnt_recorder_.begin(); + it != heap_visit_cnt_recorder_.end(); it++) { + f_heap << it->first << " -> " << it->second << std::endl; + } + f_heap << "[DEBUG] step2 units_limit_recorder : " << std::endl; + for (auto it = units_num_limit_recorder_.begin(); + it != units_num_limit_recorder_.end(); it++) { + f_heap << it->first << " -> " << it->second << std::endl; + } + + // 3. try to delete some data + f_heap << std::endl << std::endl<< "[DEBUG] debug step 3 : batch delete" << std::endl << std::endl; + items.clear(); + segment_ids.clear(); + for (uint32_t i = 0; i < 10; i++) { + segment_ids.emplace_back(i); + } + for (uint32_t i = 60; i < 100; i++) { + segment_ids.emplace_back(i); + } + batch_delete(segment_ids); + benefit_heap_.heap_index(b_heap_index); + benefit_heap_.heap(b_heap); + cost_heap_.heap_index(c_heap_index); + cost_heap_.heap(c_heap); + f_heap << "[DEBUG] step3 b_heap_index : " << std::endl; + for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) { + FilterCacheHeapNode node = it->second; + f_heap << it->first << " -> "; + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step3 b_heap : " << std::endl; + for (FilterCacheHeapNode& node : b_heap) { + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step3 c_heap_index : " << std::endl; + for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) { + FilterCacheHeapNode node = it->second; + f_heap << it->first << " -> "; + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step3 c_heap : " << std::endl; + for (FilterCacheHeapNode& node : c_heap) { + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step3 visit_cnt_recorder : " << std::endl; + for (auto it = heap_visit_cnt_recorder_.begin(); + it != heap_visit_cnt_recorder_.end(); it++) { + f_heap << it->first << " -> " << it->second << std::endl; + } + f_heap << "[DEBUG] step3 units_limit_recorder : " << std::endl; + for (auto it = units_num_limit_recorder_.begin(); + it != units_num_limit_recorder_.end(); it++) { + f_heap << it->first << " -> " << it->second << std::endl; + } + + // 4. try to sync visit cnt + f_heap << std::endl << std::endl<< "[DEBUG] debug step 4 : sync visit cnt " << std::endl << std::endl; + for (uint32_t id = 0; id < 40; id++) { + if (id % 2 == 0) { + current_visit_cnt_recorder.insert(std::make_pair(id, (id % 70) * std::pow(10, (id % 70) / 10) + 101010)); + } + } + for (uint32_t id = 40; id < 60; id++) { + current_visit_cnt_recorder.insert(std::make_pair(id, (id % 70) * std::pow(10, (id % 70) / 10) + 101010)); + } + sync_visit_cnt(current_visit_cnt_recorder); + benefit_heap_.heap_index(b_heap_index); + benefit_heap_.heap(b_heap); + cost_heap_.heap_index(c_heap_index); + cost_heap_.heap(c_heap); + f_heap << "[DEBUG] step4 b_heap_index : " << std::endl; + for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) { + FilterCacheHeapNode node = it->second; + f_heap << it->first << " -> "; + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step4 b_heap : " << std::endl; + for (FilterCacheHeapNode& node : b_heap) { + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step4 c_heap_index : " << std::endl; + for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) { + FilterCacheHeapNode node = it->second; + f_heap << it->first << " -> "; + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step4 c_heap : " << std::endl; + for (FilterCacheHeapNode& node : c_heap) { + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step4 visit_cnt_recorder : " << std::endl; + for (auto it = heap_visit_cnt_recorder_.begin(); + it != heap_visit_cnt_recorder_.end(); it++) { + f_heap << it->first << " -> " << it->second << std::endl; + } + f_heap << "[DEBUG] step4 units_limit_recorder : " << std::endl; + for (auto it = units_num_limit_recorder_.begin(); + it != units_num_limit_recorder_.end(); it++) { + f_heap << it->first << " -> " << it->second << std::endl; + } + + // 5. try to decrease units limit + f_heap << std::endl << std::endl<< "[DEBUG] debug step 5 : decrease units limit " << std::endl << std::endl; + for (uint32_t id = 0; id < 40; id++) { + if (id % 2 == 0) { + current_units_num_limit_recorder.insert(std::make_pair(id, 0)); + } else { + current_units_num_limit_recorder.insert(std::make_pair(id, 1)); + } + } + for (uint32_t id = 40; id < 50; id++) { + current_units_num_limit_recorder.insert(std::make_pair(id, 3)); + } + for (uint32_t id = 50; id < 70; id++) { + current_units_num_limit_recorder.insert(std::make_pair(id, 5)); + } + sync_units_num_limit(current_units_num_limit_recorder); + benefit_heap_.heap_index(b_heap_index); + benefit_heap_.heap(b_heap); + cost_heap_.heap_index(c_heap_index); + cost_heap_.heap(c_heap); + f_heap << "[DEBUG] step5 b_heap_index : " << std::endl; + for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) { + FilterCacheHeapNode node = it->second; + f_heap << it->first << " -> "; + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step5 b_heap : " << std::endl; + for (FilterCacheHeapNode& node : b_heap) { + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step5 c_heap_index : " << std::endl; + for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) { + FilterCacheHeapNode node = it->second; + f_heap << it->first << " -> "; + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step5 c_heap : " << std::endl; + for (FilterCacheHeapNode& node : c_heap) { + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step5 visit_cnt_recorder : " << std::endl; + for (auto it = heap_visit_cnt_recorder_.begin(); + it != heap_visit_cnt_recorder_.end(); it++) { + f_heap << it->first << " -> " << it->second << std::endl; + } + f_heap << "[DEBUG] step5 units_limit_recorder : " << std::endl; + for (auto it = units_num_limit_recorder_.begin(); + it != units_num_limit_recorder_.end(); it++) { + f_heap << it->first << " -> " << it->second << std::endl; + } + + // 6. try to increase units limit + f_heap << std::endl << std::endl<< "[DEBUG] debug step 6 : increase units limit " << std::endl << std::endl; + for (uint32_t id = 0; id < 40; id++) { + if (id % 2 == 0) { + current_units_num_limit_recorder[id] = 3; + } else { + current_units_num_limit_recorder[id] = 4; + } + } + for (uint32_t id = 40; id < 50; id++) { + current_units_num_limit_recorder[id] = 5; + } + for (uint32_t id = 50; id < 70; id++) { + current_units_num_limit_recorder[id] = 6; + } + sync_units_num_limit(current_units_num_limit_recorder); + benefit_heap_.heap_index(b_heap_index); + benefit_heap_.heap(b_heap); + cost_heap_.heap_index(c_heap_index); + cost_heap_.heap(c_heap); + f_heap << "[DEBUG] step6 b_heap_index : " << std::endl; + for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) { + FilterCacheHeapNode node = it->second; + f_heap << it->first << " -> "; + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step6 b_heap : " << std::endl; + for (FilterCacheHeapNode& node : b_heap) { + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step6 c_heap_index : " << std::endl; + for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) { + FilterCacheHeapNode node = it->second; + f_heap << it->first << " -> "; + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step6 c_heap : " << std::endl; + for (FilterCacheHeapNode& node : c_heap) { + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step6 visit_cnt_recorder : " << std::endl; + for (auto it = heap_visit_cnt_recorder_.begin(); + it != heap_visit_cnt_recorder_.end(); it++) { + f_heap << it->first << " -> " << it->second << std::endl; + } + f_heap << "[DEBUG] step6 units_limit_recorder : " << std::endl; + for (auto it = units_num_limit_recorder_.begin(); + it != units_num_limit_recorder_.end(); it++) { + f_heap << it->first << " -> " << it->second << std::endl; + } + + // 7. try to loop modification + f_heap << std::endl << std::endl<< "[DEBUG] debug step 7 : loop try_modify " << std::endl << std::endl; + f_heap << "[DEBUG] step7 loop start : " << std::endl; + FilterCacheModifyResult result; + while (try_modify(result)) { + f_heap << "enable segment -> " << "id : " << result.enable_segment_id; + f_heap << " , prev units num : " << result.enable_segment_units_num; + f_heap << " , benefit : " << result.enable_benefit << std::endl; + f_heap << "disable segment -> " << "id : " << result.disable_segment_id; + f_heap << " , prev units num : " << result.disable_segment_units_num; + f_heap << " , cost : " << result.disable_cost << std::endl; + } + // write final indexs and heaps + benefit_heap_.heap_index(b_heap_index); + benefit_heap_.heap(b_heap); + cost_heap_.heap_index(c_heap_index); + cost_heap_.heap(c_heap); + f_heap << "[DEBUG] step7 b_heap_index : " << std::endl; + for (auto it = b_heap_index.begin(); it != b_heap_index.end(); it++) { + FilterCacheHeapNode node = it->second; + f_heap << it->first << " -> "; + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step7 b_heap : " << std::endl; + for (FilterCacheHeapNode& node : b_heap) { + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step7 c_heap_index : " << std::endl; + for (auto it = c_heap_index.begin(); it != c_heap_index.end(); it++) { + FilterCacheHeapNode node = it->second; + f_heap << it->first << " -> "; + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step7 c_heap : " << std::endl; + for (FilterCacheHeapNode& node : c_heap) { + f_heap << " id : " << node->segment_id; + f_heap << " , cnt : " << node->approx_visit_cnt; + f_heap << " , units : " << node->current_units_num; + f_heap << " , value : " << node->benefit_or_cost; + f_heap << " , limit : " << node->units_num_limit; + f_heap << " , alive : " << node->is_alive << std::endl; + } + f_heap << "[DEBUG] step7 visit_cnt_recorder : " << std::endl; + for (auto it = heap_visit_cnt_recorder_.begin(); + it != heap_visit_cnt_recorder_.end(); it++) { + f_heap << it->first << " -> " << it->second << std::endl; + } + f_heap << "[DEBUG] step7 units_limit_recorder : " << std::endl; + for (auto it = units_num_limit_recorder_.begin(); + it != units_num_limit_recorder_.end(); it++) { + f_heap << it->first << " -> " << it->second << std::endl; + } + + f_heap.close(); +} + +} \ No newline at end of file diff --git a/db/art/filter_cache_heap.h b/db/art/filter_cache_heap.h new file mode 100644 index 000000000..ca9aea1aa --- /dev/null +++ b/db/art/filter_cache_heap.h @@ -0,0 +1,276 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "macros.h" + +namespace ROCKSDB_NAMESPACE { +// when filter cache is full , +// we need to use heap manager to +// clear some space and insert new filter units +// for these coming new segments +// or we may need to use heap manager to adjust filter cache +// to reduce extra I/O caused by false positive + +struct FilterCacheHeapItem; +typedef FilterCacheHeapItem* FilterCacheHeapNode; +struct FilterCacheModifyResult; +class FilterCacheHeap; +class FilterCacheHeapManager; +inline bool FilterCacheHeapNodeLessComparor(const FilterCacheHeapNode& node_1, const FilterCacheHeapNode& node_2); +inline bool FilterCacheHeapNodeGreaterComparor(const FilterCacheHeapNode& node_1, const FilterCacheHeapNode& node_2); +inline double StandardBenefitWithMaxBound(const uint32_t& visit_cnt, const uint16_t& units_num, const uint16_t& max_bound); +inline double StandardCostWithMinBound(const uint32_t& visit_cnt, const uint16_t& units_num, const uint16_t& min_bound); + +struct FilterCacheHeapItem { + uint32_t segment_id; + uint32_t approx_visit_cnt; // estimated visit cnt + uint16_t current_units_num; // enabled units num for this segment + double benefit_or_cost; // can represent enable benefit or disable cost + uint16_t units_num_limit; // units num prediction model predict maximum units num for every segment + bool is_alive; // sign whether this item still used, if false, that means this segment already merged and freed + // default set heap_value = 0, we will compuate benefit or cost in batch upsert func + FilterCacheHeapItem(const uint32_t& id, const uint32_t& cnt, const uint16_t& units, const double& heap_value, const uint16_t& limit) { + segment_id = id; + approx_visit_cnt = cnt; + current_units_num = units; + benefit_or_cost = heap_value; + units_num_limit = limit; + is_alive = true; + assert(current_units_num >= MIN_UNITS_NUM); + assert(current_units_num <= units_num_limit); + } + /* + FilterCacheHeapItem(const FilterCacheHeapItem& item) { + segment_id = item.segment_id; + approx_visit_cnt = item.approx_visit_cnt; + current_units_num = item.current_units_num; + benefit_or_cost = item.benefit_or_cost; + units_num_limit = item.units_num_limit; + is_alive = item.is_alive; + } + */ +}; + +struct FilterCacheModifyResult { + uint32_t enable_segment_id; + uint32_t disable_segment_id; + uint16_t enable_segment_units_num; + uint16_t disable_segment_units_num; + uint16_t enable_segment_next_units_num; + uint16_t disable_segment_next_units_num; + double enable_benefit; + double disable_cost; +}; + +inline bool FilterCacheHeapNodeLessComparor(const FilterCacheHeapNode& node_1, const FilterCacheHeapNode& node_2) { + return node_1->benefit_or_cost < node_2->benefit_or_cost; +} + +inline bool FilterCacheHeapNodeGreaterComparor(const FilterCacheHeapNode& node_1, const FilterCacheHeapNode& node_2) { + return node_1->benefit_or_cost > node_2->benefit_or_cost; +} + +inline double StandardBenefitWithMaxBound(const uint32_t& visit_cnt, const uint16_t& units_num, const uint16_t& max_bound) { + int bits_per_key = BITS_PER_KEY_PER_UNIT; + // We intentionally round down to reduce probing cost a little bit + int num_probes = static_cast(bits_per_key * 0.69); // 0.69 =~ ln(2) + if (num_probes < 1) num_probes = 1; + if (num_probes > 30) num_probes = 30; + + // compute false positive rate of one filter unit + double rate_per_unit = std::pow(1.0 - std::exp(-double(num_probes) / double(bits_per_key)), num_probes); + + assert(max_bound >= MIN_UNITS_NUM); + assert(max_bound <= MAX_UNITS_NUM); + if (units_num >= max_bound) { + return 0.0; // 0.0 is the lowest value of benefit (benefit >= 0.0) + } + + uint16_t next_units_num = units_num + 1; + double rate = std::pow(rate_per_unit, units_num); + double next_rate = std::pow(rate_per_unit, next_units_num); + + double benefit = double(visit_cnt) * (rate - next_rate); + /* + std::cout << "visit_cnt : " << visit_cnt + << " , rate : " << rate + << " , next_rate : " << next_rate + << " . rate_per_unit : " << rate_per_unit + << std::endl; + */ + assert(benefit >= 0); + return benefit; +} + +inline double StandardCostWithMinBound(const uint32_t& visit_cnt, const uint16_t& units_num, const uint16_t& min_bound) { + int bits_per_key = BITS_PER_KEY_PER_UNIT; + // We intentionally round down to reduce probing cost a little bit + int num_probes = static_cast(bits_per_key * 0.69); // 0.69 =~ ln(2) + if (num_probes < 1) num_probes = 1; + if (num_probes > 30) num_probes = 30; + + // compute false positive rate of one filter unit + double rate_per_unit = std::pow(1.0 - std::exp(-double(num_probes) / double(bits_per_key)), num_probes); + + assert(min_bound >= MIN_UNITS_NUM); + assert(min_bound <= MAX_UNITS_NUM); + if (units_num <= min_bound) { + return __DBL_MAX__; + } + + uint16_t next_units_num = units_num - 1; + double rate = std::pow(rate_per_unit, units_num); + double next_rate = std::pow(rate_per_unit, next_units_num); + + double cost = double(visit_cnt) * (next_rate - rate); + /* + std::cout << "visit_cnt : " << visit_cnt + << " , rate : " << rate + << " , next_rate : " << next_rate + << " . rate_per_unit : " << rate_per_unit + << std::endl; + */ + assert(cost >= 0); + return cost; +} + +class FilterCacheHeap { +private: + int heap_type_; + // map, use this map to fastly locate node in heap + std::map heap_index_; + // use make_heap, push_heap, pop_heap to manage heap + std::vector heap_; + // std::mutex heap_mutex_; + +public: + FilterCacheHeap() { + heap_type_ = UNKNOWN_HEAP; + heap_index_.clear(); + heap_.clear(); + } + + ~FilterCacheHeap() { + // do nothing + } + + void set_type(const int type) { + heap_type_ = type; + } + + // only rebuild heap_, do nothing to heap_index_ + void rebuild_heap() { + // heap_mutex_.lock(); + + assert(heap_type_ != UNKNOWN_HEAP); + assert(heap_.size() == heap_index_.size()); + if (heap_type_ == BENEFIT_HEAP) { + std::make_heap(heap_.begin(), heap_.end(), FilterCacheHeapNodeLessComparor); + } else if (heap_type_ == COST_HEAP) { + std::make_heap(heap_.begin(), heap_.end(), FilterCacheHeapNodeGreaterComparor); + } + + // heap_mutex_.unlock(); + } + + // return heap top + FilterCacheHeapNode heap_top(); + + // pop one node with deleting node from heap_index_ + // void pop(); + + // push one node with upsert node into heap_index_ + // void push(FilterCacheHeapNode& node); + + // given a batch of segment id, return needed nodes. + // only support batch query and reminded that one return node may be null + // if segment not available or segment not exists in heap_index_ + // result will write into return_nodes + void batch_query(std::vector& segment_ids, std::vector& return_nodes); + + // upsert batch nodes into heap_index_ and heap_ + // only support batch upsert, if one node already exists in heap_index_, it must in heap + // so we only need to update the content of that existing node + void batch_upsert(std::vector& nodes); + + // delete batch nodes from heap_index_ and heap_ + // only support batch delete, if one node not exist in heap_index_, it must not exist in heap + // so we only need to delete these existing nodes + void batch_delete(std::vector& segment_ids); + + // only used in debug !!! + void heap_index(std::map& heap_index) { + heap_index.clear(); + heap_index.insert(heap_index_.begin(), heap_index_.end()); + } + + // only used in debug !!! + void heap(std::vector& heap) { + heap.clear(); + heap.assign(heap_.begin(), heap_.end()); + } +}; + +class FilterCacheHeapManager { +private: + static FilterCacheHeap benefit_heap_; + static FilterCacheHeap cost_heap_; + // set heap node visit cnt = c_1, real estimated visit cnt = c_2 + // we only update c_1 when | c_1 - c_2 | >= VISIT_CNT_UPDATE_BOUND + // update c_1 means we need to update this recorder and heap + // heap_visit_cnt_recorder: map + // when filter cache call delete, this recorder will automately delete these merged segment ids + // when filter cache call upsert, this recorder will automately upsert these segment ids + static std::map heap_visit_cnt_recorder_; + static std::map units_num_limit_recorder_; + // TODO: mutex can be optimized + static std::mutex manager_mutex_; + +public: + FilterCacheHeapManager() { + benefit_heap_.set_type(BENEFIT_HEAP); + cost_heap_.set_type(COST_HEAP); + heap_visit_cnt_recorder_.clear(); + units_num_limit_recorder_.clear(); + } + + ~FilterCacheHeapManager() { + // do nothing + } + + // sync units_num_limit in heap and recorder + // reminded that we will not insert or delete nodes in this method + // we only update these nodes that already exist in two heaps + void sync_units_num_limit(std::map& current_units_num_limit_recorder); + + // sync visit cnt in heap and real estimated visit cnt + // reminded that we will not insert or delete nodes in this method + // we only update these nodes that already exist in two heaps + void sync_visit_cnt(std::map& current_visit_cnt_recorder); + + // try to read benefit_heap top and cost_heap top, then judge whether we need to modify units num in filter cache + // return true when we can modify units num of several segments, return false when we cannot + // reminded that this func only modify heap, we still need to update filter units in filter cache + bool try_modify(FilterCacheModifyResult& result); + + // delete batch segment nodes in benefit_heap and cost_heap, also need to update heap_visit_cnt_recorder_ + void batch_delete(std::vector& segment_ids); + + // upsert batch segment nodes in benefit_heap and cost_heap, also need to update heap_visit_cnt_recorder_ + // only input items, we will allocate space for nodes later + // reminded that we will also update heap_visit_cnt_recorder_ if we update a existed node + // because we need to keep heap visit cnt and recorder visit cnt the same + void batch_upsert(std::vector& items); + + // 1. try debug batch insert + // 2. try debug batch update(use batch_upsert) + void debug(); +}; + +} diff --git a/db/art/filter_cache_item.cc b/db/art/filter_cache_item.cc new file mode 100644 index 000000000..6f5cb1163 --- /dev/null +++ b/db/art/filter_cache_item.cc @@ -0,0 +1,5 @@ +#include "filter_cache_item.h" + +namespace ROCKSDB_NAMESPACE { + +} \ No newline at end of file diff --git a/db/art/filter_cache_item.h b/db/art/filter_cache_item.h new file mode 100644 index 000000000..8a591b88c --- /dev/null +++ b/db/art/filter_cache_item.h @@ -0,0 +1,53 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include "macros.h" + +namespace ROCKSDB_NAMESPACE { + +// 先在filter cache里为每个segment默认启用总bits-per-key=8,随着写入的segment的增加, +// 一旦已经占用了filter cache最大容量的一定阈值(如80%), 就利用GreedyAlgo计算规划问题,并进行模型训练 +// 一旦filter cache已满,就进入filter cache的double heap调整,我们只需将新的segment用模型进行预测 +// 将新segment的node插入到两个heap里,在后台启动一个线程,自行调整两个堆,并不断返回调整的结果 +// 得到结果后,我们可以立即对filter units的启用情况进行调节,也可以先保存后面批量调整 +// 具体见文档 + + +// 注意加上一些必要的英文注释 +// filter cache主要为一个map, key是segment id(uint32_t), value就为FilterCacheItem类 +// 成员函数需要在filter_cache_item.cc里定义 +class FilterCacheItem { +private: + // 这里定义一些必要的成员变量,尽量设置为private + // 可以存handle、segment id等信息 + // 允许使用STL类,如vector、map等 + // 是否需要使用mutex来保证filter units的启用/禁用管理与用units检查key二者不冲突? +public: + // 构造函数,可以初始化成员变量 + FilterCacheItem(const uint32_t& segment_id); + + // 清理成员变量,避免内存泄漏,如果new了空间,就可能需要在这里清理 + ~FilterCacheItem(); + + // 占用的内存空间,这里估计总共使用的filter units占用的空间就行了 + // 注意,返回的空间大小为占用的bits数量,不是bytes数量 + uint32_t approximate_size(); + + // 根据目前已经启用的units数,启用或禁用filter units + // 输入需要启用的units数,决定启用、禁用还是不处理 + // units_num : [MIN_UNITS_NUM, MAX_UNITS_NUM] + void enable_units(const uint32_t& units_num); + + // 输入一个key,判断是否存在 + // 具体就是从第一个unit开始,依次判断,如果有一个unit判断不存在。就停止。 + // 如果每个unit都判断存在,就返回true,否则返回false + // 如果启用的unit数为0,默认返回true + bool check_key(const std::string& key); +}; + +} \ No newline at end of file diff --git a/db/art/greedy_algo.cc b/db/art/greedy_algo.cc new file mode 100644 index 000000000..9aff8beed --- /dev/null +++ b/db/art/greedy_algo.cc @@ -0,0 +1,91 @@ +#include "greedy_algo.h" +#include +#include +#include +#include + +namespace ROCKSDB_NAMESPACE { + +// this func is not thread-secured, so make only one thread perform this algo!!! +void GreedyAlgo::solve(std::map& segment_algo_infos, + std::map& algo_solution, const uint32_t& cache_size) { + assert(!segment_algo_infos.empty()); + // ready to perform algo + algo_solution.clear(); + std::vector segment_algo_helper_heap; + for (auto it = segment_algo_infos.begin(); it != segment_algo_infos.end(); it++) { + uint32_t segment_id = it->first; + SegmentAlgoInfo segment_algo_info = it->second; + algo_solution[segment_id] = 0; // init algo_solution + + SegmentAlgoHelper segment_algo_helper(segment_id, segment_algo_info); + segment_algo_helper_heap.emplace_back(segment_algo_helper); // init algo heap + } + assert(segment_algo_infos.size() == algo_solution.size()); + assert(segment_algo_infos.size() == segment_algo_helper_heap.size()); + std::make_heap(segment_algo_helper_heap.begin(), + segment_algo_helper_heap.end(), + CompareSegmentAlgoHelper); + + std::fstream f_algo; + f_algo.open("/pg_wal/ycc/algo.log", std::ios::out | std::ios::app); + f_algo << "[DEBUG] start to record algo : " << std::endl; + + // current used space size (bits) of filter cache + uint32_t current_cache_size = 0; + while (!segment_algo_helper_heap.empty()) { + // std::cout << "segment id : " << segment_algo_helper_heap[0].segment_id << std::endl; + + const size_t size = segment_algo_helper_heap.size(); + // heap top item moved to segment_algo_helper_heap[segment_algo_helper_heap.size()-1]; + std::pop_heap(segment_algo_helper_heap.begin(), + segment_algo_helper_heap.end(), + CompareSegmentAlgoHelper); + SegmentAlgoHelper segment_algo_helper_top = segment_algo_helper_heap[size-1]; + // check whether free space (in filter cache) is enough + uint32_t size_needed = segment_algo_helper_top.size_per_unit; + // if not enough, remove this segment helper from heap + // that means we will not consider this segment any longer + if (current_cache_size + size_needed > cache_size) { + segment_algo_helper_heap.pop_back(); + continue; + } + // SegmentAlgoHelper(const uint32_t& id, const uint32_t& cnt, const uint32_t& size, const uint16_t& units) + SegmentAlgoHelper segment_algo_helper_needed(segment_algo_helper_top.segment_id, + segment_algo_helper_top.visit_cnt, + segment_algo_helper_top.size_per_unit, + segment_algo_helper_top.units_num + 1); + // update enabled units + // noticed that if one segment visit cnt == 0, it still enable one unit + // so check visit num before update algo_solution + if (segment_algo_helper_needed.visit_cnt > 0) { + algo_solution[segment_algo_helper_needed.segment_id] = segment_algo_helper_needed.units_num; + current_cache_size += size_needed; + f_algo << "[DEBUG] segment " << segment_algo_helper_needed.segment_id + << " : " << segment_algo_helper_needed.units_num - 1 << " -> " + << segment_algo_helper_needed.units_num << " , cache space left : " + << cache_size - current_cache_size << " , recv benefit : " + << segment_algo_helper_top.enable_benifit << " , next benefit : " + << segment_algo_helper_needed.enable_benifit << std::endl; + } + assert(algo_solution[segment_algo_helper_needed.segment_id] <= MAX_UNITS_NUM); + // enable benefit == 0 means units_num == MAX_UNITS_NUM + // that means we cannot enable one unit for this segment, already enable all units + if (segment_algo_helper_needed.enable_benifit == 0) { + // assert(segment_algo_helper_needed.units_num >= MAX_UNITS_NUM); + segment_algo_helper_heap.pop_back(); + continue; + } + // we can push this new segment helper into heap + segment_algo_helper_heap[size-1] = segment_algo_helper_needed; + std::push_heap(segment_algo_helper_heap.begin(), + segment_algo_helper_heap.end(), + CompareSegmentAlgoHelper); + } + + f_algo << std::endl; + f_algo.close(); + // return nothing, all results should be written into algo_solution +} + +} \ No newline at end of file diff --git a/db/art/greedy_algo.h b/db/art/greedy_algo.h new file mode 100644 index 000000000..a1d03acc6 --- /dev/null +++ b/db/art/greedy_algo.h @@ -0,0 +1,157 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include "macros.h" + +namespace ROCKSDB_NAMESPACE { + +struct SegmentAlgoInfo; +struct SegmentAlgoHelper; +class GreedyAlgo; + +inline double StandardBenefit(const uint32_t& visit_cnt, const uint16_t& units_num); +inline double StandardCost(const uint32_t& visit_cnt, const uint16_t& units_num); +inline bool CompareSegmentAlgoHelper(const SegmentAlgoHelper& helper_1, const SegmentAlgoHelper& helper_2); + +// contain visit counter of every segment in last long period +// also contain size of every segment's filter unit +// size of units belonging to one segment should be the same +// size equals to bits that one unit occupies +struct SegmentAlgoInfo { + uint32_t visit_cnt; + uint32_t size_per_unit; + SegmentAlgoInfo(const uint32_t& cnt, const uint32_t& size) { + assert(size > 0); + visit_cnt = cnt; size_per_unit = size; + } +}; + +// helper structure when performing this algo +// exactly, this structure will be the item of algo heap +struct SegmentAlgoHelper { + uint32_t visit_cnt; + uint16_t units_num; + uint32_t size_per_unit; + uint32_t segment_id; + double enable_benifit; + SegmentAlgoHelper(const uint32_t& id, const uint32_t& cnt, const uint32_t& size, const uint16_t& units) { + segment_id = id; visit_cnt = cnt; size_per_unit = size; units_num = units; + enable_benifit = StandardBenefit(visit_cnt, units_num); + // assert(units_num <= MAX_UNITS_NUM); + } + SegmentAlgoHelper(const uint32_t& id, SegmentAlgoInfo& segment_algo_info) { + segment_id = id; visit_cnt = segment_algo_info.visit_cnt; + size_per_unit = segment_algo_info.size_per_unit; units_num = 0; + enable_benifit = StandardBenefit(visit_cnt, units_num); + } +}; + +inline double StandardBenefit(const uint32_t& visit_cnt, const uint16_t& units_num) { + int bits_per_key = BITS_PER_KEY_PER_UNIT; + // We intentionally round down to reduce probing cost a little bit + int num_probes = static_cast(bits_per_key * 0.69); // 0.69 =~ ln(2) + if (num_probes < 1) num_probes = 1; + if (num_probes > 30) num_probes = 30; + + // compute false positive rate of one filter unit + double rate_per_unit = std::pow(1.0 - std::exp(-double(num_probes) / double(bits_per_key)), num_probes); + + if (units_num >= MAX_UNITS_NUM) { + return 0.0; + } + + uint16_t next_units_num = units_num + 1; + double rate = std::pow(rate_per_unit, units_num); + double next_rate = std::pow(rate_per_unit, next_units_num); + + double benefit = double(visit_cnt) * (rate - next_rate); + /* + std::cout << "visit_cnt : " << visit_cnt + << " , rate : " << rate + << " , next_rate : " << next_rate + << " . rate_per_unit : " << rate_per_unit + << std::endl; + */ + assert(benefit >= 0); + return benefit; +} + +inline double StandardCost(const uint32_t& visit_cnt, const uint16_t& units_num) { + int bits_per_key = BITS_PER_KEY_PER_UNIT; + // We intentionally round down to reduce probing cost a little bit + int num_probes = static_cast(bits_per_key * 0.69); // 0.69 =~ ln(2) + if (num_probes < 1) num_probes = 1; + if (num_probes > 30) num_probes = 30; + + // compute false positive rate of one filter unit + double rate_per_unit = std::pow(1.0 - std::exp(-double(num_probes) / double(bits_per_key)), num_probes); + + if (units_num <= MIN_UNITS_NUM) { + return __DBL_MAX__; + } + + uint16_t next_units_num = units_num - 1; + double rate = std::pow(rate_per_unit, units_num); + double next_rate = std::pow(rate_per_unit, next_units_num); + + double cost = double(visit_cnt) * (next_rate - rate); + /* + std::cout << "visit_cnt : " << visit_cnt + << " , rate : " << rate + << " , next_rate : " << next_rate + << " . rate_per_unit : " << rate_per_unit + << std::endl; + */ + assert(cost >= 0); + return cost; +} + +inline bool CompareSegmentAlgoHelper(const SegmentAlgoHelper& helper_1, const SegmentAlgoHelper& helper_2) { + return helper_1.enable_benifit < helper_2.enable_benifit; +} + +class GreedyAlgo { +public: + // segment_algo_infos: map + // algo_solution: map + // cache_size: total size of filter cache, we can left some space for emergency needs + // we can input 95% of real total size as arg cache_size, then left 5% of space for further needs + // full debug process of GreedyAlgo, not thread-secured + // so make sure that only called by one thread + void solve(std::map& segment_algo_infos, + std::map& algo_solution, const uint32_t& cache_size); + // full debug process of GreedyAlgo, not thread-secured + // so make sure that only called by one thread + void debug(std::map& algo_solution, const uint32_t& cache_size) { + // generate debug data + std::map segment_algo_infos; + segment_algo_infos.clear(); + uint32_t min_segment_id = 0, max_segment_id = 9999; + for (uint32_t segment_id = min_segment_id; segment_id <= max_segment_id; segment_id++) { + // SegmentAlgoInfo segment_algo_info(segment_id * 1000, 8 * 1024 * 8); // one unit is 8kb + // segment_algo_infos[segment_id] = SegmentAlgoInfo(segment_id * 1000, 8 * 1024 * 8); + // directly use '=' will cause bug, try use std::map.insert + segment_algo_infos.insert(std::make_pair(segment_id, + SegmentAlgoInfo(segment_id * std::pow(10, (segment_id / 3000) + 1), 8 * 1024 * 4))); // one unit 2 kb + } + assert(segment_algo_infos.size() == max_segment_id + 1); + + // already generate debug data, try perform algo + solve(segment_algo_infos, algo_solution, cache_size); + + // simple check results + // noticed that if segment a visit_cnt >= segment b visit_cnt + // then segment a units_num >= segment b units_num + for (uint32_t segment_id = min_segment_id; segment_id < max_segment_id; segment_id++) { + assert(algo_solution[segment_id] <= algo_solution[segment_id + 1]); + } + } +}; + + +} \ No newline at end of file diff --git a/db/art/heat_buckets.cc b/db/art/heat_buckets.cc new file mode 100644 index 000000000..dfc8b8ab2 --- /dev/null +++ b/db/art/heat_buckets.cc @@ -0,0 +1,325 @@ +#include "heat_buckets.h" +#include +#include + +namespace ROCKSDB_NAMESPACE { +std::vector HeatBuckets::seperators_; +std::vector HeatBuckets::buckets_; +uint32_t HeatBuckets::current_cnt_; // current get count in this period +std::vector> HeatBuckets::mutex_ptrs_; +std::mutex HeatBuckets::cnt_mutex_; +std::mutex HeatBuckets::sample_mutex_; +bool HeatBuckets::is_ready_; // identify whether HeatBuckets ready for hit +SamplesPool HeatBuckets::samples_; +bool HeatBuckets::updated_; // prevent from updating hotness more than once in a short time + + +Bucket::Bucket() { + hit_cnt_ = 0; + hotness_ = 0; + // keys_.clear(); +} + +Bucket::~Bucket() { + return; // destroy nothing +} + +void Bucket::update(const double& alpha, const uint32_t& period_cnt) { + // mutex_.lock(); + hotness_ = alpha * hotness_ + + (1 - alpha) * double(hit_cnt_) / double(period_cnt); + hit_cnt_ = 0; // remember to reset counter + // keys_.clear(); + // mutex_.unlock(); // remember to unlock!!! +} + +void Bucket::hit() { + // mutex_.lock(); + hit_cnt_ += 1; + // keys_.insert(key); + // mutex_.unlock(); // remember to unlock!!! +} + +HeatBuckets::HeatBuckets() { + seperators_.resize(0); + buckets_.resize(0); + current_cnt_ = 0; + mutex_ptrs_.resize(0); + is_ready_ = false; + samples_.clear(); + updated_ = false; +} + +HeatBuckets::~HeatBuckets() { + return; // destroy nothing +} + +void HeatBuckets::debug() { + std::cout << "[DEBUG] total cnt in this period: " << current_cnt_ << std::endl; + for (auto& bucket : buckets_) { + std::cout << "[DEBUG] "; + std::cout << "bucket hotness : " << bucket.hotness_; + std::cout << ", bucket hit cnt : " << bucket.hit_cnt_; + // std::cout << ", bucket keys cnt : " << bucket.keys_cnt(); + std::cout << std::endl; + } +} + +void HeatBuckets::update() { + // mark already updated, after current_cnt_ more than PERIOD_COUNT / MAGIC_FACTOR, updated_ will be reset to false; + // we need guarantee that in one period (one constant time span), db gets are much larger than PERIOD_COUNT / MAGIC_FACTOR; + // usually in server, exec get requests PERIOD_COUNT / MAGIC_FACTOR times only account for a very very short time. + if (updated_) + return; + + updated_ = true; + + assert(mutex_ptrs_.size() == buckets_.size()); + for (size_t i=0; ilock(); + } + + // TODO: use multiple threads to update hotness of all buckets + for (size_t i=0; iunlock(); + } + // remember to reset current_cnt_ counter + current_cnt_ = 0; +} + +uint32_t HeatBuckets::locate(const std::string& key) { + // we use locate method to locate the key range for one key + // reminded one key range -> [lower seperator, upper seperator) + // if we locate key k to idx i, then seperator i <= k < seperator i+1 + // equal to k in key range i + uint32_t left = 0, right = seperators_.size()-1; + while (left < right - 1){ + uint32_t mid = left + ((right-left) / 2); + if (seperators_[mid] > key) { + right = mid; + } else if (seperators_[mid] <= key) { + left = mid; + } + } + return left; +} + +void HeatBuckets::hit(const std::string& key, const bool& signal) { + assert(is_ready_); + // use binary search to find index i, making seperators_[i] <= key and seperators_[i+1] > i + // reminding we have set border guard, so dont worry about out of bounds error + // after we find the index i, we call buckets_[i].hit(), then add 1 to current_cnt_ + // if current_cnt_ >= period_cnt_, call update() to update hotness of all buckets and reset current cnt counter + + uint32_t idx = 0; + // last element is border guard + // means last element always bigger than key + // first element is border guard + // means first element always smaller than key + + // linear search version + /* + while (seperators_[index+1] <= key) { + index += 1; + } + */ + idx = locate(key); + + // std::cout << "debug seperators_ size : " << seperators_.size() << std::endl; + // std::cout << "debug buckets_ size : " << buckets_.size() << std::endl; + // std::cout << "debug mutex_ptrs_ size : " << mutex_ptrs_.size() << std::endl; + // std::cout << "debug period_cnt_ : " << period_cnt_ << std::endl; + // std::cout << "debug alpha_ : " << alpha_ << std::endl; + assert(buckets_.size() == mutex_ptrs_.size()); + assert(idx >= 0 && idx < buckets_.size()); + assert(seperators_[idx] <= key && key < seperators_[idx+1]); + + mutex_ptrs_[idx]->lock(); + buckets_[idx].hit(); // mutex only permits one write opr to one bucket + mutex_ptrs_[idx]->unlock(); + + cnt_mutex_.lock(); + current_cnt_ += 1; + + // use updated_ to prevent from updating hotness in a very short time span (due to multi-threads operation) + if (signal && !updated_) { + // debug(); + update(); + } + cnt_mutex_.unlock(); + + // remember to reset updated_ to false + if (updated_ && current_cnt_ >= PERIOD_COUNT / MAGIC_FACTOR) { + updated_ = false; + } +} + +SamplesPool::SamplesPool() { + samples_cnt_ = 0; + pool_.resize(0); + filter_.clear(); + + // because put opt will input duplicated keys, we need to guarantee SAMPLES_MAXCNT much larger than SAMPLES_LIMIT + // however std::set only remain deduplicated keys + // to collect good samples for previous put keys, we need a larger SAMPLES_MAXCNT + assert(SAMPLES_MAXCNT >= MAGIC_FACTOR * SAMPLES_LIMIT); +} + +void SamplesPool::clear() { + samples_cnt_ = 0; + pool_.resize(0); + filter_.clear(); +} + +void SamplesPool::sample(const std::string& key) { + assert(pool_.size() == filter_.size()); + // if already in pool, return + if (is_sampled(key)) { + return; + } + // pool not full + if (!is_full()) { + pool_.push_back(key); + filter_.insert(key); + } + // pool is full + else { + // need to generate random integer in [0, old samples_cnt_] (equal to [0, old samples_cnt_ + 1)) + // new samples_cnt_ = old samples_cnt_ + 1 + // if you want random integer in [a, b], use (rand() % (b-a+1))+a; + srand((unsigned)time(NULL)); + uint32_t idx = (rand() % (samples_cnt_ + 1)) + 0; + assert(idx <= samples_cnt_ && idx >= 0); + // idx in [0, samples_limit_) + // pool_ size may lightly more than samples_limit_; + if (idx < pool_.size()) { + // remove old key + std::string old_key = pool_[idx]; + filter_.erase(old_key); + + // update new key + pool_[idx] = key; + filter_.insert(key); + } + } + assert(pool_.size() == filter_.size()); + + // remember to update samples_cnt_ + samples_cnt_ += 1; +} + +void SamplesPool::prepare() { + std::string key_min = "user"; // defined min key for YCSB + std::string key_max = pool_[pool_.size()-1] + pool_[pool_.size()-1]; + if (!is_ready()) { + return; + } + sort(pool_.begin(), pool_.end()); + // add border guard + pool_.emplace(pool_.begin(), key_min); + pool_.emplace_back(key_max); +} + +void SamplesPool::divide(const uint32_t& k, std::vector& dst) { + // reminded we already add border guard to pool vector + std::string key_min = pool_[0]; // defined min key for YCSB + std::string key_max = pool_[pool_.size()-1]; + + dst.resize(0); + dst.emplace_back(key_min); + + // reminded we already add border guard to pool vector + // border guard in idx 0 and idx pool_.size()-1 + uint32_t idx = 1; + while (idx < pool_.size() - 1) { + dst.emplace_back(pool_[idx]); + idx += k; + } + + dst.emplace_back(key_max); +} + + +uint32_t SamplesPool::locate(const std::string& key) { + // pool must be sorted + // and we need to add border guard to pool + // after that, we can use locate(key) + uint32_t left = 0, right = pool_.size()-1; + while (left < right - 1){ + uint32_t mid = left + ((right-left) / 2); + if (pool_[mid] > key) { + right = mid; + } else if (pool_[mid] <= key) { + left = mid; + } + } + return left; +} + +uint32_t SamplesPool::determine_k(std::vector>& segments) { + // already add border guard to pool + uint32_t k = pool_.size() - 2; + // if segments is empty, use default k to debug + if (segments.empty()) { + k = (pool_.size() - 2) / DEFAULT_BUCKETS_NUM; + } + assert(k > 1); + for (auto& segment : segments) { + assert(segment.size() == 2); + assert(segment[0] < segment[1]); + uint32_t span = locate(segment[1]) - locate(segment[0]); + + assert(span > 1); + if (k > span) k = span; + } + // std::cout << "[DEBUG] samples divided with span k : " << k << std::endl; + return k; +} + +void HeatBuckets::sample(const std::string& key, std::vector>& segments) { + if (samples_.is_ready()) { + return; + } + sample_mutex_.lock(); + if (!samples_.is_ready()) { + samples_.sample(key); + if (samples_.is_ready()) { + init(segments); + } + } + sample_mutex_.unlock(); +} + +void HeatBuckets::init(std::vector>& segments) { + // compute proper k and determine key ranges + samples_.prepare(); + uint32_t k = samples_.determine_k(segments); + samples_.divide(k, seperators_); + + // std::cout << "[DEBUG] show key ranges below: " << std::endl; + for (size_t i=0; i(new std::mutex())); + } + assert(mutex_ptrs_.size() == buckets_.size()); + assert(seperators_.size() == buckets_.size()+1); + + is_ready_ = true; + + // debug + // std::cout << "[DEBUG] heat buckets size: " << buckets_.size() << std::endl; + // std::cout << "[DEBUG] key ranges init" << std::endl; +} +} \ No newline at end of file diff --git a/db/art/heat_buckets.h b/db/art/heat_buckets.h new file mode 100644 index 000000000..68a8277fe --- /dev/null +++ b/db/art/heat_buckets.h @@ -0,0 +1,105 @@ +#pragma once +#include +#include +#include +#include +#include "macros.h" +#include +#include +#include +#include + +namespace ROCKSDB_NAMESPACE { + +class Bucket; +class HeatBuckets; +class SamplesPool; + +class Bucket { +public: + double hotness_; + uint32_t hit_cnt_; + + Bucket(); + ~Bucket(); + + // when one time period end, update hotness_, h_(i+1) = alpha * h_i + hit_cnt_ / period_cnt + void update(const double& alpha, const uint32_t& period_cnt); + void hit(); +}; + + +/* + first sample put keys using reservoir sampling. + If we collect enough keys, determine the common key num (k) for every key group + start with idx 0, add k continuously, get 0, k, 2k, ... + set KEY_MIN, samples[0], samples[k], samples[2k], ..., KEY_MAX as seperators + guarenteed that KEY_MIN < all keys and KEY_MAX > all keys + then we define key ranges (KEY_MIN, samples[0]), [samples[1], samples[2]), [samples[2], samples[3]), ..., [..., KEY_MAX) + one heat bucket corresponding to one key range + compute and update hotness of all heat buckets +*/ +class HeatBuckets { +private: + // TODO: mutex can be optimized + static std::vector seperators_; + static std::vector buckets_; + static uint32_t current_cnt_; // current get count in this period + static std::vector> mutex_ptrs_; + static std::mutex cnt_mutex_; + static std::mutex sample_mutex_; + static bool is_ready_; // identify whether HeatBuckets ready for hit + static SamplesPool samples_; + static bool updated_; + +public: + HeatBuckets(); + ~HeatBuckets(); + + uint32_t locate(const std::string& key); // helper func: locate which bucket hitted by this key + + const bool& is_ready() { return is_ready_; } + std::vector& seperators() { return seperators_; } + std::vector& buckets() { return buckets_; } + void sample(const std::string& key, std::vector>& segments); // before init buckets, we need to sample keys; + // input segment-related key range (segments), will use them when SamplesPool ready. + + void init(std::vector>& segments); // if sample enough keys, ready to init heatbuckets + + void update(); // update hotness value of all buckets + void hit(const std::string& key, const bool& signal); // one key only hit one bucket (also mean only hit one key range) + // if signal is true, update hotness + void debug(); // output debug message in standard output +}; + +class SamplesPool { +private: + std::vector pool_; // using set to guarantee only store deduplicated samples + std::set filter_; // used to check whether new key already exist in pool + uint32_t samples_cnt_; // current sample tries num, need to update after every try +public: + SamplesPool(); + + ~SamplesPool() { return; } + + void clear(); + + // we can modify SAMPLES_MAXCNT to control the moment that starts init heat buckets + bool is_ready() { return samples_cnt_ >= SAMPLES_MAXCNT; } + bool is_full() { return pool_.size() >= SAMPLES_LIMIT; } + bool is_sampled(const std::string& key) { return filter_.count(key) > 0; } + + void sample(const std::string& key); + + void prepare(); + + // need call prepare() before + // generate seperators + void divide(const uint32_t& k, std::vector& dst); + + // determine k based on low-level segments' key range + uint32_t determine_k(std::vector>& segments); + uint32_t locate(const std::string& key); // helper func when determine k +}; + +} \ No newline at end of file diff --git a/db/art/macros.h b/db/art/macros.h index cae0408af..9d8a3e8e3 100644 --- a/db/art/macros.h +++ b/db/art/macros.h @@ -133,4 +133,84 @@ namespace ROCKSDB_NAMESPACE { #define CLWB(ptr, len) #endif +/* + * Macros for additional work on WaLSM -- WaLSM+ + */ + +// micros for HeatBuckets + +// hotness update formula +#define BUCKETS_ALPHA 0.2 +// samples pool max size, using reservoir sampling +#define SAMPLES_LIMIT 10000 +// if recv samples exceed SAMPLES_MAXCNT, end reservoir sampling and init Heat Buckets +#define SAMPLES_MAXCNT 5000000 +// short period get count, if get count equal to or exceed PERIOD_COUNT, +// end this short period and start next short period +#define PERIOD_COUNT 50000 +// number of heat buckets (number of key ranges, see hotness estimating in the paper) +#define DEFAULT_BUCKETS_NUM 500 +// magic number in class HeatBuckets +#define MAGIC_FACTOR 500 + +// micros for Model Train + +// long period = TRAIN_PERIODS * short period. if one long period end, evaluate model and retrain model if necessary +#define TRAIN_PERIODS 10 +// dataset csv file name +#define DATASET_NAME "dataset.csv" +// the path to save model txt file and train dataset csv file +#define MODEL_PATH "/pg_wal/ycc/" +// we cannot send hotness value (double) to model side, +// so we try multiple hotness value by HOTNESS_SIGNIFICANT_DIGITS_FACTOR, then send its integer part to model +// also we need to multiple key range rate by RATE_SIGNIFICANT_DIGITS_FACTOR +#define HOTNESS_SIGNIFICANT_DIGITS_FACTOR 1e6 +#define RATE_SIGNIFICANT_DIGITS_FACTOR 1e3 +// model feature num max limit : 2 * 45 + 1 +#define MAX_FEATURES_NUM 91 + +// config micro connecting to LightGBM server + +// we use Inet socket to connect server +#define HOST "127.0.0.1" +#define PORT "9090" +// max size of socket receive buffer size +#define BUFFER_SIZE 1024 +// socket message prefix +#define TRAIN_PREFIX "t " +#define PREDICT_PREFIX "p " + +// micros for filter cache + +// before model work, we enable DEFAULT_UNITS_NUM units for every segments +#define DEFAULT_UNITS_NUM 4 +// bits-per-key for every filter unit of every segment, +// found default bits-per-key = DEFAULT_UNITS_NUM * BITS_PER_KEY_PER_UNIT = 10 +// equal to primary value of paper benchmark config value +#define BITS_PER_KEY_PER_UNIT 4 +// max unit nums for every segment, we only generate MAX_UNITS_NUM units for every segment +#define MAX_UNITS_NUM 8 +// we enable 0 unit for coldest segments +#define MIN_UNITS_NUM 0 +// default max size of cache space : 8 * 1024 * 1024 * 128 = 1073741824 bit = 128 MB +#define CACHE_SPACE_SIZE 1073741824 +// fitler cache helper heap type +#define BENEFIT_HEAP 0 +#define COST_HEAP 1 +#define UNKNOWN_HEAP 2 +// visit cnt update bound +#define VISIT_CNT_UPDATE_BOUND 10 +// filter cache map threshold +#define FULL_RATE 0.95 +#define READY_RATE 0.60 +// default init L0 counts +#define INIT_LEVEL_0_COUNT 0 +// default size of one filter unit (bits) +#define DEFAULT_UNIT_SIZE 0 +// inherit remain factor +#define INHERIT_REMAIN_FACTOR 0.5 + +// filter cache client background threads num +#define FILTER_CACHE_THREADS_NUM 10 + } // namespace ROCKSDB_NAMESPACE \ No newline at end of file diff --git a/db/builder.cc b/db/builder.cc index ddcf7a9c9..a5deebff9 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -352,6 +352,7 @@ Status BuildTable( return s; } +// TODO(WaLSM+): what to do with our info recorders? we need pass global recorders pointers first Status BuildTableFromArt( SingleCompactionJob *job, const std::string& dbname, Env* env, FileSystem* fs, @@ -420,6 +421,7 @@ Status BuildTableFromArt( ioptions.statistics, ioptions.listeners, ioptions.file_checksum_gen_factory)); + // TODO(WaLSM+): pass temp recorders ptrs to builder (should be blocked based table) builder = NewTableBuilder( ioptions, mutable_cf_options, internal_comparator, int_tbl_prop_collector_factories, column_family_id, @@ -440,6 +442,7 @@ Status BuildTableFromArt( } TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable"); + // TODO(WaLSM+): we will get temp recorders after finishing s = builder->Finish(); *io_status = builder->io_status(); if (s.ok()) { diff --git a/db/column_family.cc b/db/column_family.cc index 0385aac4a..d9344f4bb 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -1255,6 +1255,7 @@ void ColumnFamilyData::InstallSuperVersion( return InstallSuperVersion(sv_context, db_mutex, mutable_cf_options_); } +// TODO: update filter cache (WaLSM+) void ColumnFamilyData::InstallSuperVersion( SuperVersionContext* sv_context, InstrumentedMutex* db_mutex, const MutableCFOptions& mutable_cf_options) { diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index e0bd1feed..3b1ee2ae0 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -560,6 +560,7 @@ void CompactionJob::GenSubcompactionBoundaries() { } } +// TODO(WaLSM+): pass temp recorders ptr and update Status CompactionJob::Run() { AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_COMPACTION_RUN); @@ -581,7 +582,8 @@ Status CompactionJob::Run() { // Always schedule the first subcompaction (whether or not there are also // others) in the current thread to be efficient with resources - ProcessKeyValueCompaction(&compact_->sub_compact_states[0]); + // TODO(WaLSM+): pass temp recorders ptr and update + ProcessKeyValueCompaction(&compact_->sub_compact_states[0]); // Wait for all other threads (if there are any) to finish execution for (auto& thread : thread_pool) { @@ -845,6 +847,7 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { return status; } +// TODO(WaLSM+): pass temp recorders ptr and update void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { assert(sub_compact != nullptr); @@ -983,7 +986,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { break; } } - status = sub_compact->AddToBuilder(key, value); + // TODO(WaLSM+): pass temp recorders ptr and update + status = sub_compact->AddToBuilder(key, value); if (!status.ok()) { break; } @@ -1095,8 +1099,9 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { // close the output file. if (sub_compact->builder != nullptr) { CompactionIterationStats range_del_out_stats; + // TODO(WaLSM+): pass temp recorders ptr and update Status s = FinishCompactionOutputFile(status, sub_compact, &range_del_agg, - &range_del_out_stats); + &range_del_out_stats); if (!s.ok() && status.ok()) { status = s; } diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 6c72f1607..18ed841ae 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -24,6 +24,9 @@ #include #include #include +#include +#include +#include #include "db/art/timestamp.h" #include "db/art/logger.h" @@ -240,6 +243,33 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, closed_(false), error_handler_(this, immutable_db_options_, &mutex_), atomic_flush_install_cv_(&mutex_) { +// WaLSM+ +#ifdef ART_PLUS + /* + get_cnt_ = 0; + period_cnt_ = 0; + last_train_period_ = 0; + */ + segment_info_recorder_ = new std::unordered_map>; + level_recorder_ = new std::map; + level_0_base_count_ = 0; + + features_nums_except_level_0_ = new std::vector; + uint16_t features_num = MAX_FEATURES_NUM; + if (features_num > 0) { + features_nums_except_level_0_->emplace_back(features_num); + } + + segment_ranges_recorder_ = new std::map>; + + unit_size_recorder_ = new std::map; + + filter_cache_.retrain_or_keep_model(features_nums_except_level_0_, + level_recorder_, + segment_ranges_recorder_, + unit_size_recorder_); + filter_cache_.make_adjustment(); +#endif // !batch_per_trx_ implies seq_per_batch_ because it is only unset for // WriteUnprepared, which should use seq_per_batch_. assert(batch_per_txn_ || seq_per_batch_); @@ -299,6 +329,10 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, global_memtable_ = new GlobalMemtable( vlog_manager_, group_manager_, env_, recovery); + // heat_buckets_ = new HeatBuckets(); + // std::cout << "Buckets_ address : "; + // std::cout << std::hex << heat_buckets_ << std::endl; + Compactor::compaction_threshold_ = options.compaction_threshold; compactor_->SetDB(this); @@ -1628,6 +1662,7 @@ class GetWithTimestampReadCallback : public ReadCallback { }; } // namespace +// TODO: Modify GetImpl -- WaLSM+ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, GetImplOptions& get_impl_options) { assert(get_impl_options.value != nullptr || @@ -1737,8 +1772,84 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, std::string* timestamp = ts_sz > 0 ? get_impl_options.timestamp : nullptr; // Change + // std::cout << "ready for get" << std::endl; +// WaLSM+: add hotness estimating #ifdef ART +#ifdef ART_PLUS std::string art_key(key.data(), key.size()); + filter_cache_.get_updating_work(art_key); + // ready to estimate hotness, update heat buckets + /* + if (heat_buckets_.is_ready()) { + get_cnt_ += 1; + if (get_cnt_ >= PERIOD_COUNT) { + heat_buckets_.hit(art_key, true); + get_cnt_ = 0; + period_cnt_ += 1; + } else { + heat_buckets_.hit(art_key, false); + } + } + + if (heat_buckets_.is_ready() && period_cnt_ > 0 && + period_cnt_ - last_train_period_ >= TRAIN_PERIODS) { + bool need_train = false; + train_mutex_.lock(); + if (period_cnt_ - last_train_period_ >= TRAIN_PERIODS) { + need_train = true; + last_train_period_ = period_cnt_; + } + train_mutex_.unlock(); + // only one thread can train model. + if (need_train) { + std::fstream f_model; + f_model.open("/pg_wal/ycc/model.log", std::ios::out | std::ios::app); + f_model << "[DEBUG] try to train models" << std::endl; + f_model << "[DEBUG] period_cnt_ : " << period_cnt_ << std::endl; + f_model << "[DEBUG] PERIOD_COUNT : " << PERIOD_COUNT << std::endl; + f_model << "[DEBUG] TRAIN_PERIODS : " << TRAIN_PERIODS << std::endl; + + if (!clf_model_.is_ready()) { + std::vector feature_nums; + clf_model_.make_ready(feature_nums); + } + + std::vector> datas; + std::vector tags; + std::vector get_cnts; + std::vector preds; + + // std::thread train_thread(make_train, clf_model_, datas, tags); + // train_thread.detach(); + clf_model_.make_train(datas, tags, get_cnts); + + clf_model_.make_predict(datas, preds); + + f_model << "[DEBUG] debug predict result: " << std::endl; + for (uint16_t& pred : preds) { + f_model << pred << " "; + } + f_model << std::endl << std::endl << std::endl; + f_model.close(); + + std::fstream f_algo; + f_algo.open("/pg_wal/ycc/algo.log", std::ios::out | std::ios::app); + f_algo << "[DEBUG] greedy algo debug results : " << std::endl; + std::map algo_solution; + uint32_t cache_size = CACHE_SPACE_SIZE; + GreedyAlgo greedy_algo; + greedy_algo.debug(algo_solution, cache_size); + for (auto it = algo_solution.begin(); it != algo_solution.end(); it++) { + f_algo << "[DEBUG] " << it->first << " -> " << it->second << std::endl; + } + f_algo.close(); + + filter_cache_heap_manager_.debug(); + } + } + */ + +#endif done = global_memtable_->Get(art_key, *get_impl_options.value->GetSelf(), &s); #else if (!skip_memtable) { @@ -1786,7 +1897,18 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, if (!done) { PERF_TIMER_GUARD(get_from_output_files_time); +#ifndef ART_PLUS + sv->current->Get( + read_options, lkey, get_impl_options.value, timestamp, &s, + &merge_context, &max_covering_tombstone_seq, + get_impl_options.get_value ? get_impl_options.value_found : nullptr, + nullptr, nullptr, + get_impl_options.get_value ? get_impl_options.callback : nullptr, + get_impl_options.get_value ? get_impl_options.is_blob_index : nullptr, + get_impl_options.get_value); +#else sv->current->Get( + filter_cache_, read_options, lkey, get_impl_options.value, timestamp, &s, &merge_context, &max_covering_tombstone_seq, get_impl_options.get_value ? get_impl_options.value_found : nullptr, @@ -1794,6 +1916,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, get_impl_options.get_value ? get_impl_options.callback : nullptr, get_impl_options.get_value ? get_impl_options.is_blob_index : nullptr, get_impl_options.get_value); +#endif RecordTick(stats_, MEMTABLE_MISS); get_in_ssd.fetch_add(1); } else { @@ -1835,6 +1958,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, return s; } +// TODO: WaLSM+ Benchmark dont use MultiGet interface std::vector DBImpl::MultiGet( const ReadOptions& read_options, const std::vector& column_family, diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 17dee4307..32209ea75 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -18,11 +18,19 @@ #include #include #include +#include #include "db/art/compactor.h" #include "db/art/heat_group_manager.h" #include "db/art/global_memtable.h" #include "db/art/vlog_manager.h" +#include "db/art/heat_buckets.h" +#include "db/art/clf_model.h" +#include "db/art/filter_cache_item.h" +#include "db/art/filter_cache_heap.h" +#include "db/art/filter_cache.h" +#include "db/art/filter_cache_client.h" +#include "db/art/greedy_algo.h" #include "db/column_family.h" #include "db/compaction/compaction_job.h" #include "db/dbformat.h" @@ -1513,6 +1521,7 @@ class DBImpl : public DB { WriteBatch* my_batch); // REQUIRES: mutex locked and in write thread. + // WaLSM+ Note: updating max memtable ID of every column families, then call schedule func Status ScheduleFlushes(WriteContext* context); void MaybeFlushStatsCF(autovector* cfds); @@ -1897,6 +1906,79 @@ class DBImpl : public DB { HeatGroupManager* group_manager_; +#ifdef ART_PLUS + // TODO: add necessary filter cache info structures + FilterCacheClient filter_cache_; // already contain FilterCacheManager + + // TODO: mutex for updating these recorders below + // will be locked when updating these recorders below, and unlock after updating ends + std::mutex filter_cache_mutex_; + + // these global recorders need to be latest after every flush or compaction: + // std::map* level_recorder_ + // std::map>* segment_ranges_recorder_ + // std::map* unit_size_recorder_ + // you may need filter_cache_.range_seperators() to receive key range seperators + // exactly, if key k < seperators[i+1] and key k >= seperators[i], then key k hit key range i + // HeatBuckets::locate(const std::string& key) will tell you how to binary search corresponding key range for one key + + // segment_info_recorder save every segments' min key and max key + // but we only need to pass empty segment_info_recorder now + // TODO: it should contain all levels segments' min key and max key, then pass to filter cache client, but not used now + // this recorder will help decide the key ranges' num, but it dont work in current work + // you can try to modify macro DEFAULT_BUCKETS_NUM to decide the key ranges' num + std::unordered_map>* segment_info_recorder_; + + // record every alive segments' level + // TODO: need to be latest all the time + std::map* level_recorder_; + + // record features num of every segments + // we choose max features num to define model feature num + // if you want to use a default features num, set MAX_FEATURES_NUM to non-zero value + // then do not insert any entry into this vector later + // TODO: we dont use this vector, so we set MAX_FEATURES_NUM to non-zero value + std::vector* features_nums_except_level_0_; + + // should be based level 0 visit cnt in a total long period + // simply we set level_0_base_count to 0, and use macro INIT_LEVEL_0_COUNT + // we can set this macro to ( PERIOD_COUNT * TRAIN_PERIODS ) * ( level 0 sorted runs num ) / ( max level 0 segments num ) + // TODO: modify INIT_LEVEL_0_COUNT to proper value + uint32_t level_0_base_count_; + + // record interacting ranges and their rates of alive segments + // TODO: should be latest all the time + std::map>* segment_ranges_recorder_; + + // every segment's filter unit size is the same + // this recorder should hold all alive segment + // simply, you can also use default macro DEFAULT_UNIT_SIZE for all segments, just leave this recorder empty + // TODO: modify DEFAULT_UNIT_SIZE + std::map* unit_size_recorder_; + + /* + HeatBuckets heat_buckets_; + + ClfModel clf_model_; + + FilterCacheHeapManager filter_cache_heap_manager_; + + // monitor low-level segments min key and max key + std::vector> segments_info_; + + // record get cnt in current period, when equal to PERIOD_COUNT, start next period + uint32_t get_cnt_; + + // record period cnt, if period_cnt_ - last_train_period_ >= TRAIN_PERIODS, start to evaluate or retrain model- + uint32_t period_cnt_; + + // record in which period last model trained. + uint32_t last_train_period_; + + // train mutex, preventing model trained more than one time + std::mutex train_mutex_; + */ +#endif // Offset of last record written by leader writer. uint64_t last_record_offset_; diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 1c4b85911..846c262e6 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -686,6 +686,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( return s; } +// WaLSM+ Note: just notify that flush begin (no-op), do not intersect with flushes void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, const MutableCFOptions& mutable_cf_options, int job_id) { @@ -737,6 +738,7 @@ void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, #endif // ROCKSDB_LITE } +// WaLSM+ Note: just notify that flush completed (no-op), do not intersect with flushes void DBImpl::NotifyOnFlushCompleted( ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, std::list>* flush_jobs_info) { @@ -777,6 +779,7 @@ void DBImpl::NotifyOnFlushCompleted( #endif // ROCKSDB_LITE } +// WaLSM+ Note: Manual Compact method, not used in YCSB benchmark Status DBImpl::CompactRange(const CompactRangeOptions& options, ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end) { @@ -969,6 +972,7 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options, return s; } +// WaLSM+ Note: Manual Compact method, not used in YCSB benchmark Status DBImpl::CompactFiles(const CompactionOptions& compact_options, ColumnFamilyHandle* column_family, const std::vector& input_file_names, @@ -1470,6 +1474,7 @@ int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) { ->mutable_cf_options.level0_stop_writes_trigger; } +// WaLSM+ Note: Manual Flush method, not used in YCSB Status DBImpl::Flush(const FlushOptions& flush_options, ColumnFamilyHandle* column_family) { auto cfh = static_cast_with_check(column_family); @@ -1489,6 +1494,7 @@ Status DBImpl::Flush(const FlushOptions& flush_options, return s; } +// WaLSM+ Note: Manual Flush method, not used in YCSB Status DBImpl::Flush(const FlushOptions& flush_options, const std::vector& column_families) { Status s; @@ -1689,6 +1695,7 @@ void DBImpl::GenerateFlushRequest(const autovector& cfds, } } +// WaLSM+ Note: only used in start, shutdown and replication, not used in YCSB Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& flush_options, FlushReason flush_reason, bool writes_stopped) { @@ -2086,6 +2093,7 @@ void DBImpl::EnableManualCompaction() { manual_compaction_paused_.fetch_sub(1, std::memory_order_release); } +// WaLSM+ Note: schedule flush and compaction if possible void DBImpl::MaybeScheduleFlushOrCompaction() { mutex_.AssertHeld(); if (!opened_successfully_) { @@ -2502,6 +2510,7 @@ struct DBCompactionJob { SuperVersionContext* superversion_context; }; +// TODO(WaLSM+): check out NVMFlushJob()? try to pass temp info recorders and merge these recorders void DBImpl::SyncCallFlush(std::vector& jobs) { JobContext job_context(next_job_id_.fetch_add(1), true); @@ -2529,9 +2538,37 @@ void DBImpl::SyncCallFlush(std::vector& jobs) { *default_cfd->GetLatestMutableCFOptions(); default_cfd->mem()->SetNextLogNumber(logfile_number_); + #ifdef ART_PLUS + // these global recorders need to be latest after every flush or compaction: + // std::map* level_recorder_ + // std::map>* segment_ranges_recorder_ + // std::map* unit_size_recorder_ + // you may need filter_cache_.range_seperators() to receive key range seperators + // exactly, if key k < seperators[i+1] and key k >= seperators[i], then key k hit key range i + // HeatBuckets::locate(const std::string& key) will tell you how to binary search corresponding key range for one key + std::set* merged_segment_ids = new std::set; // the merged segments' id, we need to delete them from these 3 global recorders + std::map* new_level_recorder = new std::map; + std::map>* new_segment_ranges_recorder = new std::map>; + std::map* new_unit_size_recorder = new std::map; + std::vector& key_range_seperators = filter_cache_.range_seperators(); + std::set* new_segment_ids = new std::set; + std::map>* inherit_infos_recorder = new std::map>; + // TODO(WaLSM+): you can pass these var into NVMFlushJob and update them when flushing + #endif + int idx = 0; std::vector db_jobs; for (auto job : jobs) { + // TODO(WaLSM+): pass temp recorders into NVMFlushJob or NVMFlushJob.build()? + /* + std::set* merged_segment_ids = new std::set; // the merged segments' id, we need to delete them from these 3 global recorders + std::map* new_level_recorder = new std::map; + std::map>* new_segment_ranges_recorder = new std::map>; + std::map* new_unit_size_recorder = new std::map; + std::vector& key_range_seperators = filter_cache_.range_seperators(); + std::set* new_segment_ids = new std::set; + std::map>* inherit_infos_recorder = new std::map>; + */ num_running_flushes_++; auto nvm_flush_job = new NVMFlushJob( job, @@ -2627,6 +2664,98 @@ void DBImpl::SyncCallFlush(std::vector& jobs) { atomic_flush_install_cv_.SignalAll(); bg_cv_.SignalAll(); + + #ifdef ART_PLUS + // do new SSTs already exist in latest version? + // TODO(WaLSM+): if all ok, merge temp recorders into global DBImpl recorders. + // we need a mutex to guarantee these recorders modified by only one background thread at one time + filter_cache_mutex_.lock(); + // std::map merged_level_recorder; // actually when flushing, there is no merged segment + + // remove merged segments + assert(merged_segment_ids->empty()); + /* + auto level_it = level_recorder_->begin(); + auto range_it = segment_ranges_recorder_->begin(); + auto units_it = unit_size_recorder_->begin(); + while (level_it != level_recorder_->end()) { + if (merged_segment_ids->count(level_it->first) > 0) { + merged_level_recorder.insert(std::make_pair(level_it->first, level_it->second)) + level_it = level_recorder_->erase(level_it); + } else { + level_it ++; + } + } + while (range_it != segment_ranges_recorder_->end()) { + if (merged_segment_ids->count(range_it->first) > 0) { + range_it = segment_ranges_recorder_->erase(range_it); + } else { + range_it ++; + } + } + while (units_it != unit_size_recorder_->end()) { + if (merged_segment_ids->count(units_it->first) > 0) { + units_it = unit_size_recorder_->erase(units_it); + } else { + units_it ++; + } + } + */ + + // lock and update global recorders + global_recorder_mutex_.lock(); + // merge merge temp recorders into global DBImpl recorders. + assert(new_level_recorder->size() == new_segment_ranges_recorder->size()); + auto new_level_it = new_level_recorder->begin(); + auto new_range_it = new_segment_ranges_recorder->begin(); + auto new_units_it = new_unit_size_recorder->begin(); + while (new_level_it != new_level_recorder->end()) { + level_recorder_.insert(std::make_pair(new_level_it->first, new_level_it->second)); + new_level_it ++; + } + while (new_range_it != new_segment_ranges_recorder->end()) { + segment_ranges_recorder_.insert(std::make_pair(new_range_it->first, new_range_it->second)); + new_range_it ++; + } + while (new_units_it != new_unit_size_recorder->end()) { + // unit_size_recorder_.insert(std::make_pair(new_units_it->first, new_units_it->second)); + // we only use DEFAULT_UNIT_SIZE + new_units_it ++; + } + global_recorder_mutex_.unlock(); + + // call filter cache client DBImpl::filter_cache_ update work + assert(merged_segment_ids->empty()); + assert(inherit_infos_recorder->empty()); + std::vector merged_segment_ids_vec, new_segment_ids_vec; + merged_segment_ids_vec.assign(merged_segment_ids.begin(), merged_segment_ids.end()); + new_segment_ids_vec.assign(new_segment_ids.begin(), new_segment_ids.end()); + filter_cache_.batch_insert_segments(merged_segment_ids_vec, new_segment_ids_vec, *inherit_infos_recorder, + *new_level_recorder, 0, *new_segment_ranges_recorder); + + // temp recorders below: + // std::set* merged_segment_ids = new std::set; // the merged segments' id, we need to delete them from these 3 global recorders + // std::map* new_level_recorder = new std::map; + // std::map>* new_segment_ranges_recorder = new std::map>; + // std::map* new_unit_size_recorder = new std::map; + // std::vector& key_range_seperators = filter_cache_.range_seperators(); + // std::set* new_segment_ids = new std::set; + // std::map>* inherit_infos_recorder = new std::map>; + // these global recorders need to be latest after every flush or compaction: + // std::map* level_recorder_ + // std::map>* segment_ranges_recorder_ + // std::map* unit_size_recorder_ + + // release temp recorders? + delete merged_segment_ids; + delete new_level_recorder; + delete new_segment_ranges_recorder; + delete new_unit_size_recorder; + delete new_segment_ids; + delete inherit_infos_recorder; + + filter_cache_mutex_.unlock(); + #endif } } @@ -2746,6 +2875,7 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, } } +// TODO(WaLSM+): maybe we can pass temp recorders ptrs and merge at last? Status DBImpl::BackgroundCompaction(bool* made_progress, JobContext* job_context, LogBuffer* log_buffer, @@ -2942,6 +3072,25 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, } } +#ifdef ART_PLUS + // these global recorders need to be latest after every flush or compaction: + // std::map* level_recorder_ + // std::map>* segment_ranges_recorder_ + // std::map* unit_size_recorder_ + // you may need filter_cache_.range_seperators() to receive key range seperators + // exactly, if key k < seperators[i+1] and key k >= seperators[i], then key k hit key range i + // HeatBuckets::locate(const std::string& key) will tell you how to binary search corresponding key range for one key + std::set* merged_segment_ids = new std::set; // the merged segments' id, we need to delete them from these 3 global recorders + std::map* new_level_recorder = new std::map; + std::map>* new_segment_ranges_recorder = new std::map>; + std::map* new_unit_size_recorder = new std::map; + std::vector& key_range_seperators = filter_cache_.range_seperators(); + std::set* new_segment_ids = new std::set; + std::map>* inherit_infos_recorder = new std::map>; + // TODO(WaLSM+): you can pass these var into NVMFlushJob and update them when compacting + int compaction_flag = 0; // 0 = not defined, 1 = delete compaction, 2 = trivial compaction, 3 = other +#endif + IOStatus io_s; if (!c) { // Nothing to do @@ -2960,7 +3109,16 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, compaction_job_stats, job_context->job_id); - + + // TODO(WaLSM+): no new SST generated, we only record deleted segment id? + // maybe we need to record segment ids for every SST for convience? + #ifdef ART_PLUS + compaction_flag = 1; + #endif + /* + std::set* merged_segment_ids = new std::set; + // the merged segments' id, we need to delete them from these 3 global recorders + */ for (const auto& f : *c->inputs(0)) { c->edit()->DeleteFile(c->level(), f->fd.GetNumber()); } @@ -2996,11 +3154,23 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, // Move files to next level int32_t moved_files = 0; int64_t moved_bytes = 0; + #ifdef ART_PLUS + compaction_flag = 2; // sign for TrivialMove + #endif for (unsigned int l = 0; l < c->num_input_levels(); l++) { if (c->level(l) == c->output_level()) { continue; } for (size_t i = 0; i < c->num_input_files(l); i++) { + // TODO(WaLSM+): no new SST generated and no SST merged, just move segments(from different levels) to target levels + // we can copy moved segment ids into merged_segment_ids. + // then record these moved segments' new level to new_level_recorder + // maybe we need to record segment ids for every SST for convience? + /* + std::set* merged_segment_ids; + // the merged segments' id, we need to delete them from these 3 global recorders + std::map* new_level_recorder = new std::map; + */ FileMetaData* f = c->input(l, i); c->edit()->DeleteFile(c->level(l), f->fd.GetNumber()); c->edit()->AddFile(c->output_level(), f->fd.GetNumber(), @@ -3092,6 +3262,17 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, GetSnapshotContext(job_context, &snapshot_seqs, &earliest_write_conflict_snapshot, &snapshot_checker); assert(is_snapshot_supported_ || snapshots_.empty()); + #ifdef ART_PLUS + compaction_flag = 3; // sign for normal compaction (merge -> split) + #endif + // TODO(WaLSM): pass temp recorders into CompactionJob or CompactionJob.Run()? it is normal compaction + // std::set* merged_segment_ids = new std::set; // the merged segments' id, we need to delete them from these 3 global recorders + // std::map* new_level_recorder = new std::map; + // std::map>* new_segment_ranges_recorder = new std::map>; + // std::map* new_unit_size_recorder = new std::map; // you can left this recorder aside and do nothing + // std::vector& key_range_seperators = filter_cache_.range_seperators(); + // std::set* new_segment_ids = new std::set; + // std::map>* inherit_infos_recorder = new std::map>; CompactionJob compaction_job( job_context->job_id, c.get(), immutable_db_options_, file_options_for_compaction_, versions_.get(), &shutting_down_, @@ -3126,6 +3307,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, *made_progress = true; TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction", c->column_family_data()); + + } if (status.ok() && !io_s.ok()) { @@ -3230,7 +3413,286 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, } m->in_progress = false; // not being processed anymore } + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Finish"); + + if (!status.ok()) { + return status; + } + +#ifdef ART_PLUS + // do new SSTs already exist in latest version? + // TODO(WaLSM+): if all ok, merge temp recorders into global DBImpl recorders. + // we need a mutex to guarantee these recorders modified by only one background thread at one time + filter_cache_mutex_.lock(); + assert(compaction_flag >= 0 && compaction_flag <= 3); + if (compaction_flag == 1) { + // lock and update global recorders + global_recorder_mutex_.lock(); + // remove merged segments + auto level_it = level_recorder_->begin(); + auto range_it = segment_ranges_recorder_->begin(); + auto units_it = unit_size_recorder_->begin(); + std::map merged_level_recorder; + while (level_it != level_recorder_->end()) { + if (merged_segment_ids->count(level_it->first) > 0) { + merged_level_recorder.insert(std::make_pair(level_it->first, level_it->second)); + level_it = level_recorder_->erase(level_it); + } else { + level_it ++; + } + } + while (range_it != segment_ranges_recorder_->end()) { + if (merged_segment_ids->count(range_it->first) > 0) { + range_it = segment_ranges_recorder_->erase(range_it); + } else { + range_it ++; + } + } + while (units_it != unit_size_recorder_->end()) { + if (merged_segment_ids->count(units_it->first) > 0) { + units_it = unit_size_recorder_->erase(units_it); + } else { + units_it ++; + } + } + global_recorder_mutex_.unlock(); + + // merge merge temp recorders into global DBImpl recorders. + assert(new_level_recorder->empty()); + assert(new_level_recorder->size() == new_segment_ranges_recorder->size()); + // delete compaction only delete segments, not generate new segments + /* + auto new_level_it = new_level_recorder->begin(); + auto new_range_it = new_segment_ranges_recorder->begin(); + auto new_units_it = new_unit_size_recorder->begin(); + while (new_level_it != new_level_recorder->end()) { + level_recorder_.insert(std::make_pair(new_level_it->first, new_level_it->second)); + new_level_it ++; + } + while (new_range_it != new_segment_ranges_recorder->end()) { + segment_ranges_recorder_.insert(std::make_pair(new_range_it->first, new_range_it->second)); + new_range_it ++; + } + while (new_units_it != new_unit_size_recorder->end()) { + unit_size_recorder_.insert(std::make_pair(new_units_it->first, new_units_it->second)); + new_units_it ++; + } + */ + + // call filter cache client DBImpl::filter_cache_ update work + assert(new_segment_ids->empty()); + assert(inherit_infos_recorder->empty()); + assert(new_level_recorder->empty()); + assert(new_segment_ranges_recorder->empty()); + // new segments id empty, that will not fit in batch_insert_segments + // we need a new method batch_delete_segments to only delete merge segments + std::vector merged_segment_ids_vec; + merged_segment_ids_vec.assign(merged_segment_ids.begin(), merged_segment_ids.end()); + filter_cache_.batch_delete_segments(merged_segment_ids_vec, merged_level_recorder); + + // temp recorders below: + // std::set* merged_segment_ids = new std::set; // the merged segments' id, we need to delete them from these 3 global recorders + // std::map* new_level_recorder = new std::map; + // std::map>* new_segment_ranges_recorder = new std::map>; + // std::map* new_unit_size_recorder = new std::map; + // std::vector& key_range_seperators = filter_cache_.range_seperators(); + // std::set* new_segment_ids = new std::set; + // std::map>* inherit_infos_recorder = new std::map>; + // these global recorders need to be latest after every flush or compaction: + // std::map* level_recorder_ + // std::map>* segment_ranges_recorder_ + // std::map* unit_size_recorder_ + + // release temp recorders? + delete merged_segment_ids; + delete new_level_recorder; + delete new_segment_ranges_recorder; + delete new_unit_size_recorder; + delete new_segment_ids; + delete inherit_infos_recorder; + + } else if (compaction_flag == 2) { + // lock and update global recorders + global_recorder_mutex_.lock(); + // modify segments' level + auto level_it = level_recorder_->begin(); + auto range_it = segment_ranges_recorder_->begin(); + assert(new_level_recorder->size() > 0); + assert(merged_segment_ids->size() == new_level_recorder->size()); + std::map old_level_recorder; + while (level_it != level_recorder_->end()) { + if (merged_segment_ids->count(level_it->first) > 0) { + old_level_recorder.insert(std::make_pair(level_it->first, level_it->second)); + level_it = level_recorder_->erase(level_it); + } else { + level_it ++; + } + } + while (range_it != segment_ranges_recorder_->end()) { + if (merged_segment_ids->count(range_it->first) > 0) { + new_segment_ranges_recorder->insert(std::make_pair(range_it->first, range->second)); + range_it = segment_ranges_recorder_->erase(range_it); + } else { + range_it ++; + } + } + + assert(unit_size_recorder_->empty()); + /* + while (units_it != unit_size_recorder_->end()) { + if (merged_segment_ids->count(units_it->first) > 0) { + units_it = unit_size_recorder_->erase(units_it); + } else { + units_it ++; + } + } + */ + + assert(new_level_recorder->size() == new_segment_ranges_recorder->size()); + auto new_level_it = new_level_recorder->begin(); + auto new_range_it = new_segment_ranges_recorder->begin(); + auto new_units_it = new_unit_size_recorder->begin(); + while (new_level_it != new_level_recorder->end()) { + level_recorder_->insert(std::make_pair(new_level_it->first, new_level_it->second)); + new_level_it ++; + } + while (new_range_it != new_segment_ranges_recorder->end()) { + segment_ranges_recorder_->insert(std::make_pair(new_range_it->first, new_range_it->second)); + new_range_it ++; + } + while (new_units_it != new_unit_size_recorder->end()) { + // unit_size_recorder_.insert(std::make_pair(new_units_it->first, new_units_it->second)); + new_units_it ++; + } + global_recorder_mutex_.unlock(); + + // call filter cache client DBImpl::filter_cache_ update work + // we need a new filter cache operation to support moving segments to a new level + std::vector merged_segment_ids_vec; + merged_segment_ids_vec.assign(merged_segment_ids.begin(), merged_segment_ids.end()); + filter_cache_.batch_move_segments(merged_segment_ids_vec, old_level_recorder, *new_level_recorder, *new_segment_ranges_recorder); + + // temp recorders below: + // std::set* merged_segment_ids = new std::set; // the merged segments' id, we need to delete them from these 3 global recorders + // std::map* new_level_recorder = new std::map; + // std::map>* new_segment_ranges_recorder = new std::map>; + // std::map* new_unit_size_recorder = new std::map; + // std::vector& key_range_seperators = filter_cache_.range_seperators(); + // std::set* new_segment_ids = new std::set; + // std::map>* inherit_infos_recorder = new std::map>; + // these global recorders need to be latest after every flush or compaction: + // std::map* level_recorder_ + // std::map>* segment_ranges_recorder_ + // std::map* unit_size_recorder_ + + // release temp recorders? + delete merged_segment_ids; + delete new_level_recorder; + delete new_segment_ranges_recorder; + delete new_unit_size_recorder; + delete new_segment_ids; + delete inherit_infos_recorder; + + } else if (compaction_flag == 3) { + // it is normal compaction (merge->split) + std::map merged_level_recorder; + + // lock and update global recorders + global_recorder_mutex_.lock(); + // remove merged segments + assert(!(merged_segment_ids->empty())); + auto level_it = level_recorder_->begin(); + auto range_it = segment_ranges_recorder_->begin(); + auto units_it = unit_size_recorder_->begin(); + while (level_it != level_recorder_->end()) { + if (merged_segment_ids->count(level_it->first) > 0) { + merged_level_recorder.insert(std::make_pair(level_it->first, level_it->second)) + level_it = level_recorder_->erase(level_it); + } else { + level_it ++; + } + } + while (range_it != segment_ranges_recorder_->end()) { + if (merged_segment_ids->count(range_it->first) > 0) { + range_it = segment_ranges_recorder_->erase(range_it); + } else { + range_it ++; + } + } + while (units_it != unit_size_recorder_->end()) { + if (merged_segment_ids->count(units_it->first) > 0) { + units_it = unit_size_recorder_->erase(units_it); + } else { + units_it ++; + } + } + + // merge merge temp recorders into global DBImpl recorders. + assert(new_segment_ids->size() > 0 && new_segment_ids->size() == new_level_recorder->size()); + assert(new_level_recorder->size() == new_segment_ranges_recorder->size()); + auto new_level_it = new_level_recorder->begin(); + auto new_range_it = new_segment_ranges_recorder->begin(); + auto new_units_it = new_unit_size_recorder->begin(); + while (new_level_it != new_level_recorder->end()) { + level_recorder_.insert(std::make_pair(new_level_it->first, new_level_it->second)); + new_level_it ++; + } + while (new_range_it != new_segment_ranges_recorder->end()) { + segment_ranges_recorder_.insert(std::make_pair(new_range_it->first, new_range_it->second)); + new_range_it ++; + } + while (new_units_it != new_unit_size_recorder->end()) { + // unit_size_recorder_.insert(std::make_pair(new_units_it->first, new_units_it->second)); + // we only use DEFAULT_UNIT_SIZE + new_units_it ++; + } + global_recorder_mutex_.unlock(); + + // make sure that we also input merged segments' level + // batch_insert_segments argument need both merged and new segments' level + auto merged_it = merged_level_recorder.begin(); + while (merged_it != merged_level_recorder.end()) { + assert(new_level_recorder->find(merged_it->first) == new_level_recorder.end()); + new_level_recorder->insert(std::make_pair(merged_it->first, merged_it->second)); + merged_it ++; + } + assert(new_level_recorder->size() == new_segment_ids->size() + merged_segment_ids->size()); + + // call filter cache client DBImpl::filter_cache_ update work + assert(inherit_infos_recorder->size() == new_segment_ids->size()); + std::vector merged_segment_ids_vec, new_segment_ids_vec; + merged_segment_ids_vec.assign(merged_segment_ids.begin(), merged_segment_ids.end()); + new_segment_ids_vec.assign(new_segment_ids.begin(), new_segment_ids.end()); + filter_cache_.batch_insert_segments(merged_segment_ids_vec, new_segment_ids_vec, *inherit_infos_recorder, + *new_level_recorder, 0, *new_segment_ranges_recorder); + + // temp recorders below: + // std::set* merged_segment_ids = new std::set; // the merged segments' id, we need to delete them from these 3 global recorders + // std::map* new_level_recorder = new std::map; + // std::map>* new_segment_ranges_recorder = new std::map>; + // std::map* new_unit_size_recorder = new std::map; + // std::vector& key_range_seperators = filter_cache_.range_seperators(); + // std::set* new_segment_ids = new std::set; + // std::map>* inherit_infos_recorder = new std::map>; + // these global recorders need to be latest after every flush or compaction: + // std::map* level_recorder_ + // std::map>* segment_ranges_recorder_ + // std::map* unit_size_recorder_ + + // release temp recorders? + delete merged_segment_ids; + delete new_level_recorder; + delete new_segment_ranges_recorder; + delete new_unit_size_recorder; + delete new_segment_ids; + delete inherit_infos_recorder; + + } else { + assert(compaction_flag == 0); + } + filter_cache_mutex_.unlock(); +#endif return status; } diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 07db5edf2..7838c62b2 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -20,6 +20,13 @@ namespace ROCKSDB_NAMESPACE { // Convenience methods Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family, const Slice& key, const Slice& val) { +// WaLSM+: first sample put keys into pool, then generate key ranges for computing hotness +#ifdef ART_PLUS + // heat_buckets not ready, still sample into pool + // if ready, prepare func auto return and do nothing + std::string art_key(key.data(), key.size()); + filter_cache_.prepare_heat_buckets(art_key, segment_info_recorder_); +#endif return DB::Put(o, column_family, key, val); } diff --git a/db/db_test3.cc b/db/db_test3.cc index eb2bc8475..c616a1b0f 100644 --- a/db/db_test3.cc +++ b/db/db_test3.cc @@ -373,7 +373,7 @@ void DoTest(std::string test_name) { options.use_direct_io_for_flush_and_compaction = true; options.use_direct_reads = true; options.enable_pipelined_write = true; - options.nvm_path = "/mnt/chen/nodememory"; + options.nvm_path = "/pg_wal/ycc/memory_art"; options.compression = rocksdb::kNoCompression; options.IncreaseParallelism(16); diff --git a/db/dbformat.h b/db/dbformat.h index 81c852ac4..38fea61ed 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -174,6 +174,14 @@ inline Slice ExtractUserKey(const Slice& internal_key) { return Slice(internal_key.data(), internal_key.size() - kNumInternalBytes); } +#ifdef ART_PLUS +// Returns the internal bytes portion of an internal key. (WaLSM+) +inline Slice ExtractInternalBytes(const Slice& internal_key) { + assert(internal_key.size() >= kNumInternalBytes); + return Slice(internal_key.data() + internal_key.size(), kNumInternalBytes); +} +#endif + inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key, size_t ts_sz) { assert(internal_key.size() >= kNumInternalBytes + ts_sz); diff --git a/db/flush_job.cc b/db/flush_job.cc index cc0be0c23..404f945a4 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -308,6 +308,7 @@ void FlushJob::Cancel() { base_->Unref(); } +// also we need to call filter cache func to update filter cache after BuildTable() succeed? Status FlushJob::WriteLevel0Table() { AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_FLUSH_WRITE_L0); diff --git a/db/nvm_flush_job.cc b/db/nvm_flush_job.cc index 3a7198635..2059f2afb 100644 --- a/db/nvm_flush_job.cc +++ b/db/nvm_flush_job.cc @@ -50,6 +50,7 @@ namespace ROCKSDB_NAMESPACE { +// WaLSM+ Note: copy of FlushJob, add nvm reading NVMFlushJob::NVMFlushJob(SingleCompactionJob* job, const std::string& dbname, ColumnFamilyData* cfd, const ImmutableDBOptions& db_options, @@ -146,6 +147,7 @@ void NVMFlushJob::Preprocess() { } } +// TODO(WaLSM+): we can pass info recorders to BuildTableFromArt()? void NVMFlushJob::Build() { Status s; { @@ -176,7 +178,7 @@ void NVMFlushJob::Build() { uint64_t creation_time = meta_.oldest_ancester_time; IOStatus io_s; - s = BuildTableFromArt( + s = BuildTableFromArt( // TODO(WaLSM+): pass temp recorders ptrs, update these recorders when building job_, dbname_, db_options_.env, db_options_.fs.get(), *cfd_->ioptions(), mutable_cf_options_, file_options_, cfd_->table_cache(), &meta_, diff --git a/db/table_cache.cc b/db/table_cache.cc index e685bb5f0..663ce8a94 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -470,6 +470,97 @@ Status TableCache::Get(const ReadOptions& options, return s; } +#ifdef ART_PLUS +Status TableCache::Get(FilterCacheClient& filter_cache, + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const Slice& k, + GetContext* get_context, + const SliceTransform* prefix_extractor, + HistogramImpl* file_read_hist, bool skip_filters, + int level, size_t max_file_size_for_l0_meta_pin) { + auto& fd = file_meta.fd; + std::string* row_cache_entry = nullptr; + bool done = false; +#ifndef ROCKSDB_LITE + IterKey row_cache_key; + std::string row_cache_entry_buffer; + + // Check row cache if enabled. Since row cache does not currently store + // sequence numbers, we cannot use it if we need to fetch the sequence. + if (ioptions_.row_cache && !get_context->NeedToReadSequence()) { + auto user_key = ExtractUserKey(k); + CreateRowCacheKeyPrefix(options, fd, k, get_context, row_cache_key); + done = GetFromRowCache(user_key, row_cache_key, row_cache_key.Size(), + get_context); + if (!done) { + row_cache_entry = &row_cache_entry_buffer; + } + } +#endif // ROCKSDB_LITE + Status s; + TableReader* t = fd.table_reader; + Cache::Handle* handle = nullptr; + if (!done) { + assert(s.ok()); + if (t == nullptr) { + s = FindTable(options, file_options_, internal_comparator, fd, &handle, + prefix_extractor, + options.read_tier == kBlockCacheTier /* no_io */, + true /* record_read_stats */, file_read_hist, skip_filters, + level, true /* prefetch_index_and_filter_in_cache */, + max_file_size_for_l0_meta_pin); + if (s.ok()) { + t = GetTableReaderFromHandle(handle); + } + } + SequenceNumber* max_covering_tombstone_seq = + get_context->max_covering_tombstone_seq(); + if (s.ok() && max_covering_tombstone_seq != nullptr && + !options.ignore_range_deletions) { + std::unique_ptr range_del_iter( + t->NewRangeTombstoneIterator(options)); + if (range_del_iter != nullptr) { + *max_covering_tombstone_seq = std::max( + *max_covering_tombstone_seq, + range_del_iter->MaxCoveringTombstoneSeqnum(ExtractUserKey(k))); + } + } + if (s.ok()) { + get_context->SetReplayLog(row_cache_entry); // nullptr if no cache. + // only add filter_cache argument + s = t->Get(filter_cache, options, k, get_context, prefix_extractor, skip_filters); + get_context->SetReplayLog(nullptr); + } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) { + // Couldn't find Table in cache but treat as kFound if no_io set + get_context->MarkKeyMayExist(); + s = Status::OK(); + done = true; + } + } + +#ifndef ROCKSDB_LITE + // Put the replay log in row cache only if something was found. + if (!done && s.ok() && row_cache_entry && !row_cache_entry->empty()) { + size_t charge = + row_cache_key.Size() + row_cache_entry->size() + sizeof(std::string); + void* row_ptr = new std::string(std::move(*row_cache_entry)); + // If row cache is full, it's OK to continue. + ioptions_.row_cache + ->Insert(row_cache_key.GetUserKey(), row_ptr, charge, + &DeleteEntry) + .PermitUncheckedError(); + } +#endif // ROCKSDB_LITE + + if (handle != nullptr) { + ReleaseHandle(handle); + } + return s; +} + +#endif + Status TableCache::InitFileTableReader(const ReadOptions& options, const InternalKeyComparator& internal_comparator, FileMetaData& file_meta) { @@ -491,6 +582,7 @@ Status TableCache::InitFileTableReader(const ReadOptions& options, } // Batched version of TableCache::MultiGet. +// TODO: WaLSM+ Benchmark dont use MultiGet interface Status TableCache::MultiGet(const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, diff --git a/db/table_cache.h b/db/table_cache.h index d0933483f..6a1ed0e90 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -14,6 +14,7 @@ #include #include +#include "db/art/filter_cache_client.h" #include "db/dbformat.h" #include "db/range_del_aggregator.h" #include "options/cf_options.h" @@ -96,6 +97,17 @@ class TableCache { HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, int level = -1, size_t max_file_size_for_l0_meta_pin = 0); +#ifdef ART_PLUS + Status Get(FilterCacheClient& filter_cache, + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const Slice& k, + GetContext* get_context, + const SliceTransform* prefix_extractor = nullptr, + HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, + int level = -1, size_t max_file_size_for_l0_meta_pin = 0); +#endif + Status InitFileTableReader(const ReadOptions& options, const InternalKeyComparator& internal_comparator, FileMetaData& file_meta); @@ -117,6 +129,7 @@ class TableCache { // in the embedded GetContext // @param skip_filters Disables loading/accessing the filter block // @param level The level this table is at, -1 for "not set / don't know" + // TODO: WaLSM+ Benchmark dont use MultiGet interface Status MultiGet(const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, diff --git a/db/version_set.cc b/db/version_set.cc index 710f21867..799e1ca2d 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -645,6 +645,7 @@ Version::~Version() { assert(f->refs > 0); f->refs--; if (f->refs <= 0) { + // TODO: update filter cache (WaLSM+) assert(cfd_ != nullptr); uint32_t path_id = f->fd.GetPathId(); assert(path_id < cfd_->ioptions()->cf_paths.size()); @@ -1831,6 +1832,183 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, } } +#ifdef ART_PLUS +void Version::Get(FilterCacheClient& filter_cache, + const ReadOptions& read_options, const LookupKey& k, + PinnableSlice* value, std::string* timestamp, Status* status, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, bool* value_found, + bool* key_exists, SequenceNumber* seq, ReadCallback* callback, + bool* is_blob, bool do_merge) { + Slice ikey = k.internal_key(); + Slice user_key = k.user_key(); + + assert(status->ok() || status->IsMergeInProgress()); + + if (key_exists != nullptr) { + // will falsify below if not found + *key_exists = true; + } + + PinnedIteratorsManager pinned_iters_mgr; + uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId; + if (vset_ && vset_->block_cache_tracer_ && + vset_->block_cache_tracer_->is_tracing_enabled()) { + tracing_get_id = vset_->block_cache_tracer_->NextGetId(); + } + // determine hit partition + auto* hit_partition = storage_info_.GetHitPartition(user_key); + GetContext get_context( + user_comparator(), merge_operator_, info_log_, db_statistics_, nullptr, + status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key, + do_merge ? value : nullptr, do_merge ? timestamp : nullptr, value_found, + merge_context, do_merge, max_covering_tombstone_seq, this->env_, seq, + merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob, + tracing_get_id); + + // Pin blocks that we read to hold merge operands + if (merge_operator_) { + pinned_iters_mgr.StartPinning(); + } + + std::vector hit_files[storage_info_.num_levels_]; + hit_files[0] = storage_info_.files_[0]; + for (int i = 1; i < storage_info_.num_levels_; i++) { + hit_files[i] = hit_partition->files_[i]; + } + FilePicker fp(hit_files, user_key, ikey, &storage_info_.level_files_brief_, + static_cast(storage_info_.num_levels_), + &storage_info_.file_indexer_, user_comparator(), + internal_comparator()); + FileMetaData* f = fp.GetNextFile(); + + int prev_level = 0; + while (f != nullptr) { + if (fp.GetCurrentLevel() != prev_level) { + prev_level = fp.GetCurrentLevel(); + hit_partition->queries[prev_level]++; + } + if (*max_covering_tombstone_seq > 0) { + // The remaining files we look at will only contain covered keys, so we + // stop here. + break; + } + if (get_context.sample()) { + sample_file_read_inc(f); + } + + // set counter + if (hit_partition->is_compaction_work[fp.GetCurrentLevel()]) { + get_context.SetSearchCount( + &hit_partition->search_counter[fp.GetCurrentLevel()]); + } else { + get_context.SetSearchCount(nullptr); + } + + bool timer_enabled = + GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex && + get_perf_context()->per_level_perf_context_enabled; + StopWatchNano timer(env_, timer_enabled /* auto_start */); + // we only add filter_cache argument in this new Get method + *status = table_cache_->Get( + filter_cache, + read_options, *internal_comparator(), *f, ikey, &get_context, + mutable_cf_options_.prefix_extractor.get(), + cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), + IsFilterSkipped(static_cast(fp.GetHitFileLevel()), + fp.IsHitFileLastInLevel()), + fp.GetHitFileLevel(), max_file_size_for_l0_meta_pin_); + // TODO: examine the behavior for corrupted key + if (timer_enabled) { + PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(), + fp.GetHitFileLevel()); + } + if (!status->ok()) { + return; + } + + // report the counters before returning + if (get_context.State() != GetContext::kNotFound && + get_context.State() != GetContext::kMerge && + db_statistics_ != nullptr) { + get_context.ReportCounters(); + } + if (db_statistics_ != nullptr) { + if (fp.GetCurrentLevel() == 0) { + RecordTick(db_statistics_, GET_MISS_L0); + } else if (fp.GetCurrentLevel() == 1) { + RecordTick(db_statistics_, GET_MISS_L1); + } else if (fp.GetCurrentLevel() >= 2) { + RecordTick(db_statistics_, GET_MISS_L2_AND_UP); + } + } + switch (get_context.State()) { + case GetContext::kNotFound: + // Keep searching in other files + break; + case GetContext::kMerge: + // TODO: update per-level perfcontext user_key_return_count for kMerge + break; + case GetContext::kFound: + if (fp.GetHitFileLevel() == 0) { + RecordTick(db_statistics_, GET_HIT_L0); + } else if (fp.GetHitFileLevel() == 1) { + RecordTick(db_statistics_, GET_HIT_L1); + } else if (fp.GetHitFileLevel() >= 2) { + RecordTick(db_statistics_, GET_HIT_L2_AND_UP); + } + PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, + fp.GetHitFileLevel()); + return; + case GetContext::kDeleted: + // Use empty error message for speed + *status = Status::NotFound(); + return; + case GetContext::kCorrupt: + *status = Status::Corruption("corrupted key for ", user_key); + return; + case GetContext::kBlobIndex: + ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index."); + *status = Status::NotSupported( + "Encounter unexpected blob index. Please open DB with " + "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); + return; + } + f = fp.GetNextFile(); + } + if (db_statistics_ != nullptr) { + get_context.ReportCounters(); + } + if (GetContext::kMerge == get_context.State()) { + if (!do_merge) { + *status = Status::OK(); + return; + } + if (!merge_operator_) { + *status = Status::InvalidArgument( + "merge_operator is not properly initialized."); + return; + } + // merge_operands are in saver and we hit the beginning of the key history + // do a final merge of nullptr and operands; + std::string* str_value = value != nullptr ? value->GetSelf() : nullptr; + *status = MergeHelper::TimedFullMerge( + merge_operator_, user_key, nullptr, merge_context->GetOperands(), + str_value, info_log_, db_statistics_, env_, + nullptr /* result_operand */, true); + if (LIKELY(value != nullptr)) { + value->PinSelf(); + } + } else { + if (key_exists != nullptr) { + *key_exists = false; + } + *status = Status::NotFound(); // Use an empty error message for speed + } +} +#endif + +// TODO: WaLSM+ Benchmark dont use MultiGet interface void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, ReadCallback* callback, bool* is_blob) { PinnedIteratorsManager pinned_iters_mgr; diff --git a/db/version_set.h b/db/version_set.h index 05abcc26c..e18bb5e6a 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -31,6 +31,7 @@ #include #include +#include "db/art/filter_cache_client.h" #include "db/blob/blob_file_meta.h" #include "db/column_family.h" #include "db/compaction/compaction.h" @@ -892,6 +893,16 @@ class Version { SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr, bool* is_blob = nullptr, bool do_merge = true); +#ifdef ART_PLUS + void Get(FilterCacheClient& filter_cache, + const ReadOptions&, const LookupKey& key, PinnableSlice* value, + std::string* timestamp, Status* status, MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + bool* value_found = nullptr, bool* key_exists = nullptr, + SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr, + bool* is_blob = nullptr, bool do_merge = true); +#endif + // TODO: WaLSM+ Benchmark dont use MultiGet interface void MultiGet(const ReadOptions&, MultiGetRange* range, ReadCallback* callback = nullptr, bool* is_blob = nullptr); diff --git a/examples/custom.cc b/examples/custom.cc index e0635801f..4c509686f 100644 --- a/examples/custom.cc +++ b/examples/custom.cc @@ -548,7 +548,7 @@ void DoTest(double zipf) { options.use_direct_reads = true; options.enable_pipelined_write = true; options.compression = rocksdb::kNoCompression; - options.nvm_path = "/mnt/chen/nodememory"; + options.nvm_path = "/pg_wal/ycc/memory_art"; options.IncreaseParallelism(16); std::string db_path = "/tmp/db_old_custom"; diff --git a/examples/fp_rate_example.cc b/examples/fp_rate_example.cc new file mode 100644 index 000000000..679386648 --- /dev/null +++ b/examples/fp_rate_example.cc @@ -0,0 +1,119 @@ +#include +#include +#include +#include +#include + +class BloomMath { + public: + // False positive rate of a standard Bloom filter, for given ratio of + // filter memory bits to added keys, and number of probes per operation. + // (The false positive rate is effectively independent of scale, assuming + // the implementation scales OK.) + static double StandardFpRate(double bits_per_key, int num_probes) { + // Standard very-good-estimate formula. See + // https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives + return std::pow(1.0 - std::exp(-num_probes / bits_per_key), num_probes); + } + + // False positive rate of a "blocked"/"shareded"/"cache-local" Bloom filter, + // for given ratio of filter memory bits to added keys, number of probes per + // operation (all within the given block or cache line size), and block or + // cache line size. + static double CacheLocalFpRate(double bits_per_key, int num_probes, + int cache_line_bits) { + double keys_per_cache_line = cache_line_bits / bits_per_key; + // A reasonable estimate is the average of the FP rates for one standard + // deviation above and below the mean bucket occupancy. See + // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#the-math + double keys_stddev = std::sqrt(keys_per_cache_line); + double crowded_fp = StandardFpRate( + cache_line_bits / (keys_per_cache_line + keys_stddev), num_probes); + double uncrowded_fp = StandardFpRate( + cache_line_bits / (keys_per_cache_line - keys_stddev), num_probes); + return (crowded_fp + uncrowded_fp) / 2; + } + + // False positive rate of querying a new item against `num_keys` items, all + // hashed to `fingerprint_bits` bits. (This assumes the fingerprint hashes + // themselves are stored losslessly. See Section 4 of + // http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf) + static double FingerprintFpRate(size_t num_keys, int fingerprint_bits) { + double inv_fingerprint_space = std::pow(0.5, fingerprint_bits); + // Base estimate assumes each key maps to a unique fingerprint. + // Could be > 1 in extreme cases. + double base_estimate = num_keys * inv_fingerprint_space; + // To account for potential overlap, we choose between two formulas + if (base_estimate > 0.0001) { + // A very good formula assuming we don't construct a floating point + // number extremely close to 1. Always produces a probability < 1. + return 1.0 - std::exp(-base_estimate); + } else { + // A very good formula when base_estimate is far below 1. (Subtract + // away the integral-approximated sum that some key has same hash as + // one coming before it in a list.) + return base_estimate - (base_estimate * base_estimate * 0.5); + } + } + + // Returns the probably of either of two independent(-ish) events + // happening, given their probabilities. (This is useful for combining + // results from StandardFpRate or CacheLocalFpRate with FingerprintFpRate + // for a hash-efficient Bloom filter's FP rate. See Section 4 of + // http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf) + static double IndependentProbabilitySum(double rate1, double rate2) { + // Use formula that avoids floating point extremely close to 1 if + // rates are extremely small. + return rate1 + rate2 - (rate1 * rate2); + } +}; + +template +class LegacyBloomImpl { + public: + static void CompareFpRate(size_t keys, int bits_per_key) { + size_t bytes = keys * bits_per_key / 8; + EstimatedFpRate(keys, bytes, ChooseNumProbes(bits_per_key)); + } + + // NOTE: this has only been validated to enough accuracy for producing + // reasonable warnings / user feedback, not for making functional decisions. + static void EstimatedFpRate(size_t keys, size_t bytes, int num_probes) { + double bits_per_key = 8.0 * bytes / keys; + double filter_rate = BloomMath::CacheLocalFpRate(bits_per_key, num_probes, + /*cache line bits*/ 512); + if (!ExtraRotates) { + // Good estimate of impact of flaw in index computation. + // Adds roughly 0.002 around 50 bits/key and 0.001 around 100 bits/key. + // The + 22 shifts it nicely to fit for lower bits/key. + filter_rate += 0.1 / (bits_per_key * 0.75 + 22); + } else { + // Not yet validated + assert(false); + } + // Always uses 32-bit hash + double fingerprint_rate = BloomMath::FingerprintFpRate(keys, 32); + double combined_rate = BloomMath::IndependentProbabilitySum(filter_rate, fingerprint_rate); + double standard_rate = BloomMath::StandardFpRate(bits_per_key, num_probes); + + std::cout << "keys : " << keys << ", bytes : " << bytes << ", bits per key : " << bits_per_key << std::endl; + std::cout << "standard FPR : " << standard_rate << std::endl; + std::cout << "cache local FPR : " << filter_rate << std::endl; + std::cout << "fingerprint FPR : " << fingerprint_rate << std::endl; + std::cout << "combined FPR : " << combined_rate << std::endl; + std::cout << std::endl << std::endl; + } + + static inline int ChooseNumProbes(int bits_per_key) { + // We intentionally round down to reduce probing cost a little bit + int num_probes = static_cast(bits_per_key * 0.69); // 0.69 =~ ln(2) + if (num_probes < 1) num_probes = 1; + if (num_probes > 30) num_probes = 30; + return num_probes; + } +}; + +int main() { + LegacyBloomImpl Bloom; + Bloom.CompareFpRate(1024 * 1024, 16); +} \ No newline at end of file diff --git a/examples/mini_benchmark.cc b/examples/mini_benchmark.cc index fce793efe..c841fd6dd 100644 --- a/examples/mini_benchmark.cc +++ b/examples/mini_benchmark.cc @@ -593,7 +593,7 @@ int main(int argc, char* argv[]) { options.use_direct_reads = true; options.enable_pipelined_write = true; options.compression = rocksdb::kNoCompression; - options.nvm_path = "/mnt/chen/nodememory"; + options.nvm_path = "/pg_wal/ycc/memory_art"; options.IncreaseParallelism(16); std::remove(options.nvm_path.c_str()); diff --git a/examples/rw_example.cc b/examples/rw_example.cc index 97b0980f7..97ac580ee 100644 --- a/examples/rw_example.cc +++ b/examples/rw_example.cc @@ -258,7 +258,7 @@ int main() { options.use_direct_io_for_flush_and_compaction = true; options.use_direct_reads = true; options.enable_pipelined_write = true; - options.nvm_path = "/mnt/chen/nodememory"; + options.nvm_path = "/pg_wal/ycc/memory_art"; options.compression = rocksdb::kNoCompression; DB* db; diff --git a/examples/simple_example.cc b/examples/simple_example.cc index 908f60a2f..6bca594e5 100644 --- a/examples/simple_example.cc +++ b/examples/simple_example.cc @@ -388,7 +388,8 @@ void ParseOptions(Options& options) { options.use_direct_io_for_flush_and_compaction = true; options.use_direct_reads = true; options.enable_pipelined_write = true; - options.OptimizeLevelStyleCompaction(); + // options.OptimizeLevelStyleCompaction(); + options.OptimizeUniversalStyleCompaction(); std::ifstream option_file("options.txt", std::ios::in); std::string line; @@ -425,7 +426,7 @@ void DoTest(std::string test_name) { options.use_direct_reads = true; options.enable_pipelined_write = true; options.compression = rocksdb::kNoCompression; - options.nvm_path = "/mnt/chen/nodememory"; + options.nvm_path = "/mnt/walsm/node_memory"; options.IncreaseParallelism(16); std::string db_path = "/tmp/tmp_data/db_test_" + test_name; diff --git a/examples/ycsb.cc b/examples/ycsb.cc index 046e10f3d..5bca080dd 100644 --- a/examples/ycsb.cc +++ b/examples/ycsb.cc @@ -419,7 +419,7 @@ void DoTest(double zipf, double read_ratio) { options.use_direct_reads = true; options.enable_pipelined_write = true; options.compression = rocksdb::kNoCompression; - options.nvm_path = "/mnt/chen/nodememory"; + options.nvm_path = "/pg_wal/ycc/memory_art"; options.IncreaseParallelism(16); BlockBasedTableOptions table_options; diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h index 53a46ad33..605923f10 100644 --- a/include/rocksdb/comparator.h +++ b/include/rocksdb/comparator.h @@ -8,6 +8,7 @@ #pragma once +#include #include #include "rocksdb/rocksdb_namespace.h" @@ -134,4 +135,10 @@ extern const Comparator* BytewiseComparator(); // ordering. extern const Comparator* ReverseBytewiseComparator(); +#ifdef ART_PLUS +// Create a comparator that uses the given comparator to perform the comparison +// but ignoring the last 4 bytes of the given key. (WaLSM+) +extern std::unique_ptr SegmentIdRemovingComparator(const Comparator* real_comparator); +#endif + } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h index 3cd85a226..7eb9e79e2 100644 --- a/include/rocksdb/filter_policy.h +++ b/include/rocksdb/filter_policy.h @@ -21,6 +21,9 @@ #include +#include +#include +#include #include #include #include @@ -38,6 +41,9 @@ struct ConfigOptions; // A class that takes a bunch of keys, then generates filter class FilterBitsBuilder { public: + #ifdef ART_PLUS + int filter_count_{1}; + #endif virtual ~FilterBitsBuilder() {} // Add Key to filter, you could use any way to store the key. @@ -50,6 +56,18 @@ class FilterBitsBuilder { // The ownership of actual data is set to buf virtual Slice Finish(std::unique_ptr* buf) = 0; + #ifdef ART_PLUS + // Generate the filter using the keys that are added, and the specified hash + // function id. The return value of this function would be the filter bits, The + // ownership of actual data is set to buf (WaLSM+) + virtual Slice FinishWithId(std::unique_ptr* buf, const int /* filter_id */) { + buf->reset(); + fprintf(stderr, "error call FilterBitsBuilder::Finish(buf, filter_id)\n"); + exit(1); + return Slice(); + } + #endif + // Calculate num of keys that can be added and generate a filter // <= the specified number of bytes. #if defined(_MSC_VER) @@ -84,6 +102,21 @@ class FilterBitsReader { may_match[i] = MayMatch(*keys[i]); } } + + #ifdef ART_PLUS + // Check if the entry match the bits in filter using the specified hash function (WaLSM+) + virtual bool MayMatchWithId(const Slice& /* entry */, const int /* hash_id */) { + fprintf(stderr, "Error call FilterBitsReader::MayMatch(entry, hash_id)"); + exit(1); + return true; + } + + // Check if an array of entries match the bits in filter using the specified hash function (WaLSM+) + virtual void MayMatchWithId(int /* num_keys */, Slice** /* keys */, bool* /* may_match */, const int /* hash_id */) { + fprintf(stderr, "Error call FilterBitsReader::MayMatch(num_keys, keys, may_match, hash_id)"); + exit(1); + } + #endif }; // Contextual information passed to BloomFilterPolicy at filter building time. diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 544078d8e..9a11c0220 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1224,7 +1224,7 @@ struct DBOptions { bool enable_rewrite = true; // Path for nvm file, don't pass directory. - std::string nvm_path = "/mnt/chen/nodememory"; + std::string nvm_path = "/pg_wal/ycc/memory_art"; }; // Options to control the behavior of a database (passed to DB::Open) diff --git a/lgb_server/README.md b/lgb_server/README.md new file mode 100644 index 000000000..d74eb4c83 --- /dev/null +++ b/lgb_server/README.md @@ -0,0 +1,5 @@ +## lgb_server + +Because the trouble of LightGBM C++ API, we seperate LightGBM train and predict job to a single socket server (python3.12) + +once WaLSM need to train new model or predict for new segments, it need to init socket client, send this socket server message and wait for response if necessary (especially for predict job) \ No newline at end of file diff --git a/lgb_server/__init__.py b/lgb_server/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/lgb_server/model.py b/lgb_server/model.py new file mode 100644 index 000000000..b8a235a0e --- /dev/null +++ b/lgb_server/model.py @@ -0,0 +1,108 @@ +import pandas as pd +import lightgbm +import numpy +import math + +model_path = '/pg_wal/ycc/' +# model_path = '' + +class LGBModel(): + def __init__(self) -> None: + self.__model = None + # one unit is 4 bits-per-key, class = 2 mean bits-per-key = 4 * 2 = 8 + # the default bits-per-key value of previous benchmark is 10 + self.__default_class = 4 + self.__bits_per_key = 4 # bits_per_key for one filter unit + self.__num_probes = math.floor(self.__bits_per_key * 0.69) # 4 * 0.69 = 2.76 -> 2 + self.__rate_per_unit = math.pow(1.0 - math.exp(-self.__num_probes/self.__bits_per_key), self.__num_probes) # false positive rate of one unit + self.__cost_rate_line = 0.10 # we can torelate deviation that is no more than self.__cost_rate_line * (best I/O cost) (compared to best I/O cost) + self.__model_name = 'model.txt' + # self.__host = '127.0.0.1' + # self.__port = '6666' + # self.__sock = None + # self.__server = None + # normally, one data row will not exceed 1024 Bytes + # we will check this out in WaLSM c++ client + # self.__bufsize = 1024 + # self.__conn = 8 + + def __evaluate_model(self, X: pd.DataFrame, y: pd.Series, c: pd.Series) -> bool: + # if model still work well, return true + count_list = list(c) + class_list = list(y) + + preds_list = list() + for i in range(0, len(X)): + preds_list.append(int(self.predict(pd.DataFrame([X.loc[i]])))) + + assert len(count_list) == len(class_list) + assert len(preds_list) == len(class_list) + + best_cost = 0.0 + pred_cost = 0.0 + for i in range(0, len(class_list)): + best_cost += math.pow(self.__rate_per_unit, class_list[i]) * count_list[i] + pred_cost += math.pow(self.__rate_per_unit, preds_list[i]) * count_list[i] + + # print("best cost : " + str(best_cost) + ", pred cost: " + str(pred_cost)) + return math.fabs((pred_cost-best_cost)/best_cost) < self.__cost_rate_line + + def train(self, dataset: str) -> str: + df = pd.read_csv(dataset) + y = df['Target'] + c = df['Count'] # used to check I/O cost metric + X = df.drop(columns=['Target', 'Count']) + if self.__model is not None and self.__evaluate_model(X, y, c): + # still work well + return + # clf = lightgbm.LGBMClassifier(min_child_samples=1, n_estimators=1, objective="multiclass") + clf = lightgbm.LGBMClassifier() + clf.fit(X, y) + # if we directly set self.__model = clf, then self.__model always predict class 0 + # we need save clf to txt file, then read this model to init self.__model + clf.booster_.save_model(model_path + self.__model_name) + self.__model = lightgbm.Booster(model_file=model_path+self.__model_name) + # print('load a new model') + return 'new model trained' + + def predict(self, datas: pd.DataFrame) -> str: + # currently, only support one data row + assert len(datas) == 1 + if self.__model is not None: + result = self.__model.predict(datas) + return str(numpy.argmax(result[0])) + else: + return str(self.__default_class) + + ''' + def __close(self) -> None: + if self.__sock is not None: + self.__sock.close() + ''' + + ''' + def start(self) -> None: + self.__sock = socket.socket(family=socket.AF_INET, type=socket.SOCK_STREAM) + self.__sock.bind((self.__host, self.__port)) + self.__sock.listen(self.__conn) + ''' + + ''' + def serve(self) -> None: + while True: + client, _ = self.__sock.accept() + msg = self.__sock.recv(self.__bufsize) + decoded = lgb_util.parse_msg(msg) + + if type(decoded) is str: + # send client nothing + self.__train(decoded) + elif type(decoded) is list: + decoded = lgb_util.prepare_data(decoded) + # self.__predict(decoded) + # send client target class str (like '0' or '1' or ... ) + client.send(self.__predict(decoded)) + else: + print('msg type unknown, LGBServer exit') + self.__close() + ''' \ No newline at end of file diff --git a/lgb_server/server.py b/lgb_server/server.py new file mode 100644 index 000000000..c068ff6cf --- /dev/null +++ b/lgb_server/server.py @@ -0,0 +1,55 @@ +import utils +import model +import socketserver +import time + +clf = model.LGBModel() +host = '127.0.0.1' +port = 9090 +bufsize = 1024 + +class LGBhandler(socketserver.BaseRequestHandler): + def handle(self): + try: + while True: + # for example + # if one compaction happen, the new sstable is consisted of 10000 segments + # we can divide these segments into some groups, + # and prefetch filter units for these segments using multi-thread + # that means in rocksdb, one client thread may need to + # predict class for a group of segments + # that means we need keep this connection until all segments of this group done + # use 'while True' and TCP protocol to keep connection + msg = self.request.recv(bufsize).decode('UTF-8', 'ignore').strip() + if not msg: + break + + decoded = utils.parse_msg(msg) + if type(decoded) is str: # mean it is train msg + # send client nothing + result = clf.train(decoded).encode('UTF-8') + # result is a simple str, just means training end + self.request.send(result) + elif type(decoded) is list: # mean it is pred msg, need to send client predict result + # print(decoded) + decoded = utils.prepare_data(decoded) + # send client target class str (like '0' or '1' or ... ) + result = clf.predict(decoded).encode('UTF-8') + # print(clf.predict(decoded)) + self.request.send(result) + except ConnectionResetError: + print('one connection close: ' + + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) + +# server = socketserver.ThreadingTCPServer((host,port), LGBhandler) +if __name__ == '__main__': + print('LGBServer start') + assert utils.dataset_path == model.model_path + server = socketserver.ThreadingTCPServer((host,port), LGBhandler) + try: + server.serve_forever() + except KeyboardInterrupt: + print('\nLGBServer end') + + # should not end during benchmark + # print('LGBServer end') \ No newline at end of file diff --git a/lgb_server/test.sh b/lgb_server/test.sh new file mode 100644 index 000000000..c54ead816 --- /dev/null +++ b/lgb_server/test.sh @@ -0,0 +1,3 @@ +rm dataset.csv +gcc test_client.cc -o test -std=c++11 -lstdc++ -lsocket++ +./test \ No newline at end of file diff --git a/lgb_server/test_client.cc b/lgb_server/test_client.cc new file mode 100644 index 000000000..50ed1e41b --- /dev/null +++ b/lgb_server/test_client.cc @@ -0,0 +1,168 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// you need to modify features num in lgb_server and filter cache!!! + +void write_debug_dataset(std::string& path) { + // ready for writer + std::ofstream stream(path); + csv2::Writer> writer(stream); + + // init hotness values + std::map hotness_map; + double base_hotness = 0.01; + for (int i = 0; i < 200; i ++) { + float r = static_cast (rand()) / static_cast (RAND_MAX) + base_hotness; + hotness_map[i] = r; + } + + // init header vector + std::vector> rows; + std::vector header; + header.emplace_back("Level"); + for (int i = 0; i < 20; i ++) { + header.emplace_back("Range_" + std::to_string(i)); + header.emplace_back("Hotness_" + std::to_string(i)); + } + header.emplace_back("Target"); + rows.emplace_back(header); + + // ready for shuffling + std::vector ids; + for(int i = 0; i < 200; i ++) { + ids.emplace_back(i); + } + + // generate values + for (int i = 0; i < 1000; i ++) { + // std::vector value; + std::vector values; + uint32_t level = i / 200; + uint32_t target = 5 - level; + float r = static_cast (rand()) / static_cast (RAND_MAX); + if (r > 0.10 * level) { + target -= 1; + } + + auto seed = std::chrono::system_clock::now().time_since_epoch().count(); + std::shuffle(ids.begin(), ids.end(), std::default_random_engine(seed)); + values.emplace_back(std::to_string(level)); + for (int j = 0; j < 20; j ++) { + values.emplace_back(std::to_string(ids[j])); + values.emplace_back(std::to_string(uint32_t(1e6 * hotness_map[ids[j]]))); + } + values.emplace_back(std::to_string(target)); + + rows.emplace_back(values); + } + + writer.write_rows(rows); + stream.close(); +} + +void make_predict_samples(std::string& path, std::vector>& datas) { + datas.clear(); + csv2::Reader, + csv2::quote_character<'"'>, + csv2::first_row_is_header, + csv2::trim_policy::trim_whitespace> csv; + + if (csv.mmap(path)) { + const auto header = csv.header(); + // int cnt = 0; + for (auto row : csv) { + + /* + if ((++cnt) > 10) { + break; + } + */ + + // cnt ++; + std::vector data; + for (auto cell : row) { + std::string value; + cell.read_value(value); + data.emplace_back(stoul(value)); + } + if (!data.empty()) { + data.pop_back(); + } + datas.emplace_back(data); + } + } +} + +void build_message(std::vector& data, std::string& message) { + message.clear(); + message = std::to_string(data[0]); + for (size_t i = 1; i < data.size(); i ++) { + message = message + " " + std::to_string(data[i]); + } +} + +int main() { + std::string host = "127.0.0.1"; + std::string port = "9090"; + std::string recv; + size_t recv_size = 1024; + std::string file = "dataset.csv"; + + recv.resize(recv_size); + + write_debug_dataset(file); + + libsocket::inet_stream t1_sock(host, port, LIBSOCKET_IPv4); + std::string msg = "t " + file; + // already write dataset, send dataset path to server + // should not receive any message from server + t1_sock << msg; + + // t1_sock.shutdown(); + + std::vector> datas; + + make_predict_samples(file, datas); + libsocket::inet_stream p1_sock(host, port, LIBSOCKET_IPv4); + for (std::vector& data : datas) { + if (!data.empty()) { + build_message(data, msg); + msg = "p " + msg; + p1_sock << msg; + p1_sock >> recv; + // train model need enough time, so should always receive 5 (default class) + std::cout << "receive " << recv.size() << " bytes : " << recv << std::endl; + } + } + + constexpr int sleep_time = 10000; + std::this_thread::sleep_for(std::chrono::milliseconds(sleep_time)); + libsocket::inet_stream p2_sock(host, port, LIBSOCKET_IPv4); + for (std::vector& data : datas) { + if (!data.empty()) { + build_message(data, msg); + msg = "p " + msg; + p2_sock << msg; + p2_sock >> recv; + // already train model, receive data class predicted by the model + std::cout << "receive " << recv.size() << " bytes : " << recv << std::endl; + } + } + + std::this_thread::sleep_for(std::chrono::milliseconds(sleep_time)); + write_debug_dataset(file); + libsocket::inet_stream t2_sock(host, port, LIBSOCKET_IPv4); + msg = "t " + file; + // already write dataset, send dataset path to server + // should not re + t2_sock << msg; + + return 0; +} \ No newline at end of file diff --git a/lgb_server/utils.py b/lgb_server/utils.py new file mode 100644 index 000000000..3d61825cb --- /dev/null +++ b/lgb_server/utils.py @@ -0,0 +1,42 @@ +from typing import Union +import pandas as pd +import sys + +dataset_path = '/pg_wal/ycc/' +# dataset_path = '' + +# msg should be like 'dataset1.csv' +def parse_train_msg(msg: str) -> str: + assert type(msg) is str + msg_list = msg.split(' ', -1) + assert len(msg_list) == 1 + + return dataset_path + msg_list[0] + +# msg should be like '0 4 12345678 2 2345678', consisted of integer and ' ' +# reminded that that msg shouldn't end with ' ' or start with ' ' +# and every integer should be seperated with single ' ' +def parse_pred_msg(msg: str) -> list[int]: + assert type(msg) is str + assert msg[-1] != ' ' and msg[0] != ' ' + msg_list = msg.split(' ', -1) + return [ int(item) for item in msg_list] + +# build predict data row from list[int] +def prepare_data(data: list[int]) -> pd.DataFrame: + assert type(data) is list and type(data[0]) is int + datas = pd.DataFrame([data]) + return datas + +# socket input should be like 't dataset1.csv' or 'p 0 4 12345678 2 2345678' +def parse_msg(msg: str) -> Union[str, list[int]]: + assert msg[0] == 't' or msg[0] == 'p' + assert msg[1] == ' ' + assert msg[2] != ' ' + # print('new message : ' + msg[2:]) + if msg[0] == 't': + return parse_train_msg(msg[2:]) + elif msg[0] == 'p': + return parse_pred_msg(msg[2:]) + else: + return None \ No newline at end of file diff --git a/models/README.md b/models/README.md new file mode 100644 index 000000000..dbbf845d8 --- /dev/null +++ b/models/README.md @@ -0,0 +1,15 @@ +## models + +used to be called by rocksdb through c++ Python interface(Python.h) + +**deprecated** in latest implement of WaLSM. + +in latest version, We decide to use client-server architecture to train and predict + +## Files + + - lgb.cc: simple c++ demo of calling c++ Python interface + + - lgb.py: simple python func for training or predicting + + - lgb.sh: simple shell for running this c++ demo \ No newline at end of file diff --git a/models/lgb.cc b/models/lgb.cc new file mode 100644 index 000000000..afa662f20 --- /dev/null +++ b/models/lgb.cc @@ -0,0 +1,254 @@ +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +void generate_samples() { + // ready for writer + std::ofstream stream("lgb.csv"); + csv2::Writer> writer(stream); + + // init hotness values + std::map hotness_map; + double base_hotness = 0.1; + for (int i=0; i<200; i++) { + float r = static_cast (rand()) / static_cast (RAND_MAX) + base_hotness; + hotness_map[i] = r; + } + + // init header vector + std::vector> rows; + std::vector header; + header.emplace_back("Level"); + for (int i=0; i<20; i++) { + header.emplace_back("Range_" + std::to_string(i)); + header.emplace_back("Hotness_" + std::to_string(i)); + } + header.emplace_back("Target"); + rows.emplace_back(header); + + // ready for shuffling + std::vector ids; + for(int i=0; i<200; i++) { + ids.emplace_back(i); + } + + // generate values + for (int i=0; i<1000; i++) { + // std::vector value; + std::vector values; + int level = i / 200; + int target = 5 - level; + float r = static_cast (rand()) / static_cast (RAND_MAX); + if (r > 0.10 * level) { + target -= 1; + } + + auto seed = std::chrono::system_clock::now().time_since_epoch().count(); + std::shuffle(ids.begin(), ids.end(), std::default_random_engine(seed)); + values.emplace_back(std::to_string(level)); + for (int i=0; i<20; i++) { + values.emplace_back(std::to_string(ids[i])); + values.emplace_back(std::to_string(int(1e8 * hotness_map[ids[i]]))); + } + values.emplace_back(std::to_string(target)); + + rows.emplace_back(values); + } + + writer.write_rows(rows); + stream.close(); +} + +void read_samples() { + csv2::Reader, + csv2::quote_character<'"'>, + csv2::first_row_is_header, + csv2::trim_policy::trim_whitespace> csv; + + if (csv.mmap("lgb.csv")) { + const auto header = csv.header(); + for (const auto row: csv) { + for (const auto cell: row) { + std::string value; + cell.read_value(value); + std::cout << value << " "; + } + std::cout << std::endl; + } + } + +} + +void train() { + PyObject* pModule = PyImport_ImportModule("lgb"); + if( pModule == nullptr ){ + std::cout <<"module not found" << std::endl; + exit(EXIT_FAILURE); + } + + PyObject* pFunc = PyObject_GetAttrString(pModule, "train"); + if( !pFunc || !PyCallable_Check(pFunc)){ + std::cout <<"not found function" << std::endl; + exit(EXIT_FAILURE); + } + + PyObject* pArg = PyTuple_New(2); + PyTuple_SetItem(pArg, 0, Py_BuildValue("s", "lgb.csv")); + PyTuple_SetItem(pArg, 1, Py_BuildValue("s", "lgb.txt")); + + PyObject_CallObject(pFunc, pArg); + + Py_DECREF(pModule); + Py_DECREF(pFunc); + Py_DECREF(pArg); +} + +/* +uint16_t predict_one() { + std::vector sample = { + 0,77,83853435,86,32896816,164,109999358,88, + 45036017,191,97761380,192,84780931,40,62674498, + 71,13928034,187,85729384,85,43033713,95, + 102396976,93,95867633,185,19964006,154,62021011,21, + 34288677,161,85558086,181,65248507,162,15193881, + 136,22547489,99,101097202 + }; + + PyObject* pModule = PyImport_ImportModule("lgb"); + if( pModule == nullptr ){ + std::cout <<"module not found" << std::endl; + exit(EXIT_FAILURE); + } + + PyObject* pFunc = PyObject_GetAttrString(pModule, "predict"); + if( !pFunc || !PyCallable_Check(pFunc)){ + std::cout <<"not found function" << std::endl; + exit(EXIT_FAILURE); + } + + PyObject* pArg = PyTuple_New(2); + PyTuple_SetItem(pArg, 0, Py_BuildValue("s", "lgb.txt")); + + PyObject* pData = PyList_New(0); + for (int feature : sample) { + PyList_Append(pData, Py_BuildValue("i", feature)); + } + PyTuple_SetItem(pArg, 1, pData); + + PyObject* pReturn = PyObject_CallObject(pFunc, pArg); + + int nResult; + PyArg_Parse(pReturn, "i", &nResult); + + // std::cout << "return result is " << nResult << std::endl; + return nResult; +} +*/ + +void predict(std::vector& results) { + results.clear(); + std::vector> samples = { + { + 0,77,83853435,86,32896816,164,109999358,88, + 45036017,191,97761380,192,84780931,40,62674498, + 71,13928034,187,85729384,85,43033713,95, + 102396976,93,95867633,185,19964006,154,62021011,21, + 34288677,161,85558086,181,65248507,162,15193881, + 136,22547489,99,101097202 + }, + { + 2,113,32610663,147,83265441,100,58249068,136,22547489, + 166,98995566,141,105010402,99,101097202,146,89779806, + 102,105025231,21,34288677,49,104932701,126,78444504,25, + 50094437,48,16975528,1,49438291,191,97761380,31, + 93911224,107,53195345,129,46866354,111,40745785 + }, + { + 4,125,103500401,33,39603161,64,36666575,75,82095235,182, + 67943000,42,50022864,96,49843665,148,75656366,18,24160255, + 57,12002304,110,88600212,185,19964006,8,37777471,16,73571175, + 26,22979043,153,23490241,104,24766001,100,58249068, + 137,89347040,69,76772379 + } + }; + + PyObject* pModule = PyImport_ImportModule("lgb"); + if( pModule == nullptr ){ + std::cout <<"module not found" << std::endl; + exit(EXIT_FAILURE); + } + + PyObject* pFunc = PyObject_GetAttrString(pModule, "predict"); + if( !pFunc || !PyCallable_Check(pFunc)){ + std::cout <<"not found function" << std::endl; + exit(EXIT_FAILURE); + } + + PyObject* pArg = PyTuple_New(2); + PyTuple_SetItem(pArg, 0, Py_BuildValue("s", "lgb.txt")); + + PyObject* pDatas = PyList_New(0); + PyObject* pData = nullptr; + size_t cnt = 0; + for (std::vector& sample : samples) { + pData = PyList_New(0); + for (uint32_t& feature : sample) { + PyList_Append(pData, Py_BuildValue("i", feature)); + } + PyList_Append(pDatas, pData); + cnt += 1; + } + + PyTuple_SetItem(pArg, 1, pDatas); + + PyObject* pReturn = PyObject_CallObject(pFunc, pArg); // should return list + + for (size_t i = 0; i < cnt; i ++) { + int nResult = 0; + PyArg_Parse(PyList_GetItem(pReturn, i), "i", &nResult); + results.emplace_back(nResult); + } + + Py_DECREF(pModule); + Py_DECREF(pFunc); + Py_DECREF(pArg); + Py_DECREF(pDatas); + Py_DECREF(pReturn); +} + +int main() { + Py_Initialize(); + if(!Py_IsInitialized()){ + std::cout << "python init fail" << std::endl; + exit(EXIT_FAILURE); + } + + PyRun_SimpleString("import sys"); + PyRun_SimpleString("sys.path.append('.')"); + + generate_samples(); + train(); + + std::vector results; + predict(results); + for (uint16_t result : results) { + std::cout << result << " " << std::endl; + } + std::cout << std::endl; + //read_samples(); + + Py_Finalize(); + + return EXIT_SUCCESS; +} \ No newline at end of file diff --git a/models/lgb.py b/models/lgb.py new file mode 100644 index 000000000..17d3c9756 --- /dev/null +++ b/models/lgb.py @@ -0,0 +1,53 @@ +import pandas as pd +import lightgbm +import numpy +import sys +from io import StringIO + +# dataset: train dataset path +# output: saved model path +def train(dataset: str, output: str): + # sys.stdout = StringIO() + df = pd.read_csv(dataset) + y = df['Target'] + X = df.drop(columns=['Target']) + clf = lightgbm.LGBMClassifier(n_estimators=2, num_leaves=16, max_depth=8) + clf.fit(X, y) + # val_pred = clf.predict(X_test) + clf.booster_.save_model(output) + +# model_file: saved model path +# data: predicted data point +''' +def predict_one(model_file: str, data: list[int]): + # df = pd.read_csv("lgb.csv") + # y = df['Target'] + # X = df.drop(columns=['Target']) + # print(model_file) + assert type(data) is list + data = pd.DataFrame(data).T + model = lightgbm.Booster(model_file=model_file) + result = model.predict(data) + # print(result) + return numpy.argmax(result[0]) +''' + +# model_file: saved model path +# data: predicted data batch +def predict(model_file: str, datas: list[list[int]]): + # df = pd.read_csv("lgb.csv") + # y = df['Target'] + # X = df.drop(columns=['Target']) + # print(model_file) + assert type(datas) is list + datas = pd.DataFrame(datas) + model = lightgbm.Booster(model_file=model_file) + results = model.predict(datas) + # print(result) + # print(results) + return [ numpy.argmax(result) for result in results ] + + +if __name__ == '__main__': + train() + predict() diff --git a/models/lgb.sh b/models/lgb.sh new file mode 100644 index 000000000..3eb066780 --- /dev/null +++ b/models/lgb.sh @@ -0,0 +1,3 @@ +rm lgb lgb.txt log.txt +gcc lgb.cc -o lgb -l_lightgbm -std=c++11 -lstdc++ -lpython3.12 -I/home/ycc/miniconda3/include/python3.12 -L/home/ycc/miniconda3/lib +./lgb > log.txt \ No newline at end of file diff --git a/src.mk b/src.mk index d75a1ee98..f39f6809c 100644 --- a/src.mk +++ b/src.mk @@ -30,6 +30,13 @@ LIB_SOURCES = \ db/art/art_node.cc \ db/art/compactor.cc \ db/art/global_memtable.cc \ + db/art/heat_buckets.cc \ + db/art/clf_model.cc \ + db/art/filter_cache_heap.cc \ + db/art/filter_cache_item.cc \ + db/art/filter_cache.cc \ + db/art/filter_cache_client.cc \ + db/art/greedy_algo.cc \ db/art/heat_group.cc \ db/art/lock.cc \ db/art/logger.cc \ diff --git a/table/block_based/block.h b/table/block_based/block.h index df6c77e59..a76c35616 100644 --- a/table/block_based/block.h +++ b/table/block_based/block.h @@ -37,6 +37,9 @@ class DataBlockIter; class IndexBlockIter; class BlockPrefixIndex; +// Block not suitable for Filter +// so these classes under this line not related to our work WaLSM+ + // BlockReadAmpBitmap is a bitmap that map the ROCKSDB_NAMESPACE::Block data // bytes to a bitmap with ratio bytes_per_bit. Whenever we access a range of // bytes in the Block we update the bitmap and increment diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 9d4f2d67c..7bd6eb3ca 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -247,6 +247,7 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector bool prefix_filtering_; }; +// WaLSM+ Note: init filter_builder struct BlockBasedTableBuilder::Rep { const ImmutableCFOptions ioptions; const MutableCFOptions moptions; diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h index 2e3081d26..38ad948af 100644 --- a/table/block_based/block_based_table_builder.h +++ b/table/block_based/block_based_table_builder.h @@ -64,6 +64,7 @@ class BlockBasedTableBuilder : public TableBuilder { // Add key,value to the table being constructed. // REQUIRES: key is after any previously added key according to comparator. // REQUIRES: Finish(), Abandon() have not been called + // WaLSM+ Note: call filter_builder->add() void Add(const Slice& key, const Slice& value) override; // Return non-ok iff some error has been detected. @@ -75,6 +76,7 @@ class BlockBasedTableBuilder : public TableBuilder { // Finish building the table. Stops using the file passed to the // constructor after this function returns. // REQUIRES: Finish(), Abandon() have not been called + // WaLSM+ Note: call WriteFilterBlock() Status Finish() override; // Indicate that the contents of this builder should be abandoned. Stops @@ -115,6 +117,7 @@ class BlockBasedTableBuilder : public TableBuilder { // Transition state from buffered to unbuffered. See `Rep::State` API comment // for details of the states. // REQUIRES: `rep_->state == kBuffered` + // WaLSM+ Note: call filter_builder->add() void EnterUnbuffered(); // Call block's Finish() method @@ -122,6 +125,8 @@ class BlockBasedTableBuilder : public TableBuilder { void WriteBlock(BlockBuilder* block, BlockHandle* handle, bool is_data_block); // Compress and write block content to the file. + // WaLSM+ Note: call filter_builder->StartBlock() + // however StartBlock() not implemented in FullFilterBlock void WriteBlock(const Slice& block_contents, BlockHandle* handle, bool is_data_block); // Directly write data to the file. @@ -130,7 +135,7 @@ class BlockBasedTableBuilder : public TableBuilder { Status InsertBlockInCache(const Slice& block_contents, const CompressionType type, const BlockHandle* handle); - + // WaLSM+ Note: call filter_builder->Finish() and write to SST void WriteFilterBlock(MetaIndexBuilder* meta_index_builder); void WriteIndexBlock(MetaIndexBuilder* meta_index_builder, BlockHandle* index_block_handle); @@ -171,6 +176,7 @@ class BlockBasedTableBuilder : public TableBuilder { CompressionType* result_compression_type, Status* out_status); // Get compressed blocks from BGWorkCompression and write them into SST + // WaLSM+ Note: call filter_builder->StartBlock() and call filter_builder->add() void BGWorkWriteRawBlock(); }; diff --git a/table/block_based/block_based_table_factory.h b/table/block_based/block_based_table_factory.h index a7120f854..ddaa13215 100644 --- a/table/block_based/block_based_table_factory.h +++ b/table/block_based/block_based_table_factory.h @@ -39,6 +39,7 @@ class TailPrefetchStats { size_t num_records_ = 0; }; +// only init TableReader and TableBuilder class BlockBasedTableFactory : public TableFactory { public: explicit BlockBasedTableFactory( diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h index e1f150573..c3c83ffd8 100644 --- a/table/block_based/block_based_table_iterator.h +++ b/table/block_based/block_based_table_iterator.h @@ -15,6 +15,7 @@ namespace ROCKSDB_NAMESPACE { // Iterates over the contents of BlockBasedTable. +// iterator for index, not related to our work WaLSM+ class BlockBasedTableIterator : public InternalIteratorBase { // compaction_readahead_size: its value will only be used if for_compaction = // true diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index f02a706f1..2257d10ea 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -2194,6 +2194,48 @@ bool BlockBasedTable::FullFilterKeyMayMatch( return may_match; } +#ifdef ART_PLUS +bool BlockBasedTable::FullFilterKeyMayMatch( + FilterCacheClient& filter_cache, + const ReadOptions& read_options, FilterBlockReader* filter, + const Slice& internal_key, const bool no_io, + const SliceTransform* prefix_extractor, GetContext* get_context, + BlockCacheLookupContext* lookup_context) const { + if (filter == nullptr || filter->IsBlockBased()) { + return true; + } + Slice user_key = ExtractUserKey(internal_key); + const Slice* const const_ikey_ptr = &internal_key; + bool may_match = true; + if (rep_->whole_key_filtering) { + size_t ts_sz = + rep_->internal_comparator.user_comparator()->timestamp_size(); + Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz); + // add filter_cache argument here + may_match = + filter->KeyMayMatch(filter_cache, + user_key_without_ts, prefix_extractor, kNotValid, + no_io, const_ikey_ptr, get_context, lookup_context); + } else if (!read_options.total_order_seek && prefix_extractor && + rep_->table_properties->prefix_extractor_name.compare( + prefix_extractor->Name()) == 0 && + prefix_extractor->InDomain(user_key) && + !filter->PrefixMayMatch(prefix_extractor->Transform(user_key), + prefix_extractor, kNotValid, no_io, + const_ikey_ptr, get_context, + lookup_context)) { + // WaLSM+ dont use prefix filter, so this branch will not reach + may_match = false; + } + if (may_match) { + RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level); + } + return may_match; +} +#endif + +// TODO: not used in WaLSM+ Benchmark, meybe used in MultiGet interface ? void BlockBasedTable::FullFilterKeysMayMatch( const ReadOptions& read_options, FilterBlockReader* filter, MultiGetRange* range, const bool no_io, @@ -2291,14 +2333,202 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, bool done = false; for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { IndexValue v = iiter->value(); - + // WaLSM+ dont use BlockBasedFilter, so not_exist_in_filter always false bool not_exist_in_filter = filter != nullptr && filter->IsBlockBased() == true && !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz), prefix_extractor, v.handle.offset(), no_io, /*const_ikey_ptr=*/nullptr, get_context, &lookup_context); + + if (not_exist_in_filter) { + // Not found + // TODO: think about interaction with Merge. If a user key cannot + // cross one data block, we should be fine. + RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); + break; + } + + if (!v.first_internal_key.empty() && !skip_filters && + UserComparatorWrapper(rep_->internal_comparator.user_comparator()) + .Compare(ExtractUserKey(key), + ExtractUserKey(v.first_internal_key)) < 0) { + // The requested key falls between highest key in previous block and + // lowest key in current block. + break; + } + + BlockCacheLookupContext lookup_data_block_context{ + TableReaderCaller::kUserGet, tracing_get_id, + /*get_from_user_specified_snapshot=*/read_options.snapshot != + nullptr}; + bool does_referenced_key_exist = false; + DataBlockIter biter; + uint64_t referenced_data_size = 0; + NewDataBlockIterator( + read_options, v.handle, &biter, BlockType::kData, get_context, + &lookup_data_block_context, + /*s=*/Status(), /*prefetch_buffer*/ nullptr); + + if (no_io && biter.status().IsIncomplete()) { + // couldn't get block from block_cache + // Update Saver.state to Found because we are only looking for + // whether we can guarantee the key is not there when "no_io" is set + get_context->MarkKeyMayExist(); + break; + } + if (!biter.status().ok()) { + s = biter.status(); + break; + } + + bool may_exist = biter.SeekForGet(key); + // If user-specified timestamp is supported, we cannot end the search + // just because hash index lookup indicates the key+ts does not exist. + if (!may_exist && ts_sz == 0) { + // HashSeek cannot find the key this block and the the iter is not + // the end of the block, i.e. cannot be in the following blocks + // either. In this case, the seek_key cannot be found, so we break + // from the top level for-loop. + done = true; + } else { + // Call the *saver function on each entry/block until it returns false + for (; biter.Valid(); biter.Next()) { + ParsedInternalKey parsed_key; + if (ParseInternalKey(biter.key(), &parsed_key) != Status::OK()) { + s = Status::Corruption(Slice()); + } + + if (!get_context->SaveValue( + parsed_key, biter.value(), &matched, + biter.IsValuePinned() ? &biter : nullptr)) { + if (get_context->State() == GetContext::GetState::kFound) { + does_referenced_key_exist = true; + referenced_data_size = biter.key().size() + biter.value().size(); + } + done = true; + break; + } + } + s = biter.status(); + } + // Write the block cache access record. + if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) { + // Avoid making copy of block_key, cf_name, and referenced_key when + // constructing the access record. + Slice referenced_key; + if (does_referenced_key_exist) { + referenced_key = biter.key(); + } else { + referenced_key = key; + } + BlockCacheTraceRecord access_record( + rep_->ioptions.env->NowMicros(), + /*block_key=*/"", lookup_data_block_context.block_type, + lookup_data_block_context.block_size, rep_->cf_id_for_tracing(), + /*cf_name=*/"", rep_->level_for_tracing(), + rep_->sst_number_for_tracing(), lookup_data_block_context.caller, + lookup_data_block_context.is_cache_hit, + lookup_data_block_context.no_insert, + lookup_data_block_context.get_id, + lookup_data_block_context.get_from_user_specified_snapshot, + /*referenced_key=*/"", referenced_data_size, + lookup_data_block_context.num_keys_in_block, + does_referenced_key_exist); + // TODO: Should handle status here? + block_cache_tracer_ + ->WriteBlockAccess(access_record, + lookup_data_block_context.block_key, + rep_->cf_name_for_tracing(), referenced_key) + .PermitUncheckedError(); + } + + if (done) { + // Avoid the extra Next which is expensive in two-level indexes + break; + } + } + if (matched && filter != nullptr && !filter->IsBlockBased()) { + RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, + rep_->level); + } + if (s.ok() && !iiter->status().IsNotFound()) { + s = iiter->status(); + } + } + + return s; +} + +#ifdef ART_PLUS +Status BlockBasedTable::Get(FilterCacheClient& filter_cache, + const ReadOptions& read_options, const Slice& key, + GetContext* get_context, + const SliceTransform* prefix_extractor, + bool skip_filters) { + assert(key.size() >= kNumInternalBytes); // key must be internal key + assert(get_context != nullptr); + Status s; + const bool no_io = read_options.read_tier == kBlockCacheTier; + + FilterBlockReader* const filter = + !skip_filters ? rep_->filter.get() : nullptr; + + // First check the full filter + // If full filter not useful, Then go into each block + uint64_t tracing_get_id = get_context->get_tracing_get_id(); + BlockCacheLookupContext lookup_context{ + TableReaderCaller::kUserGet, tracing_get_id, + /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr}; + if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) { + // Trace the key since it contains both user key and sequence number. + lookup_context.referenced_key = key.ToString(); + lookup_context.get_from_user_specified_snapshot = + read_options.snapshot != nullptr; + } + TEST_SYNC_POINT("BlockBasedTable::Get:BeforeFilterMatch"); + // add filter_cache argument here + const bool may_match = + FullFilterKeyMayMatch(filter_cache, + read_options, filter, key, no_io, prefix_extractor, + get_context, &lookup_context); + TEST_SYNC_POINT("BlockBasedTable::Get:AfterFilterMatch"); + if (!may_match) { + RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); + } else { + IndexBlockIter iiter_on_stack; + // if prefix_extractor found in block differs from options, disable + // BlockPrefixIndex. Only do this check when index_type is kHashSearch. + bool need_upper_bound_check = false; + if (rep_->index_type == BlockBasedTableOptions::kHashSearch) { + need_upper_bound_check = PrefixExtractorChanged( + rep_->table_properties.get(), prefix_extractor); + } + auto iiter = + NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, + get_context, &lookup_context); + std::unique_ptr> iiter_unique_ptr; + if (iiter != &iiter_on_stack) { + iiter_unique_ptr.reset(iiter); + } + size_t ts_sz = + rep_->internal_comparator.user_comparator()->timestamp_size(); + bool matched = false; // if such user key matched a key in SST + bool done = false; + for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { + IndexValue v = iiter->value(); + // WaLSM+ dont use BlockBasedFilter, so not_exist_in_filter always false + bool not_exist_in_filter = + filter != nullptr && filter->IsBlockBased() == true && + !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz), + prefix_extractor, v.handle.offset(), no_io, + /*const_ikey_ptr=*/nullptr, get_context, + &lookup_context); + if (not_exist_in_filter) { // Not found // TODO: think about interaction with Merge. If a user key cannot @@ -2419,7 +2649,9 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, return s; } +#endif +// TODO: WaLSM+ Benchmark dont use MultiGet interface using MultiGetRange = MultiGetContext::Range; void BlockBasedTable::MultiGet(const ReadOptions& read_options, const MultiGetRange* mget_range, diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 9cd0c24ac..ce7c4ed8a 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -9,8 +9,11 @@ #pragma once +#include +#include "db/art/filter_cache_client.h" #include "db/range_tombstone_fragmenter.h" #include "file/filename.h" +#include "rocksdb/comparator.h" #include "table/block_based/block_based_table_factory.h" #include "table/block_based/block_type.h" #include "table/block_based/cachable_entry.h" @@ -87,6 +90,7 @@ class BlockBasedTable : public TableReader { // are set. // @param force_direct_prefetch if true, always prefetching to RocksDB // buffer, rather than calling RandomAccessFile::Prefetch(). + // WaLSM+ Note: call new_table->PrefetchIndexAndFilterBlocks() static Status Open(const ReadOptions& ro, const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const BlockBasedTableOptions& table_options, @@ -103,7 +107,7 @@ class BlockBasedTable : public TableReader { TailPrefetchStats* tail_prefetch_stats = nullptr, BlockCacheTracer* const block_cache_tracer = nullptr, size_t max_file_size_for_l0_meta_pin = 0); - + // WaLSM+ Note: call filter->RangeMayExist() bool PrefixMayMatch(const Slice& internal_key, const ReadOptions& read_options, const SliceTransform* options_prefix_extractor, @@ -128,10 +132,22 @@ class BlockBasedTable : public TableReader { const ReadOptions& read_options) override; // @param skip_filters Disables loading/accessing the filter block + // WaLSM+ Note: call FullFilterKeyMayMatch() method in this file + // PERF count False Positive in the end Status Get(const ReadOptions& readOptions, const Slice& key, GetContext* get_context, const SliceTransform* prefix_extractor, bool skip_filters = false) override; +#ifdef ART_PLUS + Status Get(FilterCacheClient& filter_cache, + const ReadOptions& readOptions, const Slice& key, + GetContext* get_context, const SliceTransform* prefix_extractor, + bool skip_filters = false) override; +#endif + + // WaLSM+ Note: call FullFilterKeyMayMatch() method in this file + // PERF count False Positive in the end + // TODO: WaLSM+ Benchmark dont use MultiGet interface void MultiGet(const ReadOptions& readOptions, const MultiGetContext::Range* mget_range, const SliceTransform* prefix_extractor, @@ -172,9 +188,11 @@ class BlockBasedTable : public TableReader { std::shared_ptr GetTableProperties() const override; + // WaLSM+ Note: call filter->ApproximateMemoryUsage() size_t ApproximateMemoryUsage() const override; // convert SST file to a human readable form + // WaLSM+ Note: call filter->ToString() Status DumpTable(WritableFile* out_file) override; Status VerifyChecksum(const ReadOptions& readOptions, @@ -266,6 +284,7 @@ class BlockBasedTable : public TableReader { static std::atomic next_cache_key_id_; BlockCacheTracer* const block_cache_tracer_; + // WaLSM+ Note: update filter cache event cnt void UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context, size_t usage) const; void UpdateCacheMissMetrics(BlockType block_type, @@ -392,13 +411,24 @@ class BlockBasedTable : public TableReader { BlockCacheLookupContext* lookup_context, std::unique_ptr* index_reader); + // WaLSM+ Note: filter->PrefixesMayMatch() or filter->KeyMayMatch() bool FullFilterKeyMayMatch(const ReadOptions& read_options, FilterBlockReader* filter, const Slice& user_key, const bool no_io, const SliceTransform* prefix_extractor, GetContext* get_context, BlockCacheLookupContext* lookup_context) const; - +#ifdef ART_PLUS + bool FullFilterKeyMayMatch(FilterCacheClient& filter_cache, + const ReadOptions& read_options, + FilterBlockReader* filter, const Slice& user_key, + const bool no_io, + const SliceTransform* prefix_extractor, + GetContext* get_context, + BlockCacheLookupContext* lookup_context) const; +#endif + // TODO: not used in WaLSM+ Benchmark, meybe used in MultiGet interface ? + // WaLSM+ Note: filter->PrefixesMayMatch() or filter->KeyMayMatch() void FullFilterKeysMayMatch(const ReadOptions& read_options, FilterBlockReader* filter, MultiGetRange* range, const bool no_io, @@ -429,6 +459,7 @@ class BlockBasedTable : public TableReader { InternalIterator* meta_iter, const InternalKeyComparator& internal_comparator, BlockCacheLookupContext* lookup_context); + // WaLSM+ Note: filter->CacheDependencies() Status PrefetchIndexAndFilterBlocks( const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, BlockBasedTable* new_table, @@ -443,6 +474,7 @@ class BlockBasedTable : public TableReader { InternalIteratorBase* index_iter); // Create the filter from the filter block. + // WaLSM+ Note: FullFilterBlockReader::Create() std::unique_ptr CreateFilterBlockReader( const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, bool pin, @@ -516,6 +548,9 @@ struct BlockBasedTable::Rep { table_options(_table_opt), filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()), internal_comparator(_internal_comparator), + #ifdef ART_PLUS + segment_id_removing_comparator(SegmentIdRemovingComparator(_internal_comparator.user_comparator())), + #endif filter_type(FilterType::kNoFilter), index_type(BlockBasedTableOptions::IndexType::kBinarySearch), hash_index_allow_collision(false), @@ -531,6 +566,9 @@ struct BlockBasedTable::Rep { const BlockBasedTableOptions table_options; const FilterPolicy* const filter_policy; const InternalKeyComparator& internal_comparator; + #ifdef ART_PLUS + std::unique_ptr segment_id_removing_comparator; + #endif Status status; std::unique_ptr file; char cache_key_prefix[kMaxCacheKeyPrefixSize]; diff --git a/table/block_based/block_builder.h b/table/block_based/block_builder.h index e3fcfc2ec..ebe196a4f 100644 --- a/table/block_based/block_builder.h +++ b/table/block_based/block_builder.h @@ -17,6 +17,8 @@ namespace ROCKSDB_NAMESPACE { +// seems for kv to build index? +// not related to our work WaLSM+ class BlockBuilder { public: BlockBuilder(const BlockBuilder&) = delete; diff --git a/table/block_based/block_prefetcher.h b/table/block_based/block_prefetcher.h index ee3b61f5c..7906c5a75 100644 --- a/table/block_based/block_prefetcher.h +++ b/table/block_based/block_prefetcher.h @@ -10,6 +10,8 @@ #include "table/block_based/block_based_table_reader.h" namespace ROCKSDB_NAMESPACE { +// only prefetch data block, for scans operator +// not related to our work WaLSM+ class BlockPrefetcher { public: explicit BlockPrefetcher(size_t compaction_readahead_size) diff --git a/table/block_based/cachable_entry.h b/table/block_based/cachable_entry.h index 598f1ef57..8b34ada54 100644 --- a/table/block_based/cachable_entry.h +++ b/table/block_based/cachable_entry.h @@ -43,6 +43,7 @@ class CachableEntry { public: CachableEntry() = default; + // init method CachableEntry(T* value, Cache* cache, Cache::Handle* cache_handle, bool own_value) : value_(value) @@ -56,9 +57,11 @@ class CachableEntry { assert(!cache_handle_ || !own_value_); } + // disable copy assignment CachableEntry(const CachableEntry&) = delete; CachableEntry& operator=(const CachableEntry&) = delete; - + + // enable move assignment CachableEntry(CachableEntry&& rhs) : value_(rhs.value_) , cache_(rhs.cache_) @@ -72,7 +75,7 @@ class CachableEntry { rhs.ResetFields(); } - + // enable move assignment CachableEntry& operator=(CachableEntry&& rhs) { if (UNLIKELY(this == &rhs)) { return *this; @@ -98,28 +101,29 @@ class CachableEntry { ~CachableEntry() { ReleaseResource(); } - + // return true when all member vars are empty bool IsEmpty() const { return value_ == nullptr && cache_ == nullptr && cache_handle_ == nullptr && !own_value_; } - + // check whether cached bool IsCached() const { assert(!!cache_ == !!cache_handle_); return cache_handle_ != nullptr; } - + // return member vars, + // T is actually value in a cache entry if it own this value T* GetValue() const { return value_; } Cache* GetCache() const { return cache_; } Cache::Handle* GetCacheHandle() const { return cache_handle_; } bool GetOwnValue() const { return own_value_; } - + // reset method void Reset() { ReleaseResource(); ResetFields(); } - + // transfer to cleanable entry and make ready for cleanup job void TransferTo(Cleanable* cleanable) { if (cleanable) { if (cache_handle_ != nullptr) { @@ -132,7 +136,8 @@ class CachableEntry { ResetFields(); } - + // set value + // set own_value_ to true -> this entry has ownership of this value void SetOwnedValue(T* value) { assert(value != nullptr); @@ -146,7 +151,8 @@ class CachableEntry { value_ = value; own_value_ = true; } - + // set value + // set own_value_ to true -> this entry does not have ownership of this value void SetUnownedValue(T* value) { assert(value != nullptr); @@ -160,7 +166,8 @@ class CachableEntry { value_ = value; assert(!own_value_); } - + // set value, cache, cache_handle + // set own_value_ to true -> this entry does not have ownership of this value void SetCachedValue(T* value, Cache* cache, Cache::Handle* cache_handle) { assert(value != nullptr); assert(cache != nullptr); @@ -180,6 +187,7 @@ class CachableEntry { } private: + // release cache entry in cache or release owned value void ReleaseResource() { if (LIKELY(cache_handle_ != nullptr)) { assert(cache_ != nullptr); @@ -188,14 +196,14 @@ class CachableEntry { delete value_; } } - + // reset all member fields to null void ResetFields() { value_ = nullptr; cache_ = nullptr; cache_handle_ = nullptr; own_value_ = false; } - + // release cache entry in cache static void ReleaseCacheHandle(void* arg1, void* arg2) { Cache* const cache = static_cast(arg1); assert(cache); @@ -206,6 +214,7 @@ class CachableEntry { cache->Release(cache_handle); } + // delete value liked arg1 static void DeleteValue(void* arg1, void* /* arg2 */) { delete static_cast(arg1); } diff --git a/table/block_based/filter_block_reader_common.h b/table/block_based/filter_block_reader_common.h index a18bc5449..bcd4588d2 100644 --- a/table/block_based/filter_block_reader_common.h +++ b/table/block_based/filter_block_reader_common.h @@ -22,6 +22,7 @@ class FilePrefetchBuffer; template class FilterBlockReaderCommon : public FilterBlockReader { public: + // init method FilterBlockReaderCommon(const BlockBasedTable* t, CachableEntry&& filter_block) : table_(t), filter_block_(std::move(filter_block)) { @@ -29,22 +30,32 @@ class FilterBlockReaderCommon : public FilterBlockReader { } protected: + // input table, call table->RetrieveBlock() and return status static Status ReadFilterBlock(const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, const ReadOptions& read_options, bool use_cache, GetContext* get_context, BlockCacheLookupContext* lookup_context, CachableEntry* filter_block); - + // return table_ const BlockBasedTable* table() const { return table_; } + // not used in our work const SliceTransform* table_prefix_extractor() const; + // return table_->get_rep()->whole_key_filtering; bool whole_key_filtering() const; + // return table_->get_rep()->table_options.cache_index_and_filter_blocks; + // return true when cache filter in block cache + // should return false in WaLSM+, we will design new cache space for filter bool cache_filter_blocks() const; - + // if cached, because filter_block_ already own the cache value + // we only need to call filter_block->SetUnownedValue(filter_block_.GetValue()); + // so filter_block only have the reference of cached entry + // if not cached, we read from block, load into filter block(owned) + // and return status. Status GetOrReadFilterBlock(bool no_io, GetContext* get_context, BlockCacheLookupContext* lookup_context, CachableEntry* filter_block) const; - + // if GetOwnValue() = true,return Usage,otherwise 0 size_t ApproximateFilterBlockMemoryUsage() const; private: diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc index 31eb6b90d..da03f1d5f 100644 --- a/table/block_based/filter_policy.cc +++ b/table/block_based/filter_policy.cc @@ -8,7 +8,11 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include +#include +#include #include +#include +#include #include "rocksdb/filter_policy.h" @@ -341,10 +345,12 @@ class FastLocalBloomBitsReader : public FilterBitsReader { const uint32_t len_bytes_; }; +// default use Cache Local Bloom Filter Impl, see util/bloom_impl.h using LegacyBloomImpl = LegacyLocalityBloomImpl; class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder { public: + // input bits_per_key, init builder explicit LegacyBloomBitsBuilder(const int bits_per_key, Logger* info_log); // No Copy allowed @@ -353,23 +359,35 @@ class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder { ~LegacyBloomBitsBuilder() override; + // hash to one value and push into hash_entries_ + // noticed that Hash use double hashing, we only need one hash value h + // then use double hashing void AddKey(const Slice& key) override; + // already collect hash values, just write to filter, + // return slice(real filter bits + num_probes(1 bit) + num_lines(4 bits)) Slice Finish(std::unique_ptr* buf) override; + // input used bytes, and output keys num int CalculateNumEntry(const uint32_t bytes) override; + // actually only need num_entry to determine used space uint32_t CalculateSpace(const int num_entry) override { uint32_t dont_care1; uint32_t dont_care2; return CalculateSpace(num_entry, &dont_care1, &dont_care2); } + // simply estimate FPR double EstimatedFpRate(size_t keys, size_t bytes) override { return LegacyBloomImpl::EstimatedFpRate(keys, bytes - /*metadata*/ 5, num_probes_); } + #ifdef ART_PLUS + int hash_id_; + #endif + private: int bits_per_key_; int num_probes_; @@ -379,21 +397,28 @@ class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder { // Get totalbits that optimized for cpu cache line uint32_t GetTotalBitsForLocality(uint32_t total_bits); + // first calcute space and allocate space for filter // Reserve space for new filter char* ReserveSpace(const int num_entry, uint32_t* total_bits, uint32_t* num_lines); + // calcuate space : real bloom filter size (key size * bits per key) + metadata size (5) // Implementation-specific variant of public CalculateSpace uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits, uint32_t* num_lines); + // data is real filter array, input hash value h and set bits in data // Assuming single threaded access to this function. void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits); }; LegacyBloomBitsBuilder::LegacyBloomBitsBuilder(const int bits_per_key, Logger* info_log) - : bits_per_key_(bits_per_key), + : + #ifdef ART_PLUS + hash_id_(0), + #endif + bits_per_key_(bits_per_key), num_probes_(LegacyNoLocalityBloomImpl::ChooseNumProbes(bits_per_key_)), info_log_(info_log) { assert(bits_per_key_); @@ -402,7 +427,12 @@ LegacyBloomBitsBuilder::LegacyBloomBitsBuilder(const int bits_per_key, LegacyBloomBitsBuilder::~LegacyBloomBitsBuilder() {} void LegacyBloomBitsBuilder::AddKey(const Slice& key) { + #ifdef ART_PLUS + uint32_t hash = BloomHashId(key, hash_id_); + #else uint32_t hash = BloomHash(key); + #endif + if (hash_entries_.size() == 0 || hash != hash_entries_.back()) { hash_entries_.push_back(hash); } @@ -525,8 +555,72 @@ inline void LegacyBloomBitsBuilder::AddHash(uint32_t h, char* data, folly::constexpr_log2(CACHE_LINE_SIZE)); } +#ifdef ART_PLUS +class MultiLegacyBloomBitsBuilder : public FilterBitsBuilder { + public: + explicit MultiLegacyBloomBitsBuilder(const size_t filter_count, + const int bits_per_key, + Logger* info_log); + ~MultiLegacyBloomBitsBuilder(); + + // No copy allowed + MultiLegacyBloomBitsBuilder(const MultiLegacyBloomBitsBuilder&) = delete; + void operator=(const MultiLegacyBloomBitsBuilder&) = delete; + + virtual void AddKey(const Slice& key) override; + virtual Slice Finish(std::unique_ptr* buf) override; + virtual Slice FinishWithId(std::unique_ptr* buf, + const int hash_id) override; + + private: + std::vector bits_builders_; + + void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits); +}; + +MultiLegacyBloomBitsBuilder::MultiLegacyBloomBitsBuilder( + const size_t filter_count, const int bits_per_key, Logger* info_log) { + filter_count_ = filter_count; + bits_builders_.reserve(filter_count); + + for (size_t i = 0; i < filter_count; ++i) { + // TODO determine num_probes + LegacyBloomBitsBuilder* bits_builder = + new LegacyBloomBitsBuilder(bits_per_key, info_log); + bits_builder->hash_id_ = i; + bits_builders_.push_back(bits_builder); + } +} + +MultiLegacyBloomBitsBuilder::~MultiLegacyBloomBitsBuilder() { + for (size_t i = 0; i < bits_builders_.size(); ++i) { + delete bits_builders_[i]; + bits_builders_[i] = nullptr; + } +} + +void MultiLegacyBloomBitsBuilder::AddKey(const Slice& key) { + for (size_t i = 0; i < bits_builders_.size(); ++i) { + bits_builders_[i]->AddKey(key); + } +} + +Slice MultiLegacyBloomBitsBuilder::Finish(std::unique_ptr* buf) { + buf->reset(); + fprintf(stderr, "error call MultiLegacyBloomBitsBuilder::Finish(buf)\n"); + exit(1); + return Slice(); +} + +Slice MultiLegacyBloomBitsBuilder::FinishWithId(std::unique_ptr* buf, + int hash_id) { + return bits_builders_[hash_id]->Finish(buf); +} +#endif + class LegacyBloomBitsReader : public FilterBitsReader { public: + // init func LegacyBloomBitsReader(const char* data, int num_probes, uint32_t num_lines, uint32_t log2_cache_line_size) : data_(data), @@ -540,6 +634,7 @@ class LegacyBloomBitsReader : public FilterBitsReader { ~LegacyBloomBitsReader() override {} + // check whether key is in filter array // "contents" contains the data built by a preceding call to // FilterBitsBuilder::Finish. MayMatch must return true if the key was // passed to FilterBitsBuilder::AddKey. This method may return true or false @@ -554,6 +649,7 @@ class LegacyBloomBitsReader : public FilterBitsReader { hash, num_probes_, data_ + byte_offset, log2_cache_line_size_); } + // check whether keys is in filter array virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override { std::array hashes; std::array byte_offsets; @@ -570,6 +666,40 @@ class LegacyBloomBitsReader : public FilterBitsReader { } } + #ifdef ART_PLUS + // check whether key is in filter array + // "contents" contains the data built by a preceding call to + // FilterBitsBuilder::Finish. MayMatch must return true if the key was + // passed to FilterBitsBuilder::AddKey. This method may return true or false + // if the key was not on the list, but it should aim to return false with a + // high probability. (WaLSM+) + bool MayMatchWithId(const Slice& key, const int hash_id) override { + uint32_t hash = BloomHashId(key, hash_id); + uint32_t byte_offset; + LegacyBloomImpl::PrepareHashMayMatch( + hash, num_lines_, data_, /*out*/ &byte_offset, log2_cache_line_size_); + return LegacyBloomImpl::HashMayMatchPrepared( + hash, num_probes_, data_ + byte_offset, log2_cache_line_size_); + } + + // check whether keys is in filter array (WaLSM+) + virtual void MayMatchWithId(int num_keys, Slice** keys, bool* may_match, const int hash_id) override { + std::array hashes; + std::array byte_offsets; + for (int i = 0; i < num_keys; ++i) { + hashes[i] = BloomHashId(*keys[i], hash_id); + LegacyBloomImpl::PrepareHashMayMatch(hashes[i], num_lines_, data_, + /*out*/ &byte_offsets[i], + log2_cache_line_size_); + } + for (int i = 0; i < num_keys; ++i) { + may_match[i] = LegacyBloomImpl::HashMayMatchPrepared( + hashes[i], num_probes_, data_ + byte_offsets[i], + log2_cache_line_size_); + } + } + #endif + private: const char* data_; const int num_probes_; @@ -602,6 +732,8 @@ const std::vector BloomFilterPolicy::kAllUserModes = { kAuto, }; +// init BloomFilterPolicy, only used for old Block Filter Format, +// BloomFilterPolicy not used in our work -- WaLSM and WaLSM+ BloomFilterPolicy::BloomFilterPolicy(double bits_per_key, Mode mode) : mode_(mode), warned_(false), aggregate_rounding_balance_(0) { // Sanitize bits_per_key @@ -628,6 +760,7 @@ const char* BloomFilterPolicy::Name() const { return "rocksdb.BuiltinBloomFilter"; } +// not used in our work -- WaLSM and WaLSM+ void BloomFilterPolicy::CreateFilter(const Slice* keys, int n, std::string* dst) const { // We should ideally only be using this deprecated interface for @@ -657,6 +790,7 @@ void BloomFilterPolicy::CreateFilter(const Slice* keys, int n, } } +// not used in our work -- WaLSM and WaLSM+ bool BloomFilterPolicy::KeyMayMatch(const Slice& key, const Slice& bloom_filter) const { const size_t len = bloom_filter.size(); @@ -680,6 +814,7 @@ bool BloomFilterPolicy::KeyMayMatch(const Slice& key, array); } +// not used in our work -- WaLSM and WaLSM+ FilterBitsBuilder* BloomFilterPolicy::GetFilterBitsBuilder() const { // This code path should no longer be used, for the built-in // BloomFilterPolicy. Internal to RocksDB and outside @@ -692,6 +827,8 @@ FilterBitsBuilder* BloomFilterPolicy::GetFilterBitsBuilder() const { return GetBuilderWithContext(FilterBuildingContext(BlockBasedTableOptions())); } +// we use default format version(4)and Kauto, should return LegacyBloomBitsBuilder +// benchmark use default format version(4)and Kauto, do not set parameter -- format version FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext( const FilterBuildingContext& context) const { Mode cur = mode_; @@ -731,14 +868,24 @@ FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext( "with format_version>=5.", whole_bits_per_key_, adjective); } + + #ifndef ART_PLUS return new LegacyBloomBitsBuilder(whole_bits_per_key_, context.info_log); + #else + // TODO: determine filter_count, + // and maybe move this property to some kind of options (WaLSM+) + const int filter_count = 10; + return new MultiLegacyBloomBitsBuilder(filter_count, whole_bits_per_key_, context.info_log); + #endif } } assert(false); return nullptr; // something legal } +// only return FilterBuilder, +// return LegacyBloomBitsBuilder in our work WaLSM and WaLSM+ FilterBitsBuilder* BloomFilterPolicy::GetBuilderFromContext( const FilterBuildingContext& context) { if (context.table_options.filter_policy) { @@ -748,6 +895,8 @@ FilterBitsBuilder* BloomFilterPolicy::GetBuilderFromContext( } } +// return LegacyBloomBitsReader or FastLocalBloomBitsReader +// return only LegacyBloomBitsReader in our work // Read metadata to determine what kind of FilterBitsReader is needed // and return a new one. FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader( @@ -824,6 +973,8 @@ FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader( log2_cache_line_size); } +// return only FastLocalBloomBitsReader (new implementation) +// not used in our work // For newer Bloom filter implementations FilterBitsReader* BloomFilterPolicy::GetBloomBitsReader( const Slice& contents) const { @@ -884,6 +1035,7 @@ FilterBitsReader* BloomFilterPolicy::GetBloomBitsReader( return new AlwaysTrueFilter(); } +// used for benchmark parameter const FilterPolicy* NewBloomFilterPolicy(double bits_per_key, bool use_block_based_builder) { BloomFilterPolicy::Mode m; diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc index a104bec47..f6ecbeb1c 100644 --- a/table/block_based/full_filter_block.cc +++ b/table/block_based/full_filter_block.cc @@ -92,6 +92,20 @@ Slice FullFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/, return Slice(); } +#ifdef ART_PLUS +FullFilterBlockReader::FullFilterBlockReader( + const BlockBasedTable* t, + CachableEntry&& filter_block, + const int hash_id) + : FilterBlockReaderCommon(t, std::move(filter_block)), + hash_id_(hash_id) { + const SliceTransform* const prefix_extractor = table_prefix_extractor(); + if (prefix_extractor) { + full_length_enabled_ = + prefix_extractor->FullLengthEnabled(&prefix_extractor_full_length_); + } +} +#else FullFilterBlockReader::FullFilterBlockReader( const BlockBasedTable* t, CachableEntry&& filter_block) @@ -102,6 +116,7 @@ FullFilterBlockReader::FullFilterBlockReader( prefix_extractor->FullLengthEnabled(&prefix_extractor_full_length_); } } +#endif bool FullFilterBlockReader::KeyMayMatch( const Slice& key, const SliceTransform* /*prefix_extractor*/, @@ -115,7 +130,12 @@ bool FullFilterBlockReader::KeyMayMatch( if (!whole_key_filtering()) { return true; } + + #ifdef ART_PLUS + return MayMatch(key, no_io, get_context, lookup_context, hash_id_); + #else return MayMatch(key, no_io, get_context, lookup_context); + #endif } std::unique_ptr FullFilterBlockReader::Create( @@ -154,9 +174,44 @@ bool FullFilterBlockReader::PrefixMayMatch( (void)block_offset; #endif assert(block_offset == kNotValid); + + #ifdef ART_PLUS + return MayMatch(prefix, no_io, get_context, lookup_context, hash_id_); + #else return MayMatch(prefix, no_io, get_context, lookup_context); + #endif } +#ifdef ART_PLUS +bool FullFilterBlockReader::MayMatch( + const Slice& entry, bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, const int hash_id) const { + CachableEntry filter_block; + + const Status s = + GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block); + if (!s.ok()) { + IGNORE_STATUS_IF_ERROR(s); + return true; + } + + assert(filter_block.GetValue()); + + FilterBitsReader* const filter_bits_reader = + filter_block.GetValue()->filter_bits_reader(); + + if (filter_bits_reader) { + if (filter_bits_reader->MayMatchWithId(entry, hash_id)) { + PERF_COUNTER_ADD(bloom_sst_hit_count, 1); + return true; + } else { + PERF_COUNTER_ADD(bloom_sst_miss_count, 1); + return false; + } + } + return true; // remain the same with block_based filter +} +#else bool FullFilterBlockReader::MayMatch( const Slice& entry, bool no_io, GetContext* get_context, BlockCacheLookupContext* lookup_context) const { @@ -185,6 +240,7 @@ bool FullFilterBlockReader::MayMatch( } return true; // remain the same with block_based filter } +#endif void FullFilterBlockReader::KeysMayMatch( MultiGetRange* range, const SliceTransform* /*prefix_extractor*/, @@ -199,7 +255,12 @@ void FullFilterBlockReader::KeysMayMatch( // present return; } + #ifdef ART_PLUS + MayMatch(range, no_io, nullptr, lookup_context, hash_id_); + #else MayMatch(range, no_io, nullptr, lookup_context); + #endif + } void FullFilterBlockReader::PrefixesMayMatch( @@ -213,9 +274,15 @@ void FullFilterBlockReader::PrefixesMayMatch( MayMatch(range, no_io, prefix_extractor, lookup_context); } +#ifdef ART_PLUS +void FullFilterBlockReader::MayMatch( + MultiGetRange* range, bool no_io, const SliceTransform* prefix_extractor, + BlockCacheLookupContext* lookup_context, const int hash_id) const { +#else void FullFilterBlockReader::MayMatch( MultiGetRange* range, bool no_io, const SliceTransform* prefix_extractor, BlockCacheLookupContext* lookup_context) const { +#endif CachableEntry filter_block; const Status s = GetOrReadFilterBlock(no_io, range->begin()->get_context, @@ -254,7 +321,11 @@ void FullFilterBlockReader::MayMatch( } } + #ifdef ART_PLUS + filter_bits_reader->MayMatchWithId(num_keys, &keys[0], &may_match[0], hash_id); + #else filter_bits_reader->MayMatch(num_keys, &keys[0], &may_match[0]); + #endif int i = 0; for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) { diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h index 42f4dbbc3..deda30c6f 100644 --- a/table/block_based/full_filter_block.h +++ b/table/block_based/full_filter_block.h @@ -37,6 +37,8 @@ class FilterBitsReader; // class FullFilterBlockBuilder : public FilterBlockBuilder { public: + // when format version < 5, use LegacyBloomBitsBuilder + // In our work, format version = 4 explicit FullFilterBlockBuilder(const SliceTransform* prefix_extractor, bool whole_key_filtering, FilterBitsBuilder* filter_bits_builder); @@ -48,18 +50,29 @@ class FullFilterBlockBuilder : public FilterBlockBuilder { // directly. and be deleted here ~FullFilterBlockBuilder() {} + // default return false virtual bool IsBlockBased() override { return false; } + // not implemented in FullFilterBlock virtual void StartBlock(uint64_t /*block_offset*/) override {} + // if not use prefix bloom, only call AddKey(key) virtual void Add(const Slice& key) override; + // return num_added_, num of keys virtual size_t NumAdded() const override { return num_added_; } + // only return the slice from LegacyBloomBitsBuilder(format version < 5) virtual Slice Finish(const BlockHandle& tmp, Status* status) override; using FilterBlockBuilder::Finish; protected: + // call LegacyBloomBitsBuilder(format version < 5), add key and num_added_ virtual void AddKey(const Slice& key); + // LegacyBloomBitsBuilder(format version < 5) std::unique_ptr filter_bits_builder_; + // set last_prefix_recorded_ and last_whole_key_recorded_ to false virtual void Reset(); + // not used when disable prefix bloom void AddPrefix(const Slice& key); + // return prefix_extractor, however we donot use prefix bloom + // no need for prefix_extractor const SliceTransform* prefix_extractor() { return prefix_extractor_; } private: @@ -83,38 +96,48 @@ class FullFilterBlockBuilder : public FilterBlockBuilder { class FullFilterBlockReader : public FilterBlockReaderCommon { public: + // set prefix_extractor if needed + // In our work, dont use prefix_extractor + #ifdef ART_PLUS + FullFilterBlockReader(const BlockBasedTable* t, + CachableEntry&& filter_block, + const int hash_id = 0); + #else FullFilterBlockReader(const BlockBasedTable* t, CachableEntry&& filter_block); - + #endif + // call FullFilterBlockReader() to return std::unique_ptr static std::unique_ptr Create( const BlockBasedTable* table, const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, bool pin, BlockCacheLookupContext* lookup_context); - + // always return false bool IsBlockBased() override { return false; } - + // call MayMatch(key, no_io, get_context, lookup_context) bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, GetContext* get_context, BlockCacheLookupContext* lookup_context) override; - + // not used in our work bool PrefixMayMatch(const Slice& prefix, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, GetContext* get_context, BlockCacheLookupContext* lookup_context) override; - + // range check, call MayMatch(range, no_io, nullptr, lookup_context); void KeysMayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, BlockCacheLookupContext* lookup_context) override; - + // not used in our work void PrefixesMayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, BlockCacheLookupContext* lookup_context) override; + // call ApproximateFilterBlockMemoryUsage(), return Memory Usage size_t ApproximateMemoryUsage() const override; + // when disable prefix bloom, never call this method bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key, const SliceTransform* prefix_extractor, const Comparator* comparator, @@ -123,17 +146,33 @@ class FullFilterBlockReader BlockCacheLookupContext* lookup_context) override; private: + #ifdef ART_PLUS + // Get From Cache Or Read From SST, to get filter, then check whether entry hit + bool MayMatch(const Slice& entry, bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, const int hash_id = 0) const; + // range is the key range in the SST, check out these keys may fit in the filter + void MayMatch(MultiGetRange* range, bool no_io, + const SliceTransform* prefix_extractor, + BlockCacheLookupContext* lookup_context, + const int hash_id = 0) const; + #else bool MayMatch(const Slice& entry, bool no_io, GetContext* get_context, BlockCacheLookupContext* lookup_context) const; + // range is the key range in the SST, check out these keys may fit in the filter void MayMatch(MultiGetRange* range, bool no_io, const SliceTransform* prefix_extractor, BlockCacheLookupContext* lookup_context) const; + #endif + // when disable prefix bloom, never call this method bool IsFilterCompatible(const Slice* iterate_upper_bound, const Slice& prefix, const Comparator* comparator) const; private: bool full_length_enabled_; size_t prefix_extractor_full_length_; + #ifdef ART_PLUS + const int hash_id_; + #endif }; } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/parsed_full_filter_block.h b/table/block_based/parsed_full_filter_block.h index 36c619921..c1b342538 100644 --- a/table/block_based/parsed_full_filter_block.h +++ b/table/block_based/parsed_full_filter_block.h @@ -17,19 +17,24 @@ class FilterPolicy; // The sharable/cachable part of the full filter. class ParsedFullFilterBlock { public: + // mainly get FilterBitsReader from filter_policy ParsedFullFilterBlock(const FilterPolicy* filter_policy, BlockContents&& contents); ~ParsedFullFilterBlock(); + // noticed unique_ptr point to FilterBitsReader, + // thus this method return the FilterBitsReader filter_bits_reader_ pointed to FilterBitsReader* filter_bits_reader() const { return filter_bits_reader_.get(); } + // not implemented // TODO: consider memory usage of the FilterBitsReader size_t ApproximateMemoryUsage() const { return block_contents_.ApproximateMemoryUsage(); } - + // not implemented + // TODO bool own_bytes() const { return block_contents_.own_bytes(); } private: diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index dc25abbea..e077603f4 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -5,18 +5,30 @@ #include "table/block_based/partitioned_filter_block.h" +#include +#include +#include #include +#include "db/dbformat.h" #include "file/file_util.h" #include "monitoring/perf_context_imp.h" #include "port/malloc.h" #include "port/port.h" +#include "rocksdb/comparator.h" #include "rocksdb/filter_policy.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" #include "table/block_based/block.h" #include "table/block_based/block_based_table_reader.h" #include "util/coding.h" namespace ROCKSDB_NAMESPACE { +#ifdef ART_PLUS +Slice generate_modified_internal_key(std::unique_ptr& buf, + Slice original_internal_key, + int filter_index, int segment_id); +#endif PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder( const SliceTransform* _prefix_extractor, bool whole_key_filtering, @@ -55,6 +67,13 @@ PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder( } } } + + #ifdef ART_PLUS + filter_count_ = filter_bits_builder->filter_count_; + filter_gc.resize(filter_count_); + filters.resize(filter_count_); + finishing_filter_index_ = 0; + #endif } PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {} @@ -70,7 +89,9 @@ void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock( if (!p_index_builder_->ShouldCutFilterBlock()) { return; } + #ifndef ART_PLUS filter_gc.push_back(std::unique_ptr(nullptr)); + #endif // Add the prefix of the next key before finishing the partition. This hack, // fixes a bug with format_verison=3 where seeking for the prefix would lead @@ -81,9 +102,18 @@ void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock( FullFilterBlockBuilder::AddPrefix(*next_key); } + #ifdef ART_PLUS + for (int i = 0; i < filter_count_; ++i) { + filter_gc[i].push_back(std::unique_ptr(nullptr)); + Slice filter = filter_bits_builder_->FinishWithId(&filter_gc[i].back(), i); + std::string& index_key = p_index_builder_->GetPartitionKey(); + filters[i].push_back({index_key, filter, segment_id_base_.fetch_add(1, std::memory_order_relaxed)}); + } + #else Slice filter = filter_bits_builder_->Finish(&filter_gc.back()); std::string& index_key = p_index_builder_->GetPartitionKey(); filters.push_back({index_key, filter}); + #endif keys_added_to_partition_ = 0; Reset(); } @@ -102,7 +132,11 @@ Slice PartitionedFilterBlockBuilder::Finish( const BlockHandle& last_partition_block_handle, Status* status) { if (finishing_filters == true) { // Record the handle of the last written filter block in the index + #ifdef ART_PLUS + FilterEntry& last_entry = filters[finishing_filter_index_].front(); + #else FilterEntry& last_entry = filters.front(); + #endif std::string handle_encoding; last_partition_block_handle.EncodeTo(&handle_encoding); std::string handle_delta_encoding; @@ -111,6 +145,21 @@ Slice PartitionedFilterBlockBuilder::Finish( last_partition_block_handle.size() - last_encoded_handle_.size()); last_encoded_handle_ = last_partition_block_handle; const Slice handle_delta_encoding_slice(handle_delta_encoding); + + #ifdef ART_PLUS + std::unique_ptr modified_key_buf; + Slice modified_key = generate_modified_internal_key( + modified_key_buf, last_entry.key, finishing_filter_index_, + last_entry.segment_id); + index_on_filter_block_builder_.Add(modified_key, handle_encoding, + &handle_delta_encoding_slice); + if (!p_index_builder_->seperator_is_key_plus_seq()) { + index_on_filter_block_builder_without_seq_.Add( + ExtractUserKey(modified_key), handle_encoding, + &handle_delta_encoding_slice); + } + filters[finishing_filter_index_].pop_front(); + #else index_on_filter_block_builder_.Add(last_entry.key, handle_encoding, &handle_delta_encoding_slice); if (!p_index_builder_->seperator_is_key_plus_seq()) { @@ -119,12 +168,24 @@ Slice PartitionedFilterBlockBuilder::Finish( &handle_delta_encoding_slice); } filters.pop_front(); + #endif } else { MaybeCutAFilterBlock(nullptr); } // If there is no filter partition left, then return the index on filter // partitions + #ifdef ART_PLUS + if (UNLIKELY(filters[finishing_filter_index_].empty())) { + finishing_filter_index_++; + if (finishing_filter_index_ < filter_count_) { + *status = Status::Incomplete(); + finishing_filters = true; + return filters[finishing_filter_index_].front().filter; + } + #else if (UNLIKELY(filters.empty())) { + #endif + *status = Status::OK(); if (finishing_filters) { if (p_index_builder_->seperator_is_key_plus_seq()) { @@ -141,7 +202,11 @@ Slice PartitionedFilterBlockBuilder::Finish( // indicate we expect more calls to Finish *status = Status::Incomplete(); finishing_filters = true; + #ifdef ART_PLUS + return filters[finishing_filter_index_].front().filter; + #else return filters.front().filter; + #endif } } @@ -191,6 +256,26 @@ bool PartitionedFilterBlockReader::KeyMayMatch( &FullFilterBlockReader::KeyMayMatch); } +#ifdef ART_PLUS +bool PartitionedFilterBlockReader::KeyMayMatch( + FilterCacheClient& filter_cache, + const Slice& key, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, + GetContext* get_context, BlockCacheLookupContext* lookup_context) { + assert(const_ikey_ptr != nullptr); + assert(block_offset == kNotValid); + if (!whole_key_filtering()) { + return true; + } + + return MayMatch(filter_cache, + key, prefix_extractor, block_offset, no_io, const_ikey_ptr, + get_context, lookup_context, + &FullFilterBlockReader::KeyMayMatch); +} +#endif + +// TODO: not used in WaLSM+ Benchmark, meybe used in MultiGet interface ? void PartitionedFilterBlockReader::KeysMayMatch( MultiGetRange* range, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, @@ -204,6 +289,7 @@ void PartitionedFilterBlockReader::KeysMayMatch( &FullFilterBlockReader::KeysMayMatch); } +// not use prefix filter in WaLSM+ Benchmark bool PartitionedFilterBlockReader::PrefixMayMatch( const Slice& prefix, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, @@ -235,10 +321,18 @@ void PartitionedFilterBlockReader::PrefixesMayMatch( BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( const CachableEntry& filter_block, const Slice& entry) const { IndexBlockIter iter; + #ifdef ART_PLUS + const Comparator* const segment_id_removing_comparator = table()->get_rep()->segment_id_removing_comparator.get(); + #else const InternalKeyComparator* const comparator = internal_comparator(); + #endif Statistics* kNullStats = nullptr; filter_block.GetValue()->NewIndexIterator( + #ifdef ART_PLUS + segment_id_removing_comparator, + #else comparator->user_comparator(), + #endif table()->get_rep()->get_global_seqno(BlockType::kFilter), &iter, kNullStats, true /* total_order_seek */, false /* have_first_key */, index_key_includes_seq(), index_value_is_full()); @@ -256,6 +350,7 @@ BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( return fltr_blk_handle; } +// TODO: retrieve filter block from filter cache (WaLSM+) Status PartitionedFilterBlockReader::GetFilterPartitionBlock( FilePrefetchBuffer* prefetch_buffer, const BlockHandle& fltr_blk_handle, bool no_io, GetContext* get_context, @@ -289,6 +384,7 @@ Status PartitionedFilterBlockReader::GetFilterPartitionBlock( return s; } +// TODO: retrieve filter block from filter cache (WaLSM+) bool PartitionedFilterBlockReader::MayMatch( const Slice& slice, const SliceTransform* prefix_extractor, uint64_t block_offset, bool no_io, const Slice* const_ikey_ptr, @@ -306,11 +402,22 @@ bool PartitionedFilterBlockReader::MayMatch( return true; } + #ifdef ART_PLUS + // find key "0 original_internal key". filter_index=segment_id=0. (WaLSM+) + // segment_id itself is useless in comparison, + // but must be appended otherwise the extracted user key will be incorrect. + std::unique_ptr modified_key_buf; + Slice modified_key = + generate_modified_internal_key(modified_key_buf, *const_ikey_ptr, 0, 0); + auto filter_handle = GetFilterPartitionHandle(filter_block, modified_key); + #else auto filter_handle = GetFilterPartitionHandle(filter_block, *const_ikey_ptr); + #endif if (UNLIKELY(filter_handle.size() == 0)) { // key is out of range return false; } + // TODO: get some filter blocks from the filter cache and check (WaLSM+) CachableEntry filter_partition_block; s = GetFilterPartitionBlock(nullptr /* prefetch_buffer */, filter_handle, no_io, get_context, lookup_context, @@ -322,11 +429,80 @@ bool PartitionedFilterBlockReader::MayMatch( FullFilterBlockReader filter_partition(table(), std::move(filter_partition_block)); + // initialize the reader with hash_id (WaLSM+) + // FullFilterBlockReader filter_partition(table(), + // std::move(filter_partition_block), + // 1); return (filter_partition.*filter_function)( slice, prefix_extractor, block_offset, no_io, const_ikey_ptr, get_context, lookup_context); } +#ifdef ART_PLUS +bool PartitionedFilterBlockReader::MayMatch( + FilterCacheClient& filter_cache, + const Slice& slice, const SliceTransform* prefix_extractor, + uint64_t block_offset, bool no_io, const Slice* const_ikey_ptr, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + FilterFunction filter_function) const { + /* + simple example of filter cache object: + uint32_t segment_id = 100; + std::string key = "k"; + bool result = filter_cache.check_key(segment_id, k); + */ + // TODO: leave filter unit data or filter unit reader into filter_cache, so block cache only need to cache filter index? + CachableEntry filter_block; + Status s = + GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block); + if (UNLIKELY(!s.ok())) { + IGNORE_STATUS_IF_ERROR(s); + return true; + } + + if (UNLIKELY(filter_block.GetValue()->size() == 0)) { + return true; + } + + #ifdef ART_PLUS + // find key "0 original_internal key". filter_index=segment_id=0. (WaLSM+) + // segment_id itself is useless in comparison, + // but must be appended otherwise the extracted user key will be incorrect. + std::unique_ptr modified_key_buf; + Slice modified_key = + generate_modified_internal_key(modified_key_buf, *const_ikey_ptr, 0, 0); + auto filter_handle = GetFilterPartitionHandle(filter_block, modified_key); + #else + auto filter_handle = GetFilterPartitionHandle(filter_block, *const_ikey_ptr); + #endif + if (UNLIKELY(filter_handle.size() == 0)) { // key is out of range + return false; + } + + // TODO: get some filter blocks from the filter cache and check (WaLSM+) + CachableEntry filter_partition_block; + s = GetFilterPartitionBlock(nullptr /* prefetch_buffer */, filter_handle, + no_io, get_context, lookup_context, + &filter_partition_block); + if (UNLIKELY(!s.ok())) { + IGNORE_STATUS_IF_ERROR(s); + return true; + } + + FullFilterBlockReader filter_partition(table(), + std::move(filter_partition_block)); + // initialize the reader with hash_id (WaLSM+) + // FullFilterBlockReader filter_partition(table(), + // std::move(filter_partition_block), + // 1); + return (filter_partition.*filter_function)( + slice, prefix_extractor, block_offset, no_io, const_ikey_ptr, get_context, + lookup_context); +} +#endif + +// TODO: used when calling MultiGet, but we dont use MultiGet in WaLSM+ Benchmark +// TODO: retrieve filter block from filter cache (WaLSM+) void PartitionedFilterBlockReader::MayMatch( MultiGetRange* range, const SliceTransform* prefix_extractor, uint64_t block_offset, bool no_io, BlockCacheLookupContext* lookup_context, @@ -350,9 +526,20 @@ void PartitionedFilterBlockReader::MayMatch( // share block cache lookup and use full filter multiget on the partition // filter. for (auto iter = start_iter_same_handle; iter != range->end(); ++iter) { + #ifdef ART_PLUS + // find key "0 original_internal key". filter_index=segment_id=0. (WaLSM+) + // segment_id itself is useless in comparison, + // but must be appended otherwise the extracted user key will be incorrect. + std::unique_ptr modified_key_buf; + Slice modified_key = + generate_modified_internal_key(modified_key_buf, iter->ikey, 0, 0); // TODO: re-use one top-level index iterator + BlockHandle this_filter_handle = + GetFilterPartitionHandle(filter_block, modified_key); + #else BlockHandle this_filter_handle = GetFilterPartitionHandle(filter_block, iter->ikey); + #endif if (!prev_filter_handle.IsNull() && this_filter_handle != prev_filter_handle) { MultiGetRange subrange(*range, start_iter_same_handle, iter); @@ -380,6 +567,7 @@ void PartitionedFilterBlockReader::MayMatch( } } +// TODO: retrieve filter block from filter cache (WaLSM+) void PartitionedFilterBlockReader::MayMatchPartition( MultiGetRange* range, const SliceTransform* prefix_extractor, uint64_t block_offset, BlockHandle filter_handle, bool no_io, @@ -394,6 +582,10 @@ void PartitionedFilterBlockReader::MayMatchPartition( return; // Any/all may match } + // initialize the reader with hash_id (WaLSM+) + // FullFilterBlockReader filter_partition(table(), + // std::move(filter_partition_block), + // 1); FullFilterBlockReader filter_partition(table(), std::move(filter_partition_block)); (filter_partition.*filter_function)(range, prefix_extractor, block_offset, @@ -438,10 +630,19 @@ void PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, assert(filter_block.GetValue()); IndexBlockIter biter; + #ifdef ART_PLUS + const Comparator* const segment_id_removing_comparator = rep->segment_id_removing_comparator.get(); + #else const InternalKeyComparator* const comparator = internal_comparator(); + #endif Statistics* kNullStats = nullptr; filter_block.GetValue()->NewIndexIterator( - comparator->user_comparator(), rep->get_global_seqno(BlockType::kFilter), + #ifdef ART_PLUS + segment_id_removing_comparator, + #else + comparator->user_comparator(), + #endif + rep->get_global_seqno(BlockType::kFilter), &biter, kNullStats, true /* total_order_seek */, false /* have_first_key */, index_key_includes_seq(), index_value_is_full()); @@ -512,4 +713,30 @@ bool PartitionedFilterBlockReader::index_value_is_full() const { return table()->get_rep()->index_value_is_full; } +#ifdef ART_PLUS +std::atomic PartitionedFilterBlockBuilder::segment_id_base_{0}; +#endif + +#ifdef ART_PLUS +Slice generate_modified_internal_key(std::unique_ptr& buf, Slice original_internal_key, int filter_index, int segment_id) { + // calculate modified_key (WaLSM+) + // +--------------+------------------------------------+------------+-------------------------+ + // | filter_index | original_user_key | segment_id | original_internal_bytes | + // | 4 bytes | (key.size() - kInternalBytes) bytes| 4 bytes | kInternalBytes bytes | + // +--------------+------------------------------------+------------+-------------------------+ + size_t modified_key_buf_size = 4 + original_internal_key.size() + 4; + char *modified_key_buf = new char[modified_key_buf_size]; + Slice original_user_key = ExtractUserKey(original_internal_key); + Slice original_internal_bytes = ExtractInternalBytes(original_internal_key); + EncodeFixed32R(modified_key_buf, filter_index); + std::memcpy(modified_key_buf + 4, original_user_key.data(), original_user_key.size()); + EncodeFixed32R(modified_key_buf + 4 + original_user_key.size(), segment_id); + std::memcpy(modified_key_buf + 4 + original_user_key.size() + 4, original_internal_bytes.data_, original_internal_bytes.size()); + Slice modified_key = Slice(modified_key_buf, modified_key_buf_size); + + buf.reset(modified_key_buf); + return modified_key; +} +#endif + } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h index 2ccc8f8bc..0d970a7a6 100644 --- a/table/block_based/partitioned_filter_block.h +++ b/table/block_based/partitioned_filter_block.h @@ -5,10 +5,14 @@ #pragma once +#include +#include #include #include #include +#include #include "db/dbformat.h" +#include "db/art/filter_cache_client.h" #include "index_builder.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" @@ -45,10 +49,19 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { struct FilterEntry { std::string key; Slice filter; + #ifdef ART_PLUS + uint32_t segment_id; + #endif }; + #ifdef ART_PLUS + std::vector> filters; // list of partitioned indexes and their keys + std::unique_ptr value; + std::vector>> filter_gc; + #else std::list filters; // list of partitioned indexes and their keys std::unique_ptr value; std::vector> filter_gc; + #endif bool finishing_filters = false; // true if Finish is called once but not complete yet. // The policy of when cut a filter block and Finish it @@ -63,6 +76,14 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { // The number of keys added to the last partition so far uint32_t keys_added_to_partition_; BlockHandle last_encoded_handle_; + + #ifdef ART_PLUS + // The number of filter builders(hash functions) for each segment. (WaLSM+) + int filter_count_; + // When Finish() is called, return filters[filter_index].front() (WaLSM+) + int finishing_filter_index_; + static std::atomic segment_id_base_; + #endif }; class PartitionedFilterBlockReader : public FilterBlockReaderCommon { @@ -80,11 +101,20 @@ class PartitionedFilterBlockReader : public FilterBlockReaderCommon { uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, GetContext* get_context, BlockCacheLookupContext* lookup_context) override; +#ifdef ART_PLUS + bool KeyMayMatch(FilterCacheClient& filter_cache, + const Slice& key, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, GetContext* get_context, + BlockCacheLookupContext* lookup_context); +#endif + // TODO: not used in WaLSM+ Benchmark, meybe used in MultiGet interface ? void KeysMayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, BlockCacheLookupContext* lookup_context) override; + // not use prefix filter in WaLSM+ experiments bool PrefixMayMatch(const Slice& prefix, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, @@ -117,6 +147,15 @@ class PartitionedFilterBlockReader : public FilterBlockReaderCommon { GetContext* get_context, BlockCacheLookupContext* lookup_context, FilterFunction filter_function) const; +#ifdef ART_PLUS + bool MayMatch(FilterCacheClient& filter_cache, + const Slice& slice, const SliceTransform* prefix_extractor, + uint64_t block_offset, bool no_io, const Slice* const_ikey_ptr, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + FilterFunction filter_function) const; +#endif + // TODO: used when calling MultiGet, but we dont use MultiGet in WaLSM+ Benchmark using FilterManyFunction = void (FullFilterBlockReader::*)( MultiGetRange* range, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, diff --git a/table/get_context.h b/table/get_context.h index 30a877799..5ca4147ce 100644 --- a/table/get_context.h +++ b/table/get_context.h @@ -20,6 +20,7 @@ class PinnedIteratorsManager; // Data structure for accumulating statistics during a point lookup. At the // end of the point lookup, the corresponding ticker stats are updated. This // avoids the overhead of frequent ticker stats updates +// WaLSM+ Note: related filter counter struct GetContextStats { uint64_t num_cache_hit = 0; uint64_t num_cache_index_hit = 0; diff --git a/thread_demo/README.md b/thread_demo/README.md new file mode 100644 index 000000000..6e7c21919 --- /dev/null +++ b/thread_demo/README.md @@ -0,0 +1,3 @@ +# thread_demo + +this is a small test demo for thread pool lib : https://github.com/alugowski/task-thread-pool diff --git a/thread_demo/test.sh b/thread_demo/test.sh new file mode 100644 index 000000000..10e7766e3 --- /dev/null +++ b/thread_demo/test.sh @@ -0,0 +1,3 @@ +rm thread +gcc thread.cc -o thread -std=c++11 -lstdc++ -lpthread +./thread \ No newline at end of file diff --git a/thread_demo/thread.cc b/thread_demo/thread.cc new file mode 100644 index 000000000..1c9d5ae31 --- /dev/null +++ b/thread_demo/thread.cc @@ -0,0 +1,132 @@ +#include +#include +// Use #include "task_thread_pool.hpp" for relative path, +// and #include if installed in include path +#include "task_thread_pool.hpp" + +// func outside class with arguments input and return value +bool outerFunc1(const bool& arg1, const bool& arg2) { + bool arg = arg1 && arg2; + std::cout << "call to outerFunc1: " << (arg) << std::endl; + return arg; +} + +// func outside class with return value +bool outerFunc2() { + bool arg = true; + std::cout << "call to outerFunc2: " << (arg) << std::endl; + return arg; +} + +// func outside class with arguments input +void outerFunc3(const bool& arg1, const bool& arg2) { + bool arg = arg1 && arg2; + std::cout << "call to outerFunc3: " << (arg) << std::endl; + return; +} + +// func outside class +void outerFunc4() { + bool arg = false; + std::cout << "call to outerFunc4: " << (arg) << std::endl; + return; +} + +class TestClass { +public: + int cnt; + TestClass() { + cnt = 0; + } + void add() { + cnt ++; + } +}; + +class ThreadDemo { +private: + static task_thread_pool::task_thread_pool pool_; + + // func in class should be "static" + // func inside class with arguments input and return value + static bool innerFunc1(const bool& arg1, const bool& arg2) { + bool arg = arg1 && arg2; + std::cout << "call to innerFunc1: " << (arg) << std::endl; + return arg; + } + + // func inside class with return value + static bool innerFunc2() { + bool arg = true; + std::cout << "call to innerFunc2: " << (arg) << std::endl; + return arg; + } + + // func inside class with arguments input + static void innerFunc3(const bool& arg1, const bool& arg2) { + bool arg = arg1 && arg2; + std::cout << "call to innerFunc3: " << (arg) << std::endl; + return; + } + + // func inside class + static void innerFunc4() { + bool arg = false; + std::cout << "call to innerFunc4: " << (arg) << std::endl; + return; + } + + static void testFunc(TestClass*& test) { + int cnt = 0; + while (cnt++ < 100) { + (*test).add(); + //break; + } + } + + static void monitor(TestClass*& test) { + int cnt = 0; + while (cnt++ < 100) { + std::cout << (*test).cnt << std::endl; + //break; + } + } + +public: + void thread_test() { + std::future func_future_1, func_future_2, func_future_3, func_future_4; + func_future_1 = pool_.submit(outerFunc1, true, true); + func_future_2 = pool_.submit(outerFunc2); + pool_.submit_detach(outerFunc3, true, false); + pool_.submit_detach(outerFunc4); + func_future_3 = pool_.submit(innerFunc1, true, true); + func_future_4 = pool_.submit(innerFunc2); + pool_.submit_detach(innerFunc3, true, false); + pool_.submit_detach(innerFunc4); + + pool_.wait_for_tasks(); + } + + void test(TestClass* test1) { + pool_.pause(); + pool_.submit_detach(testFunc, test1); + pool_.submit_detach(monitor, test1); + pool_.submit_detach(testFunc, test1); + pool_.submit_detach(monitor, test1); + pool_.unpause(); + // pool_.submit_detach(testFunc, test1); + // monitor(); + // testFunc(test1); + // monitor(test1); + } +}; + +task_thread_pool::task_thread_pool ThreadDemo::pool_; + +int main() { + ThreadDemo demo; + TestClass* test1 = new TestClass(); + // demo.thread_test(); + demo.test(test1); + return 0; +} \ No newline at end of file diff --git a/util/coding.h b/util/coding.h index d7d83d1b5..03ea9689b 100644 --- a/util/coding.h +++ b/util/coding.h @@ -99,6 +99,7 @@ extern int VarintLength(uint64_t v); // REQUIRES: dst has enough space for the value being written extern void EncodeFixed16(char* dst, uint16_t value); extern void EncodeFixed32(char* dst, uint32_t value); +extern void EncodeFixed32R(char* dst, uint32_t value); extern void EncodeFixed64(char* dst, uint64_t value); // Lower-level versions of Put... that write directly into a character buffer @@ -136,6 +137,13 @@ inline uint32_t DecodeFixed32(const char* ptr) { } } +inline uint32_t DecodeFixed32R(const char* ptr) { + return ((static_cast(static_cast(ptr[3]))) + | (static_cast(static_cast(ptr[2])) << 8) + | (static_cast(static_cast(ptr[1])) << 16) + | (static_cast(static_cast(ptr[0])) << 24)); +} + inline uint64_t DecodeFixed64(const char* ptr) { if (port::kLittleEndian) { // Load the raw bytes @@ -187,6 +195,13 @@ inline void EncodeFixed32(char* buf, uint32_t value) { } } +inline void EncodeFixed32R(char* buf, uint32_t value) { + buf[3] = value & 0xff; + buf[2] = (value >> 8) & 0xff; + buf[1] = (value >> 16) & 0xff; + buf[0] = (value >> 24) & 0xff; +} + inline void EncodeFixed64(char* buf, uint64_t value) { if (port::kLittleEndian) { memcpy(buf, &value, sizeof(value)); diff --git a/util/comparator.cc b/util/comparator.cc index 44d45732a..f82a6dd14 100644 --- a/util/comparator.cc +++ b/util/comparator.cc @@ -207,6 +207,58 @@ class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl { }; }// namespace +#ifdef ART_PLUS +// Remove the last 4 bytes of the key to be compared +// before performing the comparison +// (WaLSM+) +class SegmentIdRemovingComparatorImpl : public Comparator { + public: + SegmentIdRemovingComparatorImpl(const Comparator* comparator) + : real_comparator(comparator) {} + + const char* Name() const override { + return "walsmplus.SegmentIdRemovingComparator"; + } + + int Compare(const Slice& a, const Slice& b) const override { + return real_comparator->Compare(Slice(a.data(), a.size() - 4), + Slice(b.data(), b.size() - 4)); + } + + bool Equal(const Slice& a, const Slice& b) const override { + return real_comparator->Equal(Slice(a.data(), a.size() - 4), + Slice(b.data(), b.size() - 4)); + } + + void FindShortestSeparator(std::string* start, + const Slice& limit) const override { + real_comparator->FindShortestSeparator(start, limit); + } + + void FindShortSuccessor(std::string* key) const override { + real_comparator->FindShortSuccessor(key); + } + + bool IsSameLengthImmediateSuccessor(const Slice& s, + const Slice& t) const override { + return real_comparator->IsSameLengthImmediateSuccessor( + Slice(s.data(), s.size() - 4), Slice(t.data(), t.size() - 4)); + } + + bool CanKeysWithDifferentByteContentsBeEqual() const override { + return real_comparator->CanKeysWithDifferentByteContentsBeEqual(); + } + + int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b, + bool b_has_ts) const override { + return CompareWithoutTimestamp(a, a_has_ts, b, b_has_ts); + } + + private: + const Comparator* real_comparator; +}; +#endif + const Comparator* BytewiseComparator() { static BytewiseComparatorImpl bytewise; return &bytewise; @@ -217,4 +269,12 @@ const Comparator* ReverseBytewiseComparator() { return &rbytewise; } +#ifdef ART_PLUS +std::unique_ptr SegmentIdRemovingComparator( + const Comparator* real_comparator) { + return std::unique_ptr( + new SegmentIdRemovingComparatorImpl(real_comparator)); +} +#endif + } // namespace ROCKSDB_NAMESPACE diff --git a/util/hash.h b/util/hash.h index a07fc4631..ba79c3b96 100644 --- a/util/hash.h +++ b/util/hash.h @@ -21,6 +21,7 @@ #include #include +#include #include "rocksdb/slice.h" #include "util/fastrange.h" @@ -58,6 +59,38 @@ inline uint32_t BloomHash(const Slice& key) { return Hash(key.data(), key.size(), 0xbc9f1d34); } +inline uint32_t BloomHashId(const Slice& key, int id) { + switch(id){ + case 0: + return Hash(key.data(), key.size(), 0xbc9f1d34); + case 1: + return Hash(key.data(), key.size(), 0x34f1d34b); + case 2: + return Hash(key.data(), key.size(), 0x251d34bc); + case 3: + return Hash(key.data(), key.size(), 0x01d34bc9); + case 4: + return Hash(key.data(), key.size(), 0x1934bc9f); + case 5: + return Hash(key.data(), key.size(), 0x934bc9f1); + case 6: + return Hash(key.data(), key.size(), 0x4bc9f193); + case 7: + return Hash(key.data(), key.size(), 0x51c2578a); + case 8: + return Hash(key.data(), key.size(), 0xda23562f); + case 9: + return Hash(key.data(), key.size(), 0x135254f2); + case 10: + return Hash(key.data(), key.size(), 0xea1e4a48); + case 11: + return Hash(key.data(), key.size(), 0x567925f1); + default: + std::cout << "BloomHash id error" << std::endl; + exit(1); + } +} + inline uint64_t GetSliceHash64(const Slice& key) { return Hash64(key.data(), key.size()); } diff --git a/workloads/README.md b/workloads/README.md new file mode 100644 index 000000000..0c7bccbf9 --- /dev/null +++ b/workloads/README.md @@ -0,0 +1,80 @@ +## Introduction + +Author: Guo Teng. Email: PRCguoteng@gmail.com + +**Deprecated.** Output read workload file when run YCSB and divide workload into key ranges. + +**Now we try to sample the Put key and generate key ranges, so dont use this tool any more** + +## Build and run + +simple run + +``` shell +cd workloads +g++ generator.cc -o generator +./generator +``` + +## Argument + +there are three arguments need to be defined, just see ```generator.cc``` + +``` c++ +// simple args, avoid using command line arg or config file +const std::string input_workload = "workload"; +const std::string output_seperators = "seperators"; +const int key_range_num = 1000; +``` + +## Workload + +we use YCSB to generator workload file, need enable micro USE_WORKLOAD when compiling. + +modify this code in ```YCSB/rocksdb/rocksdb_db.cc RocksdbDB::ReadSingle(...)``` func + +``` c++ +DB::Status RocksdbDB::ReadSingle(const std::string &table, const std::string &key, + const std::vector *fields, + std::vector &result) { + std::string data; + rocksdb::Status s = db_->Get(rocksdb::ReadOptions(), key, &data); + #ifdef GEN_WORKLOAD + std::fstream f; + f.open("../workload/workload", std::ios::out | std::ios::app); + f << key <(fieldcount_)); + } + return kOK; +} +``` + +then run one workload under YCSB/workload only use **one thread** + +After that, you will get the new workload file in workload dir + +## Use Workload + +Firstly, you should add -DWALSM_PLUS when compiling WaLSM code to enable heat_buckets + +Then, you can see related macros in ```db/art/macros.h``` + +``` c++ +// micros for HeatBuckets +#define SEPERATORS_PATH "/home/ycc/WaLSM/workload/seperators" +#define BUCKETS_PERIOD 20000 +#define BUCKETS_ALPHA 0.2 +``` + +modify ```SEPERATORS_PATH``` to your own generated seperators file path \ No newline at end of file diff --git a/workloads/generator.cc b/workloads/generator.cc new file mode 100644 index 000000000..3b8f49a8a --- /dev/null +++ b/workloads/generator.cc @@ -0,0 +1,85 @@ +#include +#include +#include +#include +#include +#include +#include + +int main() { + // simple args, avoid using command line arg or config file + const std::string input_workload = "workload"; + const std::string output_seperators = "seperators"; + const int key_range_num = 500; + + std::set keys; + std::string key; + + std::ifstream in; + in.open(input_workload.c_str(), std::ios::in); + + if (!in.is_open()) { + std::cout << "failed to open input file, please check whether the file exists" << std::endl; + return 0; + } + + std::cout << "success open input file" << std::endl; + while (std::getline(in, key)) { + // use set STL to deduplicate keys + keys.insert(key); + } + std::cout << "unique key count : " << keys.size() << std::endl; + in.close(); + + // convert to vector in order to locate key range border + std::vector selector; + selector.assign(keys.begin(), keys.end()); + std::cout << "selector size : " << selector.size() << std::endl; + + const int keys_size = selector.size(); + int one_key_range_count = 0; + // remind that last key range may have more than (one_key_range_count) keys, but it doesnt matter + one_key_range_count = keys_size / key_range_num; + + std::cout << "one key range count : " << one_key_range_count << std::endl; + std::cout << "key range num : " << key_range_num << std::endl; + + // each key range : [k_m, k_M), we need locate (key_range_num-1) seperators - k1, k2, ... + // when one key k comes, we can get key range index i, making (k >= seperators[i] && k < seperators[i+1]) + // we introduce two guard border, k_min = "u" and k_max = keys[keys.size()-1] + "MAXGUARD" + // note k_min and k_max need modified according to your unique workload + std::vector seperators; + seperators.push_back("user"); // min guard + assert(seperators[0] < selector[0]); + for (int i=1; i <= key_range_num-1; i++) { // need (key_range_num-1) seperators + seperators.push_back(selector[i * one_key_range_count]); + } + seperators.push_back(selector[selector.size()-1]+"MAXGUARD"); // max guard + + std::ofstream out; + out.open(output_seperators.c_str(), std::ios::out); + + if (!out.is_open()) { + std::cout << "failed to open output file, please rerun to create output" << std::endl; + return 0; + } + std::cout << "success open output file" << std::endl; + + for (const auto &seperator : seperators) { + // std::cout << seperator << std::endl; + out << seperator << std::endl; + } + out.close(); + + /* + std::ifstream valid; + valid.open("seperators", std::ios::in); + + while (std::getline(valid, key)) { + std::cout << key << std::endl; + } + valid.close(); + */ + + return 0; +} \ No newline at end of file