Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions quickwit/quickwit-common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,43 @@ pub fn split_file(split_id: impl Display) -> String {
format!("{split_id}.split")
}

/// Returns the storage path for a split given its ID and key prefix.
///
/// - Empty `prefix`: legacy flat scheme `{split_id}.split`
/// - Non-empty `prefix`: `{prefix}/{split_id}.split` — the `/` separator is inserted here, so the
/// `prefix` itself must NOT contain a trailing `/`
///
/// The prefix is computed once at split creation time and stored in `SplitMetadata`.
pub fn split_storage_path(split_id: &str, prefix: &str) -> String {
if prefix.is_empty() {
return format!("{split_id}.split");
}
format!("{prefix}/{split_id}.split")

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the / as a separator is not great because the AWS UI and S3 CLI is going to show them as a directories.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok changing that for - then?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually I cannot change this without breaking yahoo.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can delete the splits if needed

}

#[cfg(test)]
mod split_path_tests {
use super::*;

const SAMPLE_ULID: &str = "01ARZ3NDEKTSV4RRFFQ69G5FAV";

#[test]
fn test_split_storage_path_empty_prefix() {
assert_eq!(
split_storage_path(SAMPLE_ULID, ""),
"01ARZ3NDEKTSV4RRFFQ69G5FAV.split"
);
}

#[test]
fn test_split_storage_path_with_prefix() {
assert_eq!(
split_storage_path(SAMPLE_ULID, "TS"),
"TS/01ARZ3NDEKTSV4RRFFQ69G5FAV.split"
);
}
}

fn get_from_env_opt_aux<T: Debug>(
key: &str,
parse_fn: impl FnOnce(&str) -> Option<T>,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,12 @@ impl MergeSplitDownloader {
let _protect_guard = ctx.protect_zone();
let tantivy_dir = self
.split_store
.fetch_and_open_split(split.split_id(), download_directory, &io_controls)
.fetch_and_open_split(
split.split_id(),
&split.prefix,
download_directory,
&io_controls,
)
.await
.map_err(|error| {
let split_id = split.split_id();
Expand Down
1 change: 1 addition & 0 deletions quickwit/quickwit-indexing/src/actors/uploader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,7 @@ impl Handler<PackagedSplitBatch> for Uploader {
report_splits.push(ReportSplit {
storage_uri: split_store.remote_uri().to_string(),
split_id: packaged_split.split_id().to_string(),
prefix: split_metadata.prefix.clone(),
});

split_metadata_list.push(split_metadata);
Expand Down
2 changes: 2 additions & 0 deletions quickwit/quickwit-indexing/src/models/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ mod publisher_message;
mod raw_doc_batch;
mod shard_positions;
mod split_attrs;
mod split_prefix;

pub use indexed_split::{
CommitTrigger, EmptySplit, IndexedSplit, IndexedSplitBatch, IndexedSplitBatchBuilder,
Expand All @@ -48,6 +49,7 @@ pub use raw_doc_batch::RawDocBatch;
pub(crate) use shard_positions::LocalShardPositionsUpdate;
pub use shard_positions::ShardPositionsService;
pub use split_attrs::{SplitAttrs, create_split_metadata};
pub(crate) use split_prefix::compute_split_key_prefix;

#[derive(Debug)]
pub struct NewPublishToken(pub PublishToken);
1 change: 1 addition & 0 deletions quickwit/quickwit-indexing/src/models/split_attrs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ pub fn create_split_metadata(
footer_offsets,
delete_opstamp: split_attrs.delete_opstamp,
num_merge_ops: split_attrs.num_merge_ops,
prefix: super::compute_split_key_prefix(&split_attrs.split_id),
}
}

Expand Down
115 changes: 115 additions & 0 deletions quickwit/quickwit-indexing/src/models/split_prefix.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
// Copyright 2021-Present Datadog, Inc.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having this file located under actors is surprising because this is not an actor.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok moved it to models.

//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//! Computes the S3 key prefix used to shard split object keys across S3 partitions and avoid
//! hotspots. See [`compute_split_key_prefix`].

use std::sync::LazyLock;

use quickwit_common::rate_limited_warn;
use tracing::warn;

/// Maximum prefix length: the ULID random portion (positions 10–25) is 16 characters long.
const MAX_SPLIT_KEY_PREFIX_LEN: u8 = 16;

/// Returns the number of characters from the ULID random portion used as an S3 key prefix
/// directory.
///
/// Controlled by the `QW_SPLIT_KEY_PREFIX_LEN` environment variable (default: 0, i.e. the legacy
/// flat scheme). Setting this to `2` creates 32^2 = 1024 buckets, which is sufficient for most
/// workloads. Values above [`MAX_SPLIT_KEY_PREFIX_LEN`] are clamped.
///
/// The value is read from the environment once and cached for the lifetime of the process.
fn split_key_prefix_len() -> u8 {
static SPLIT_KEY_PREFIX_LEN: LazyLock<u8> = LazyLock::new(|| {
let prefix_len: u8 = quickwit_common::get_from_env("QW_SPLIT_KEY_PREFIX_LEN", 0u8, false);

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we should harcode the len to a reasonable default and forget about it reducing the amount of boilerplate in the config. We used to used 4 chars at Airbnb: we would hash the original path and use the first four char of the hash.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

split ulid are base32. 5 bits per chars. a prefix of 2 already gives us 1024 buckets too (and then the regular ulid).

the reason why I didn't do more was, on the off change that we need to find all splits within a range it can be done in 1024 requests. If we don't care then ok for 4. Let me know what you think.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was more trying to make a point about hardcoding, not about the length specifically. Any length that works for you works for me.

if prefix_len > MAX_SPLIT_KEY_PREFIX_LEN {
warn!(
prefix_len,
max = MAX_SPLIT_KEY_PREFIX_LEN,
"`QW_SPLIT_KEY_PREFIX_LEN` exceeds maximum; clamping"
);
return MAX_SPLIT_KEY_PREFIX_LEN;
}
prefix_len
});
*SPLIT_KEY_PREFIX_LEN
}

/// Computes the S3 key prefix for a split from its ULID, using the process-wide
/// `QW_SPLIT_KEY_PREFIX_LEN` configuration (see [`split_key_prefix_len`]).
///
/// Extracts the configured number of characters starting at position 10 of the ULID (the first
/// characters of the random portion, after the 10-character timestamp). Returns an empty string
/// when the configured length is 0 (legacy flat scheme).
///
/// The returned prefix does NOT contain the trailing `/` separator; that separator is added by
/// [`quickwit_common::split_storage_path`] when building the full storage path.
///
/// Hidden contract: `split_id` must be a valid 26-character ULID. If it is too short to extract
/// the requested prefix, a rate-limited warning is logged and an empty string is returned, falling
/// back to the legacy flat scheme.
pub(crate) fn compute_split_key_prefix(split_id: &str) -> String {
let prefix_len = split_key_prefix_len();
if prefix_len == 0 {
return String::new();
}
let end = 10 + prefix_len as usize;
if split_id.len() < end {
rate_limited_warn!(
limit_per_min = 1,
split_id = split_id,
prefix_len = prefix_len,
"split ID is too short to extract prefix; falling back to flat storage path"
);
return String::new();
}
split_id[10..end].to_string()
}

#[cfg(test)]
mod tests {
use super::*;

const SAMPLE_ULID: &str = "01ARZ3NDEKTSV4RRFFQ69G5FAV";

// The tests below mutate the `QW_SPLIT_KEY_PREFIX_LEN` environment variable, which feeds a
// process-wide cached `LazyLock`. They rely on `cargo nextest` running each test in its own
// process so the cache (and the environment) does not leak between tests.

#[test]
fn test_split_key_prefix_len_disabled_by_default() {
assert_eq!(split_key_prefix_len(), 0);
}

#[test]
fn test_compute_split_key_prefix_disabled_by_default() {
assert_eq!(compute_split_key_prefix(SAMPLE_ULID), "");
}

#[test]
fn test_compute_split_key_prefix_extracts_random_portion() {
// SAFETY: under nextest each test runs in its own process, so this does not race.
unsafe { std::env::set_var("QW_SPLIT_KEY_PREFIX_LEN", "4") };
// Characters 10–13 of the ULID are the first 4 characters of the random portion.
assert_eq!(compute_split_key_prefix(SAMPLE_ULID), "TSV4");
}

#[test]
fn test_compute_split_key_prefix_too_short_falls_back() {
// SAFETY: under nextest each test runs in its own process, so this does not race.
unsafe { std::env::set_var("QW_SPLIT_KEY_PREFIX_LEN", "4") };
assert_eq!(compute_split_key_prefix("01ARZ"), "");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ impl IndexingSplitStore {
self.inner.remote_storage.uri()
}

fn split_path(&self, split_id: &str) -> PathBuf {
PathBuf::from(quickwit_common::split_file(split_id))
fn split_path(&self, split_id: &str, prefix: &str) -> PathBuf {
PathBuf::from(quickwit_common::split_storage_path(split_id, prefix))
}

/// Stores a split.
Expand All @@ -116,7 +116,7 @@ impl IndexingSplitStore {
let start = Instant::now();
let split_num_bytes = put_payload.len();

let key = self.split_path(split.split_id());
let key = self.split_path(split.split_id(), &split.prefix);
let is_mature = split.is_mature(OffsetDateTime::now_utc());
self.inner
.remote_storage
Expand Down Expand Up @@ -179,10 +179,13 @@ impl IndexingSplitStore {
pub async fn fetch_and_open_split(
&self,
split_id: &str,
prefix: &str,
output_dir_path: &Path,
io_controls: &IoControls,
) -> StorageResult<Box<dyn Directory>> {
let path = PathBuf::from(quickwit_common::split_file(split_id));
// The local filename is always flat (no prefix directory) — the prefix only affects the
// remote S3 key.
let local_filename = quickwit_common::split_file(split_id);
if let Some(split_path) = self
.inner
.split_cache
Expand All @@ -198,13 +201,14 @@ impl IndexingSplitStore {
} else {
tracing::Span::current().record("cache_hit", false);
}
let dest_filepath = output_dir_path.join(&path);
let remote_path = self.split_path(split_id, prefix);
let dest_filepath = output_dir_path.join(&local_filename);
let dest_file = tokio::fs::File::create(&dest_filepath).await?;
let mut dest_file_with_write_limit = io_controls.clone().wrap_write(dest_file);
self.inner
.remote_storage
.copy_to(&path, &mut dest_file_with_write_limit)
.instrument(info_span!("fetch_split_from_remote_storage", path=?path))
.copy_to(&remote_path, &mut dest_file_with_write_limit)
.instrument(info_span!("fetch_split_from_remote_storage", path=?remote_path))
.await?;
get_tantivy_directory_from_split_bundle(&dest_filepath)
}
Expand Down Expand Up @@ -314,7 +318,7 @@ mod tests {
{
let output = tempfile::tempdir()?;
let split1 = split_store
.fetch_and_open_split(&split_id1, output.path(), &io_controls)
.fetch_and_open_split(&split_id1, "", output.path(), &io_controls)
.await?;
let local_store_stats = split_store.inspect_split_cache().await;
assert_eq!(local_store_stats.len(), 1);
Expand All @@ -323,7 +327,7 @@ mod tests {
{
let output = tempfile::tempdir()?;
let split2 = split_store
.fetch_and_open_split(&split_id2, output.path(), &io_controls)
.fetch_and_open_split(&split_id2, "", output.path(), &io_controls)
.await?;
let local_store_stats = split_store.inspect_split_cache().await;
assert_eq!(local_store_stats.len(), 0);
Expand Down Expand Up @@ -410,15 +414,15 @@ mod tests {
// get from remote storage because split_id1 was evicted by split_id2
let output = tempfile::tempdir()?;
let _split1 = split_store
.fetch_and_open_split(&split_id1, output.path(), &io_controls)
.fetch_and_open_split(&split_id1, "", output.path(), &io_controls)
.await?;
assert_eq!(io_controls.num_bytes(), split_payload1.len());
}
{
// get from cache
let output = tempfile::tempdir()?;
let _split2 = split_store
.fetch_and_open_split(&split_id2, output.path(), &io_controls)
.fetch_and_open_split(&split_id2, "", output.path(), &io_controls)
.await?;
// the number of downloaded by didn't change (still the size of split_payload1)
assert_eq!(io_controls.num_bytes(), split_payload1.len());
Expand All @@ -427,7 +431,7 @@ mod tests {
// get from remote because getting from cache removes the split from the cache
let output = tempfile::tempdir()?;
let _split2 = split_store
.fetch_and_open_split(&split_id2, output.path(), &io_controls)
.fetch_and_open_split(&split_id2, "", output.path(), &io_controls)
.await?;
assert_eq!(
io_controls.num_bytes(),
Expand Down
15 changes: 14 additions & 1 deletion quickwit/quickwit-metastore/src/split_metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,17 @@ pub struct SplitMetadata {
/// Doc mapping UID used when creating this split. This split may only be merged with other
/// splits using the same doc mapping UID.
pub doc_mapping_uid: DocMappingUid,

/// S3 key prefix directory used when this split was stored (e.g., `"ND"`).
///
/// An empty string means the legacy flat scheme: `{split_id}.split` directly under the index
/// URI. A non-empty prefix (with no trailing `/`) means the split is stored at
/// `{prefix}/{split_id}.split`.
///
/// This value is set once at split creation time from the `QW_SPLIT_KEY_PREFIX_LEN`
/// environment variable and never changed afterwards.
#[serde(default)]
pub prefix: String,
}

impl fmt::Debug for SplitMetadata {
Expand Down Expand Up @@ -229,7 +240,7 @@ impl SplitMetadata {

/// Converts the split metadata into a [`SplitInfo`].
pub fn as_split_info(&self) -> SplitInfo {
let file_name = quickwit_common::split_file(self.split_id());
let file_name = quickwit_common::split_storage_path(self.split_id(), &self.prefix);

SplitInfo {
uncompressed_docs_size_bytes: ByteSize(self.uncompressed_docs_size_in_bytes),
Expand Down Expand Up @@ -281,6 +292,7 @@ impl quickwit_config::TestableForRegression for SplitMetadata {
footer_offsets: 1000..2000,
num_merge_ops: 3,
doc_mapping_uid: DocMappingUid::default(),
prefix: String::new(),
}
}

Expand Down Expand Up @@ -421,6 +433,7 @@ mod tests {
delete_opstamp: 0,
num_merge_ops: 0,
doc_mapping_uid: DocMappingUid::default(),
prefix: String::new(),
};

let expected_output = "SplitMetadata { split_id: \"split-1\", index_uid: IndexUid { \
Expand Down
6 changes: 6 additions & 0 deletions quickwit/quickwit-metastore/src/split_metadata_version.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,10 @@ pub(crate) struct SplitMetadataV0_8 {
// splits before when updates first appeared are compatible with each other.
#[serde(default)]
doc_mapping_uid: DocMappingUid,

#[serde(default)]
#[serde(skip_serializing_if = "String::is_empty")]
pub prefix: String,
}

impl From<SplitMetadataV0_8> for SplitMetadata {
Expand Down Expand Up @@ -128,6 +132,7 @@ impl From<SplitMetadataV0_8> for SplitMetadata {
footer_offsets: v8.footer_offsets,
num_merge_ops: v8.num_merge_ops,
doc_mapping_uid: v8.doc_mapping_uid,
prefix: v8.prefix,
}
}
}
Expand All @@ -150,6 +155,7 @@ impl From<SplitMetadata> for SplitMetadataV0_8 {
footer_offsets: split.footer_offsets,
num_merge_ops: split.num_merge_ops,
doc_mapping_uid: split.doc_mapping_uid,
prefix: split.prefix,
}
}
}
Expand Down
4 changes: 4 additions & 0 deletions quickwit/quickwit-proto/protos/quickwit/search.proto
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ message ReportSplit {
string split_id = 2;
// The storage uri. This URI does NOT include the split id.
string storage_uri = 1;
// S3 key prefix directory with no trailing "/" (e.g. "ND"). Empty string means the legacy flat scheme.
string prefix = 3;
}

message ReportSplitsRequest {
Expand Down Expand Up @@ -517,6 +519,8 @@ message SplitIdAndFooterOffsets {
optional int64 timestamp_end = 5;
// The number of docs in the split
uint64 num_docs = 6;
// S3 key prefix directory with no trailing "/" (e.g. "ND"). Empty string means the legacy flat scheme.
string prefix = 7;
}

// Hits returned by a FetchDocRequest.
Expand Down
Loading
Loading