quickwit-oss · fulmicoton-dd · Jun 19, 2026 · Jun 22, 2026 · guilload · Jun 22, 2026
@@ -112,6 +112,43 @@ pub fn split_file(split_id: impl Display) -> String {
     format!("{split_id}.split")
 }
 
+/// Returns the storage path for a split given its ID and key prefix.
+///
+/// - Empty `prefix`: legacy flat scheme `{split_id}.split`
+/// - Non-empty `prefix`: `{prefix}/{split_id}.split` — the `/` separator is inserted here, so the
+///   `prefix` itself must NOT contain a trailing `/`
+///
+/// The prefix is computed once at split creation time and stored in `SplitMetadata`.
+pub fn split_storage_path(split_id: &str, prefix: &str) -> String {
+    if prefix.is_empty() {
+        return format!("{split_id}.split");
+    }
+    format!("{prefix}/{split_id}.split")
+}
+
+#[cfg(test)]
+mod split_path_tests {
+    use super::*;
+
+    const SAMPLE_ULID: &str = "01ARZ3NDEKTSV4RRFFQ69G5FAV";
+
+    #[test]
+    fn test_split_storage_path_empty_prefix() {
+        assert_eq!(
+            split_storage_path(SAMPLE_ULID, ""),
+            "01ARZ3NDEKTSV4RRFFQ69G5FAV.split"
+        );
+    }
+
+    #[test]
+    fn test_split_storage_path_with_prefix() {
+        assert_eq!(
+            split_storage_path(SAMPLE_ULID, "TS"),
+            "TS/01ARZ3NDEKTSV4RRFFQ69G5FAV.split"
+        );
+    }
+}
+
 fn get_from_env_opt_aux<T: Debug>(
     key: &str,
     parse_fn: impl FnOnce(&str) -> Option<T>,

@@ -114,7 +114,12 @@ impl MergeSplitDownloader {
             let _protect_guard = ctx.protect_zone();
             let tantivy_dir = self
                 .split_store
-                .fetch_and_open_split(split.split_id(), download_directory, &io_controls)
+                .fetch_and_open_split(
+                    split.split_id(),
+                    &split.prefix,
+                    download_directory,
+                    &io_controls,
+                )
                 .await
                 .map_err(|error| {
                     let split_id = split.split_id();

@@ -338,6 +338,7 @@ impl Handler<PackagedSplitBatch> for Uploader {
                     report_splits.push(ReportSplit {
                         storage_uri: split_store.remote_uri().to_string(),
                         split_id: packaged_split.split_id().to_string(),
+                        prefix: split_metadata.prefix.clone(),
                     });
 
                     split_metadata_list.push(split_metadata);

@@ -27,6 +27,7 @@ mod publisher_message;
 mod raw_doc_batch;
 mod shard_positions;
 mod split_attrs;
+mod split_prefix;
 
 pub use indexed_split::{
     CommitTrigger, EmptySplit, IndexedSplit, IndexedSplitBatch, IndexedSplitBatchBuilder,
@@ -48,6 +49,7 @@ pub use raw_doc_batch::RawDocBatch;
 pub(crate) use shard_positions::LocalShardPositionsUpdate;
 pub use shard_positions::ShardPositionsService;
 pub use split_attrs::{SplitAttrs, create_split_metadata};
+pub(crate) use split_prefix::compute_split_key_prefix;
 
 #[derive(Debug)]
 pub struct NewPublishToken(pub PublishToken);
@@ -125,6 +125,7 @@ pub fn create_split_metadata(
         footer_offsets,
         delete_opstamp: split_attrs.delete_opstamp,
         num_merge_ops: split_attrs.num_merge_ops,
+        prefix: super::compute_split_key_prefix(&split_attrs.split_id),
     }
 }
 

@@ -0,0 +1,115 @@
+// Copyright 2021-Present Datadog, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Computes the S3 key prefix used to shard split object keys across S3 partitions and avoid
+//! hotspots. See [`compute_split_key_prefix`].
+
+use std::sync::LazyLock;
+
+use quickwit_common::rate_limited_warn;
+use tracing::warn;
+
+/// Maximum prefix length: the ULID random portion (positions 10–25) is 16 characters long.
+const MAX_SPLIT_KEY_PREFIX_LEN: u8 = 16;
+
+/// Returns the number of characters from the ULID random portion used as an S3 key prefix
+/// directory.
+///
+/// Controlled by the `QW_SPLIT_KEY_PREFIX_LEN` environment variable (default: 0, i.e. the legacy
+/// flat scheme). Setting this to `2` creates 32^2 = 1024 buckets, which is sufficient for most
+/// workloads. Values above [`MAX_SPLIT_KEY_PREFIX_LEN`] are clamped.
+///
+/// The value is read from the environment once and cached for the lifetime of the process.
+fn split_key_prefix_len() -> u8 {
+    static SPLIT_KEY_PREFIX_LEN: LazyLock<u8> = LazyLock::new(|| {
+        let prefix_len: u8 = quickwit_common::get_from_env("QW_SPLIT_KEY_PREFIX_LEN", 0u8, false);
+        if prefix_len > MAX_SPLIT_KEY_PREFIX_LEN {
+            warn!(
+                prefix_len,
+                max = MAX_SPLIT_KEY_PREFIX_LEN,
+                "`QW_SPLIT_KEY_PREFIX_LEN` exceeds maximum; clamping"
+            );
+            return MAX_SPLIT_KEY_PREFIX_LEN;
+        }
+        prefix_len
+    });
+    *SPLIT_KEY_PREFIX_LEN
+}
+
+/// Computes the S3 key prefix for a split from its ULID, using the process-wide
+/// `QW_SPLIT_KEY_PREFIX_LEN` configuration (see [`split_key_prefix_len`]).
+///
+/// Extracts the configured number of characters starting at position 10 of the ULID (the first
+/// characters of the random portion, after the 10-character timestamp). Returns an empty string
+/// when the configured length is 0 (legacy flat scheme).
+///
+/// The returned prefix does NOT contain the trailing `/` separator; that separator is added by
+/// [`quickwit_common::split_storage_path`] when building the full storage path.
+///
+/// Hidden contract: `split_id` must be a valid 26-character ULID. If it is too short to extract
+/// the requested prefix, a rate-limited warning is logged and an empty string is returned, falling
+/// back to the legacy flat scheme.
+pub(crate) fn compute_split_key_prefix(split_id: &str) -> String {
+    let prefix_len = split_key_prefix_len();
+    if prefix_len == 0 {
+        return String::new();
+    }
+    let end = 10 + prefix_len as usize;
+    if split_id.len() < end {
+        rate_limited_warn!(
+            limit_per_min = 1,
+            split_id = split_id,
+            prefix_len = prefix_len,
+            "split ID is too short to extract prefix; falling back to flat storage path"
+        );
+        return String::new();
+    }
+    split_id[10..end].to_string()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    const SAMPLE_ULID: &str = "01ARZ3NDEKTSV4RRFFQ69G5FAV";
+
+    // The tests below mutate the `QW_SPLIT_KEY_PREFIX_LEN` environment variable, which feeds a
+    // process-wide cached `LazyLock`. They rely on `cargo nextest` running each test in its own
+    // process so the cache (and the environment) does not leak between tests.
+
+    #[test]
+    fn test_split_key_prefix_len_disabled_by_default() {
+        assert_eq!(split_key_prefix_len(), 0);
+    }
+
+    #[test]
+    fn test_compute_split_key_prefix_disabled_by_default() {
+        assert_eq!(compute_split_key_prefix(SAMPLE_ULID), "");
+    }
+
+    #[test]
+    fn test_compute_split_key_prefix_extracts_random_portion() {
+        // SAFETY: under nextest each test runs in its own process, so this does not race.
+        unsafe { std::env::set_var("QW_SPLIT_KEY_PREFIX_LEN", "4") };
+        // Characters 10–13 of the ULID are the first 4 characters of the random portion.
+        assert_eq!(compute_split_key_prefix(SAMPLE_ULID), "TSV4");
+    }
+
+    #[test]
+    fn test_compute_split_key_prefix_too_short_falls_back() {
+        // SAFETY: under nextest each test runs in its own process, so this does not race.
+        unsafe { std::env::set_var("QW_SPLIT_KEY_PREFIX_LEN", "4") };
+        assert_eq!(compute_split_key_prefix("01ARZ"), "");
+    }
+}
@@ -93,8 +93,8 @@ impl IndexingSplitStore {
         self.inner.remote_storage.uri()
     }
 
-    fn split_path(&self, split_id: &str) -> PathBuf {
-        PathBuf::from(quickwit_common::split_file(split_id))
+    fn split_path(&self, split_id: &str, prefix: &str) -> PathBuf {
+        PathBuf::from(quickwit_common::split_storage_path(split_id, prefix))
     }
 
     /// Stores a split.
@@ -116,7 +116,7 @@ impl IndexingSplitStore {
         let start = Instant::now();
         let split_num_bytes = put_payload.len();
 
-        let key = self.split_path(split.split_id());
+        let key = self.split_path(split.split_id(), &split.prefix);
         let is_mature = split.is_mature(OffsetDateTime::now_utc());
         self.inner
             .remote_storage
@@ -179,10 +179,13 @@ impl IndexingSplitStore {
     pub async fn fetch_and_open_split(
         &self,
         split_id: &str,
+        prefix: &str,
         output_dir_path: &Path,
         io_controls: &IoControls,
     ) -> StorageResult<Box<dyn Directory>> {
-        let path = PathBuf::from(quickwit_common::split_file(split_id));
+        // The local filename is always flat (no prefix directory) — the prefix only affects the
+        // remote S3 key.
+        let local_filename = quickwit_common::split_file(split_id);
         if let Some(split_path) = self
             .inner
             .split_cache
@@ -198,13 +201,14 @@ impl IndexingSplitStore {
         } else {
             tracing::Span::current().record("cache_hit", false);
         }
-        let dest_filepath = output_dir_path.join(&path);
+        let remote_path = self.split_path(split_id, prefix);
+        let dest_filepath = output_dir_path.join(&local_filename);
         let dest_file = tokio::fs::File::create(&dest_filepath).await?;
         let mut dest_file_with_write_limit = io_controls.clone().wrap_write(dest_file);
         self.inner
             .remote_storage
-            .copy_to(&path, &mut dest_file_with_write_limit)
-            .instrument(info_span!("fetch_split_from_remote_storage", path=?path))
+            .copy_to(&remote_path, &mut dest_file_with_write_limit)
+            .instrument(info_span!("fetch_split_from_remote_storage", path=?remote_path))
             .await?;
         get_tantivy_directory_from_split_bundle(&dest_filepath)
     }
@@ -314,7 +318,7 @@ mod tests {
         {
             let output = tempfile::tempdir()?;
             let split1 = split_store
-                .fetch_and_open_split(&split_id1, output.path(), &io_controls)
+                .fetch_and_open_split(&split_id1, "", output.path(), &io_controls)
                 .await?;
             let local_store_stats = split_store.inspect_split_cache().await;
             assert_eq!(local_store_stats.len(), 1);
@@ -323,7 +327,7 @@ mod tests {
         {
             let output = tempfile::tempdir()?;
             let split2 = split_store
-                .fetch_and_open_split(&split_id2, output.path(), &io_controls)
+                .fetch_and_open_split(&split_id2, "", output.path(), &io_controls)
                 .await?;
             let local_store_stats = split_store.inspect_split_cache().await;
             assert_eq!(local_store_stats.len(), 0);
@@ -410,15 +414,15 @@ mod tests {
             // get from remote storage because split_id1 was evicted by split_id2
             let output = tempfile::tempdir()?;
             let _split1 = split_store
-                .fetch_and_open_split(&split_id1, output.path(), &io_controls)
+                .fetch_and_open_split(&split_id1, "", output.path(), &io_controls)
                 .await?;
             assert_eq!(io_controls.num_bytes(), split_payload1.len());
         }
         {
             // get from cache
             let output = tempfile::tempdir()?;
             let _split2 = split_store
-                .fetch_and_open_split(&split_id2, output.path(), &io_controls)
+                .fetch_and_open_split(&split_id2, "", output.path(), &io_controls)
                 .await?;
             // the number of downloaded by didn't change (still the size of split_payload1)
             assert_eq!(io_controls.num_bytes(), split_payload1.len());
@@ -427,7 +431,7 @@ mod tests {
             // get from remote because getting from cache removes the split from the cache
             let output = tempfile::tempdir()?;
             let _split2 = split_store
-                .fetch_and_open_split(&split_id2, output.path(), &io_controls)
+                .fetch_and_open_split(&split_id2, "", output.path(), &io_controls)
                 .await?;
             assert_eq!(
                 io_controls.num_bytes(),

@@ -132,6 +132,17 @@ pub struct SplitMetadata {
     /// Doc mapping UID used when creating this split. This split may only be merged with other
     /// splits using the same doc mapping UID.
     pub doc_mapping_uid: DocMappingUid,
+
+    /// S3 key prefix directory used when this split was stored (e.g., `"ND"`).
+    ///
+    /// An empty string means the legacy flat scheme: `{split_id}.split` directly under the index
+    /// URI. A non-empty prefix (with no trailing `/`) means the split is stored at
+    /// `{prefix}/{split_id}.split`.
+    ///
+    /// This value is set once at split creation time from the `QW_SPLIT_KEY_PREFIX_LEN`
+    /// environment variable and never changed afterwards.
+    #[serde(default)]
+    pub prefix: String,
 }
 
 impl fmt::Debug for SplitMetadata {
@@ -229,7 +240,7 @@ impl SplitMetadata {
 
     /// Converts the split metadata into a [`SplitInfo`].
     pub fn as_split_info(&self) -> SplitInfo {
-        let file_name = quickwit_common::split_file(self.split_id());
+        let file_name = quickwit_common::split_storage_path(self.split_id(), &self.prefix);
 
         SplitInfo {
             uncompressed_docs_size_bytes: ByteSize(self.uncompressed_docs_size_in_bytes),
@@ -281,6 +292,7 @@ impl quickwit_config::TestableForRegression for SplitMetadata {
             footer_offsets: 1000..2000,
             num_merge_ops: 3,
             doc_mapping_uid: DocMappingUid::default(),
+            prefix: String::new(),
         }
     }
 
@@ -421,6 +433,7 @@ mod tests {
             delete_opstamp: 0,
             num_merge_ops: 0,
             doc_mapping_uid: DocMappingUid::default(),
+            prefix: String::new(),
         };
 
         let expected_output = "SplitMetadata { split_id: \"split-1\", index_uid: IndexUid { \

@@ -92,6 +92,10 @@ pub(crate) struct SplitMetadataV0_8 {
     // splits before when updates first appeared are compatible with each other.
     #[serde(default)]
     doc_mapping_uid: DocMappingUid,
+
+    #[serde(default)]
+    #[serde(skip_serializing_if = "String::is_empty")]
+    pub prefix: String,
 }
 
 impl From<SplitMetadataV0_8> for SplitMetadata {
@@ -128,6 +132,7 @@ impl From<SplitMetadataV0_8> for SplitMetadata {
             footer_offsets: v8.footer_offsets,
             num_merge_ops: v8.num_merge_ops,
             doc_mapping_uid: v8.doc_mapping_uid,
+            prefix: v8.prefix,
         }
     }
 }
@@ -150,6 +155,7 @@ impl From<SplitMetadata> for SplitMetadataV0_8 {
             footer_offsets: split.footer_offsets,
             num_merge_ops: split.num_merge_ops,
             doc_mapping_uid: split.doc_mapping_uid,
+            prefix: split.prefix,
         }
     }
 }

@@ -105,6 +105,8 @@ message ReportSplit {
   string split_id = 2;
   // The storage uri. This URI does NOT include the split id.
   string storage_uri = 1;
+  // S3 key prefix directory with no trailing "/" (e.g. "ND"). Empty string means the legacy flat scheme.
+  string prefix = 3;
 }
 
 message ReportSplitsRequest {
@@ -517,6 +519,8 @@ message SplitIdAndFooterOffsets {
   optional int64 timestamp_end = 5;
   // The number of docs in the split
   uint64 num_docs = 6;
+  // S3 key prefix directory with no trailing "/" (e.g. "ND"). Empty string means the legacy flat scheme.
+  string prefix = 7;
 }
 
 // Hits returned by a FetchDocRequest.