Skip to content

Commit 5a709f1

Browse files
authored
Validate doc mapping update (#5988)
1 parent 4f038e3 commit 5a709f1

File tree

8 files changed

+266
-88
lines changed

8 files changed

+266
-88
lines changed

quickwit/quickwit-config/src/index_config/mod.rs

Lines changed: 150 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
pub(crate) mod serialize;
1616

17+
use std::collections::HashSet;
1718
use std::hash::{Hash, Hasher};
1819
use std::num::NonZeroUsize;
1920
use std::str::FromStr;
@@ -565,11 +566,67 @@ pub(super) fn validate_index_config(
565566
Ok(())
566567
}
567568

569+
/// Returns the updated doc mapping and a boolean indicating whether a mutation occurred.
570+
///
571+
/// The logic goes as follows:
572+
/// 1. If the new doc mapping is the same as the current doc mapping, ignoring their UIDs, returns
573+
/// the current doc mapping and `false`, indicating that no mutation occurred.
574+
/// 2. If the new doc mapping is different from the current doc mapping, verifies the following
575+
/// constraints before returning the new doc mapping and `true`, indicating that a mutation
576+
/// occurred:
577+
/// - The doc mapping UID should differ from the current one
578+
/// - The timestamp field should remain the same
579+
/// - The tokenizers should be a superset of the current tokenizers
580+
/// - A doc mapper can be built from the new doc mapping
581+
pub fn prepare_doc_mapping_update(
582+
mut new_doc_mapping: DocMapping,
583+
current_doc_mapping: &DocMapping,
584+
search_settings: &SearchSettings,
585+
) -> anyhow::Result<(DocMapping, bool)> {
586+
// Save the new doc mapping UID in a temporary variable and override it with the current doc
587+
// mapping UID to compare the two doc mappings, ignoring their UIDs.
588+
let new_doc_mapping_uid = new_doc_mapping.doc_mapping_uid;
589+
new_doc_mapping.doc_mapping_uid = current_doc_mapping.doc_mapping_uid;
590+
591+
if new_doc_mapping == *current_doc_mapping {
592+
return Ok((new_doc_mapping, false));
593+
}
594+
// Restore the new doc mapping UID.
595+
new_doc_mapping.doc_mapping_uid = new_doc_mapping_uid;
596+
597+
ensure!(
598+
new_doc_mapping.doc_mapping_uid != current_doc_mapping.doc_mapping_uid,
599+
"new doc mapping UID should differ from the current one, current UID `{}`, new UID `{}`",
600+
current_doc_mapping.doc_mapping_uid,
601+
new_doc_mapping.doc_mapping_uid,
602+
);
603+
let new_timestamp_field = new_doc_mapping.timestamp_field.as_deref();
604+
let current_timestamp_field = current_doc_mapping.timestamp_field.as_deref();
605+
ensure!(
606+
new_timestamp_field == current_timestamp_field,
607+
"updating timestamp field is not allowed, current timestamp field `{}`, new timestamp \
608+
field `{}`",
609+
current_timestamp_field.unwrap_or("none"),
610+
new_timestamp_field.unwrap_or("none"),
611+
);
612+
// TODO: Unsure this constraint is required, should we relax it?
613+
let new_tokenizers: HashSet<_> = new_doc_mapping.tokenizers.iter().collect();
614+
let current_tokenizers: HashSet<_> = current_doc_mapping.tokenizers.iter().collect();
615+
ensure!(
616+
new_tokenizers.is_superset(&current_tokenizers),
617+
"updating tokenizers is allowed only if adding new tokenizers, current tokenizers \
618+
`{current_tokenizers:?}`, new tokenizers `{new_tokenizers:?}`",
619+
);
620+
build_doc_mapper(&new_doc_mapping, search_settings).context("invalid doc mapping")?;
621+
Ok((new_doc_mapping, true))
622+
}
623+
568624
#[cfg(test)]
569625
mod tests {
570626

571627
use cron::TimeUnitSpec;
572-
use quickwit_doc_mapper::ModeType;
628+
use quickwit_doc_mapper::{Mode, ModeType, TokenizerEntry};
629+
use quickwit_proto::types::DocMappingUid;
573630

574631
use super::*;
575632
use crate::ConfigFormat;
@@ -981,4 +1038,96 @@ mod tests {
9811038
let error = serde_yaml::from_str::<IngestSettings>(settings_yaml).unwrap_err();
9821039
assert!(error.to_string().contains("expected a nonzero"));
9831040
}
1041+
1042+
#[test]
1043+
fn test_prepare_doc_mapping_update() {
1044+
let current_index_config = IndexConfig::for_test("test-index", "s3://test-index");
1045+
let mut current_doc_mapping = current_index_config.doc_mapping;
1046+
let search_settings = current_index_config.search_settings;
1047+
1048+
let tokenizer_json = r#"
1049+
{
1050+
"name": "breton-tokenizer",
1051+
"type": "regex",
1052+
"pattern": "crêpes*"
1053+
}
1054+
"#;
1055+
let tokenizer: TokenizerEntry = serde_json::from_str(tokenizer_json).unwrap();
1056+
1057+
current_doc_mapping.tokenizers.push(tokenizer.clone());
1058+
1059+
// The new doc mapping should have a different doc mapping UID.
1060+
let mut new_doc_mapping = current_doc_mapping.clone();
1061+
new_doc_mapping.store_source = false; // This is set to `true` for the current doc mapping.
1062+
let error =
1063+
prepare_doc_mapping_update(new_doc_mapping, &current_doc_mapping, &search_settings)
1064+
.unwrap_err()
1065+
.to_string();
1066+
assert!(error.contains("doc mapping UID should differ"));
1067+
1068+
// The new doc mapping should not change the timestamp field.
1069+
let mut new_doc_mapping = current_doc_mapping.clone();
1070+
new_doc_mapping.doc_mapping_uid = DocMappingUid::random();
1071+
new_doc_mapping.timestamp_field = Some("ts".to_string()); // This is set to `timestamp` for the current doc mapping.
1072+
let error =
1073+
prepare_doc_mapping_update(new_doc_mapping, &current_doc_mapping, &search_settings)
1074+
.unwrap_err()
1075+
.to_string();
1076+
assert!(error.contains("timestamp field"));
1077+
1078+
// The new doc mapping should not remove the timestamp field.
1079+
let mut new_doc_mapping = current_doc_mapping.clone();
1080+
new_doc_mapping.doc_mapping_uid = DocMappingUid::random();
1081+
new_doc_mapping.timestamp_field = None;
1082+
let error =
1083+
prepare_doc_mapping_update(new_doc_mapping, &current_doc_mapping, &search_settings)
1084+
.unwrap_err()
1085+
.to_string();
1086+
assert!(error.contains("timestamp field"));
1087+
1088+
// The new doc mapping should not remove tokenizers.
1089+
let mut new_doc_mapping = current_doc_mapping.clone();
1090+
new_doc_mapping.doc_mapping_uid = DocMappingUid::random();
1091+
new_doc_mapping.tokenizers.clear();
1092+
let error =
1093+
prepare_doc_mapping_update(new_doc_mapping, &current_doc_mapping, &search_settings)
1094+
.unwrap_err()
1095+
.to_string();
1096+
assert!(error.contains("tokenizers"));
1097+
1098+
// The new doc mapping should be "buildable" into a doc mapper.
1099+
let mut new_doc_mapping = current_doc_mapping.clone();
1100+
new_doc_mapping.doc_mapping_uid = DocMappingUid::random();
1101+
new_doc_mapping.tokenizers.push(tokenizer);
1102+
let error =
1103+
prepare_doc_mapping_update(new_doc_mapping, &current_doc_mapping, &search_settings)
1104+
.unwrap_err()
1105+
.source()
1106+
.unwrap()
1107+
.to_string();
1108+
assert!(error.contains("duplicated custom tokenizer"));
1109+
1110+
let mut new_doc_mapping = current_doc_mapping.clone();
1111+
new_doc_mapping.doc_mapping_uid = DocMappingUid::random();
1112+
let (updated_doc_mapping, mutation_occurred) =
1113+
prepare_doc_mapping_update(new_doc_mapping, &current_doc_mapping, &search_settings)
1114+
.unwrap();
1115+
assert!(!mutation_occurred);
1116+
assert_eq!(
1117+
updated_doc_mapping.doc_mapping_uid,
1118+
current_doc_mapping.doc_mapping_uid
1119+
);
1120+
assert_eq!(updated_doc_mapping, current_doc_mapping);
1121+
1122+
let mut new_doc_mapping = current_doc_mapping.clone();
1123+
let new_doc_mapping_uid = DocMappingUid::random();
1124+
new_doc_mapping.doc_mapping_uid = new_doc_mapping_uid;
1125+
new_doc_mapping.mode = Mode::Strict;
1126+
let (updated_doc_mapping, mutation_occurred) =
1127+
prepare_doc_mapping_update(new_doc_mapping, &current_doc_mapping, &search_settings)
1128+
.unwrap();
1129+
assert!(mutation_occurred);
1130+
assert_eq!(updated_doc_mapping.doc_mapping_uid, new_doc_mapping_uid);
1131+
assert_eq!(updated_doc_mapping.mode, Mode::Strict);
1132+
}
9841133
}

quickwit/quickwit-config/src/index_config/serialize.rs

Lines changed: 7 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,16 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15-
use std::collections::HashSet;
16-
1715
use anyhow::{Context, ensure};
1816
use quickwit_common::uri::Uri;
19-
use quickwit_doc_mapper::DocMapperBuilder;
2017
use quickwit_proto::types::{DocMappingUid, IndexId};
2118
use serde::{Deserialize, Serialize};
2219
use tracing::info;
2320

2421
use super::{IngestSettings, validate_index_config};
2522
use crate::{
2623
ConfigFormat, DocMapping, IndexConfig, IndexingSettings, RetentionPolicy, SearchSettings,
27-
validate_identifier,
24+
prepare_doc_mapping_update, validate_identifier,
2825
};
2926

3027
/// Alias for the latest serialization format.
@@ -92,63 +89,12 @@ pub fn load_index_config_update(
9289
current_index_config.index_uri,
9390
new_index_config.index_uri
9491
);
95-
96-
// verify the new mapping is coherent
97-
let doc_mapper_builder = DocMapperBuilder {
98-
doc_mapping: new_index_config.doc_mapping.clone(),
99-
default_search_fields: new_index_config
100-
.search_settings
101-
.default_search_fields
102-
.clone(),
103-
legacy_type_tag: None,
104-
};
105-
doc_mapper_builder
106-
.try_build()
107-
.context("invalid mapping update")?;
108-
109-
{
110-
let new_mapping_uid = new_index_config.doc_mapping.doc_mapping_uid;
111-
// we verify whether they are equal ignoring the mapping uid as it is generated at random:
112-
// we don't want to record a mapping change when nothing really happened.
113-
new_index_config.doc_mapping.doc_mapping_uid =
114-
current_index_config.doc_mapping.doc_mapping_uid;
115-
if new_index_config.doc_mapping != current_index_config.doc_mapping {
116-
new_index_config.doc_mapping.doc_mapping_uid = new_mapping_uid;
117-
ensure!(
118-
current_index_config.doc_mapping.doc_mapping_uid
119-
!= new_index_config.doc_mapping.doc_mapping_uid,
120-
"`doc_mapping_doc_mapping_uid` must change when the doc mapping is updated",
121-
);
122-
ensure!(
123-
current_index_config.doc_mapping.timestamp_field
124-
== new_index_config.doc_mapping.timestamp_field,
125-
"`doc_mapping.timestamp_field` cannot be updated, current value {}, new expected \
126-
value {}",
127-
current_index_config
128-
.doc_mapping
129-
.timestamp_field
130-
.as_deref()
131-
.unwrap_or("<none>"),
132-
new_index_config
133-
.doc_mapping
134-
.timestamp_field
135-
.as_deref()
136-
.unwrap_or("<none>"),
137-
);
138-
// TODO: i'm not sure this is necessary, we can relax this requirement once we know
139-
// for sure
140-
let current_tokenizers: HashSet<_> =
141-
current_index_config.doc_mapping.tokenizers.iter().collect();
142-
let new_tokenizers: HashSet<_> =
143-
new_index_config.doc_mapping.tokenizers.iter().collect();
144-
ensure!(
145-
new_tokenizers.is_superset(&current_tokenizers),
146-
"`.doc_mapping.tokenizers` must be a superset of previously available tokenizers"
147-
);
148-
} else {
149-
// the docmapping is unchanged, keep the old uid
150-
}
151-
}
92+
let (updated_doc_mapping, _mutation_occurred) = prepare_doc_mapping_update(
93+
new_index_config.doc_mapping,
94+
&current_index_config.doc_mapping,
95+
&new_index_config.search_settings,
96+
)?;
97+
new_index_config.doc_mapping = updated_doc_mapping;
15298

15399
Ok(new_index_config)
154100
}

quickwit/quickwit-config/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ use index_config::serialize::{IndexConfigV0_8, VersionedIndexConfig};
4747
pub use index_config::{
4848
IndexConfig, IndexingResources, IndexingSettings, IngestSettings, RetentionPolicy,
4949
SearchSettings, build_doc_mapper, load_index_config_from_user_config, load_index_config_update,
50+
prepare_doc_mapping_update,
5051
};
5152
pub use quickwit_doc_mapper::DocMapping;
5253
use serde::Serialize;

quickwit/quickwit-doc-mapper/src/doc_mapping.rs

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -166,20 +166,6 @@ impl DocMapping {
166166
pub fn default_max_num_partitions() -> NonZeroU32 {
167167
NonZeroU32::new(200).unwrap()
168168
}
169-
170-
/// Returns whether the `other` doc mapping is equal to `self` leaving their respective doc
171-
/// mapping UIDs out of the comparison.
172-
pub fn eq_ignore_doc_mapping_uid(&self, other: &Self) -> bool {
173-
let doc_mapping_uid = DocMappingUid::default();
174-
175-
let mut left = self.clone();
176-
left.doc_mapping_uid = doc_mapping_uid;
177-
178-
let mut right = other.clone();
179-
right.doc_mapping_uid = doc_mapping_uid;
180-
181-
left == right
182-
}
183169
}
184170

185171
#[cfg(test)]

quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ impl FileBackedIndex {
221221
ingest_settings: IngestSettings,
222222
search_settings: SearchSettings,
223223
retention_policy_opt: Option<RetentionPolicy>,
224-
) -> bool {
224+
) -> MetastoreResult<bool> {
225225
self.metadata.update_index_config(
226226
doc_mapping,
227227
indexing_settings,

quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -575,7 +575,7 @@ impl MetastoreService for FileBackedMetastore {
575575
ingest_settings,
576576
search_settings,
577577
retention_policy_opt,
578-
);
578+
)?;
579579
let index_metadata = index.metadata().clone();
580580

581581
if mutation_occurred {

0 commit comments

Comments
 (0)