|
14 | 14 |
|
15 | 15 | pub(crate) mod serialize; |
16 | 16 |
|
| 17 | +use std::collections::HashSet; |
17 | 18 | use std::hash::{Hash, Hasher}; |
18 | 19 | use std::num::NonZeroUsize; |
19 | 20 | use std::str::FromStr; |
@@ -565,11 +566,67 @@ pub(super) fn validate_index_config( |
565 | 566 | Ok(()) |
566 | 567 | } |
567 | 568 |
|
| 569 | +/// Returns the updated doc mapping and a boolean indicating whether a mutation occurred. |
| 570 | +/// |
| 571 | +/// The logic goes as follows: |
| 572 | +/// 1. If the new doc mapping is the same as the current doc mapping, ignoring their UIDs, returns |
| 573 | +/// the current doc mapping and `false`, indicating that no mutation occurred. |
| 574 | +/// 2. If the new doc mapping is different from the current doc mapping, verifies the following |
| 575 | +/// constraints before returning the new doc mapping and `true`, indicating that a mutation |
| 576 | +/// occurred: |
| 577 | +/// - The doc mapping UID should differ from the current one |
| 578 | +/// - The timestamp field should remain the same |
| 579 | +/// - The tokenizers should be a superset of the current tokenizers |
| 580 | +/// - A doc mapper can be built from the new doc mapping |
| 581 | +pub fn prepare_doc_mapping_update( |
| 582 | + mut new_doc_mapping: DocMapping, |
| 583 | + current_doc_mapping: &DocMapping, |
| 584 | + search_settings: &SearchSettings, |
| 585 | +) -> anyhow::Result<(DocMapping, bool)> { |
| 586 | + // Save the new doc mapping UID in a temporary variable and override it with the current doc |
| 587 | + // mapping UID to compare the two doc mappings, ignoring their UIDs. |
| 588 | + let new_doc_mapping_uid = new_doc_mapping.doc_mapping_uid; |
| 589 | + new_doc_mapping.doc_mapping_uid = current_doc_mapping.doc_mapping_uid; |
| 590 | + |
| 591 | + if new_doc_mapping == *current_doc_mapping { |
| 592 | + return Ok((new_doc_mapping, false)); |
| 593 | + } |
| 594 | + // Restore the new doc mapping UID. |
| 595 | + new_doc_mapping.doc_mapping_uid = new_doc_mapping_uid; |
| 596 | + |
| 597 | + ensure!( |
| 598 | + new_doc_mapping.doc_mapping_uid != current_doc_mapping.doc_mapping_uid, |
| 599 | + "new doc mapping UID should differ from the current one, current UID `{}`, new UID `{}`", |
| 600 | + current_doc_mapping.doc_mapping_uid, |
| 601 | + new_doc_mapping.doc_mapping_uid, |
| 602 | + ); |
| 603 | + let new_timestamp_field = new_doc_mapping.timestamp_field.as_deref(); |
| 604 | + let current_timestamp_field = current_doc_mapping.timestamp_field.as_deref(); |
| 605 | + ensure!( |
| 606 | + new_timestamp_field == current_timestamp_field, |
| 607 | + "updating timestamp field is not allowed, current timestamp field `{}`, new timestamp \ |
| 608 | + field `{}`", |
| 609 | + current_timestamp_field.unwrap_or("none"), |
| 610 | + new_timestamp_field.unwrap_or("none"), |
| 611 | + ); |
| 612 | + // TODO: Unsure this constraint is required, should we relax it? |
| 613 | + let new_tokenizers: HashSet<_> = new_doc_mapping.tokenizers.iter().collect(); |
| 614 | + let current_tokenizers: HashSet<_> = current_doc_mapping.tokenizers.iter().collect(); |
| 615 | + ensure!( |
| 616 | + new_tokenizers.is_superset(¤t_tokenizers), |
| 617 | + "updating tokenizers is allowed only if adding new tokenizers, current tokenizers \ |
| 618 | + `{current_tokenizers:?}`, new tokenizers `{new_tokenizers:?}`", |
| 619 | + ); |
| 620 | + build_doc_mapper(&new_doc_mapping, search_settings).context("invalid doc mapping")?; |
| 621 | + Ok((new_doc_mapping, true)) |
| 622 | +} |
| 623 | + |
568 | 624 | #[cfg(test)] |
569 | 625 | mod tests { |
570 | 626 |
|
571 | 627 | use cron::TimeUnitSpec; |
572 | | - use quickwit_doc_mapper::ModeType; |
| 628 | + use quickwit_doc_mapper::{Mode, ModeType, TokenizerEntry}; |
| 629 | + use quickwit_proto::types::DocMappingUid; |
573 | 630 |
|
574 | 631 | use super::*; |
575 | 632 | use crate::ConfigFormat; |
@@ -981,4 +1038,96 @@ mod tests { |
981 | 1038 | let error = serde_yaml::from_str::<IngestSettings>(settings_yaml).unwrap_err(); |
982 | 1039 | assert!(error.to_string().contains("expected a nonzero")); |
983 | 1040 | } |
| 1041 | + |
| 1042 | + #[test] |
| 1043 | + fn test_prepare_doc_mapping_update() { |
| 1044 | + let current_index_config = IndexConfig::for_test("test-index", "s3://test-index"); |
| 1045 | + let mut current_doc_mapping = current_index_config.doc_mapping; |
| 1046 | + let search_settings = current_index_config.search_settings; |
| 1047 | + |
| 1048 | + let tokenizer_json = r#" |
| 1049 | + { |
| 1050 | + "name": "breton-tokenizer", |
| 1051 | + "type": "regex", |
| 1052 | + "pattern": "crêpes*" |
| 1053 | + } |
| 1054 | + "#; |
| 1055 | + let tokenizer: TokenizerEntry = serde_json::from_str(tokenizer_json).unwrap(); |
| 1056 | + |
| 1057 | + current_doc_mapping.tokenizers.push(tokenizer.clone()); |
| 1058 | + |
| 1059 | + // The new doc mapping should have a different doc mapping UID. |
| 1060 | + let mut new_doc_mapping = current_doc_mapping.clone(); |
| 1061 | + new_doc_mapping.store_source = false; // This is set to `true` for the current doc mapping. |
| 1062 | + let error = |
| 1063 | + prepare_doc_mapping_update(new_doc_mapping, ¤t_doc_mapping, &search_settings) |
| 1064 | + .unwrap_err() |
| 1065 | + .to_string(); |
| 1066 | + assert!(error.contains("doc mapping UID should differ")); |
| 1067 | + |
| 1068 | + // The new doc mapping should not change the timestamp field. |
| 1069 | + let mut new_doc_mapping = current_doc_mapping.clone(); |
| 1070 | + new_doc_mapping.doc_mapping_uid = DocMappingUid::random(); |
| 1071 | + new_doc_mapping.timestamp_field = Some("ts".to_string()); // This is set to `timestamp` for the current doc mapping. |
| 1072 | + let error = |
| 1073 | + prepare_doc_mapping_update(new_doc_mapping, ¤t_doc_mapping, &search_settings) |
| 1074 | + .unwrap_err() |
| 1075 | + .to_string(); |
| 1076 | + assert!(error.contains("timestamp field")); |
| 1077 | + |
| 1078 | + // The new doc mapping should not remove the timestamp field. |
| 1079 | + let mut new_doc_mapping = current_doc_mapping.clone(); |
| 1080 | + new_doc_mapping.doc_mapping_uid = DocMappingUid::random(); |
| 1081 | + new_doc_mapping.timestamp_field = None; |
| 1082 | + let error = |
| 1083 | + prepare_doc_mapping_update(new_doc_mapping, ¤t_doc_mapping, &search_settings) |
| 1084 | + .unwrap_err() |
| 1085 | + .to_string(); |
| 1086 | + assert!(error.contains("timestamp field")); |
| 1087 | + |
| 1088 | + // The new doc mapping should not remove tokenizers. |
| 1089 | + let mut new_doc_mapping = current_doc_mapping.clone(); |
| 1090 | + new_doc_mapping.doc_mapping_uid = DocMappingUid::random(); |
| 1091 | + new_doc_mapping.tokenizers.clear(); |
| 1092 | + let error = |
| 1093 | + prepare_doc_mapping_update(new_doc_mapping, ¤t_doc_mapping, &search_settings) |
| 1094 | + .unwrap_err() |
| 1095 | + .to_string(); |
| 1096 | + assert!(error.contains("tokenizers")); |
| 1097 | + |
| 1098 | + // The new doc mapping should be "buildable" into a doc mapper. |
| 1099 | + let mut new_doc_mapping = current_doc_mapping.clone(); |
| 1100 | + new_doc_mapping.doc_mapping_uid = DocMappingUid::random(); |
| 1101 | + new_doc_mapping.tokenizers.push(tokenizer); |
| 1102 | + let error = |
| 1103 | + prepare_doc_mapping_update(new_doc_mapping, ¤t_doc_mapping, &search_settings) |
| 1104 | + .unwrap_err() |
| 1105 | + .source() |
| 1106 | + .unwrap() |
| 1107 | + .to_string(); |
| 1108 | + assert!(error.contains("duplicated custom tokenizer")); |
| 1109 | + |
| 1110 | + let mut new_doc_mapping = current_doc_mapping.clone(); |
| 1111 | + new_doc_mapping.doc_mapping_uid = DocMappingUid::random(); |
| 1112 | + let (updated_doc_mapping, mutation_occurred) = |
| 1113 | + prepare_doc_mapping_update(new_doc_mapping, ¤t_doc_mapping, &search_settings) |
| 1114 | + .unwrap(); |
| 1115 | + assert!(!mutation_occurred); |
| 1116 | + assert_eq!( |
| 1117 | + updated_doc_mapping.doc_mapping_uid, |
| 1118 | + current_doc_mapping.doc_mapping_uid |
| 1119 | + ); |
| 1120 | + assert_eq!(updated_doc_mapping, current_doc_mapping); |
| 1121 | + |
| 1122 | + let mut new_doc_mapping = current_doc_mapping.clone(); |
| 1123 | + let new_doc_mapping_uid = DocMappingUid::random(); |
| 1124 | + new_doc_mapping.doc_mapping_uid = new_doc_mapping_uid; |
| 1125 | + new_doc_mapping.mode = Mode::Strict; |
| 1126 | + let (updated_doc_mapping, mutation_occurred) = |
| 1127 | + prepare_doc_mapping_update(new_doc_mapping, ¤t_doc_mapping, &search_settings) |
| 1128 | + .unwrap(); |
| 1129 | + assert!(mutation_occurred); |
| 1130 | + assert_eq!(updated_doc_mapping.doc_mapping_uid, new_doc_mapping_uid); |
| 1131 | + assert_eq!(updated_doc_mapping.mode, Mode::Strict); |
| 1132 | + } |
984 | 1133 | } |
0 commit comments