diff --git a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java new file mode 100644 index 00000000000..c17dada5eb1 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.update.processor; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.Base64; +import java.util.Collection; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.function.Predicate; +import org.apache.lucene.util.BytesRef; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.core.SolrCore; +import org.apache.solr.handler.component.RealTimeGetComponent; +import org.apache.solr.handler.component.RealTimeGetComponent.Resolution; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.schema.SchemaField; +import org.apache.solr.schema.TextField; +import org.apache.solr.update.AddUpdateCommand; +import org.apache.solr.update.UpdateCommand; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * An implementation of {@link UpdateRequestProcessor} which computes a hash of field values, and + * uses this hash to reject/accept document updates. + * + * + * + *

Depending on {#dropSameDocuments} value, this processor may drop or accept document updates. + * This implementation can be used for monitoring or dropping no-op updates (updates that do not + * change the Solr document content). + * + *

Note: the hash is computed using {@link Lookup3Signature} and must be stored in a field with + * docValues enabled for retrieval. + * + * @see Lookup3Signature + */ +public class ContentHashVersionProcessor extends UpdateRequestProcessor { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private final SchemaField hashField; + private final SolrQueryResponse rsp; + private final SolrCore core; + private final Predicate includedFields; // Matcher for included fields in hash + private final Predicate excludedFields; // Matcher for excluded fields from hash + private boolean dropSameDocuments; + private int sameCount = 0; + private int differentCount = 0; + + public ContentHashVersionProcessor( + Predicate hashIncludedFields, + Predicate hashExcludedFields, + String hashFieldName, + SolrQueryRequest req, + SolrQueryResponse rsp, + UpdateRequestProcessor next) { + super(next); + this.core = req.getCore(); + this.hashField = new SchemaField(hashFieldName, new TextField()); + this.rsp = rsp; + this.includedFields = hashIncludedFields; + this.excludedFields = hashExcludedFields; + } + + @Override + public void processAdd(AddUpdateCommand cmd) throws IOException { + SolrInputDocument newDoc = cmd.getSolrInputDocument(); + String newHash = computeDocHash(newDoc); + newDoc.setField(hashField.getName(), newHash); + int i = 0; + + if (!isHashAcceptable(cmd.getIndexedId(), newHash)) { + return; + } + + while (true) { + logOverlyFailedRetries(i, cmd); + try { + super.processAdd(cmd); + return; + } catch (SolrException e) { + if (e.code() != 409) { + throw e; + } + ++i; + } + } + } + + @Override + public void finish() throws IOException { + try { + super.finish(); + } finally { + // Only log when there are updates to existing documents + int totalUpdates = sameCount + differentCount; + if (totalUpdates > 0) { + if (dropSameDocuments) { + rsp.addToLog("contentHash.duplicatesDropped", sameCount); + rsp.addToLog("contentHash.duplicatesDetected", sameCount); + } else { + rsp.addToLog("contentHash.duplicatesDropped", 0); + rsp.addToLog("contentHash.duplicatesDetected", sameCount); + } + rsp.addToLog("contentHash.changed", differentCount); + } else { + rsp.addToLog("contentHash.duplicatesDropped", 0); + rsp.addToLog("contentHash.duplicatesDetected", 0); + rsp.addToLog("contentHash.changed", 0); + } + } + } + + private static void logOverlyFailedRetries(int i, UpdateCommand cmd) { + if ((i & 255) == 255) { + log.warn("Unusual number of optimistic concurrency retries: retries={} cmd={}", i, cmd); + } + } + + void setDropSameDocuments(boolean dropSameDocuments) { + this.dropSameDocuments = dropSameDocuments; + } + + private boolean isHashAcceptable(BytesRef indexedDocId, String newHash) throws IOException { + assert null != indexedDocId; + + Optional oldDocHash = getOldDocHash(indexedDocId); + if (oldDocHash.isPresent()) { + String oldHash = oldDocHash.get(); // No hash: might want to keep track of these too + if (Objects.equals(newHash, oldHash)) { + sameCount++; + return !dropSameDocuments; + } else { + differentCount++; + return true; + } + } + return true; // Doc not found + } + + /** Retrieves the hash value from the old document identified by the given ID. */ + private Optional getOldDocHash(BytesRef indexedDocId) throws IOException { + SolrInputDocument oldDoc = + RealTimeGetComponent.getInputDocument( + core, indexedDocId, indexedDocId, null, Set.of(hashField.getName()), Resolution.DOC); + if (oldDoc == null) { + return Optional.empty(); + } + Object o = oldDoc.getFieldValue(hashField.getName()); + return Optional.ofNullable(o).map(String::valueOf); + } + + String computeDocHash(SolrInputDocument doc) { + final Signature sig = new Lookup3Signature(); + + // Stream field names, filter, sort, and process in a single pass + doc.getFieldNames().stream() + .filter(includedFields) // Keep fields that match 'included fields' matcher + .filter(excludedFields.negate()) // Exclude fields that match 'excluded fields' matcher + .sorted() // Sort to ensure consistent field order across different doc field orders + .forEach( + fieldName -> { + sig.add(fieldName); + Object o = doc.getFieldValue(fieldName); + if (o instanceof Collection) { + for (Object oo : (Collection) o) { + sig.add(String.valueOf(oo)); + } + } else { + sig.add(String.valueOf(o)); + } + }); + + // Signature, depending on implementation, may return 8-byte or 16-byte value + byte[] signature = sig.getSignature(); + return Base64.getEncoder().encodeToString(signature); + } +} diff --git a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java new file mode 100644 index 00000000000..bdff30f062d --- /dev/null +++ b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.update.processor; + +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import java.util.function.Predicate; +import java.util.stream.Collectors; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.util.StrUtils; +import org.apache.solr.core.SolrCore; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.util.plugin.SolrCoreAware; + +/** Factory for {@link ContentHashVersionProcessor} instances. */ +public class ContentHashVersionProcessorFactory extends UpdateRequestProcessorFactory + implements SolrCoreAware, UpdateRequestProcessorFactory.RunAlways { + private static final char SEPARATOR = ','; // Separator for included/excluded fields + private List includeFields = List.of("*"); // Included fields defaults to 'all' + private List excludeFields = new ArrayList<>(); + private String hashFieldName; // Must be explicitly configured + private boolean dropSameDocuments = true; + + public ContentHashVersionProcessorFactory() {} + + public void init(NamedList args) { + Object tmp = args.remove("includeFields"); + if (tmp != null) { + if (!(tmp instanceof String)) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, "'includeFields' must be configured as a "); + } + // Include fields support comma separated list of fields (e.g. "field1,field2,field3"). + // Also supports "*" to include all fields + this.includeFields = + StrUtils.splitSmart((String) tmp, SEPARATOR).stream() + .map(String::trim) + .collect(Collectors.toList()); + } + tmp = args.remove("hashFieldName"); + if (tmp == null) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "'hashFieldName' is required and must be explicitly configured"); + } + if (!(tmp instanceof String)) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, "'hashFieldName' must be configured as a "); + } + this.hashFieldName = (String) tmp; + + tmp = args.remove("excludeFields"); + if (tmp != null) { + if (!(tmp instanceof String)) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, "'excludeFields' must be configured as a "); + } + if ("*".equals(((String) tmp).trim())) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, "'excludeFields' can't exclude all fields."); + } + // Exclude fields support comma separated list of fields (e.g. + // "excluded_field1,excluded_field2"). + // Also supports "*" to exclude all fields + this.excludeFields = + StrUtils.splitSmart((String) tmp, SEPARATOR).stream() + .map(String::trim) + .collect(Collectors.toList()); + } + excludeFields.add(hashFieldName); // Hash field name is excluded from hash computation + + tmp = args.remove("hashCompareStrategy"); + if (tmp != null) { + if (!(tmp instanceof String)) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "'hashCompareStrategy' must be configured as a "); + } + String value = ((String) tmp).toLowerCase(Locale.ROOT); + if ("drop".equalsIgnoreCase(value)) { + dropSameDocuments = true; + } else if ("log".equalsIgnoreCase(value)) { + dropSameDocuments = false; + } else { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "Value '" + + value + + "' is unsupported for 'hashCompareStrategy', only 'drop' and 'log' are supported."); + } + } + + super.init(args); + } + + public UpdateRequestProcessor getInstance( + SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) { + ContentHashVersionProcessor processor = + new ContentHashVersionProcessor( + buildFieldMatcher(includeFields), + buildFieldMatcher(excludeFields), + hashFieldName, + req, + rsp, + next); + processor.setDropSameDocuments(dropSameDocuments); + return processor; + } + + public void inform(SolrCore core) { + if (core.getLatestSchema().getUniqueKeyField() == null) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, "schema must have uniqueKey defined."); + } + } + + public String getHashFieldName() { + return hashFieldName; + } + + public List getIncludeFields() { + return includeFields; + } + + public List getExcludeFields() { + return excludeFields; + } + + public boolean dropSameDocuments() { + return dropSameDocuments; + } + + static Predicate buildFieldMatcher(List fieldNames) { + return fieldName -> { + for (String currentFieldName : fieldNames) { + if ("*".equals(currentFieldName)) { + return true; + } + if (fieldName.equals(currentFieldName)) { + return true; + } + if (currentFieldName.length() > 1 + && currentFieldName.endsWith("*") + && fieldName.startsWith(currentFieldName.substring(0, currentFieldName.length() - 1))) { + return true; + } + } + return false; + }; + } +} diff --git a/solr/core/src/test-files/solr/collection1/conf/schema16.xml b/solr/core/src/test-files/solr/collection1/conf/schema16.xml new file mode 100644 index 00000000000..3cfdcc119fe --- /dev/null +++ b/solr/core/src/test-files/solr/collection1/conf/schema16.xml @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + _id + diff --git a/solr/core/src/test-files/solr/collection1/conf/solrconfig-contenthashversion.xml b/solr/core/src/test-files/solr/collection1/conf/solrconfig-contenthashversion.xml new file mode 100644 index 00000000000..5afe4b167cf --- /dev/null +++ b/solr/core/src/test-files/solr/collection1/conf/solrconfig-contenthashversion.xml @@ -0,0 +1,66 @@ + + + + + + + ${tests.luceneMatchVersion:LATEST} + + + + + ${solr.data.dir:} + + + + + + ${solr.ulog.dir:} + + + + + + + _hash_ + _id + + + + + + + _hash_ + _id + log + + + + + + + _hash_ + _id + drop + + + + + + + diff --git a/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorFactoryTest.java b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorFactoryTest.java new file mode 100644 index 00000000000..807a1987264 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorFactoryTest.java @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.update.processor; + +import static org.apache.solr.SolrTestCaseJ4.assumeWorkingMockito; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.util.List; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.util.NamedList; +import org.junit.BeforeClass; +import org.junit.Test; + +public class ContentHashVersionProcessorFactoryTest { + + @BeforeClass + public static void beforeClass() throws Exception { + assumeWorkingMockito(); + } + + @Test + public void shouldHaveSensibleDefaultValues() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + assertEquals(List.of("*"), factory.getIncludeFields()); + assertTrue(factory.dropSameDocuments()); + } + + @Test + public void shouldInitWithHashFieldName() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("hashFieldName", "_hash_field_"); + factory.init(args); + + assertEquals("_hash_field_", factory.getHashFieldName()); + } + + @Test + public void shouldInitWithAllField() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("hashFieldName", "content_hash"); + args.add("includeFields", "*"); + factory.init(args); + + assertEquals(1, factory.getIncludeFields().size()); + assertEquals("*", factory.getIncludeFields().getFirst()); + } + + @Test + public void shouldInitWithIncludedFields() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("hashFieldName", "content_hash"); + args.add("includeFields", " field1,field2 , field3 "); + factory.init(args); + + assertEquals(3, factory.getIncludeFields().size()); + assertEquals(List.of("field1", "field2", "field3"), factory.getIncludeFields()); + } + + @Test + public void shouldInitWithExcludedFields() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("hashFieldName", "content_hash"); + args.add("excludeFields", " field1,field2 , field3 "); + factory.init(args); + + assertEquals(4, factory.getExcludeFields().size()); + assertEquals(List.of("field1", "field2", "field3", "content_hash"), factory.getExcludeFields()); + } + + @Test + public void shouldSelectDropStrategy() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("hashFieldName", "content_hash"); + args.add("hashCompareStrategy", "drop"); + factory.init(args); + + assertTrue(factory.dropSameDocuments()); + } + + @Test + public void shouldSelectLogStrategy() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("hashFieldName", "content_hash"); + args.add("hashCompareStrategy", "log"); + factory.init(args); + + assertFalse(factory.dropSameDocuments()); + } + + @Test(expected = SolrException.class) + public void shouldSelectUnsupportedStrategy() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("hashFieldName", "content_hash"); + args.add("hashCompareStrategy", "unsupported value"); + factory.init(args); + } + + @Test(expected = SolrException.class) + public void shouldRejectExcludeAllFields() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("hashFieldName", "content_hash"); + args.add("excludeFields", "*"); + factory.init(args); + } + + @Test(expected = SolrException.class) + public void shouldRequireExplicitHashFieldName() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + // Intentionally not setting hashFieldName + factory.init(args); + } + + @Test + public void shouldAutoExcludeHashFieldFromHashComputation() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("hashFieldName", "my_hash_field"); + args.add("excludeFields", "field1,field2"); + factory.init(args); + + // Hash field should be automatically added to excludeFields + assertEquals(3, factory.getExcludeFields().size()); + assertTrue( + "Should contain explicitly excluded field1", factory.getExcludeFields().contains("field1")); + assertTrue( + "Should contain explicitly excluded field2", factory.getExcludeFields().contains("field2")); + assertTrue( + "Should auto-exclude hash field name", + factory.getExcludeFields().contains("my_hash_field")); + } +} diff --git a/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java new file mode 100644 index 00000000000..6b9c908cbfe --- /dev/null +++ b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java @@ -0,0 +1,433 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.update.processor; + +import static org.mockito.Mockito.mock; + +import java.util.List; +import java.util.UUID; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.jspecify.annotations.NonNull; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +public class ContentHashVersionProcessorTest extends UpdateProcessorTestBase { + + public static final String ID_FIELD = "_id"; + public static final String FIRST_FIELD = "field1"; + public static final String SECOND_FIELD = "field2"; + public static final String THIRD_FIELD = "docField3"; + public static final String FOURTH_FIELD = "field4"; + + public static final String INITIAL_DOC_ID = "1"; + public static final String INITIAL_FIELD1_VALUE = "Initial values used to compute initial hash"; + public static final String INITIAL_FIELD2_VALUE = + "This a constant value for testing include/exclude fields"; + public static final String[] INITIAL_DOC = + new String[] { + ID_FIELD, INITIAL_DOC_ID, + FIRST_FIELD, INITIAL_FIELD1_VALUE, + SECOND_FIELD, INITIAL_FIELD2_VALUE + }; + private String initialDocHash; + + @BeforeClass + public static void beforeClass() throws Exception { + initCore("solrconfig-contenthashversion.xml", "schema16.xml"); + } + + @Before + public void setUp() throws Exception { + super.setUp(); + assertU(delQ("*:*")); + addDoc(adoc(INITIAL_DOC), "contenthashversion-default"); + assertU(commit()); + + // Query for the document and extract _hash_ field value + initialDocHash = getHashFieldValue(INITIAL_DOC_ID); + } + + private static @NonNull String getHashFieldValue(String docId) throws Exception { + String response = h.query(req("q", ID_FIELD + ":" + docId, "fl", "_hash_")); + + // Parse XML response to extract _hash_ field value + // Response format: value + String hashPattern = ""; + int startIdx = response.indexOf(hashPattern); + if (startIdx == -1) { + fail("Hash field not found in document " + docId); + } + startIdx += hashPattern.length(); + int endIdx = response.indexOf("", startIdx); + if (endIdx == -1) { + fail("Hash field closing tag not found"); + } + return response.substring(startIdx, endIdx); + } + + private ContentHashVersionProcessor getContentHashVersionProcessor( + List includedFields, List excludedFields) { + return new ContentHashVersionProcessor( + ContentHashVersionProcessorFactory.buildFieldMatcher(includedFields), + ContentHashVersionProcessorFactory.buildFieldMatcher(excludedFields), + "_hash_", + mock(SolrQueryRequest.class), + mock(SolrQueryResponse.class), + mock(UpdateRequestProcessor.class)); + } + + @Test + public void shouldUseExcludedFieldsWildcard() { + // Given + ContentHashVersionProcessor processor = + getContentHashVersionProcessor(List.of("*"), List.of("field*")); + + // Given (doc for update) + SolrInputDocument inputDocument = + doc( + f(ID_FIELD, "0000000001"), + f(FIRST_FIELD, UUID.randomUUID().toString()), + f(SECOND_FIELD, UUID.randomUUID().toString()), + f(THIRD_FIELD, "constant to have a constant hash"), + f(FOURTH_FIELD, UUID.randomUUID().toString())); + + // Then (only ID and THIRD_FIELD is used in hash, other fields contain random values) + assertEquals( + "bwE8Zjq0aOs=", processor.computeDocHash(inputDocument)); // Hash if only ID field was used + } + + @Test + public void shouldUseIncludedFieldsWildcard() { + // Given + ContentHashVersionProcessor processor = + getContentHashVersionProcessor(List.of("field*"), List.of(THIRD_FIELD)); + + // Given (doc for update) + SolrInputDocument inputDocument = + doc( + f(ID_FIELD, "0000000001"), + f(FIRST_FIELD, "constant to have a constant hash for field1"), + f(SECOND_FIELD, "constant to have a constant hash for field2"), + f(THIRD_FIELD, UUID.randomUUID().toString()), + f(FOURTH_FIELD, "constant to have a constant hash for field4")); + + // Then + assertEquals("PozPs2qZQtw=", processor.computeDocHash(inputDocument)); + } + + @Test + public void shouldUseIncludedFieldsWildcard2() { + // Given (variant of previous shouldUseIncludedFieldsWildcard, without the excludedField config) + ContentHashVersionProcessor processor = + getContentHashVersionProcessor(List.of("field*"), List.of()); + + // Given (doc for update) + SolrInputDocument inputDocument = + doc( + f(ID_FIELD, "0000000001"), + f(FIRST_FIELD, "constant to have a constant hash for field1"), + f(SECOND_FIELD, "constant to have a constant hash for field2"), + f(THIRD_FIELD, UUID.randomUUID().toString()), + f(FOURTH_FIELD, "constant to have a constant hash for field4")); + + // Then + assertEquals("PozPs2qZQtw=", processor.computeDocHash(inputDocument)); + } + + @Test + public void shouldDedupIncludedFields() { + // Given (processor to include field1 and field2 only) + ContentHashVersionProcessor processorWithDuplicatedFieldName = + getContentHashVersionProcessor(List.of(FIRST_FIELD, FIRST_FIELD, SECOND_FIELD), List.of()); + ContentHashVersionProcessor processorWithWildcard = + getContentHashVersionProcessor( + List.of( // Also change order of config (test reorder of field names) + SECOND_FIELD, FIRST_FIELD, "field1*"), + List.of()); + + // Given (doc for update) + SolrInputDocument inputDocument = + doc( + f(ID_FIELD, "0000000001"), + f(FIRST_FIELD, "constant to have a constant hash for field1"), + f(SECOND_FIELD, "constant to have a constant hash for field2"), + f(THIRD_FIELD, UUID.randomUUID().toString()), + f(FOURTH_FIELD, "constant to have a constant hash for field4")); + + // Then + assertEquals("XavrOYGlkXM=", processorWithDuplicatedFieldName.computeDocHash(inputDocument)); + assertEquals("XavrOYGlkXM=", processorWithWildcard.computeDocHash(inputDocument)); + } + + @Test + public void shouldCreateSignatureForNewDoc() throws Exception { + // When (update) + final String newDocId = UUID.randomUUID().toString(); + assertU( + adoc( + ID_FIELD, newDocId, + FIRST_FIELD, INITIAL_FIELD1_VALUE, + SECOND_FIELD, INITIAL_FIELD2_VALUE)); + assertU(commit()); + + // Then + final String hashFieldValueForNewDoc = getHashFieldValue(newDocId); + assertEquals(initialDocHash, hashFieldValueForNewDoc); + } + + @Test + public void shouldAddToResponseLog() throws Exception { + // Given (command to update existing doc) + final String newDocId = UUID.randomUUID().toString(); + final SolrQueryResponse update1 = + addDocWithResponse( + adoc( + ID_FIELD, newDocId, + FIRST_FIELD, INITIAL_FIELD1_VALUE, + SECOND_FIELD, INITIAL_FIELD2_VALUE), + "contenthashversion-default"); + final SolrQueryResponse update2 = + addDocWithResponse( + adoc( + ID_FIELD, newDocId, + FIRST_FIELD, "This is a doc with values", + SECOND_FIELD, "that differs from stored doc, so it's considered new"), + "contenthashversion-default"); + assertU(commit()); + + // Then + assertResponse(update1, 0, 0, 0); + assertResponse(update2, 0, 0, 1); + } + + @Test + public void shouldKeepDuplicateDocumentsInLogMode() throws Exception { + // Given: Use log chain which detects but does NOT drop duplicates + final String docId = UUID.randomUUID().toString(); + + // When: Add a document + addDoc( + adoc( + ID_FIELD, docId, + FIRST_FIELD, "original value", + SECOND_FIELD, "original value 2"), + "contenthashversion-log"); + assertU(commit()); + String originalHash = getHashFieldValue(docId); + + // When: Try to add the same content again (duplicate) + SolrQueryResponse duplicateResponse = + addDocWithResponse( + adoc( + ID_FIELD, docId, + FIRST_FIELD, "original value", + SECOND_FIELD, "original value 2"), + "contenthashversion-log"); + assertU(commit()); + + // Then: Response should show duplicate was detected but NOT dropped + assertResponse(duplicateResponse, 0, 1, 0); + + // Then: Document should still exist in index + assertQ(req("q", ID_FIELD + ":" + docId), "//result[@numFound='1']"); + + // Then: Document hash should remain unchanged (duplicate was processed) + String currentHash = getHashFieldValue(docId); + assertEquals("Hash should remain unchanged for duplicate", originalHash, currentHash); + + // When: Update with different content + SolrQueryResponse changedResponse = + addDocWithResponse( + adoc( + ID_FIELD, docId, + FIRST_FIELD, "changed value", + SECOND_FIELD, "changed value 2"), + "contenthashversion-log"); + assertU(commit()); + + // Then: Response should show content changed + assertResponse(changedResponse, 0, 0, 1); + + // Then: Hash should be updated + String newHash = getHashFieldValue(docId); + assertNotEquals("Hash should change for different content", originalHash, newHash); + } + + @Test + public void shouldExcludeFieldsUpdateSignatureForNewDoc() throws Exception { + // Given (update using URP chain WITHOUT drop doc (log mode)) + final String newDocId = UUID.randomUUID().toString(); + addDoc( + adoc( + ID_FIELD, newDocId, + FIRST_FIELD, INITIAL_FIELD1_VALUE, + SECOND_FIELD, INITIAL_FIELD2_VALUE), + "contenthashversion-default"); + assertU(commit()); + + // Then + final String hashFieldValue = getHashFieldValue(newDocId); + assertEquals(initialDocHash, hashFieldValue); + } + + @Test + public void shouldCommitWithDropModeEnabled() throws Exception { + // Initial document already exists from setUp() + // When: Try to add the same document again (duplicate content) using URP chain WITH drop doc + // (drop mode) + SolrQueryResponse solrQueryResponse = + addDocWithResponse( + adoc( + ID_FIELD, INITIAL_DOC_ID, + FIRST_FIELD, INITIAL_FIELD1_VALUE, + SECOND_FIELD, INITIAL_FIELD2_VALUE), + "contenthashversion-drop"); + assertU(commit()); + + // Then: Verify response shows duplicate was dropped + assertResponse(solrQueryResponse, 1, 1, 0); + + // Then: Verify document was NOT actually added/updated (still only 1 doc in index) + assertQ(req("q", "*:*"), "//result[@numFound='1']"); + + // Verify the document still has the original hash + String currentHash = getHashFieldValue(INITIAL_DOC_ID); + assertEquals("Document hash should not have changed", initialDocHash, currentHash); + } + + @Test + public void shouldHandleDocumentWithOnlyIdField() { + // Given: Document with only ID field (no other fields to hash) + ContentHashVersionProcessor processor = + getContentHashVersionProcessor(List.of("*"), List.of(ID_FIELD)); + + // When: Compute hash for document with only ID + SolrInputDocument doc = doc(f(ID_FIELD, "only-id-doc")); + + // Then: Should compute hash (even if empty field set) + String hash = processor.computeDocHash(doc); + assertNotNull("Hash should not be null for ID-only document", hash); + assertFalse("Hash should not be empty", hash.isEmpty()); + } + + @Test + public void shouldHandleMultiValueFields() { + // Given: Processor that includes multi-value fields + ContentHashVersionProcessor processor = + getContentHashVersionProcessor(List.of("*"), List.of(ID_FIELD)); + + // When: Document with multi-value field + SolrInputDocument doc1 = doc(f(ID_FIELD, "doc1"), f(FIRST_FIELD, "value1", "value2", "value3")); + + // Then: Should compute consistent hash + String hash1 = processor.computeDocHash(doc1); + assertNotNull(hash1); + + // Same values in same order should produce same hash + SolrInputDocument doc2 = doc(f(ID_FIELD, "doc2"), f(FIRST_FIELD, "value1", "value2", "value3")); + String hash2 = processor.computeDocHash(doc2); + assertEquals("Same multi-value field should produce same hash", hash1, hash2); + + // Different order should produce different hash (collection order matters) + SolrInputDocument doc3 = doc(f(ID_FIELD, "doc3"), f(FIRST_FIELD, "value3", "value1", "value2")); + String hash3 = processor.computeDocHash(doc3); + assertNotEquals("Different order should produce different hash", hash1, hash3); + } + + @Test + public void shouldHandleNullFieldValues() { + // Given: Processor that handles null values + ContentHashVersionProcessor processor = + getContentHashVersionProcessor(List.of("*"), List.of(ID_FIELD)); + + // When: Document with null field value (represented as "null" string) + SolrInputDocument doc = doc(f(ID_FIELD, "null-doc"), f(FIRST_FIELD, (Object) null)); + + // Then: Should compute hash without error + String hash = processor.computeDocHash(doc); + assertNotNull("Should handle null values", hash); + assertFalse("Hash should not be empty", hash.isEmpty()); + } + + @Test + public void shouldProduceSameHashRegardlessOfFieldOrder() { + // Given: Documents with same fields in different order + ContentHashVersionProcessor processor = + getContentHashVersionProcessor(List.of("*"), List.of(ID_FIELD)); + + // When: Create docs with fields in different order + SolrInputDocument doc1 = + doc( + f(ID_FIELD, "doc1"), + f(FIRST_FIELD, "value1"), + f(SECOND_FIELD, "value2"), + f(THIRD_FIELD, "value3")); + + SolrInputDocument doc2 = + doc( + f(ID_FIELD, "doc2"), + f(THIRD_FIELD, "value3"), + f(FIRST_FIELD, "value1"), + f(SECOND_FIELD, "value2")); + + // Then: Hashes should be identical (fields are sorted before hashing) + String hash1 = processor.computeDocHash(doc1); + String hash2 = processor.computeDocHash(doc2); + assertEquals("Hash should be same regardless of field order", hash1, hash2); + } + + @Test + public void shouldHandleEmptyFieldValues() { + // Given: Document with empty string values + ContentHashVersionProcessor processor = + getContentHashVersionProcessor(List.of("*"), List.of(ID_FIELD)); + + SolrInputDocument doc1 = doc(f(ID_FIELD, "empty-doc"), f(FIRST_FIELD, ""), f(SECOND_FIELD, "")); + + // When: Compute hash + String hash1 = processor.computeDocHash(doc1); + + // Then: Should produce valid hash + assertNotNull("Should handle empty values", hash1); + assertFalse("Hash should not be empty", hash1.isEmpty()); + + // Empty strings should produce different hash than no fields + SolrInputDocument doc2 = doc(f(ID_FIELD, "empty-doc")); + String hash2 = processor.computeDocHash(doc2); + assertNotEquals("Empty string fields should differ from no fields", hash1, hash2); + } + + private static void assertResponse( + SolrQueryResponse solrQueryResponse, + int droppedDocCount, + int duplicateDocCount, + int changedDocCount) { + assertNotNull(solrQueryResponse.getToLog().get("contentHash.duplicatesDropped")); + assertNotNull(solrQueryResponse.getToLog().get("contentHash.duplicatesDetected")); + assertNotNull(solrQueryResponse.getToLog().get("contentHash.changed")); + + int droppedDocs = (int) solrQueryResponse.getToLog().get("contentHash.duplicatesDropped"); + int duplicateDocs = (int) solrQueryResponse.getToLog().get("contentHash.duplicatesDetected"); + int changedDocs = (int) solrQueryResponse.getToLog().get("contentHash.changed"); + assertEquals(droppedDocCount, droppedDocs); + assertEquals(duplicateDocCount, duplicateDocs); + assertEquals(changedDocCount, changedDocs); + } +} diff --git a/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java b/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java index 3ff7e74aaa2..9df7e06cb6c 100644 --- a/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java +++ b/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java @@ -1161,6 +1161,11 @@ public static String adoc(SolrInputDocument sdoc) { } public static void addDoc(String doc, String updateRequestProcessorChain) throws Exception { + addDocWithResponse(doc, updateRequestProcessorChain); + } + + public static SolrQueryResponse addDocWithResponse(String doc, String updateRequestProcessorChain) + throws Exception { Map params = new HashMap<>(); MultiMapSolrParams mmparams = new MultiMapSolrParams(params); params.put(UpdateParams.UPDATE_CHAIN, new String[] {updateRequestProcessorChain}); @@ -1169,8 +1174,11 @@ public static void addDoc(String doc, String updateRequestProcessorChain) throws UpdateRequestHandler handler = new UpdateRequestHandler(); handler.init(null); req.setContentStreams(List.of(new ContentStreamBase.StringStream(doc))); - handler.handleRequestBody(req, new SolrQueryResponse()); + final SolrQueryResponse rsp = new SolrQueryResponse(); + handler.handleRequestBody(req, rsp); req.close(); + + return rsp; } /**