Store keyword fields that trip ignore_above in binary doc values (#137483)

Kubik42 · web-flow · commit 67831350281b · 2025-11-25T14:55:43.000-06:00
* Store keyword fields that trip ignore_above in binary doc values

* Addressed feedback

* Moved ignore values doc value field fetcher inside of existing fetcher function
diff --git a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java
@@ -14,6 +14,8 @@
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.StoredField;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.Term;
@@ -30,6 +32,7 @@
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOFunction;
 import org.elasticsearch.common.CheckedIntFunction;
+import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
 import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.common.text.UTF8DecodingReader;
 import org.elasticsearch.common.unit.Fuzziness;
@@ -39,6 +42,7 @@
 import org.elasticsearch.index.analysis.NamedAnalyzer;
 import org.elasticsearch.index.fielddata.FieldDataContext;
 import org.elasticsearch.index.fielddata.IndexFieldData;
+import org.elasticsearch.index.fielddata.SortedBinaryDocValues;
 import org.elasticsearch.index.fielddata.SourceValueFetcherSortedBinaryIndexFieldData;
 import org.elasticsearch.index.fielddata.StoredFieldSortedBinaryIndexFieldData;
 import org.elasticsearch.index.fieldvisitor.StoredFieldLoader;
@@ -297,12 +301,17 @@ private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOExcepti
 
             if (parent instanceof KeywordFieldMapper.KeywordFieldType keywordParent
                 && keywordParent.ignoreAbove().valuesPotentiallyIgnored()) {
-                final String parentFallbackFieldName = keywordParent.syntheticSourceFallbackFieldName();
                 if (parent.isStored()) {
-                    return storedFieldFetcher(parentFieldName, parentFallbackFieldName);
+                    return combineFieldFetchers(
+                        storedFieldFetcher(parentFieldName),
+                        ignoredValuesDocValuesFieldFetcher(keywordParent.syntheticSourceFallbackFieldName())
+                    );
                 } else if (parent.hasDocValues()) {
                     var ifd = searchExecutionContext.getForField(parent, MappedFieldType.FielddataOperation.SEARCH);
-                    return combineFieldFetchers(docValuesFieldFetcher(ifd), storedFieldFetcher(parentFallbackFieldName));
+                    return combineFieldFetchers(
+                        docValuesFieldFetcher(ifd),
+                        ignoredValuesDocValuesFieldFetcher(keywordParent.syntheticSourceFallbackFieldName())
+                    );
                 }
             }
 
@@ -325,22 +334,16 @@ private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOExcepti
             final KeywordFieldMapper.KeywordFieldType keywordDelegate
         ) {
             if (keywordDelegate.ignoreAbove().valuesPotentiallyIgnored()) {
-                // because we don't know whether the delegate field will be ignored during parsing, we must also check the current field
-                String fieldName = name();
-                String fallbackName = syntheticSourceFallbackFieldName();
-
-                // delegate field names
                 String delegateFieldName = keywordDelegate.name();
-                String delegateFieldFallbackName = keywordDelegate.syntheticSourceFallbackFieldName();
+                // bc we don't know whether the delegate will ignore a value, we must also check the fallback field created by this
+                // match_only_text field
+                String fallbackName = syntheticSourceFallbackFieldName();
 
                 if (keywordDelegate.isStored()) {
-                    return storedFieldFetcher(delegateFieldName, delegateFieldFallbackName, fieldName, fallbackName);
+                    return storedFieldFetcher(delegateFieldName, fallbackName);
                 } else if (keywordDelegate.hasDocValues()) {
                     var ifd = searchExecutionContext.getForField(keywordDelegate, MappedFieldType.FielddataOperation.SEARCH);
-                    return combineFieldFetchers(
-                        docValuesFieldFetcher(ifd),
-                        storedFieldFetcher(delegateFieldFallbackName, fieldName, fallbackName)
-                    );
+                    return combineFieldFetchers(docValuesFieldFetcher(ifd), storedFieldFetcher(fallbackName));
                 }
             }
 
@@ -355,25 +358,34 @@ private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOExcepti
             }
         }
 
-        private static IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> docValuesFieldFetcher(
-            IndexFieldData<?> ifd
+        private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> docValuesFieldFetcher(IndexFieldData<?> ifd) {
+            return context -> {
+                SortedBinaryDocValues indexedValuesDocValues = ifd.load(context).getBytesValues();
+                return docId -> getValuesFromDocValues(indexedValuesDocValues, docId);
+            };
+        }
+
+        private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> ignoredValuesDocValuesFieldFetcher(
+            String fieldName
         ) {
             return context -> {
-                var sortedBinaryDocValues = ifd.load(context).getBytesValues();
-                return docId -> {
-                    if (sortedBinaryDocValues.advanceExact(docId)) {
-                        var values = new ArrayList<>(sortedBinaryDocValues.docValueCount());
-                        for (int i = 0; i < sortedBinaryDocValues.docValueCount(); i++) {
-                            values.add(sortedBinaryDocValues.nextValue().utf8ToString());
-                        }
-                        return values;
-                    } else {
-                        return List.of();
-                    }
-                };
+                CustomBinaryDocValues ignoredValuesDocValues = new CustomBinaryDocValues(DocValues.getBinary(context.reader(), fieldName));
+                return docId -> getValuesFromDocValues(ignoredValuesDocValues, docId);
             };
         }
 
+        private List<Object> getValuesFromDocValues(SortedBinaryDocValues docValues, int docId) throws IOException {
+            if (docValues.advanceExact(docId)) {
+                var values = new ArrayList<>(docValues.docValueCount());
+                for (int i = 0; i < docValues.docValueCount(); i++) {
+                    values.add(docValues.nextValue().utf8ToString());
+                }
+                return values;
+            } else {
+                return List.of();
+            }
+        }
+
         private static IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> storedFieldFetcher(String... names) {
             var loader = StoredFieldLoader.create(false, Set.of(names));
             return context -> {
@@ -779,4 +791,46 @@ protected void writeValue(Object value, XContentBuilder b) throws IOException {
 
         return fieldLoader;
     }
+
+    /**
+     * A wrapper around {@link BinaryDocValues} that exposes some quality of life functions. Note, these values are not sorted.
+     */
+    private static class CustomBinaryDocValues extends SortedBinaryDocValues {
+
+        private final BinaryDocValues binaryDocValues;
+        private final ByteArrayStreamInput stream;
+
+        private int docValueCount = 0;
+
+        CustomBinaryDocValues(BinaryDocValues binaryDocValues) {
+            this.binaryDocValues = binaryDocValues;
+            this.stream = new ByteArrayStreamInput();
+        }
+
+        @Override
+        public BytesRef nextValue() throws IOException {
+            // this function already knows how to decode the underlying bytes array, so no need to explicitly call VInt()
+            return stream.readBytesRef();
+        }
+
+        @Override
+        public boolean advanceExact(int docId) throws IOException {
+            // if document has a value, read underlying bytes
+            if (binaryDocValues.advanceExact(docId)) {
+                BytesRef docValuesBytes = binaryDocValues.binaryValue();
+                stream.reset(docValuesBytes.bytes, docValuesBytes.offset, docValuesBytes.length);
+                docValueCount = stream.readVInt();
+                return true;
+            }
+
+            // otherwise there is nothing to do
+            docValueCount = 0;
+            return false;
+        }
+
+        @Override
+        public int docValueCount() {
+            return docValueCount;
+        }
+    }
 }
diff --git a/modules/mapper-extras/src/yamlRestTest/resources/rest-api-spec/test/match_only_text/10_basic.yml b/modules/mapper-extras/src/yamlRestTest/resources/rest-api-spec/test/match_only_text/10_basic.yml
@@ -465,7 +465,7 @@ synthetic_source match_only_text as multi-field with ignored keyword as parent:
         id: "1"
         refresh: true
         body:
-          foo: [ "Apache Lucene powers Elasticsearch", "Apache" ]
+          foo: [ "Apache Lucene powers Elasticsearch", "Apache", "Apache Lucene" ]
 
   - do:
       search:
@@ -477,7 +477,7 @@ synthetic_source match_only_text as multi-field with ignored keyword as parent:
 
   - match: { "hits.total.value": 1 }
   - match:
-      hits.hits.0._source.foo: [ "Apache", "Apache Lucene powers Elasticsearch" ]
+      hits.hits.0._source.foo: [ "Apache", "Apache Lucene powers Elasticsearch", "Apache Lucene" ]
 
 ---
 synthetic_source match_only_text as multi-field with stored keyword as parent:
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/BinaryDocValuesSyntheticFieldLoaderLayer.java b/server/src/main/java/org/elasticsearch/index/mapper/BinaryDocValuesSyntheticFieldLoaderLayer.java
@@ -0,0 +1,84 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.index.mapper;
+
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
+import org.elasticsearch.xcontent.XContentBuilder;
+
+import java.io.IOException;
+
+public final class BinaryDocValuesSyntheticFieldLoaderLayer implements CompositeSyntheticFieldLoader.DocValuesLayer {
+
+    private final String fieldName;
+
+    // the binary doc values for a document are all encoded in a single binary array, which this stream knows how to read
+    // the doc values in the array take the form of [doc value count][length of value 1][value 1][length of value 2][value 2]...
+    private final ByteArrayStreamInput stream;
+    private int valueCount;
+
+    public BinaryDocValuesSyntheticFieldLoaderLayer(String fieldName) {
+        this.fieldName = fieldName;
+        this.stream = new ByteArrayStreamInput();
+    }
+
+    @Override
+    public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException {
+        BinaryDocValues docValues = leafReader.getBinaryDocValues(fieldName);
+
+        // there are no values associated with this field
+        if (docValues == null) {
+            valueCount = 0;
+            return null;
+        }
+
+        return docId -> {
+            // there are no more documents to process
+            if (docValues.advanceExact(docId) == false) {
+                valueCount = 0;
+                return false;
+            }
+
+            // otherwise, extract the doc values into a stream to later read from
+            BytesRef docValuesBytes = docValues.binaryValue();
+            stream.reset(docValuesBytes.bytes, docValuesBytes.offset, docValuesBytes.length);
+            valueCount = stream.readVInt();
+
+            return hasValue();
+        };
+    }
+
+    @Override
+    public void write(XContentBuilder b) throws IOException {
+        for (int i = 0; i < valueCount; i++) {
+            // this function already knows how to decode the underlying bytes array, so no need to explicitly call VInt()
+            BytesRef valueBytes = stream.readBytesRef();
+            b.value(valueBytes.utf8ToString());
+        }
+    }
+
+    @Override
+    public boolean hasValue() {
+        return valueCount > 0;
+    }
+
+    @Override
+    public long valueCount() {
+        return valueCount;
+    }
+
+    @Override
+    public String fieldName() {
+        return fieldName;
+    }
+
+}
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java
@@ -40,6 +40,8 @@
 import org.apache.lucene.util.automaton.CompiledAutomaton;
 import org.apache.lucene.util.automaton.CompiledAutomaton.AUTOMATON_TYPE;
 import org.apache.lucene.util.automaton.Operations;
+import org.elasticsearch.ElasticsearchException;
+import org.elasticsearch.common.io.stream.BytesStreamOutput;
 import org.elasticsearch.common.lucene.BytesRefs;
 import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.common.lucene.search.AutomatonQueries;
@@ -87,6 +89,7 @@
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
@@ -1153,7 +1156,14 @@ private boolean indexValue(DocumentParserContext context, XContentString value)
                 var utfBytes = value.bytes();
                 var bytesRef = new BytesRef(utfBytes.bytes(), utfBytes.offset(), utfBytes.length());
                 final String fieldName = fieldType().syntheticSourceFallbackFieldName();
-                context.doc().add(new StoredField(fieldName, bytesRef));
+
+                // store the value in a binary doc values field, create one if it doesn't exist
+                MultiValuedBinaryDocValuesField field = (MultiValuedBinaryDocValuesField) context.doc().getByKey(fieldName);
+                if (field == null) {
+                    field = new MultiValuedBinaryDocValuesField(fieldName);
+                    context.doc().addWithKey(fieldName, field);
+                }
+                field.add(bytesRef);
             }
 
             return false;
@@ -1316,15 +1326,56 @@ protected BytesRef preserve(BytesRef value) {
         // extra copy of the field for supporting synthetic source. This layer will check that copy.
         if (fieldType().ignoreAbove.valuesPotentiallyIgnored()) {
             final String fieldName = fieldType().syntheticSourceFallbackFieldName();
-            layers.add(new CompositeSyntheticFieldLoader.StoredFieldLayer(fieldName) {
-                @Override
-                protected void writeValue(Object value, XContentBuilder b) throws IOException {
-                    BytesRef ref = (BytesRef) value;
-                    b.utf8Value(ref.bytes, ref.offset, ref.length);
-                }
-            });
+            layers.add(new BinaryDocValuesSyntheticFieldLoaderLayer(fieldName));
         }
 
         return new CompositeSyntheticFieldLoader(leafFieldName, fullFieldName, layers);
     }
+
+    /**
+     * A custom implementation of {@link org.apache.lucene.index.BinaryDocValues} that uses a {@link Set} to maintain a collection of unique
+     * binary doc values for fields with multiple values per document.
+     */
+    private static final class MultiValuedBinaryDocValuesField extends CustomDocValuesField {
+
+        private final Set<BytesRef> uniqueValues;
+        private int docValuesByteCount = 0;
+
+        MultiValuedBinaryDocValuesField(String name) {
+            super(name);
+            // linked hash set to maintain insertion order of elements
+            uniqueValues = new LinkedHashSet<>();
+        }
+
+        public void add(final BytesRef value) {
+            if (uniqueValues.add(value)) {
+                // might as well track these on the go as opposed to having to loop through all entries later
+                docValuesByteCount += value.length;
+            }
+        }
+
+        /**
+         * Encodes the collection of binary doc values as a single contiguous binary array, wrapped in {@link BytesRef}. This array takes
+         * the form of [doc value count][length of value 1][value 1][length of value 2][value 2]...
+         */
+        @Override
+        public BytesRef binaryValue() {
+            int docValuesCount = uniqueValues.size();
+            // the + 1 is for the total doc values count, which is prefixed at the start of the array
+            int streamSize = docValuesByteCount + (docValuesCount + 1) * Integer.BYTES;
+
+            try (BytesStreamOutput out = new BytesStreamOutput(streamSize)) {
+                out.writeVInt(docValuesCount);
+                for (BytesRef value : uniqueValues) {
+                    int valueLength = value.length;
+                    out.writeVInt(valueLength);
+                    out.writeBytes(value.bytes, value.offset, valueLength);
+                }
+                return out.bytes().toBytesRef();
+            } catch (IOException e) {
+                throw new ElasticsearchException("Failed to get binary value", e);
+            }
+        }
+
+    }
 }
diff --git a/server/src/test/java/org/elasticsearch/index/mapper/KeywordSyntheticSourceNativeArrayIntegrationTests.java b/server/src/test/java/org/elasticsearch/index/mapper/KeywordSyntheticSourceNativeArrayIntegrationTests.java
@@ -52,6 +52,7 @@ public void testSynthesizeArrayIgnoreAbove() throws Exception {
             .endObject()
             .endObject()
             .endObject();
+
         // Note values that would be ignored are added at the end of arrays,
         // this makes testing easier as ignored values are always synthesized after regular values:
         var arrayValues = new Object[][] {
@@ -60,7 +61,16 @@ public void testSynthesizeArrayIgnoreAbove() throws Exception {
             new Object[] { "123", "1234", "12345" },
             new Object[] { null, null, null, "blabla" },
             new Object[] { "1", "2", "3", "blabla" } };
-        verifySyntheticArray(arrayValues, mapping, "_id", "field._original");
+
+        // values in the original array should be deduplicated
+        var expectedArrayValues = new Object[][] {
+            new Object[] { null, "a", "ab", "abc", "abcd", null, "abcde" },
+            new Object[] { "12345" },
+            new Object[] { "123", "1234", "12345" },
+            new Object[] { null, null, null, "blabla" },
+            new Object[] { "1", "2", "3", "blabla" } };
+
+        verifySyntheticArray(arrayValues, expectedArrayValues, mapping, "_id");
     }
 
     public void testSynthesizeObjectArray() throws Exception {
diff --git a/test/framework/src/main/java/org/elasticsearch/index/mapper/NativeArrayIntegrationTestCase.java b/test/framework/src/main/java/org/elasticsearch/index/mapper/NativeArrayIntegrationTestCase.java
diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java