Skip to content

Commit 6783135

Browse files
authored
Store keyword fields that trip ignore_above in binary doc values (#137483)
* Store keyword fields that trip ignore_above in binary doc values * Addressed feedback * Moved ignore values doc value field fetcher inside of existing fetcher function
1 parent 7425bf7 commit 6783135

File tree

7 files changed

+250
-97
lines changed

7 files changed

+250
-97
lines changed

modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java

Lines changed: 82 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
import org.apache.lucene.document.Field;
1515
import org.apache.lucene.document.FieldType;
1616
import org.apache.lucene.document.StoredField;
17+
import org.apache.lucene.index.BinaryDocValues;
18+
import org.apache.lucene.index.DocValues;
1719
import org.apache.lucene.index.IndexOptions;
1820
import org.apache.lucene.index.LeafReaderContext;
1921
import org.apache.lucene.index.Term;
@@ -30,6 +32,7 @@
3032
import org.apache.lucene.util.BytesRef;
3133
import org.apache.lucene.util.IOFunction;
3234
import org.elasticsearch.common.CheckedIntFunction;
35+
import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
3336
import org.elasticsearch.common.lucene.Lucene;
3437
import org.elasticsearch.common.text.UTF8DecodingReader;
3538
import org.elasticsearch.common.unit.Fuzziness;
@@ -39,6 +42,7 @@
3942
import org.elasticsearch.index.analysis.NamedAnalyzer;
4043
import org.elasticsearch.index.fielddata.FieldDataContext;
4144
import org.elasticsearch.index.fielddata.IndexFieldData;
45+
import org.elasticsearch.index.fielddata.SortedBinaryDocValues;
4246
import org.elasticsearch.index.fielddata.SourceValueFetcherSortedBinaryIndexFieldData;
4347
import org.elasticsearch.index.fielddata.StoredFieldSortedBinaryIndexFieldData;
4448
import org.elasticsearch.index.fieldvisitor.StoredFieldLoader;
@@ -297,12 +301,17 @@ private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOExcepti
297301

298302
if (parent instanceof KeywordFieldMapper.KeywordFieldType keywordParent
299303
&& keywordParent.ignoreAbove().valuesPotentiallyIgnored()) {
300-
final String parentFallbackFieldName = keywordParent.syntheticSourceFallbackFieldName();
301304
if (parent.isStored()) {
302-
return storedFieldFetcher(parentFieldName, parentFallbackFieldName);
305+
return combineFieldFetchers(
306+
storedFieldFetcher(parentFieldName),
307+
ignoredValuesDocValuesFieldFetcher(keywordParent.syntheticSourceFallbackFieldName())
308+
);
303309
} else if (parent.hasDocValues()) {
304310
var ifd = searchExecutionContext.getForField(parent, MappedFieldType.FielddataOperation.SEARCH);
305-
return combineFieldFetchers(docValuesFieldFetcher(ifd), storedFieldFetcher(parentFallbackFieldName));
311+
return combineFieldFetchers(
312+
docValuesFieldFetcher(ifd),
313+
ignoredValuesDocValuesFieldFetcher(keywordParent.syntheticSourceFallbackFieldName())
314+
);
306315
}
307316
}
308317

@@ -325,22 +334,16 @@ private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOExcepti
325334
final KeywordFieldMapper.KeywordFieldType keywordDelegate
326335
) {
327336
if (keywordDelegate.ignoreAbove().valuesPotentiallyIgnored()) {
328-
// because we don't know whether the delegate field will be ignored during parsing, we must also check the current field
329-
String fieldName = name();
330-
String fallbackName = syntheticSourceFallbackFieldName();
331-
332-
// delegate field names
333337
String delegateFieldName = keywordDelegate.name();
334-
String delegateFieldFallbackName = keywordDelegate.syntheticSourceFallbackFieldName();
338+
// bc we don't know whether the delegate will ignore a value, we must also check the fallback field created by this
339+
// match_only_text field
340+
String fallbackName = syntheticSourceFallbackFieldName();
335341

336342
if (keywordDelegate.isStored()) {
337-
return storedFieldFetcher(delegateFieldName, delegateFieldFallbackName, fieldName, fallbackName);
343+
return storedFieldFetcher(delegateFieldName, fallbackName);
338344
} else if (keywordDelegate.hasDocValues()) {
339345
var ifd = searchExecutionContext.getForField(keywordDelegate, MappedFieldType.FielddataOperation.SEARCH);
340-
return combineFieldFetchers(
341-
docValuesFieldFetcher(ifd),
342-
storedFieldFetcher(delegateFieldFallbackName, fieldName, fallbackName)
343-
);
346+
return combineFieldFetchers(docValuesFieldFetcher(ifd), storedFieldFetcher(fallbackName));
344347
}
345348
}
346349

@@ -355,25 +358,34 @@ private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOExcepti
355358
}
356359
}
357360

358-
private static IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> docValuesFieldFetcher(
359-
IndexFieldData<?> ifd
361+
private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> docValuesFieldFetcher(IndexFieldData<?> ifd) {
362+
return context -> {
363+
SortedBinaryDocValues indexedValuesDocValues = ifd.load(context).getBytesValues();
364+
return docId -> getValuesFromDocValues(indexedValuesDocValues, docId);
365+
};
366+
}
367+
368+
private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> ignoredValuesDocValuesFieldFetcher(
369+
String fieldName
360370
) {
361371
return context -> {
362-
var sortedBinaryDocValues = ifd.load(context).getBytesValues();
363-
return docId -> {
364-
if (sortedBinaryDocValues.advanceExact(docId)) {
365-
var values = new ArrayList<>(sortedBinaryDocValues.docValueCount());
366-
for (int i = 0; i < sortedBinaryDocValues.docValueCount(); i++) {
367-
values.add(sortedBinaryDocValues.nextValue().utf8ToString());
368-
}
369-
return values;
370-
} else {
371-
return List.of();
372-
}
373-
};
372+
CustomBinaryDocValues ignoredValuesDocValues = new CustomBinaryDocValues(DocValues.getBinary(context.reader(), fieldName));
373+
return docId -> getValuesFromDocValues(ignoredValuesDocValues, docId);
374374
};
375375
}
376376

377+
private List<Object> getValuesFromDocValues(SortedBinaryDocValues docValues, int docId) throws IOException {
378+
if (docValues.advanceExact(docId)) {
379+
var values = new ArrayList<>(docValues.docValueCount());
380+
for (int i = 0; i < docValues.docValueCount(); i++) {
381+
values.add(docValues.nextValue().utf8ToString());
382+
}
383+
return values;
384+
} else {
385+
return List.of();
386+
}
387+
}
388+
377389
private static IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> storedFieldFetcher(String... names) {
378390
var loader = StoredFieldLoader.create(false, Set.of(names));
379391
return context -> {
@@ -779,4 +791,46 @@ protected void writeValue(Object value, XContentBuilder b) throws IOException {
779791

780792
return fieldLoader;
781793
}
794+
795+
/**
796+
* A wrapper around {@link BinaryDocValues} that exposes some quality of life functions. Note, these values are not sorted.
797+
*/
798+
private static class CustomBinaryDocValues extends SortedBinaryDocValues {
799+
800+
private final BinaryDocValues binaryDocValues;
801+
private final ByteArrayStreamInput stream;
802+
803+
private int docValueCount = 0;
804+
805+
CustomBinaryDocValues(BinaryDocValues binaryDocValues) {
806+
this.binaryDocValues = binaryDocValues;
807+
this.stream = new ByteArrayStreamInput();
808+
}
809+
810+
@Override
811+
public BytesRef nextValue() throws IOException {
812+
// this function already knows how to decode the underlying bytes array, so no need to explicitly call VInt()
813+
return stream.readBytesRef();
814+
}
815+
816+
@Override
817+
public boolean advanceExact(int docId) throws IOException {
818+
// if document has a value, read underlying bytes
819+
if (binaryDocValues.advanceExact(docId)) {
820+
BytesRef docValuesBytes = binaryDocValues.binaryValue();
821+
stream.reset(docValuesBytes.bytes, docValuesBytes.offset, docValuesBytes.length);
822+
docValueCount = stream.readVInt();
823+
return true;
824+
}
825+
826+
// otherwise there is nothing to do
827+
docValueCount = 0;
828+
return false;
829+
}
830+
831+
@Override
832+
public int docValueCount() {
833+
return docValueCount;
834+
}
835+
}
782836
}

modules/mapper-extras/src/yamlRestTest/resources/rest-api-spec/test/match_only_text/10_basic.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,7 @@ synthetic_source match_only_text as multi-field with ignored keyword as parent:
465465
id: "1"
466466
refresh: true
467467
body:
468-
foo: [ "Apache Lucene powers Elasticsearch", "Apache" ]
468+
foo: [ "Apache Lucene powers Elasticsearch", "Apache", "Apache Lucene" ]
469469

470470
- do:
471471
search:
@@ -477,7 +477,7 @@ synthetic_source match_only_text as multi-field with ignored keyword as parent:
477477

478478
- match: { "hits.total.value": 1 }
479479
- match:
480-
hits.hits.0._source.foo: [ "Apache", "Apache Lucene powers Elasticsearch" ]
480+
hits.hits.0._source.foo: [ "Apache", "Apache Lucene powers Elasticsearch", "Apache Lucene" ]
481481

482482
---
483483
synthetic_source match_only_text as multi-field with stored keyword as parent:
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.index.mapper;
11+
12+
import org.apache.lucene.index.BinaryDocValues;
13+
import org.apache.lucene.index.LeafReader;
14+
import org.apache.lucene.util.BytesRef;
15+
import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
16+
import org.elasticsearch.xcontent.XContentBuilder;
17+
18+
import java.io.IOException;
19+
20+
public final class BinaryDocValuesSyntheticFieldLoaderLayer implements CompositeSyntheticFieldLoader.DocValuesLayer {
21+
22+
private final String fieldName;
23+
24+
// the binary doc values for a document are all encoded in a single binary array, which this stream knows how to read
25+
// the doc values in the array take the form of [doc value count][length of value 1][value 1][length of value 2][value 2]...
26+
private final ByteArrayStreamInput stream;
27+
private int valueCount;
28+
29+
public BinaryDocValuesSyntheticFieldLoaderLayer(String fieldName) {
30+
this.fieldName = fieldName;
31+
this.stream = new ByteArrayStreamInput();
32+
}
33+
34+
@Override
35+
public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException {
36+
BinaryDocValues docValues = leafReader.getBinaryDocValues(fieldName);
37+
38+
// there are no values associated with this field
39+
if (docValues == null) {
40+
valueCount = 0;
41+
return null;
42+
}
43+
44+
return docId -> {
45+
// there are no more documents to process
46+
if (docValues.advanceExact(docId) == false) {
47+
valueCount = 0;
48+
return false;
49+
}
50+
51+
// otherwise, extract the doc values into a stream to later read from
52+
BytesRef docValuesBytes = docValues.binaryValue();
53+
stream.reset(docValuesBytes.bytes, docValuesBytes.offset, docValuesBytes.length);
54+
valueCount = stream.readVInt();
55+
56+
return hasValue();
57+
};
58+
}
59+
60+
@Override
61+
public void write(XContentBuilder b) throws IOException {
62+
for (int i = 0; i < valueCount; i++) {
63+
// this function already knows how to decode the underlying bytes array, so no need to explicitly call VInt()
64+
BytesRef valueBytes = stream.readBytesRef();
65+
b.value(valueBytes.utf8ToString());
66+
}
67+
}
68+
69+
@Override
70+
public boolean hasValue() {
71+
return valueCount > 0;
72+
}
73+
74+
@Override
75+
public long valueCount() {
76+
return valueCount;
77+
}
78+
79+
@Override
80+
public String fieldName() {
81+
return fieldName;
82+
}
83+
84+
}

server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java

Lines changed: 59 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@
4040
import org.apache.lucene.util.automaton.CompiledAutomaton;
4141
import org.apache.lucene.util.automaton.CompiledAutomaton.AUTOMATON_TYPE;
4242
import org.apache.lucene.util.automaton.Operations;
43+
import org.elasticsearch.ElasticsearchException;
44+
import org.elasticsearch.common.io.stream.BytesStreamOutput;
4345
import org.elasticsearch.common.lucene.BytesRefs;
4446
import org.elasticsearch.common.lucene.Lucene;
4547
import org.elasticsearch.common.lucene.search.AutomatonQueries;
@@ -87,6 +89,7 @@
8789
import java.util.Arrays;
8890
import java.util.Collection;
8991
import java.util.Collections;
92+
import java.util.LinkedHashSet;
9093
import java.util.List;
9194
import java.util.Locale;
9295
import java.util.Map;
@@ -1153,7 +1156,14 @@ private boolean indexValue(DocumentParserContext context, XContentString value)
11531156
var utfBytes = value.bytes();
11541157
var bytesRef = new BytesRef(utfBytes.bytes(), utfBytes.offset(), utfBytes.length());
11551158
final String fieldName = fieldType().syntheticSourceFallbackFieldName();
1156-
context.doc().add(new StoredField(fieldName, bytesRef));
1159+
1160+
// store the value in a binary doc values field, create one if it doesn't exist
1161+
MultiValuedBinaryDocValuesField field = (MultiValuedBinaryDocValuesField) context.doc().getByKey(fieldName);
1162+
if (field == null) {
1163+
field = new MultiValuedBinaryDocValuesField(fieldName);
1164+
context.doc().addWithKey(fieldName, field);
1165+
}
1166+
field.add(bytesRef);
11571167
}
11581168

11591169
return false;
@@ -1316,15 +1326,56 @@ protected BytesRef preserve(BytesRef value) {
13161326
// extra copy of the field for supporting synthetic source. This layer will check that copy.
13171327
if (fieldType().ignoreAbove.valuesPotentiallyIgnored()) {
13181328
final String fieldName = fieldType().syntheticSourceFallbackFieldName();
1319-
layers.add(new CompositeSyntheticFieldLoader.StoredFieldLayer(fieldName) {
1320-
@Override
1321-
protected void writeValue(Object value, XContentBuilder b) throws IOException {
1322-
BytesRef ref = (BytesRef) value;
1323-
b.utf8Value(ref.bytes, ref.offset, ref.length);
1324-
}
1325-
});
1329+
layers.add(new BinaryDocValuesSyntheticFieldLoaderLayer(fieldName));
13261330
}
13271331

13281332
return new CompositeSyntheticFieldLoader(leafFieldName, fullFieldName, layers);
13291333
}
1334+
1335+
/**
1336+
* A custom implementation of {@link org.apache.lucene.index.BinaryDocValues} that uses a {@link Set} to maintain a collection of unique
1337+
* binary doc values for fields with multiple values per document.
1338+
*/
1339+
private static final class MultiValuedBinaryDocValuesField extends CustomDocValuesField {
1340+
1341+
private final Set<BytesRef> uniqueValues;
1342+
private int docValuesByteCount = 0;
1343+
1344+
MultiValuedBinaryDocValuesField(String name) {
1345+
super(name);
1346+
// linked hash set to maintain insertion order of elements
1347+
uniqueValues = new LinkedHashSet<>();
1348+
}
1349+
1350+
public void add(final BytesRef value) {
1351+
if (uniqueValues.add(value)) {
1352+
// might as well track these on the go as opposed to having to loop through all entries later
1353+
docValuesByteCount += value.length;
1354+
}
1355+
}
1356+
1357+
/**
1358+
* Encodes the collection of binary doc values as a single contiguous binary array, wrapped in {@link BytesRef}. This array takes
1359+
* the form of [doc value count][length of value 1][value 1][length of value 2][value 2]...
1360+
*/
1361+
@Override
1362+
public BytesRef binaryValue() {
1363+
int docValuesCount = uniqueValues.size();
1364+
// the + 1 is for the total doc values count, which is prefixed at the start of the array
1365+
int streamSize = docValuesByteCount + (docValuesCount + 1) * Integer.BYTES;
1366+
1367+
try (BytesStreamOutput out = new BytesStreamOutput(streamSize)) {
1368+
out.writeVInt(docValuesCount);
1369+
for (BytesRef value : uniqueValues) {
1370+
int valueLength = value.length;
1371+
out.writeVInt(valueLength);
1372+
out.writeBytes(value.bytes, value.offset, valueLength);
1373+
}
1374+
return out.bytes().toBytesRef();
1375+
} catch (IOException e) {
1376+
throw new ElasticsearchException("Failed to get binary value", e);
1377+
}
1378+
}
1379+
1380+
}
13301381
}

server/src/test/java/org/elasticsearch/index/mapper/KeywordSyntheticSourceNativeArrayIntegrationTests.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ public void testSynthesizeArrayIgnoreAbove() throws Exception {
5252
.endObject()
5353
.endObject()
5454
.endObject();
55+
5556
// Note values that would be ignored are added at the end of arrays,
5657
// this makes testing easier as ignored values are always synthesized after regular values:
5758
var arrayValues = new Object[][] {
@@ -60,7 +61,16 @@ public void testSynthesizeArrayIgnoreAbove() throws Exception {
6061
new Object[] { "123", "1234", "12345" },
6162
new Object[] { null, null, null, "blabla" },
6263
new Object[] { "1", "2", "3", "blabla" } };
63-
verifySyntheticArray(arrayValues, mapping, "_id", "field._original");
64+
65+
// values in the original array should be deduplicated
66+
var expectedArrayValues = new Object[][] {
67+
new Object[] { null, "a", "ab", "abc", "abcd", null, "abcde" },
68+
new Object[] { "12345" },
69+
new Object[] { "123", "1234", "12345" },
70+
new Object[] { null, null, null, "blabla" },
71+
new Object[] { "1", "2", "3", "blabla" } };
72+
73+
verifySyntheticArray(arrayValues, expectedArrayValues, mapping, "_id");
6474
}
6575

6676
public void testSynthesizeObjectArray() throws Exception {

0 commit comments

Comments
 (0)