[fix](ES Catalog)Do not extract doc_values of field with ignore_above setting (#40314) (#40464)

bp #40314
This commit is contained in:
qiye
2024-09-06 16:25:30 +08:00
committed by GitHub
parent cb0613e249
commit 8104b992d1
12 changed files with 321 additions and 121 deletions

View File

@ -38,7 +38,7 @@ import org.apache.logging.log4j.Logger;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Collections;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
@ -51,7 +51,10 @@ import java.util.Set;
@Getter
@Setter
public class EsTable extends Table {
public static final Set<String> DEFAULT_DOCVALUE_DISABLED_FIELDS = new HashSet<>(Collections.singletonList("text"));
// reference: https://www.elastic.co/guide/en/elasticsearch/reference/current/doc-values.html
// https://www.elastic.co/guide/en/elasticsearch/reference/current/text.html
public static final Set<String> DEFAULT_DOCVALUE_DISABLED_FIELDS =
new HashSet<>(Arrays.asList("text", "annotated_text", "match_only_text"));
private static final Logger LOG = LogManager.getLogger(EsTable.class);
// Solr doc_values vs stored_fields performance-smackdown indicate:

View File

@ -130,6 +130,16 @@ public class MappingPhase implements SearchPhase {
if (docValue) {
docValueField = colName;
}
} else if (innerTypeObject.has("ignore_above")) {
// reference:
// https://www.elastic.co/guide/en/elasticsearch/reference/current/keyword.html#keyword-params
// > ignore_above
// > Do not index any string longer than this value. Defaults to 2147483647 so that all values
// > would be accepted. Please however note that default dynamic mapping rules create a sub
// > keyword field that overrides this default by setting ignore_above: 256.
// this field has `ignore_above` param
// Strings longer than the ignore_above setting will not be indexed or stored
// so we cannot rely on its doc_values
} else {
// a : {c : {}} -> a -> a.c
docValueField = colName + "." + fieldName;
@ -146,6 +156,17 @@ public class MappingPhase implements SearchPhase {
} else if (fieldType == null || "nested".equals(fieldType)) {
// The object field has no type, and nested not support doc value.
return;
} else if (fieldObject.has("ignore_above")) {
// reference:
// https://www.elastic.co/guide/en/elasticsearch/reference/current/keyword.html#keyword-params
// > ignore_above
// > Do not index any string longer than this value. Defaults to 2147483647 so that all values
// > would be accepted. Please however note that default dynamic mapping rules create a sub
// > keyword field that overrides this default by setting ignore_above: 256.
// this field has `ignore_above` param
// Strings longer than the ignore_above setting will not be indexed or stored
// so we cannot rely on its doc_values
return;
}
docValueField = colName;
}