Skip to content

ESQL: Fix alias removal in regex extraction with JOIN (#127687) #128204

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changelog/127687.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 127687
summary: "ESQL: Fix alias removal in regex extraction with JOIN"
area: ES|QL
type: bug
issues:
- 127467
Original file line number Diff line number Diff line change
Expand Up @@ -1567,3 +1567,72 @@ null | Milky Way | Marunouchi
null | null | null
null | null | null
;


joinMaskingRegex
// https://github.com/elastic/elasticsearch/issues/127467
required_capability: union_types
required_capability: join_lookup_v12
required_capability: fix_join_masking_regex_extract
from books,message_*,ul*
| enrich languages_policy on status
| drop `language_name`, `bytes_out`, `id`, id
| dissect book_no "%{type}"
| dissect author.keyword "%{HZicfARaID}"
| mv_expand `status`
| sort HZicfARaID, year DESC NULLS LAST, publisher DESC NULLS FIRST, description DESC, type NULLS LAST, message ASC NULLS LAST, title NULLS FIRST, status NULLS LAST
| enrich languages_policy on book_no
| grok message "%{WORD:DiLNyZKNDu}"
| limit 7972
| rename year as language_code
| lookup join languages_lookup on language_code
| limit 13966
| stats rcyIZnSOb = min(language_code), `ratings` = min(@timestamp), dgDxwMeFYrD = count(`@timestamp`), ifyZfXigqVN = count(*), qTXdrzSpY = min(language_code) by author.keyword
| rename author.keyword as message
| lookup join message_types_lookup on message
| stats `ratings` = count(*) by type
| stats `type` = count(type), `ratings` = count(*)
| keep `ratings`, ratings
;

ratings:long
1
;

joinMaskingDissect
// https://github.com/elastic/elasticsearch/issues/127467
required_capability: join_lookup_v12
required_capability: fix_join_masking_regex_extract
from sample_data
| dissect message "%{type}"
| drop type
| lookup join message_types_lookup on message
| stats count = count(*) by type
| keep count
| sort count
;
count:long
1
3
3
;


joinMaskingGrok
// https://github.com/elastic/elasticsearch/issues/127467
required_capability: join_lookup_v12
required_capability: fix_join_masking_regex_extract
from sample_data
| grok message "%{WORD:type}"
| drop type
| lookup join message_types_lookup on message
| stats max = max(event_duration) by type
| keep max
| sort max
;

max:long
1232382
3450233
8268153
;
Original file line number Diff line number Diff line change
Expand Up @@ -854,7 +854,13 @@ public enum Cap {
* Support for keeping `DROP` attributes when resolving field names.
* see <a href="https://github.com/elastic/elasticsearch/issues/126418"> ES|QL: no matches for pattern #126418 </a>
*/
DROP_AGAIN_WITH_WILDCARD_AFTER_EVAL;
DROP_AGAIN_WITH_WILDCARD_AFTER_EVAL,

/**
* During resolution (pre-analysis) we have to consider that joins can override regex extracted values
* see <a href="https://github.com/elastic/elasticsearch/issues/127467"> ES|QL: pruning of JOINs leads to missing fields #127467</a>
*/
FIX_JOIN_MASKING_REGEX_EXTRACT;

private final boolean enabled;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
import org.elasticsearch.xpack.esql.core.expression.Expressions;
import org.elasticsearch.xpack.esql.core.expression.FoldContext;
import org.elasticsearch.xpack.esql.core.expression.MetadataAttribute;
import org.elasticsearch.xpack.esql.core.expression.NamedExpression;
import org.elasticsearch.xpack.esql.core.expression.ReferenceAttribute;
import org.elasticsearch.xpack.esql.core.expression.UnresolvedAttribute;
import org.elasticsearch.xpack.esql.core.expression.UnresolvedStar;
import org.elasticsearch.xpack.esql.core.util.Holder;
Expand Down Expand Up @@ -570,11 +572,7 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set<String> enrichPolicy

parsed.forEachDown(p -> {// go over each plan top-down
if (p instanceof RegexExtract re) { // for Grok and Dissect
// remove other down-the-tree references to the extracted fields
for (Attribute extracted : re.extractedFields()) {
referencesBuilder.removeIf(attr -> matchByName(attr, extracted.name(), false));
}
// but keep the inputs needed by Grok/Dissect
// keep the inputs needed by Grok/Dissect
referencesBuilder.addAll(re.input().references());
} else if (p instanceof Enrich enrich) {
AttributeSet enrichFieldRefs = Expressions.references(enrich.enrichFields());
Expand Down Expand Up @@ -629,15 +627,19 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set<String> enrichPolicy
// remove any already discovered UnresolvedAttributes that are in fact aliases defined later down in the tree
// for example "from test | eval x = salary | stats max = max(x) by gender"
// remove the UnresolvedAttribute "x", since that is an Alias defined in "eval"
// also remove other down-the-tree references to the extracted fields from "grok" and "dissect"
AttributeSet planRefs = p.references();
Set<String> fieldNames = planRefs.names();
p.forEachExpressionDown(Alias.class, alias -> {
p.forEachExpressionDown(NamedExpression.class, ne -> {
if ((ne instanceof Alias || ne instanceof ReferenceAttribute) == false) {
return;
}
// do not remove the UnresolvedAttribute that has the same name as its alias, ie "rename id AS id"
// or the UnresolvedAttributes that are used in Functions that have aliases "STATS id = MAX(id)"
if (fieldNames.contains(alias.name())) {
if (fieldNames.contains(ne.name())) {
return;
}
referencesBuilder.removeIf(attr -> matchByName(attr, alias.name(), shadowingRefsBuilder.contains(attr)));
referencesBuilder.removeIf(attr -> matchByName(attr, ne.name(), shadowingRefsBuilder.contains(attr)));
});
}
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1341,6 +1341,53 @@ public void testDissectOverwriteName() {
assertThat(fieldNames, equalTo(Set.of("emp_no", "emp_no.*", "first_name", "first_name.*")));
}

/**
* Fix alias removal in regex extraction with JOIN
* @see <a href="https://github.com/elastic/elasticsearch/issues/127467">ES|QL: pruning of JOINs leads to missing fields</a>
*/
public void testAvoidGrokAttributesRemoval() {
assumeTrue("LOOKUP JOIN available as snapshot only", EsqlCapabilities.Cap.JOIN_LOOKUP_V12.isEnabled());
Set<String> fieldNames = fieldNames("""
from message_types
| eval type = 1
| lookup join message_types_lookup on message
| drop message
| grok type "%{WORD:b}"
| stats x = max(b)
| keep x""", Set.of());
assertThat(fieldNames, equalTo(Set.of("message", "x", "x.*", "message.*")));
}

public void testAvoidGrokAttributesRemoval2() {
assumeTrue("LOOKUP JOIN available as snapshot only", EsqlCapabilities.Cap.JOIN_LOOKUP_V12.isEnabled());
Set<String> fieldNames = fieldNames("""
from sample_data
| dissect message "%{type}"
| drop type
| lookup join message_types_lookup on message
| stats count = count(*) by type
| keep count
| sort count""", Set.of());
assertThat(fieldNames, equalTo(Set.of("type", "message", "count", "message.*", "type.*", "count.*")));
}

public void testAvoidGrokAttributesRemoval3() {
assumeTrue("LOOKUP JOIN available as snapshot only", EsqlCapabilities.Cap.JOIN_LOOKUP_V12.isEnabled());
Set<String> fieldNames = fieldNames("""
from sample_data
| grok message "%{WORD:type}"
| drop type
| lookup join message_types_lookup on message
| stats max = max(event_duration) by type
| keep max
| sort max""", Set.of());
assertThat(
fieldNames,
equalTo(Set.of("type", "event_duration", "message", "max", "event_duration.*", "message.*", "type.*", "max.*"))
);

}

public void testEnrichOnDefaultField() {
Set<String> fieldNames = fieldNames("""
from employees
Expand Down