Skip to content

ESQL: Replace grouping by DateFormat with DateTrunc #129277

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changelog/129277.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 129277
summary: "ESQL: Replace grouping by DateFormat with DateTrunc"
area: ES|QL
type: enhancement
issues:
- 114772
Original file line number Diff line number Diff line change
Expand Up @@ -1581,4 +1581,25 @@ x:date | y:date

;

optimizeDateGroupingWithFormatting
FROM employees
| STATS salary = SUM(salary) BY month = DATE_FORMAT("yyyy-MM", birth_date)
| sort month
| limit 10
;

salary:long | month:keyword
71165 | 1952-02
66174 | 1952-04
54518 | 1952-05
62405 | 1952-06
48233 | 1952-07
52121 | 1952-08
31897 | 1952-11
40031 | 1952-12
125761 | 1953-01
25945 | 1953-02

;


Original file line number Diff line number Diff line change
Expand Up @@ -10,28 +10,58 @@
import org.elasticsearch.xpack.esql.core.expression.Alias;
import org.elasticsearch.xpack.esql.core.expression.Attribute;
import org.elasticsearch.xpack.esql.core.expression.Expression;
import org.elasticsearch.xpack.esql.core.expression.Literal;
import org.elasticsearch.xpack.esql.core.expression.NamedExpression;
import org.elasticsearch.xpack.esql.core.expression.ReferenceAttribute;
import org.elasticsearch.xpack.esql.core.tree.Source;
import org.elasticsearch.xpack.esql.core.type.DataType;
import org.elasticsearch.xpack.esql.core.util.Holder;
import org.elasticsearch.xpack.esql.expression.function.aggregate.AggregateFunction;
import org.elasticsearch.xpack.esql.expression.function.grouping.GroupingFunction;
import org.elasticsearch.xpack.esql.expression.function.scalar.date.DateFormat;
import org.elasticsearch.xpack.esql.expression.function.scalar.date.DateTrunc;
import org.elasticsearch.xpack.esql.plan.logical.Aggregate;
import org.elasticsearch.xpack.esql.plan.logical.Eval;
import org.elasticsearch.xpack.esql.plan.logical.LogicalPlan;
import org.elasticsearch.xpack.esql.plan.logical.Project;

import java.time.Period;
import java.time.format.DateTimeFormatter;
import java.time.temporal.ChronoField;
import java.time.temporal.ChronoUnit;
import java.time.temporal.IsoFields;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
* Replace nested expressions inside a {@link Aggregate} with synthetic eval.
* An optimizer rule that performs two main optimizations:
* 1. Replaces nested expressions inside a {@link Aggregate} with synthetic eval
* 2. Optimizes DATE_FORMAT function calls in GROUP BY clauses with more efficient DATE_TRUNC operations
* <p>
* For nested expressions in aggregates:
* {@code STATS SUM(a + 1) BY x % 2}
* becomes
* {@code EVAL `a + 1` = a + 1, `x % 2` = x % 2 | STATS SUM(`a+1`_ref) BY `x % 2`_ref}
* and
* {@code INLINESTATS SUM(a + 1) BY x % 2}
* becomes
* {@code EVAL `a + 1` = a + 1, `x % 2` = x % 2 | INLINESTATS SUM(`a+1`_ref) BY `x % 2`_ref}
* <p>
* For date formatting optimization:
* {@code STATS sum = SUM(value) BY month = DATE_FORMAT("yyyy-MM", timestamp) }
* can be optimized to
* {@code STATS sum = SUM(value) BY month1 = DATE_TRUNC(1 month, timestamp) | EVAL month = DATE_FORMAT("yyyy-MM", month1) | KEEP sum, month}
* which is more efficient for grouping operations.
* <p>
* The date formatting optimization analyzes the format pattern and maps it to the smallest possible time interval
* that preserves the grouping semantics. Supported intervals range from nanoseconds to years, including special
* cases like quarters and weeks.
* <p>
* This date optimization not only improves performance but also ensures correctness in time-based grouping:
* DATE_TRUNC properly handles timezone and daylight saving time (DST) transitions when using Period or Duration
* intervals, while DATE_FORMAT does not account for these timezone-related considerations.
*/
public final class ReplaceAggregateNestedExpressionWithEval extends OptimizerRules.OptimizerRule<Aggregate> {

Expand All @@ -41,8 +71,21 @@ protected LogicalPlan rule(Aggregate aggregate) {
Map<String, Attribute> evalNames = new HashMap<>();
Map<GroupingFunction, Attribute> groupingAttributes = new HashMap<>();
List<Expression> newGroupings = new ArrayList<>(aggregate.groupings());
List<NamedExpression> newProjections = new ArrayList<>();
Map<NamedExpression, Attribute> referenceAttributes = new HashMap<>();
boolean groupingChanged = false;

List<Alias> newEvals = new ArrayList<>();
int[] counter = new int[] { 0 };

// Count DateFormat occurrences to avoid incorrect grouping when replacing multiple DATE_FORMAT with DATE_TRUNC
int[] dateFormatCount = new int[] { 0 };
for (Expression g : newGroupings) {
if (g instanceof Alias as && as.child() instanceof DateFormat) {
dateFormatCount[0]++;
}
}

// start with the groupings since the aggs might reuse/reference them
for (int i = 0, s = newGroupings.size(); i < s; i++) {
Expression g = newGroupings.get(i);
Expand All @@ -60,6 +103,32 @@ protected LogicalPlan rule(Aggregate aggregate) {
// Move the alias into an eval and replace it with its attribute.
groupingChanged = true;
var attr = as.toAttribute();
if (asChild instanceof DateFormat df && dateFormatCount[0] == 1) {
// Extract the format pattern and field from DateFormat
Literal format = (Literal) df.children().getFirst();
Expression field = df.children().get(1);

// Try to convert the format pattern to a minimal time interval
// This optimization attempts to simplify date formatting to DATE_TRUNC operations
Literal interval = formatToMinimalInterval((String) format.value(), g.source());
// If we can optimize the format to use DATE_TRUNC
if (interval != null) {
// Create a new DateTrunc operation with the optimized interval
DateTrunc dateTrunc = new DateTrunc(df.source(), interval, field);
// Create a synthetic alias for the DateTrunc operation
var alias = new Alias(as.source(), syntheticName(dateTrunc, as, counter[0]++), dateTrunc, null, true);
attr = alias.toAttribute();
// Replace the original DateFormat children with the new format and attribute
Expression expression = df.replaceChildren(List.of(format, attr));
// Create a new eval alias for the optimized expression
Alias newEval = as.replaceChild(expression);
newEvals.add(newEval);
referenceAttributes.put(attr, newEval.toAttribute());
evalNames.put(as.name(), attr);
as = alias;
}
}

evals.add(as);
evalNames.put(as.name(), attr);
newGroupings.set(i, attr);
Expand All @@ -80,7 +149,6 @@ protected LogicalPlan rule(Aggregate aggregate) {
expToAttribute.put(a.child().canonical(), a.toAttribute());
}

int[] counter = new int[] { 0 };
// for the aggs make sure to unwrap the agg function and check the existing groupings
for (NamedExpression agg : aggs) {
NamedExpression a = (NamedExpression) agg.transformDown(Alias.class, as -> {
Expand Down Expand Up @@ -113,8 +181,17 @@ protected LogicalPlan rule(Aggregate aggregate) {

return as.replaceChild(replaced);
});

if (groupingChanged && agg instanceof ReferenceAttribute ra) {
Attribute ref = evalNames.get(ra.name());
if (ref != null) {
aggsChanged.set(true);
newAggs.add(ref);
newProjections.add(referenceAttributes.getOrDefault(ref, ref.toAttribute()));
continue;
}
}
newAggs.add(a);
newProjections.add(a.toAttribute());
}

if (evals.size() > 0) {
Expand All @@ -124,6 +201,10 @@ protected LogicalPlan rule(Aggregate aggregate) {
var newEval = new Eval(aggregate.source(), aggregate.child(), evals);
aggregate = aggregate.with(newEval, groupings, aggregates);
}
if (newEvals.size() > 0) {
Eval eval = new Eval(aggregate.source(), aggregate, newEvals);
return new Project(aggregate.source(), eval, newProjections);
}

return aggregate;
}
Expand Down Expand Up @@ -192,4 +273,43 @@ private static Expression transformAggregateFunction(
private static String syntheticName(Expression expression, Expression func, int counter) {
return TemporaryNameUtils.temporaryName(expression, func, counter);
}

private static Literal formatToMinimalInterval(String format, Source source) {
try {
DateTimeFormatter formatter = DateTimeFormatter.ofPattern(format);
String formatterAsString = formatter.toString();
if (formatterAsString.contains(ChronoField.NANO_OF_SECOND.toString())
|| formatterAsString.contains(ChronoField.NANO_OF_DAY.toString())) {
return new Literal(source, ChronoUnit.NANOS.getDuration(), DataType.TIME_DURATION);
} else if (formatterAsString.contains(ChronoField.MILLI_OF_DAY.toString())) {
return new Literal(source, ChronoUnit.MILLIS.getDuration(), DataType.TIME_DURATION);
} else if (formatterAsString.contains(ChronoField.SECOND_OF_MINUTE.toString())) {
return new Literal(source, ChronoUnit.SECONDS.getDuration(), DataType.TIME_DURATION);
} else if (formatterAsString.contains(ChronoField.MINUTE_OF_HOUR.toString())) {
return new Literal(source, ChronoUnit.MINUTES.getDuration(), DataType.TIME_DURATION);
} else if (formatterAsString.contains(ChronoField.HOUR_OF_DAY.toString())
|| formatterAsString.contains(ChronoField.CLOCK_HOUR_OF_DAY.toString())
|| formatterAsString.contains(ChronoField.HOUR_OF_AMPM.toString())
|| formatterAsString.contains(ChronoField.CLOCK_HOUR_OF_AMPM.toString())) {
return new Literal(source, ChronoUnit.HOURS.getDuration(), DataType.TIME_DURATION);
} else if (formatterAsString.contains(ChronoField.AMPM_OF_DAY.toString())) {
return new Literal(source, ChronoUnit.HALF_DAYS, DataType.TIME_DURATION);
} else if (formatterAsString.contains(ChronoField.DAY_OF_WEEK.toString())
|| formatterAsString.contains(ChronoField.DAY_OF_MONTH.toString())
|| formatterAsString.contains(ChronoField.DAY_OF_YEAR.toString())) {
return new Literal(source, Period.ofDays(1), DataType.DATE_PERIOD);
} else if (formatterAsString.contains(ChronoField.ALIGNED_WEEK_OF_MONTH.toString())
|| formatterAsString.contains(ChronoField.ALIGNED_WEEK_OF_YEAR.toString())) {
return new Literal(source, Period.ofDays(7), DataType.DATE_PERIOD);
} else if (formatterAsString.contains(ChronoField.MONTH_OF_YEAR.toString())) {
return new Literal(source, Period.ofMonths(1), DataType.DATE_PERIOD);
} else if (formatterAsString.contains(IsoFields.QUARTER_OF_YEAR.toString())) {
return new Literal(source, Period.ofMonths(3), DataType.DATE_PERIOD);
} else if (formatterAsString.contains(ChronoField.YEAR_OF_ERA.toString())
|| formatterAsString.contains(ChronoField.YEAR.toString())) {
return new Literal(source, Period.ofYears(1), DataType.DATE_PERIOD);
}
} catch (IllegalArgumentException ignored) {}
return null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@
import org.elasticsearch.xpack.esql.expression.function.scalar.convert.ToInteger;
import org.elasticsearch.xpack.esql.expression.function.scalar.convert.ToLong;
import org.elasticsearch.xpack.esql.expression.function.scalar.convert.ToString;
import org.elasticsearch.xpack.esql.expression.function.scalar.date.DateFormat;
import org.elasticsearch.xpack.esql.expression.function.scalar.date.DateTrunc;
import org.elasticsearch.xpack.esql.expression.function.scalar.math.Round;
import org.elasticsearch.xpack.esql.expression.function.scalar.multivalue.MvAvg;
import org.elasticsearch.xpack.esql.expression.function.scalar.multivalue.MvCount;
Expand Down Expand Up @@ -7995,4 +7997,92 @@ public void testSampleNoPushDownChangePoint() {
var topN = as(changePoint.child(), TopN.class);
var source = as(topN.child(), EsRelation.class);
}

/**
* Project[[avg{r}#7, date{r}#4]]
* \_Eval[[$$SUM$avg$0{r$}#20 / $$COUNT$avg$1{r$}#21 AS avg#7]]
* \_Limit[1000[INTEGER],false]
* \_Aggregate[[date{r}#4],[SUM(salary{f}#14,true[BOOLEAN]) AS $$SUM$avg$0#20, COUNT(salary{f}#14,true[BOOLEAN]) AS $$COUNT$av
* g$1#21, date{r}#4]]
* \_Eval[[DATEFORMAT([79 79 79 79][KEYWORD],hire_date{f}#16) AS date#4]]
* \_EsRelation[test][_meta_field{f}#15, emp_no{f}#9, first_name{f}#10, g..]
*/
public void testReplaceGroupingByDateFormatWithDateTrunc() {

List<String> formats = List.of(
"yyyy",
"YYYY",
"MM/yyyy",
"yy-mm",
"yyyy-dd-MM",
"DD",
"yyyy-MM-dd HH:mm:ss"
);

for (var format : formats) {
var query = """
FROM test
| STATS avg = AVG(salary) BY date = DATE_FORMAT("%s", hire_date)
""";
String format1 = String.format(query, format);
var optimized = optimizedPlan(format1);

var project = as(optimized, Project.class);
var eval = as(project.child(), Eval.class);
assertThat(eval.fields(), hasSize(2));
var dateformat = as(eval.fields().get(1).child(), DateFormat.class);

var limit = as(eval.child(), Limit.class);
var agg = as(limit.child(), Aggregate.class);
var ref = as(agg.groupings().getFirst(), ReferenceAttribute.class);

var eval2 = as(agg.child(), Eval.class);
assertThat(eval2.fields(), hasSize(1));
var dateTrunc = as(eval2.fields().getFirst().child(), DateTrunc.class);
assertThat(eval2.fields().getFirst().toAttribute(), is(ref));

var source = as(eval2.child(), EsRelation.class);
}

}

/**
* Project[[avg{r}#10, date{r}#4, date2{r}#7]]
* \_Eval[[$$SUM$avg$0{r$}#24 / $$COUNT$avg$1{r$}#25 AS avg#10]]
* \_Limit[1000[INTEGER],false]
* \_Aggregate[[date{r}#4, date2{r}#7],[SUM(salary{f}#18,true[BOOLEAN]) AS $$SUM$avg$0#24, COUNT(salary{f}#18,true[BOOLEAN]) A
* S $$COUNT$avg$1#25, date{r}#4, date2{r}#7]]
* \_Eval[[DATEFORMAT([79 79 79 79 2d 64 64][KEYWORD],hire_date{f}#20) AS date#4, DATEFORMAT([64 64][KEYWORD],hire_date{
* f}#20) AS date2#7]]
* \_EsRelation[test][_meta_field{f}#19, emp_no{f}#13, first_name{f}#14, ..]
*/
public void testReplaceGroupingByDateFormatWithDateTrunc2() {
var query = """
FROM test
| STATS avg = AVG(salary) BY date = DATE_FORMAT("yyyy-dd", hire_date), date2 = DATE_FORMAT("dd", hire_date)
""";
var optimized = optimizedPlan(query);

var project = as(optimized, Project.class);
var eval = as(project.child(), Eval.class);
assertThat(eval.fields(), hasSize(1));
// var dateformat = as(eval.fields().get(1).child(), DateFormat.class);

var limit = as(eval.child(), Limit.class);
var agg = as(limit.child(), Aggregate.class);
assertThat(agg.groupings(), hasSize(2));
var grouping1 = as(agg.groupings().getFirst(), ReferenceAttribute.class);
var grouping2 = as(agg.groupings().get(1), ReferenceAttribute.class);

var eval2 = as(agg.child(), Eval.class);
assertThat(eval2.fields(), hasSize(2));
var dateFormat1 = as(eval2.fields().getFirst().child(), DateFormat.class);
var dateFormat2 = as(eval2.fields().getFirst().child(), DateFormat.class);

assertThat(eval2.fields().getFirst().toAttribute(), is(grouping1));
assertThat(eval2.fields().get(1).toAttribute(), is(grouping2));

var source = as(eval2.child(), EsRelation.class);
}

}
Loading