elastic · kanoshiou · Jun 9, 2025 · Jun 10, 2025 · Jun 11, 2025 · Jun 11, 2025
diff --git a/docs/changelog/129277.yaml b/docs/changelog/129277.yaml
@@ -0,0 +1,6 @@
+pr: 129277
+summary: "ESQL: Replace grouping by DateFormat with DateTrunc"
+area: ES|QL
+type: enhancement
+issues:
+  - 114772
diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/date.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/date.csv-spec
@@ -1581,4 +1581,25 @@ x:date                      | y:date
 
 ;
 
+optimizeDateGroupingWithFormatting
+FROM employees 
+| STATS salary = SUM(salary) BY month = DATE_FORMAT("yyyy-MM", birth_date)
+| sort month 
+| limit 10
+;
+
+salary:long    | month:keyword     
+71165          | 1952-02        
+66174          | 1952-04        
+54518          | 1952-05        
+62405          | 1952-06        
+48233          | 1952-07        
+52121          | 1952-08        
+31897          | 1952-11        
+40031          | 1952-12        
+125761         | 1953-01        
+25945          | 1953-02        
+
+;
+
 
diff --git a/...icsearch/xpack/esql/optimizer/rules/logical/ReplaceAggregateNestedExpressionWithEval.java b/...icsearch/xpack/esql/optimizer/rules/logical/ReplaceAggregateNestedExpressionWithEval.java
@@ -10,28 +10,58 @@
 import org.elasticsearch.xpack.esql.core.expression.Alias;
 import org.elasticsearch.xpack.esql.core.expression.Attribute;
 import org.elasticsearch.xpack.esql.core.expression.Expression;
+import org.elasticsearch.xpack.esql.core.expression.Literal;
 import org.elasticsearch.xpack.esql.core.expression.NamedExpression;
+import org.elasticsearch.xpack.esql.core.expression.ReferenceAttribute;
+import org.elasticsearch.xpack.esql.core.tree.Source;
+import org.elasticsearch.xpack.esql.core.type.DataType;
 import org.elasticsearch.xpack.esql.core.util.Holder;
 import org.elasticsearch.xpack.esql.expression.function.aggregate.AggregateFunction;
 import org.elasticsearch.xpack.esql.expression.function.grouping.GroupingFunction;
+import org.elasticsearch.xpack.esql.expression.function.scalar.date.DateFormat;
+import org.elasticsearch.xpack.esql.expression.function.scalar.date.DateTrunc;
 import org.elasticsearch.xpack.esql.plan.logical.Aggregate;
 import org.elasticsearch.xpack.esql.plan.logical.Eval;
 import org.elasticsearch.xpack.esql.plan.logical.LogicalPlan;
+import org.elasticsearch.xpack.esql.plan.logical.Project;
 
+import java.time.Period;
+import java.time.format.DateTimeFormatter;
+import java.time.temporal.ChronoField;
+import java.time.temporal.ChronoUnit;
+import java.time.temporal.IsoFields;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
 /**
- * Replace nested expressions inside a {@link Aggregate} with synthetic eval.
+ * An optimizer rule that performs two main optimizations:
+ * 1. Replaces nested expressions inside a {@link Aggregate} with synthetic eval
+ * 2. Optimizes DATE_FORMAT function calls in GROUP BY clauses with more efficient DATE_TRUNC operations
+ * <p>
+ * For nested expressions in aggregates:
  * {@code STATS SUM(a + 1) BY x % 2}
  * becomes
  * {@code EVAL `a + 1` = a + 1, `x % 2` = x % 2 | STATS SUM(`a+1`_ref) BY `x % 2`_ref}
  * and
  * {@code INLINESTATS SUM(a + 1) BY x % 2}
  * becomes
  * {@code EVAL `a + 1` = a + 1, `x % 2` = x % 2 | INLINESTATS SUM(`a+1`_ref) BY `x % 2`_ref}
+ * <p>
+ * For date formatting optimization:
+ * {@code STATS sum = SUM(value) BY month = DATE_FORMAT("yyyy-MM", timestamp) }
+ * can be optimized to
+ * {@code STATS sum = SUM(value) BY month1 = DATE_TRUNC(1 month, timestamp) | EVAL month = DATE_FORMAT("yyyy-MM", month1) | KEEP sum, month}
+ * which is more efficient for grouping operations.
+ * <p>
+ * The date formatting optimization analyzes the format pattern and maps it to the smallest possible time interval
+ * that preserves the grouping semantics. Supported intervals range from nanoseconds to years, including special
+ * cases like quarters and weeks.
+ * <p>
+ * This date optimization not only improves performance but also ensures correctness in time-based grouping:
+ * DATE_TRUNC properly handles timezone and daylight saving time (DST) transitions when using Period or Duration
+ * intervals, while DATE_FORMAT does not account for these timezone-related considerations.
  */
 public final class ReplaceAggregateNestedExpressionWithEval extends OptimizerRules.OptimizerRule<Aggregate> {
 
@@ -41,8 +71,21 @@ protected LogicalPlan rule(Aggregate aggregate) {
         Map<String, Attribute> evalNames = new HashMap<>();
         Map<GroupingFunction, Attribute> groupingAttributes = new HashMap<>();
         List<Expression> newGroupings = new ArrayList<>(aggregate.groupings());
+        List<NamedExpression> newProjections = new ArrayList<>();
+        Map<NamedExpression, Attribute> referenceAttributes = new HashMap<>();
         boolean groupingChanged = false;
 
+        List<Alias> newEvals = new ArrayList<>();
+        int[] counter = new int[] { 0 };
+
+        // Count DateFormat occurrences to avoid incorrect grouping when replacing multiple DATE_FORMAT with DATE_TRUNC
+        int[] dateFormatCount = new int[] { 0 };
+        for (Expression g : newGroupings) {
+            if (g instanceof Alias as && as.child() instanceof DateFormat) {
+                dateFormatCount[0]++;
+            }
+        }
+
         // start with the groupings since the aggs might reuse/reference them
         for (int i = 0, s = newGroupings.size(); i < s; i++) {
             Expression g = newGroupings.get(i);
@@ -60,6 +103,32 @@ protected LogicalPlan rule(Aggregate aggregate) {
                     // Move the alias into an eval and replace it with its attribute.
                     groupingChanged = true;
                     var attr = as.toAttribute();
+                    if (asChild instanceof DateFormat df && dateFormatCount[0] == 1) {
+                        // Extract the format pattern and field from DateFormat
+                        Literal format = (Literal) df.children().getFirst();
+                        Expression field = df.children().get(1);
+
+                        // Try to convert the format pattern to a minimal time interval
+                        // This optimization attempts to simplify date formatting to DATE_TRUNC operations
+                        Literal interval = formatToMinimalInterval((String) format.value(), g.source());
+                        // If we can optimize the format to use DATE_TRUNC
+                        if (interval != null) {
+                            // Create a new DateTrunc operation with the optimized interval
+                            DateTrunc dateTrunc = new DateTrunc(df.source(), interval, field);
+                            // Create a synthetic alias for the DateTrunc operation
+                            var alias = new Alias(as.source(), syntheticName(dateTrunc, as, counter[0]++), dateTrunc, null, true);
+                            attr = alias.toAttribute();
+                            // Replace the original DateFormat children with the new format and attribute
+                            Expression expression = df.replaceChildren(List.of(format, attr));
+                            // Create a new eval alias for the optimized expression
+                            Alias newEval = as.replaceChild(expression);
+                            newEvals.add(newEval);
+                            referenceAttributes.put(attr, newEval.toAttribute());
+                            evalNames.put(as.name(), attr);
+                            as = alias;
+                        }
+                    }
+
                     evals.add(as);
                     evalNames.put(as.name(), attr);
                     newGroupings.set(i, attr);
@@ -80,7 +149,6 @@ protected LogicalPlan rule(Aggregate aggregate) {
             expToAttribute.put(a.child().canonical(), a.toAttribute());
         }
 
-        int[] counter = new int[] { 0 };
         // for the aggs make sure to unwrap the agg function and check the existing groupings
         for (NamedExpression agg : aggs) {
             NamedExpression a = (NamedExpression) agg.transformDown(Alias.class, as -> {
@@ -113,8 +181,17 @@ protected LogicalPlan rule(Aggregate aggregate) {
 
                 return as.replaceChild(replaced);
             });
-
+            if (groupingChanged && agg instanceof ReferenceAttribute ra) {
+                Attribute ref = evalNames.get(ra.name());
+                if (ref != null) {
+                    aggsChanged.set(true);
+                    newAggs.add(ref);
+                    newProjections.add(referenceAttributes.getOrDefault(ref, ref.toAttribute()));
+                    continue;
+                }
+            }
             newAggs.add(a);
+            newProjections.add(a.toAttribute());
         }
 
         if (evals.size() > 0) {
@@ -124,6 +201,10 @@ protected LogicalPlan rule(Aggregate aggregate) {
             var newEval = new Eval(aggregate.source(), aggregate.child(), evals);
             aggregate = aggregate.with(newEval, groupings, aggregates);
         }
+        if (newEvals.size() > 0) {
+            Eval eval = new Eval(aggregate.source(), aggregate, newEvals);
+            return new Project(aggregate.source(), eval, newProjections);
+        }
 
         return aggregate;
     }
@@ -192,4 +273,43 @@ private static Expression transformAggregateFunction(
     private static String syntheticName(Expression expression, Expression func, int counter) {
         return TemporaryNameUtils.temporaryName(expression, func, counter);
     }
+
+    private static Literal formatToMinimalInterval(String format, Source source) {
+        try {
+            DateTimeFormatter formatter = DateTimeFormatter.ofPattern(format);
+            String formatterAsString = formatter.toString();
+            if (formatterAsString.contains(ChronoField.NANO_OF_SECOND.toString())
+                || formatterAsString.contains(ChronoField.NANO_OF_DAY.toString())) {
+                return new Literal(source, ChronoUnit.NANOS.getDuration(), DataType.TIME_DURATION);
+            } else if (formatterAsString.contains(ChronoField.MILLI_OF_DAY.toString())) {
+                return new Literal(source, ChronoUnit.MILLIS.getDuration(), DataType.TIME_DURATION);
+            } else if (formatterAsString.contains(ChronoField.SECOND_OF_MINUTE.toString())) {
+                return new Literal(source, ChronoUnit.SECONDS.getDuration(), DataType.TIME_DURATION);
+            } else if (formatterAsString.contains(ChronoField.MINUTE_OF_HOUR.toString())) {
+                return new Literal(source, ChronoUnit.MINUTES.getDuration(), DataType.TIME_DURATION);
+            } else if (formatterAsString.contains(ChronoField.HOUR_OF_DAY.toString())
+                || formatterAsString.contains(ChronoField.CLOCK_HOUR_OF_DAY.toString())
+                || formatterAsString.contains(ChronoField.HOUR_OF_AMPM.toString())
+                || formatterAsString.contains(ChronoField.CLOCK_HOUR_OF_AMPM.toString())) {
+                    return new Literal(source, ChronoUnit.HOURS.getDuration(), DataType.TIME_DURATION);
+                } else if (formatterAsString.contains(ChronoField.AMPM_OF_DAY.toString())) {
+                    return new Literal(source, ChronoUnit.HALF_DAYS, DataType.TIME_DURATION);
+                } else if (formatterAsString.contains(ChronoField.DAY_OF_WEEK.toString())
+                    || formatterAsString.contains(ChronoField.DAY_OF_MONTH.toString())
+                    || formatterAsString.contains(ChronoField.DAY_OF_YEAR.toString())) {
+                        return new Literal(source, Period.ofDays(1), DataType.DATE_PERIOD);
+                    } else if (formatterAsString.contains(ChronoField.ALIGNED_WEEK_OF_MONTH.toString())
+                        || formatterAsString.contains(ChronoField.ALIGNED_WEEK_OF_YEAR.toString())) {
+                            return new Literal(source, Period.ofDays(7), DataType.DATE_PERIOD);
+                        } else if (formatterAsString.contains(ChronoField.MONTH_OF_YEAR.toString())) {
+                            return new Literal(source, Period.ofMonths(1), DataType.DATE_PERIOD);
+                        } else if (formatterAsString.contains(IsoFields.QUARTER_OF_YEAR.toString())) {
+                            return new Literal(source, Period.ofMonths(3), DataType.DATE_PERIOD);
+                        } else if (formatterAsString.contains(ChronoField.YEAR_OF_ERA.toString())
+                            || formatterAsString.contains(ChronoField.YEAR.toString())) {
+                                return new Literal(source, Period.ofYears(1), DataType.DATE_PERIOD);
+                            }
+        } catch (IllegalArgumentException ignored) {}
+        return null;
+    }
 }
diff --git a/.../esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java b/.../esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java
@@ -68,6 +68,8 @@
 import org.elasticsearch.xpack.esql.expression.function.scalar.convert.ToInteger;
 import org.elasticsearch.xpack.esql.expression.function.scalar.convert.ToLong;
 import org.elasticsearch.xpack.esql.expression.function.scalar.convert.ToString;
+import org.elasticsearch.xpack.esql.expression.function.scalar.date.DateFormat;
+import org.elasticsearch.xpack.esql.expression.function.scalar.date.DateTrunc;
 import org.elasticsearch.xpack.esql.expression.function.scalar.math.Round;
 import org.elasticsearch.xpack.esql.expression.function.scalar.multivalue.MvAvg;
 import org.elasticsearch.xpack.esql.expression.function.scalar.multivalue.MvCount;
@@ -7995,4 +7997,92 @@ public void testSampleNoPushDownChangePoint() {
         var topN = as(changePoint.child(), TopN.class);
         var source = as(topN.child(), EsRelation.class);
     }
+
+    /**
+     * Project[[avg{r}#7, date{r}#4]]
+     * \_Eval[[$$SUM$avg$0{r$}#20 / $$COUNT$avg$1{r$}#21 AS avg#7]]
+     *   \_Limit[1000[INTEGER],false]
+     *     \_Aggregate[[date{r}#4],[SUM(salary{f}#14,true[BOOLEAN]) AS $$SUM$avg$0#20, COUNT(salary{f}#14,true[BOOLEAN]) AS $$COUNT$av
+     * g$1#21, date{r}#4]]
+     *       \_Eval[[DATEFORMAT([79 79 79 79][KEYWORD],hire_date{f}#16) AS date#4]]
+     *         \_EsRelation[test][_meta_field{f}#15, emp_no{f}#9, first_name{f}#10, g..]
+     */
+    public void testReplaceGroupingByDateFormatWithDateTrunc() {
+
+        List<String> formats = List.of(
+            "yyyy",
+            "YYYY",
+            "MM/yyyy",
+            "yy-mm",
+            "yyyy-dd-MM",
+            "DD",
+            "yyyy-MM-dd HH:mm:ss"
+        );
+
+        for (var format : formats) {
+            var query = """
+                FROM test
+                | STATS avg = AVG(salary) BY date = DATE_FORMAT("%s", hire_date)
+                """;
+            String format1 = String.format(query, format);
+            var optimized = optimizedPlan(format1);
+
+            var project = as(optimized, Project.class);
+            var eval = as(project.child(), Eval.class);
+            assertThat(eval.fields(), hasSize(2));
+            var dateformat = as(eval.fields().get(1).child(), DateFormat.class);
+
+            var limit = as(eval.child(), Limit.class);
+            var agg = as(limit.child(), Aggregate.class);
+            var ref = as(agg.groupings().getFirst(), ReferenceAttribute.class);
+
+            var eval2 = as(agg.child(), Eval.class);
+            assertThat(eval2.fields(), hasSize(1));
+            var dateTrunc = as(eval2.fields().getFirst().child(), DateTrunc.class);
+            assertThat(eval2.fields().getFirst().toAttribute(), is(ref));
+
+            var source = as(eval2.child(), EsRelation.class);
+        }
+
+    }
+
+    /**
+     * Project[[avg{r}#10, date{r}#4, date2{r}#7]]
+     * \_Eval[[$$SUM$avg$0{r$}#24 / $$COUNT$avg$1{r$}#25 AS avg#10]]
+     *   \_Limit[1000[INTEGER],false]
+     *     \_Aggregate[[date{r}#4, date2{r}#7],[SUM(salary{f}#18,true[BOOLEAN]) AS $$SUM$avg$0#24, COUNT(salary{f}#18,true[BOOLEAN]) A
+     * S $$COUNT$avg$1#25, date{r}#4, date2{r}#7]]
+     *       \_Eval[[DATEFORMAT([79 79 79 79 2d 64 64][KEYWORD],hire_date{f}#20) AS date#4, DATEFORMAT([64 64][KEYWORD],hire_date{
+     * f}#20) AS date2#7]]
+     *         \_EsRelation[test][_meta_field{f}#19, emp_no{f}#13, first_name{f}#14, ..]
+     */
+    public void testReplaceGroupingByDateFormatWithDateTrunc2() {
+        var query = """
+            FROM test
+            | STATS avg = AVG(salary) BY date = DATE_FORMAT("yyyy-dd", hire_date), date2 = DATE_FORMAT("dd", hire_date)
+            """;
+        var optimized = optimizedPlan(query);
+
+        var project = as(optimized, Project.class);
+        var eval = as(project.child(), Eval.class);
+        assertThat(eval.fields(), hasSize(1));
+        // var dateformat = as(eval.fields().get(1).child(), DateFormat.class);
+
+        var limit = as(eval.child(), Limit.class);
+        var agg = as(limit.child(), Aggregate.class);
+        assertThat(agg.groupings(), hasSize(2));
+        var grouping1 = as(agg.groupings().getFirst(), ReferenceAttribute.class);
+        var grouping2 = as(agg.groupings().get(1), ReferenceAttribute.class);
+
+        var eval2 = as(agg.child(), Eval.class);
+        assertThat(eval2.fields(), hasSize(2));
+        var dateFormat1 = as(eval2.fields().getFirst().child(), DateFormat.class);
+        var dateFormat2 = as(eval2.fields().getFirst().child(), DateFormat.class);
+
+        assertThat(eval2.fields().getFirst().toAttribute(), is(grouping1));
+        assertThat(eval2.fields().get(1).toAttribute(), is(grouping2));
+
+        var source = as(eval2.child(), EsRelation.class);
+    }
+
 }