Skip to content

Commit 0a80916

Browse files
authored
ESQL: Pushdown constructs doing case-insensitive regexes (#128393)
This introduces an optimization to pushdown to Lucense those language constructs that aim at case-insensitive regular expression matching, used with `LIKE` and `RLIKE` operators, such as: * `| WHERE TO_LOWER(field) LIKE "abc*"` * `| WHERE TO_UPPER(field) RLIKE "ABC.*"` These are now pushed as case-insensitive `wildcard` and `regexp` respectively queries down to Lucene. Closes #127479
1 parent cc461af commit 0a80916

File tree

33 files changed

+756
-236
lines changed

33 files changed

+756
-236
lines changed

benchmarks/src/main/java/org/elasticsearch/benchmark/compute/operator/EvalBenchmark.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,9 @@
4848
import org.elasticsearch.xpack.esql.expression.function.scalar.math.RoundTo;
4949
import org.elasticsearch.xpack.esql.expression.function.scalar.multivalue.MvMin;
5050
import org.elasticsearch.xpack.esql.expression.function.scalar.nulls.Coalesce;
51-
import org.elasticsearch.xpack.esql.expression.function.scalar.string.RLike;
5251
import org.elasticsearch.xpack.esql.expression.function.scalar.string.ToLower;
5352
import org.elasticsearch.xpack.esql.expression.function.scalar.string.ToUpper;
53+
import org.elasticsearch.xpack.esql.expression.function.scalar.string.regex.RLike;
5454
import org.elasticsearch.xpack.esql.expression.predicate.operator.arithmetic.Add;
5555
import org.elasticsearch.xpack.esql.expression.predicate.operator.comparison.Equals;
5656
import org.elasticsearch.xpack.esql.expression.predicate.operator.comparison.LessThan;

docs/changelog/128393.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 128393
2+
summary: Pushdown constructs doing case-insensitive regexes
3+
area: ES|QL
4+
type: enhancement
5+
issues:
6+
- 127479

server/src/main/java/org/elasticsearch/TransportVersions.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,7 @@ static TransportVersion def(int id) {
272272
public static final TransportVersion ML_INFERENCE_VERTEXAI_CHATCOMPLETION_ADDED = def(9_083_0_00);
273273
public static final TransportVersion INFERENCE_CUSTOM_SERVICE_ADDED = def(9_084_0_00);
274274
public static final TransportVersion ESQL_LIMIT_ROW_SIZE = def(9_085_0_00);
275+
public static final TransportVersion ESQL_REGEX_MATCH_WITH_CASE_INSENSITIVITY = def(9_086_0_00);
275276

276277
/*
277278
* STOP! READ THIS FIRST! No, really,

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/predicate/regex/AbstractStringPattern.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@ public abstract class AbstractStringPattern implements StringPattern {
1616

1717
private Automaton automaton;
1818

19-
public abstract Automaton createAutomaton();
19+
public abstract Automaton createAutomaton(boolean ignoreCase);
2020

2121
private Automaton automaton() {
2222
if (automaton == null) {
23-
automaton = createAutomaton();
23+
automaton = createAutomaton(false);
2424
}
2525
return automaton;
2626
}

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/predicate/regex/RLike.java

Lines changed: 0 additions & 35 deletions
This file was deleted.

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/predicate/regex/RLikePattern.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,10 @@ public RLikePattern(String regexpPattern) {
2121
}
2222

2323
@Override
24-
public Automaton createAutomaton() {
24+
public Automaton createAutomaton(boolean ignoreCase) {
25+
int matchFlags = ignoreCase ? RegExp.CASE_INSENSITIVE : 0;
2526
return Operations.determinize(
26-
new RegExp(regexpPattern, RegExp.ALL | RegExp.DEPRECATED_COMPLEMENT).toAutomaton(),
27+
new RegExp(regexpPattern, RegExp.ALL | RegExp.DEPRECATED_COMPLEMENT, matchFlags).toAutomaton(),
2728
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT
2829
);
2930
}

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/predicate/regex/WildcardLike.java

Lines changed: 0 additions & 35 deletions
This file was deleted.

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/predicate/regex/WildcardPattern.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,13 @@
1010
import org.apache.lucene.search.WildcardQuery;
1111
import org.apache.lucene.util.automaton.Automaton;
1212
import org.apache.lucene.util.automaton.Operations;
13+
import org.apache.lucene.util.automaton.RegExp;
1314
import org.elasticsearch.xpack.esql.core.util.StringUtils;
1415

1516
import java.util.Objects;
1617

18+
import static org.elasticsearch.xpack.esql.core.util.StringUtils.luceneWildcardToRegExp;
19+
1720
/**
1821
* Similar to basic regex, supporting '?' wildcard for single character (same as regex ".")
1922
* and '*' wildcard for multiple characters (same as regex ".*")
@@ -37,8 +40,14 @@ public String pattern() {
3740
}
3841

3942
@Override
40-
public Automaton createAutomaton() {
41-
return WildcardQuery.toAutomaton(new Term(null, wildcard), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
43+
public Automaton createAutomaton(boolean ignoreCase) {
44+
return ignoreCase
45+
? Operations.determinize(
46+
new RegExp(luceneWildcardToRegExp(wildcard), RegExp.ALL | RegExp.DEPRECATED_COMPLEMENT, RegExp.CASE_INSENSITIVE)
47+
.toAutomaton(),
48+
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT
49+
)
50+
: WildcardQuery.toAutomaton(new Term(null, wildcard), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
4251
}
4352

4453
@Override

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/util/StringUtils.java

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
package org.elasticsearch.xpack.esql.core.util;
88

99
import org.apache.lucene.document.InetAddressPoint;
10+
import org.apache.lucene.search.WildcardQuery;
1011
import org.apache.lucene.search.spell.LevenshteinDistance;
1112
import org.apache.lucene.util.BytesRef;
1213
import org.apache.lucene.util.CollectionUtil;
@@ -178,6 +179,44 @@ public static String wildcardToJavaPattern(String pattern, char escape) {
178179
return regex.toString();
179180
}
180181

182+
/**
183+
* Translates a Lucene wildcard pattern to a Lucene RegExp one.
184+
* @param wildcard Lucene wildcard pattern
185+
* @return Lucene RegExp pattern
186+
*/
187+
public static String luceneWildcardToRegExp(String wildcard) {
188+
StringBuilder regex = new StringBuilder();
189+
190+
for (int i = 0, wcLen = wildcard.length(); i < wcLen; i++) {
191+
char c = wildcard.charAt(i); // this will work chunking through Unicode as long as all values matched are ASCII
192+
switch (c) {
193+
case WildcardQuery.WILDCARD_STRING -> regex.append(".*");
194+
case WildcardQuery.WILDCARD_CHAR -> regex.append(".");
195+
case WildcardQuery.WILDCARD_ESCAPE -> {
196+
if (i + 1 < wcLen) {
197+
// consume the wildcard escaping, consider the next char
198+
char next = wildcard.charAt(i + 1);
199+
i++;
200+
switch (next) {
201+
case WildcardQuery.WILDCARD_STRING, WildcardQuery.WILDCARD_CHAR, WildcardQuery.WILDCARD_ESCAPE ->
202+
// escape `*`, `.`, `\`, since these are special chars in RegExp as well
203+
regex.append("\\");
204+
// default: unnecessary escaping -- just ignore the escaping
205+
}
206+
regex.append(next);
207+
} else {
208+
// "else fallthru, lenient parsing with a trailing \" -- according to WildcardQuery#toAutomaton
209+
regex.append("\\\\");
210+
}
211+
}
212+
case '$', '(', ')', '+', '.', '[', ']', '^', '{', '|', '}' -> regex.append("\\").append(c);
213+
default -> regex.append(c);
214+
}
215+
}
216+
217+
return regex.toString();
218+
}
219+
181220
/**
182221
* Translates a like pattern to a Lucene wildcard.
183222
* This methods pays attention to the custom escape char which gets converted into \ (used by Lucene).

x-pack/plugin/esql-core/src/test/java/org/elasticsearch/xpack/esql/core/util/StringUtilsTests.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99

1010
import org.elasticsearch.test.ESTestCase;
1111

12+
import static org.elasticsearch.xpack.esql.core.util.StringUtils.luceneWildcardToRegExp;
1213
import static org.elasticsearch.xpack.esql.core.util.StringUtils.wildcardToJavaPattern;
14+
import static org.hamcrest.Matchers.is;
1315

1416
public class StringUtilsTests extends ESTestCase {
1517

@@ -55,4 +57,21 @@ public void testWildcard() {
5557
public void testEscapedEscape() {
5658
assertEquals("^\\\\\\\\$", wildcardToJavaPattern("\\\\\\\\", '\\'));
5759
}
60+
61+
public void testLuceneWildcardToRegExp() {
62+
assertThat(luceneWildcardToRegExp(""), is(""));
63+
assertThat(luceneWildcardToRegExp("*"), is(".*"));
64+
assertThat(luceneWildcardToRegExp("?"), is("."));
65+
assertThat(luceneWildcardToRegExp("\\\\"), is("\\\\"));
66+
assertThat(luceneWildcardToRegExp("foo?bar"), is("foo.bar"));
67+
assertThat(luceneWildcardToRegExp("foo*bar"), is("foo.*bar"));
68+
assertThat(luceneWildcardToRegExp("foo\\\\bar"), is("foo\\\\bar"));
69+
assertThat(luceneWildcardToRegExp("foo*bar?baz"), is("foo.*bar.baz"));
70+
assertThat(luceneWildcardToRegExp("foo\\*bar"), is("foo\\*bar"));
71+
assertThat(luceneWildcardToRegExp("foo\\?bar\\?"), is("foo\\?bar\\?"));
72+
assertThat(luceneWildcardToRegExp("foo\\?bar\\"), is("foo\\?bar\\\\"));
73+
assertThat(luceneWildcardToRegExp("[](){}^$.|+"), is("\\[\\]\\(\\)\\{\\}\\^\\$\\.\\|\\+"));
74+
assertThat(luceneWildcardToRegExp("foo\\\uD83D\uDC14bar"), is("foo\uD83D\uDC14bar"));
75+
assertThat(luceneWildcardToRegExp("foo\uD83D\uDC14bar"), is("foo\uD83D\uDC14bar"));
76+
}
5877
}

x-pack/plugin/esql-core/src/test/java/org/elasticsearch/xpack/esql/core/util/TestUtils.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import org.elasticsearch.xpack.esql.core.type.DataType;
1414
import org.elasticsearch.xpack.esql.core.type.EsField;
1515

16+
import java.util.Locale;
1617
import java.util.regex.Pattern;
1718

1819
import static java.util.Collections.emptyMap;
@@ -61,4 +62,15 @@ public static FieldAttribute getFieldAttribute(String name, DataType dataType) {
6162
public static String stripThrough(String input) {
6263
return WS_PATTERN.matcher(input).replaceAll(StringUtils.EMPTY);
6364
}
65+
66+
/** Returns the input string, but with parts of it having the letter casing changed. */
67+
public static String randomCasing(String input) {
68+
StringBuilder sb = new StringBuilder(input.length());
69+
for (int i = 0, inputLen = input.length(), step = (int) Math.sqrt(inputLen), chunkEnd; i < inputLen; i += step) {
70+
chunkEnd = Math.min(i + step, inputLen);
71+
var chunk = input.substring(i, chunkEnd);
72+
sb.append(randomBoolean() ? chunk.toLowerCase(Locale.ROOT) : chunk.toUpperCase(Locale.ROOT));
73+
}
74+
return sb.toString();
75+
}
6476
}

x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/EsqlTestUtils.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@
6363
import org.elasticsearch.xpack.esql.core.util.DateUtils;
6464
import org.elasticsearch.xpack.esql.core.util.StringUtils;
6565
import org.elasticsearch.xpack.esql.expression.function.EsqlFunctionRegistry;
66-
import org.elasticsearch.xpack.esql.expression.function.scalar.string.RLike;
67-
import org.elasticsearch.xpack.esql.expression.function.scalar.string.WildcardLike;
66+
import org.elasticsearch.xpack.esql.expression.function.scalar.string.regex.RLike;
67+
import org.elasticsearch.xpack.esql.expression.function.scalar.string.regex.WildcardLike;
6868
import org.elasticsearch.xpack.esql.expression.predicate.Range;
6969
import org.elasticsearch.xpack.esql.expression.predicate.operator.comparison.Equals;
7070
import org.elasticsearch.xpack.esql.expression.predicate.operator.comparison.GreaterThan;

x-pack/plugin/esql/qa/testFixtures/src/main/resources/where-like.csv-spec

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,3 +319,107 @@ warningRegex:java.lang.IllegalArgumentException: single-value function encounter
319319
emp_no:integer | job_positions:keyword
320320
10025 | Accountant
321321
;
322+
323+
likeWithUpperTurnedInsensitive
324+
FROM employees
325+
| KEEP emp_no, first_name
326+
| SORT emp_no
327+
| WHERE TO_UPPER(first_name) LIKE "GEOR*"
328+
;
329+
330+
emp_no:integer |first_name:keyword
331+
10001 |Georgi
332+
10055 |Georgy
333+
;
334+
335+
likeWithLowerTurnedInsensitive
336+
FROM employees
337+
| KEEP emp_no, first_name
338+
| SORT emp_no
339+
| WHERE TO_LOWER(TO_UPPER(first_name)) LIKE "geor*"
340+
;
341+
342+
emp_no:integer |first_name:keyword
343+
10001 |Georgi
344+
10055 |Georgy
345+
;
346+
347+
likeWithLowerConflictingFolded
348+
FROM employees
349+
| KEEP emp_no, first_name
350+
| SORT emp_no
351+
| WHERE TO_UPPER(first_name) LIKE "geor*"
352+
;
353+
354+
emp_no:integer |first_name:keyword
355+
;
356+
357+
likeWithLowerTurnedInsensitiveNotPushedDown
358+
FROM employees
359+
| KEEP emp_no, first_name
360+
| SORT emp_no
361+
| WHERE TO_LOWER(first_name) LIKE "geor*" OR emp_no + 1 IN (10002, 10056)
362+
;
363+
364+
emp_no:integer |first_name:keyword
365+
10001 |Georgi
366+
10055 |Georgy
367+
;
368+
369+
rlikeWithUpperTurnedInsensitive
370+
FROM employees
371+
| KEEP emp_no, first_name
372+
| SORT emp_no
373+
| WHERE TO_UPPER(first_name) RLIKE "GEOR.*"
374+
;
375+
376+
emp_no:integer |first_name:keyword
377+
10001 |Georgi
378+
10055 |Georgy
379+
;
380+
381+
rlikeWithLowerTurnedInsensitive
382+
FROM employees
383+
| KEEP emp_no, first_name
384+
| SORT emp_no
385+
| WHERE TO_LOWER(TO_UPPER(first_name)) RLIKE "geor.*"
386+
;
387+
388+
emp_no:integer |first_name:keyword
389+
10001 |Georgi
390+
10055 |Georgy
391+
;
392+
393+
rlikeWithLowerConflictingFolded
394+
FROM employees
395+
| KEEP emp_no, first_name
396+
| SORT emp_no
397+
| WHERE TO_UPPER(first_name) RLIKE "geor.*"
398+
;
399+
400+
emp_no:integer |first_name:keyword
401+
;
402+
403+
negatedRLikeWithLowerTurnedInsensitive
404+
FROM employees
405+
| KEEP emp_no, first_name
406+
| SORT emp_no
407+
| WHERE TO_LOWER(TO_UPPER(first_name)) NOT RLIKE "geor.*"
408+
| STATS c = COUNT()
409+
;
410+
411+
c:long
412+
88
413+
;
414+
415+
rlikeWithLowerTurnedInsensitiveNotPushedDown
416+
FROM employees
417+
| KEEP emp_no, first_name
418+
| SORT emp_no
419+
| WHERE TO_LOWER(first_name) RLIKE "geor.*" OR emp_no + 1 IN (10002, 10056)
420+
;
421+
422+
emp_no:integer |first_name:keyword
423+
10001 |Georgi
424+
10055 |Georgy
425+
;

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/ExpressionWritables.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,11 +68,11 @@
6868
import org.elasticsearch.xpack.esql.expression.function.scalar.string.ByteLength;
6969
import org.elasticsearch.xpack.esql.expression.function.scalar.string.LTrim;
7070
import org.elasticsearch.xpack.esql.expression.function.scalar.string.Length;
71-
import org.elasticsearch.xpack.esql.expression.function.scalar.string.RLike;
7271
import org.elasticsearch.xpack.esql.expression.function.scalar.string.RTrim;
7372
import org.elasticsearch.xpack.esql.expression.function.scalar.string.Space;
7473
import org.elasticsearch.xpack.esql.expression.function.scalar.string.Trim;
75-
import org.elasticsearch.xpack.esql.expression.function.scalar.string.WildcardLike;
74+
import org.elasticsearch.xpack.esql.expression.function.scalar.string.regex.RLike;
75+
import org.elasticsearch.xpack.esql.expression.function.scalar.string.regex.WildcardLike;
7676
import org.elasticsearch.xpack.esql.expression.function.scalar.util.Delay;
7777
import org.elasticsearch.xpack.esql.expression.predicate.logical.Not;
7878
import org.elasticsearch.xpack.esql.expression.predicate.nulls.IsNotNull;

0 commit comments

Comments
 (0)