Skip to content

Commit bae6401

Browse files
committed
Fix matching semantics for custom character classes
This applies the current matching semantics for character classes, matching either characters or Unicode scalars depending on the current options.
1 parent 3146746 commit bae6401

File tree

2 files changed

+85
-54
lines changed

2 files changed

+85
-54
lines changed

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 59 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -60,24 +60,53 @@ extension DSLTree.Atom {
6060
_ opts: MatchingOptions
6161
) throws -> MEProgram<String>.ConsumeFunction? {
6262
let isCaseInsensitive = opts.isCaseInsensitive
63-
63+
let isCharacterSemantics = opts.semanticLevel == .graphemeCluster
64+
6465
switch self {
6566
case let .char(c):
66-
// TODO: Match level?
6767
return { input, bounds in
68-
let low = bounds.lowerBound
68+
let nextIndex = isCharacterSemantics
69+
? input.index(after: bounds.lowerBound)
70+
: input.unicodeScalars.index(after: bounds.lowerBound)
71+
72+
var curIdx = bounds.lowerBound
6973
if isCaseInsensitive && c.isCased {
70-
return input[low].lowercased() == c.lowercased()
71-
? input.index(after: low)
72-
: nil
74+
if isCharacterSemantics {
75+
return input[curIdx].lowercased() == c.lowercased()
76+
? nextIndex
77+
: nil
78+
} else {
79+
// FIXME: How do multi-scalar characters match in case insensitive mode?
80+
return input.unicodeScalars[curIdx].properties.lowercaseMapping == c.lowercased()
81+
? nextIndex
82+
: nil
83+
}
7384
} else {
74-
return input[low] == c
75-
? input.index(after: low)
76-
: nil
85+
if isCharacterSemantics {
86+
return input[curIdx] == c
87+
? nextIndex
88+
: nil
89+
} else {
90+
// Try to match the sequence of unicodeScalars in `input` and `c`
91+
var patternIndex = c.unicodeScalars.startIndex
92+
while curIdx < input.endIndex, patternIndex < c.unicodeScalars.endIndex {
93+
if input.unicodeScalars[curIdx] != c.unicodeScalars[patternIndex] {
94+
return nil
95+
}
96+
input.unicodeScalars.formIndex(after: &curIdx)
97+
c.unicodeScalars.formIndex(after: &patternIndex)
98+
}
99+
100+
// Match succeeded if all scalars in `c.unicodeScalars` matched
101+
return patternIndex == c.unicodeScalars.endIndex
102+
? curIdx
103+
: nil
104+
}
77105
}
78106
}
79107
case let .scalar(s):
80-
return consumeScalar {
108+
let consume = consumeFunction(for: opts)
109+
return consume {
81110
isCaseInsensitive
82111
? $0.properties.lowercaseMapping == s.properties.lowercaseMapping
83112
: $0 == s
@@ -255,17 +284,24 @@ extension DSLTree.CustomCharacterClass.Member {
255284
throw Unsupported("\(high) in range")
256285
}
257286

287+
let isCharacterSemantic = opts.semanticLevel == .graphemeCluster
288+
258289
if opts.isCaseInsensitive {
259290
let lhsLower = lhs.lowercased()
260291
let rhsLower = rhs.lowercased()
261292
guard lhsLower <= rhsLower else { throw Unsupported("Invalid range \(lhs)-\(rhs)") }
262293
return { input, bounds in
263294
// TODO: check for out of bounds?
264295
let curIdx = bounds.lowerBound
265-
guard input[curIdx].hasExactlyOneScalar else { return nil }
266-
if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) {
267-
// TODO: semantic level
268-
return input.index(after: curIdx)
296+
if isCharacterSemantic {
297+
guard input[curIdx].hasExactlyOneScalar else { return nil }
298+
if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) {
299+
return input.index(after: curIdx)
300+
}
301+
} else {
302+
if (lhsLower...rhsLower).contains(input.unicodeScalars[curIdx].properties.lowercaseMapping) {
303+
return input.unicodeScalars.index(after: curIdx)
304+
}
269305
}
270306
return nil
271307
}
@@ -274,10 +310,15 @@ extension DSLTree.CustomCharacterClass.Member {
274310
return { input, bounds in
275311
// TODO: check for out of bounds?
276312
let curIdx = bounds.lowerBound
277-
guard input[curIdx].hasExactlyOneScalar else { return nil }
278-
if (lhs...rhs).contains(input[curIdx]) {
279-
// TODO: semantic level
280-
return input.index(after: curIdx)
313+
if isCharacterSemantic {
314+
guard input[curIdx].hasExactlyOneScalar else { return nil }
315+
if (lhs...rhs).contains(input[curIdx]) {
316+
return input.index(after: curIdx)
317+
}
318+
} else {
319+
if (lhs...rhs).contains(Character(input.unicodeScalars[curIdx])) {
320+
return input.unicodeScalars.index(after: curIdx)
321+
}
281322
}
282323
return nil
283324
}

Tests/RegexTests/MatchTests.swift

Lines changed: 26 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -673,6 +673,11 @@ extension RegexTests {
673673
}
674674
firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}")
675675

676+
firstMatchTest(#"[12]"#, input: "1️⃣", match: nil)
677+
firstMatchTest(#"[1-2]"#, input: "1️⃣", match: nil)
678+
firstMatchTest(#"[\d]"#, input: "1️⃣", match: "1️⃣")
679+
firstMatchTest(#"(?P)[\d]"#, input: "1️⃣", match: nil)
680+
676681
// Currently not supported in the matching engine.
677682
for c: UnicodeScalar in ["a", "b", "c"] {
678683
firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)",
@@ -1507,31 +1512,28 @@ extension RegexTests {
15071512
}
15081513

15091514
func testCanonicalEquivalenceCustomCharacterClass() throws {
1510-
// Expectation: Concatenations with custom character classes should be able
1511-
// to match within a grapheme cluster. That is, a regex should be able to
1512-
// match the scalar values that comprise a grapheme cluster in separate,
1513-
// or repeated, custom character classes.
1515+
// Expectation: Custom character class matches do not cross grapheme
1516+
// character boundaries by default. When matching with Unicode scalar
1517+
// semantics, grapheme cluster boundaries are ignored, so matching
1518+
// sequences of custom character classes can succeed.
15141519

15151520
matchTest(
15161521
#"[áéíóú]$"#,
15171522
(eComposed, true),
15181523
(eDecomposed, true))
15191524

1520-
// FIXME: Custom char classes don't use canonical equivalence with composed characters
1521-
firstMatchTest(#"e[\u{301}]$"#, input: eComposed, match: eComposed,
1522-
xfail: true)
1523-
firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eComposed, match: eComposed,
1524-
xfail: true)
1525-
firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eComposed, match: eComposed,
1526-
xfail: true)
1525+
// Unicode scalar semantics
1526+
firstMatchTest(#"(?u)e[\u{301}]$"#, input: eDecomposed, match: eDecomposed)
1527+
firstMatchTest(#"(?u)e[\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed)
1528+
firstMatchTest(#"(?u)[e][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed)
1529+
firstMatchTest(#"(?u)[e-e][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed)
1530+
firstMatchTest(#"(?u)[a-z][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed)
15271531

1528-
// FIXME: Custom char classes don't match decomposed characters
1529-
firstMatchTest(#"e[\u{301}]$"#, input: eDecomposed, match: eDecomposed,
1530-
xfail: true)
1531-
firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed,
1532-
xfail: true)
1533-
firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed,
1534-
xfail: true)
1532+
// Grapheme cluster semantics
1533+
firstMatchTest(#"e[\u{301}]$"#, input: eComposed, match: nil)
1534+
firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eComposed, match: nil)
1535+
firstMatchTest(#"[e][\u{300}-\u{320}]$"#, input: eComposed, match: nil)
1536+
firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eComposed, match: nil)
15351537

15361538
let flag = "🇰🇷"
15371539
firstMatchTest(#"🇰🇷"#, input: flag, match: flag)
@@ -1540,27 +1542,15 @@ extension RegexTests {
15401542
firstMatchTest(#"\u{1F1F0 1F1F7}"#, input: flag, match: flag)
15411543

15421544
// First Unicode scalar followed by CCC of regional indicators
1543-
firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag,
1544-
xfail: true)
1545-
1546-
// FIXME: CCC of Regional Indicator doesn't match with both parts of a flag character
1545+
firstMatchTest(#"(?u)^\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: flag)
1546+
// A CCC of regional indicators followed by the second Unicode scalar
1547+
firstMatchTest(#"(?u)^[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}$"#, input: flag, match: flag)
15471548
// A CCC of regional indicators x 2
1548-
firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]{2}"#, input: flag, match: flag,
1549-
xfail: true)
1549+
firstMatchTest(#"(?u)^[\u{1F1E6}-\u{1F1FF}]{2}$"#, input: flag, match: flag)
15501550

1551-
// FIXME: A single CCC of regional indicators matches the whole flag character
1552-
// A CCC of regional indicators followed by the second Unicode scalar
1553-
firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}"#, input: flag, match: flag,
1554-
xfail: true)
15551551
// A single CCC of regional indicators
1556-
firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: nil,
1557-
xfail: true)
1558-
1559-
// A single CCC of actual flag emojis / combined regional indicators
1560-
firstMatchTest(#"[🇦🇫-🇿🇼]"#, input: flag, match: flag)
1561-
// This succeeds (correctly) because \u{1F1F0} is lexicographically
1562-
// within the CCC range
1563-
firstMatchTest(#"[🇦🇫-🇿🇼]"#, input: "\u{1F1F0}abc", match: "\u{1F1F0}")
1552+
firstMatchTest(#"^[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: nil)
1553+
firstMatchTest(#"^(?u)[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: nil)
15641554
}
15651555

15661556
func testAnyChar() throws {

0 commit comments

Comments
 (0)