Skip to content

Commit a9f8c7a

Browse files
committed
Add regnode EXACTFU_ONLY8
This is a regnode that otherwise would be an EXACTFU except that it contains a code point that requires UTF-8 to match, including all the possible folds involving it. Hence if the target string isn't UTF-8, we know it can't possibly match, without needing to try. For completeness, there could also be an EXACTFAA_ONLY8 and an EXACTFL_ONLY8 created, but I think these are unlikely to actually appear in the wild, since using /aa is mainly about ASCII, and /l mostly will involve characters that don't require UTF-8.
1 parent f6b4b99 commit a9f8c7a

File tree

5 files changed

+177
-117
lines changed

5 files changed

+177
-117
lines changed

pod/perldebguts.pod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -676,6 +676,8 @@ will be lost.
676676

677677
EXACT_ONLY8 str Like EXACT, but no strings that aren't in
678678
UTF-8 can match
679+
EXACTFU_ONLY8 str Like EXACTFU, but no strings that aren't in
680+
UTF-8 can match
679681

680682
# Do nothing types
681683

regcomp.c

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2741,7 +2741,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
27412741
if ( noper < tail
27422742
&& ( OP(noper) == flags
27432743
|| (flags == EXACT && OP(noper) == EXACT_ONLY8)
2744-
|| (flags == EXACTFU && OP(noper) == EXACTFU_SS)))
2744+
|| (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
2745+
|| OP(noper) == EXACTFU_SS))) )
27452746
{
27462747
uc= (U8*)STRING(noper);
27472748
e= uc + STR_LEN(noper);
@@ -2958,7 +2959,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
29582959
if ( noper < tail
29592960
&& ( OP(noper) == flags
29602961
|| (flags == EXACT && OP(noper) == EXACT_ONLY8)
2961-
|| (flags == EXACTFU && OP(noper) == EXACTFU_SS) ) )
2962+
|| (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
2963+
|| OP(noper) == EXACTFU_SS))) )
29622964
{
29632965
const U8 *uc= (U8*)STRING(noper);
29642966
const U8 *e= uc + STR_LEN(noper);
@@ -3182,7 +3184,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
31823184
if ( noper < tail
31833185
&& ( OP(noper) == flags
31843186
|| (flags == EXACT && OP(noper) == EXACT_ONLY8)
3185-
|| (flags == EXACTFU && OP(noper) == EXACTFU_SS) ) )
3187+
|| (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
3188+
|| OP(noper) == EXACTFU_SS))) )
31863189
{
31873190
const U8 *uc= (U8*)STRING(noper);
31883191
const U8 *e= uc + STR_LEN(noper);
@@ -4671,6 +4674,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
46714674
EXACT | EXACT
46724675
EXACT_ONLY8 | EXACT
46734676
EXACTFU | EXACTFU
4677+
EXACTFU_ONLY8 | EXACTFU
46744678
EXACTFU_SS | EXACTFU
46754679
EXACTFAA | EXACTFAA
46764680
EXACTL | EXACTL
@@ -4682,7 +4686,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
46824686
? NOTHING \
46834687
: ( EXACT == (X) || EXACT_ONLY8 == (X) ) \
46844688
? EXACT \
4685-
: ( EXACTFU == (X) || EXACTFU_SS == (X) ) \
4689+
: ( EXACTFU == (X) \
4690+
|| EXACTFU_ONLY8 == (X) \
4691+
|| EXACTFU_SS == (X) ) \
46864692
? EXACTFU \
46874693
: ( EXACTFAA == (X) ) \
46884694
? EXACTFAA \
@@ -13759,6 +13765,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
1375913765
* target string is (also) in UTF-8 */
1376013766
bool requires_utf8_target = FALSE;
1376113767

13768+
bool has_micro_sign = FALSE;
13769+
1376213770
/* Allocate an EXACT node. The node_type may change below to
1376313771
* another EXACTish node, but since the size of the node doesn't
1376413772
* change, it works */
@@ -14222,6 +14230,9 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
1422214230

1422314231
if (ender > 255) {
1422414232
requires_utf8_target = TRUE;
14233+
if (UNLIKELY(ender == GREEK_SMALL_LETTER_MU)) {
14234+
has_micro_sign = TRUE;
14235+
}
1422514236
}
1422614237
}
1422714238
}
@@ -14264,6 +14275,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
1426414275
}
1426514276
#endif
1426614277

14278+
else if (UNLIKELY(ender == MICRO_SIGN)) {
14279+
has_micro_sign = TRUE;
14280+
}
14281+
1426714282
/* Even when folding, we store just the input
1426814283
* character, as we have an array that finds its fold
1426914284
* quickly */
@@ -14481,6 +14496,16 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
1448114496
else if (node_type == EXACTF) {
1448214497
RExC_seen_d_op = TRUE;
1448314498
}
14499+
14500+
/* The micro sign is the only below 256 character that
14501+
* folds to above 255 */
14502+
if ( OP(REGNODE_p(ret)) == EXACTFU
14503+
&& requires_utf8_target
14504+
&& LIKELY(! has_micro_sign))
14505+
{
14506+
OP(REGNODE_p(ret)) = EXACTFU_ONLY8;
14507+
}
14508+
1448414509
}
1448514510

1448614511
alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len,
@@ -19252,6 +19277,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
1925219277
case EXACTFAA_NO_TRIE:
1925319278
case EXACTFAA:
1925419279
case EXACTFU:
19280+
case EXACTFU_ONLY8:
1925519281
case EXACTFLU8:
1925619282
case EXACTFU_SS:
1925719283
case EXACTFL:

regcomp.sym

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,9 @@ EXACTFLU8 EXACT, str ; Rare circumstances: like EXACTFU, but is under /
112112
EXACTFAA_NO_TRIE EXACT, str ; Match this string (which is not trie-able; not guaranteed to be folded) using /iaa rules (w/len).
113113

114114
EXACT_ONLY8 EXACT, str ; Like EXACT, but no strings that aren't in UTF-8 can match
115+
EXACTFU_ONLY8 EXACT, str ; Like EXACTFU, but no strings that aren't in UTF-8 can match
116+
# One could add EXACTFAA8 and and something that has the same effect for /l,
117+
# but these would be extremely uncommon
115118

116119
#*Do nothing types
117120

regexec.c

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2292,6 +2292,14 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
22922292
| FOLDEQ_S2_FOLDS_SANE;
22932293
goto do_exactf_utf8;
22942294

2295+
case EXACTFU_ONLY8:
2296+
if (! utf8_target) {
2297+
break;
2298+
}
2299+
assert(is_utf8_pat);
2300+
utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2301+
goto do_exactf_utf8;
2302+
22952303
case EXACTFU:
22962304
if (is_utf8_pat || utf8_target) {
22972305
utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
@@ -6361,6 +6369,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
63616369
fold_array = PL_fold_latin1;
63626370
goto do_exactf;
63636371

6372+
case EXACTFU_ONLY8: /* /abc/iu with something in /abc/ > 255 */
6373+
if (! utf8_target) {
6374+
sayNO;
6375+
}
6376+
assert(is_utf8_pat);
6377+
fold_utf8_flags = FOLDEQ_S1_ALREADY_FOLDED;
6378+
goto do_exactf;
6379+
63646380
case EXACTFU_SS: /* /\x{df}/iu */
63656381
case EXACTFU: /* /abc/iu */
63666382
folder = foldEQ_latin1;
@@ -9288,6 +9304,14 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
92889304
| FOLDEQ_S2_FOLDS_SANE;
92899305
goto do_exactf;
92909306

9307+
case EXACTFU_ONLY8:
9308+
if (! utf8_target) {
9309+
break;
9310+
}
9311+
assert(reginfo->is_utf8_pat);
9312+
utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
9313+
goto do_exactf;
9314+
92919315
case EXACTFU_SS:
92929316
case EXACTFU:
92939317
utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;

0 commit comments

Comments
 (0)