Skip to content

Commit f6b4b99

Browse files
committed
Add regnode EXACT_ONLY8
This is a regnode that otherwise would be an EXACT except that it contains a code point that requires UTF-8 to represent. Hence if the target string isn't UTF-8, we know it can't possibly match, without needing to try.
1 parent 51fa1a7 commit f6b4b99

File tree

5 files changed

+192
-136
lines changed

5 files changed

+192
-136
lines changed

pod/perldebguts.pod

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -674,6 +674,9 @@ will be lost.
674674
not guaranteed to be folded) using /iaa
675675
rules (w/len).
676676

677+
EXACT_ONLY8 str Like EXACT, but no strings that aren't in
678+
UTF-8 can match
679+
677680
# Do nothing types
678681

679682
NOTHING no Match empty string.

regcomp.c

Lines changed: 50 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2649,7 +2649,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
26492649
#endif
26502650

26512651
switch (flags) {
2652-
case EXACT: case EXACTL: break;
2652+
case EXACT: case EXACT_ONLY8: case EXACTL: break;
26532653
case EXACTFAA:
26542654
case EXACTFU_SS:
26552655
case EXACTFU:
@@ -2664,7 +2664,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
26642664
trie->wordcount = word_count;
26652665
RExC_rxi->data->data[ data_slot ] = (void*)trie;
26662666
trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2667-
if (flags == EXACT || flags == EXACTL)
2667+
if (flags == EXACT || flags == EXACT_ONLY8 || flags == EXACTL)
26682668
trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
26692669
trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
26702670
trie->wordcount+1, sizeof(reg_trie_wordinfo));
@@ -2738,15 +2738,11 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
27382738
noper= noper_next;
27392739
}
27402740

2741-
if ( noper < tail &&
2742-
(
2743-
OP(noper) == flags ||
2744-
(
2745-
flags == EXACTFU &&
2746-
OP(noper) == EXACTFU_SS
2747-
)
2748-
)
2749-
) {
2741+
if ( noper < tail
2742+
&& ( OP(noper) == flags
2743+
|| (flags == EXACT && OP(noper) == EXACT_ONLY8)
2744+
|| (flags == EXACTFU && OP(noper) == EXACTFU_SS)))
2745+
{
27502746
uc= (U8*)STRING(noper);
27512747
e= uc + STR_LEN(noper);
27522748
} else {
@@ -2959,7 +2955,11 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
29592955
noper= noper_next;
29602956
}
29612957

2962-
if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2958+
if ( noper < tail
2959+
&& ( OP(noper) == flags
2960+
|| (flags == EXACT && OP(noper) == EXACT_ONLY8)
2961+
|| (flags == EXACTFU && OP(noper) == EXACTFU_SS) ) )
2962+
{
29632963
const U8 *uc= (U8*)STRING(noper);
29642964
const U8 *e= uc + STR_LEN(noper);
29652965

@@ -3179,7 +3179,11 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
31793179
noper= noper_next;
31803180
}
31813181

3182-
if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
3182+
if ( noper < tail
3183+
&& ( OP(noper) == flags
3184+
|| (flags == EXACT && OP(noper) == EXACT_ONLY8)
3185+
|| (flags == EXACTFU && OP(noper) == EXACTFU_SS) ) )
3186+
{
31833187
const U8 *uc= (U8*)STRING(noper);
31843188
const U8 *e= uc + STR_LEN(noper);
31853189

@@ -4012,7 +4016,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
40124016
* this final joining, sequences could have been split over boundaries, and
40134017
* hence missed). The sequences only happen in folding, hence for any
40144018
* non-EXACT EXACTish node */
4015-
if (OP(scan) != EXACT && OP(scan) != EXACTL) {
4019+
if (OP(scan) != EXACT && OP(scan) != EXACT_ONLY8 && OP(scan) != EXACTL) {
40164020
U8* s0 = (U8*) STRING(scan);
40174021
U8* s = s0;
40184022
U8* s_end = s0 + STR_LEN(scan);
@@ -4665,17 +4669,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
46654669
----------------+-----------
46664670
NOTHING | NOTHING
46674671
EXACT | EXACT
4672+
EXACT_ONLY8 | EXACT
46684673
EXACTFU | EXACTFU
46694674
EXACTFU_SS | EXACTFU
4670-
EXACTFAA | EXACTFAA
4675+
EXACTFAA | EXACTFAA
46714676
EXACTL | EXACTL
46724677
EXACTFLU8 | EXACTFLU8
46734678

46744679

46754680
*/
46764681
#define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
46774682
? NOTHING \
4678-
: ( EXACT == (X) ) \
4683+
: ( EXACT == (X) || EXACT_ONLY8 == (X) ) \
46794684
? EXACT \
46804685
: ( EXACTFU == (X) || EXACTFU_SS == (X) ) \
46814686
? EXACTFU \
@@ -4999,7 +5004,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
49995004
continue;
50005005
}
50015006
}
5002-
else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
5007+
else if ( OP(scan) == EXACT
5008+
|| OP(scan) == EXACT_ONLY8
5009+
|| OP(scan) == EXACTL)
5010+
{
50035011
SSize_t l = STR_LEN(scan);
50045012
UV uc;
50055013
assert(l);
@@ -5118,7 +5126,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
51185126
case PLUS:
51195127
if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
51205128
next = NEXTOPER(scan);
5121-
if (OP(next) == EXACT
5129+
if ( OP(next) == EXACT
5130+
|| OP(next) == EXACT_ONLY8
51225131
|| OP(next) == EXACTL
51235132
|| (flags & SCF_DO_STCLASS))
51245133
{
@@ -7713,8 +7722,12 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
77137722
DEBUG_PEEP("first:", first, 0, 0);
77147723
/* Ignore EXACT as we deal with it later. */
77157724
if (PL_regkind[OP(first)] == EXACT) {
7716-
if (OP(first) == EXACT || OP(first) == EXACTL)
7725+
if ( OP(first) == EXACT
7726+
|| OP(first) == EXACT_ONLY8
7727+
|| OP(first) == EXACTL)
7728+
{
77177729
NOOP; /* Empty, get anchored substr later. */
7730+
}
77187731
else
77197732
RExC_rxi->regstclass = first;
77207733
}
@@ -8056,7 +8069,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
80568069
&& nop == END)
80578070
RExC_rx->extflags |= RXf_WHITE;
80588071
else if ( RExC_rx->extflags & RXf_SPLIT
8059-
&& (fop == EXACT || fop == EXACTL)
8072+
&& (fop == EXACT || fop == EXACT_ONLY8 || fop == EXACTL)
80608073
&& STR_LEN(first) == 1
80618074
&& *(STRING(first)) == ' '
80628075
&& nop == END )
@@ -13742,6 +13755,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
1374213755
* as the latter's folds aren't known until runtime. */
1374313756
bool maybe_exactfu = FOLD;
1374413757

13758+
/* Does this node contain something that can't match unless the
13759+
* target string is (also) in UTF-8 */
13760+
bool requires_utf8_target = FALSE;
13761+
1374513762
/* Allocate an EXACT node. The node_type may change below to
1374613763
* another EXACTish node, but since the size of the node doesn't
1374713764
* change, it works */
@@ -14123,6 +14140,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
1412314140
U8 * new_s = uvchr_to_utf8((U8*)s, ender);
1412414141
added_len = (char *) new_s - s;
1412514142
s = (char *) new_s;
14143+
14144+
if (ender > 255) {
14145+
requires_utf8_target = TRUE;
14146+
}
1412614147
}
1412714148
}
1412814149
else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
@@ -14198,6 +14219,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
1419814219
? FOLD_FLAGS_NOMIX_ASCII
1419914220
: 0));
1420014221
s += added_len;
14222+
14223+
if (ender > 255) {
14224+
requires_utf8_target = TRUE;
14225+
}
1420114226
}
1420214227
}
1420314228
else {
@@ -14431,11 +14456,14 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
1443114456
OP(REGNODE_p(ret)) = node_type;
1443214457

1443314458
/* If the node type is EXACT here, check to see if it
14434-
* should be EXACTL. */
14459+
* should be EXACTL, or EXACT_ONLY8. */
1443514460
if (node_type == EXACT) {
1443614461
if (LOC) {
1443714462
OP(REGNODE_p(ret)) = EXACTL;
1443814463
}
14464+
else if (requires_utf8_target) {
14465+
OP(REGNODE_p(ret)) = EXACT_ONLY8;
14466+
}
1443914467
}
1444014468

1444114469
if (FOLD) {
@@ -19218,6 +19246,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
1921819246
if ( exact ) {
1921919247
switch (OP(REGNODE_p(scan))) {
1922019248
case EXACT:
19249+
case EXACT_ONLY8:
1922119250
case EXACTL:
1922219251
case EXACTF:
1922319252
case EXACTFAA_NO_TRIE:

regcomp.sym

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,8 @@ EXACTFU_SS EXACT, str ; Match this string (folded iff in UTF-8, length i
111111
EXACTFLU8 EXACT, str ; Rare circumstances: like EXACTFU, but is under /l, UTF-8, folded, and everything in it is above 255.
112112
EXACTFAA_NO_TRIE EXACT, str ; Match this string (which is not trie-able; not guaranteed to be folded) using /iaa rules (w/len).
113113

114+
EXACT_ONLY8 EXACT, str ; Like EXACT, but no strings that aren't in UTF-8 can match
115+
114116
#*Do nothing types
115117

116118
NOTHING NOTHING, no ; Match empty string.

regexec.c

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4449,7 +4449,10 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
44494449
U8 *pat = (U8*)STRING(text_node);
44504450
U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
44514451

4452-
if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
4452+
if ( OP(text_node) == EXACT
4453+
|| OP(text_node) == EXACT_ONLY8
4454+
|| OP(text_node) == EXACTL)
4455+
{
44534456

44544457
/* In an exact node, only one thing can be matched, that first
44554458
* character. If both the pat and the target are UTF-8, we can just
@@ -6246,9 +6249,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
62466249
if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
62476250
_CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
62486251
}
6252+
goto do_exact;
6253+
case EXACT_ONLY8:
6254+
if (! utf8_target) {
6255+
sayNO;
6256+
}
62496257
/* FALLTHROUGH */
62506258
case EXACT: { /* /abc/ */
6251-
char *s = STRING(scan);
6259+
char *s;
6260+
do_exact:
6261+
s = STRING(scan);
62526262
ln = STR_LEN(scan);
62536263
if (utf8_target != is_utf8_pat) {
62546264
/* The target and the pattern have differing utf8ness. */
@@ -9184,8 +9194,15 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
91849194
if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
91859195
_CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
91869196
}
9197+
goto do_exact;
9198+
9199+
case EXACT_ONLY8:
9200+
if (! utf8_target) {
9201+
break;
9202+
}
91879203
/* FALLTHROUGH */
91889204
case EXACT:
9205+
do_exact:
91899206
assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
91909207

91919208
c = (U8)*STRING(p);

0 commit comments

Comments
 (0)