@@ -2649,7 +2649,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2649
2649
#endif
2650
2650
2651
2651
switch (flags) {
2652
- case EXACT: case EXACTL: break;
2652
+ case EXACT: case EXACT_ONLY8: case EXACTL: break;
2653
2653
case EXACTFAA:
2654
2654
case EXACTFU_SS:
2655
2655
case EXACTFU:
@@ -2664,7 +2664,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2664
2664
trie->wordcount = word_count;
2665
2665
RExC_rxi->data->data[ data_slot ] = (void*)trie;
2666
2666
trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2667
- if (flags == EXACT || flags == EXACTL)
2667
+ if (flags == EXACT || flags == EXACT_ONLY8 || flags == EXACTL)
2668
2668
trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2669
2669
trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2670
2670
trie->wordcount+1, sizeof(reg_trie_wordinfo));
@@ -2738,15 +2738,11 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2738
2738
noper= noper_next;
2739
2739
}
2740
2740
2741
- if ( noper < tail &&
2742
- (
2743
- OP(noper) == flags ||
2744
- (
2745
- flags == EXACTFU &&
2746
- OP(noper) == EXACTFU_SS
2747
- )
2748
- )
2749
- ) {
2741
+ if ( noper < tail
2742
+ && ( OP(noper) == flags
2743
+ || (flags == EXACT && OP(noper) == EXACT_ONLY8)
2744
+ || (flags == EXACTFU && OP(noper) == EXACTFU_SS)))
2745
+ {
2750
2746
uc= (U8*)STRING(noper);
2751
2747
e= uc + STR_LEN(noper);
2752
2748
} else {
@@ -2959,7 +2955,11 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2959
2955
noper= noper_next;
2960
2956
}
2961
2957
2962
- if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2958
+ if ( noper < tail
2959
+ && ( OP(noper) == flags
2960
+ || (flags == EXACT && OP(noper) == EXACT_ONLY8)
2961
+ || (flags == EXACTFU && OP(noper) == EXACTFU_SS) ) )
2962
+ {
2963
2963
const U8 *uc= (U8*)STRING(noper);
2964
2964
const U8 *e= uc + STR_LEN(noper);
2965
2965
@@ -3179,7 +3179,11 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
3179
3179
noper= noper_next;
3180
3180
}
3181
3181
3182
- if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
3182
+ if ( noper < tail
3183
+ && ( OP(noper) == flags
3184
+ || (flags == EXACT && OP(noper) == EXACT_ONLY8)
3185
+ || (flags == EXACTFU && OP(noper) == EXACTFU_SS) ) )
3186
+ {
3183
3187
const U8 *uc= (U8*)STRING(noper);
3184
3188
const U8 *e= uc + STR_LEN(noper);
3185
3189
@@ -4012,7 +4016,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
4012
4016
* this final joining, sequences could have been split over boundaries, and
4013
4017
* hence missed). The sequences only happen in folding, hence for any
4014
4018
* non-EXACT EXACTish node */
4015
- if (OP(scan) != EXACT && OP(scan) != EXACTL) {
4019
+ if (OP(scan) != EXACT && OP(scan) != EXACT_ONLY8 && OP(scan) != EXACTL) {
4016
4020
U8* s0 = (U8*) STRING(scan);
4017
4021
U8* s = s0;
4018
4022
U8* s_end = s0 + STR_LEN(scan);
@@ -4665,17 +4669,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4665
4669
----------------+-----------
4666
4670
NOTHING | NOTHING
4667
4671
EXACT | EXACT
4672
+ EXACT_ONLY8 | EXACT
4668
4673
EXACTFU | EXACTFU
4669
4674
EXACTFU_SS | EXACTFU
4670
- EXACTFAA | EXACTFAA
4675
+ EXACTFAA | EXACTFAA
4671
4676
EXACTL | EXACTL
4672
4677
EXACTFLU8 | EXACTFLU8
4673
4678
4674
4679
4675
4680
*/
4676
4681
#define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
4677
4682
? NOTHING \
4678
- : ( EXACT == (X) ) \
4683
+ : ( EXACT == (X) || EXACT_ONLY8 == (X) ) \
4679
4684
? EXACT \
4680
4685
: ( EXACTFU == (X) || EXACTFU_SS == (X) ) \
4681
4686
? EXACTFU \
@@ -4999,7 +5004,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4999
5004
continue;
5000
5005
}
5001
5006
}
5002
- else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
5007
+ else if ( OP(scan) == EXACT
5008
+ || OP(scan) == EXACT_ONLY8
5009
+ || OP(scan) == EXACTL)
5010
+ {
5003
5011
SSize_t l = STR_LEN(scan);
5004
5012
UV uc;
5005
5013
assert(l);
@@ -5118,7 +5126,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
5118
5126
case PLUS:
5119
5127
if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5120
5128
next = NEXTOPER(scan);
5121
- if (OP(next) == EXACT
5129
+ if ( OP(next) == EXACT
5130
+ || OP(next) == EXACT_ONLY8
5122
5131
|| OP(next) == EXACTL
5123
5132
|| (flags & SCF_DO_STCLASS))
5124
5133
{
@@ -7713,8 +7722,12 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
7713
7722
DEBUG_PEEP("first:", first, 0, 0);
7714
7723
/* Ignore EXACT as we deal with it later. */
7715
7724
if (PL_regkind[OP(first)] == EXACT) {
7716
- if (OP(first) == EXACT || OP(first) == EXACTL)
7725
+ if ( OP(first) == EXACT
7726
+ || OP(first) == EXACT_ONLY8
7727
+ || OP(first) == EXACTL)
7728
+ {
7717
7729
NOOP; /* Empty, get anchored substr later. */
7730
+ }
7718
7731
else
7719
7732
RExC_rxi->regstclass = first;
7720
7733
}
@@ -8056,7 +8069,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
8056
8069
&& nop == END)
8057
8070
RExC_rx->extflags |= RXf_WHITE;
8058
8071
else if ( RExC_rx->extflags & RXf_SPLIT
8059
- && (fop == EXACT || fop == EXACTL)
8072
+ && (fop == EXACT || fop == EXACT_ONLY8 || fop == EXACTL)
8060
8073
&& STR_LEN(first) == 1
8061
8074
&& *(STRING(first)) == ' '
8062
8075
&& nop == END )
@@ -13742,6 +13755,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
13742
13755
* as the latter's folds aren't known until runtime. */
13743
13756
bool maybe_exactfu = FOLD;
13744
13757
13758
+ /* Does this node contain something that can't match unless the
13759
+ * target string is (also) in UTF-8 */
13760
+ bool requires_utf8_target = FALSE;
13761
+
13745
13762
/* Allocate an EXACT node. The node_type may change below to
13746
13763
* another EXACTish node, but since the size of the node doesn't
13747
13764
* change, it works */
@@ -14123,6 +14140,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
14123
14140
U8 * new_s = uvchr_to_utf8((U8*)s, ender);
14124
14141
added_len = (char *) new_s - s;
14125
14142
s = (char *) new_s;
14143
+
14144
+ if (ender > 255) {
14145
+ requires_utf8_target = TRUE;
14146
+ }
14126
14147
}
14127
14148
}
14128
14149
else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
@@ -14198,6 +14219,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
14198
14219
? FOLD_FLAGS_NOMIX_ASCII
14199
14220
: 0));
14200
14221
s += added_len;
14222
+
14223
+ if (ender > 255) {
14224
+ requires_utf8_target = TRUE;
14225
+ }
14201
14226
}
14202
14227
}
14203
14228
else {
@@ -14431,11 +14456,14 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
14431
14456
OP(REGNODE_p(ret)) = node_type;
14432
14457
14433
14458
/* If the node type is EXACT here, check to see if it
14434
- * should be EXACTL. */
14459
+ * should be EXACTL, or EXACT_ONLY8 . */
14435
14460
if (node_type == EXACT) {
14436
14461
if (LOC) {
14437
14462
OP(REGNODE_p(ret)) = EXACTL;
14438
14463
}
14464
+ else if (requires_utf8_target) {
14465
+ OP(REGNODE_p(ret)) = EXACT_ONLY8;
14466
+ }
14439
14467
}
14440
14468
14441
14469
if (FOLD) {
@@ -19218,6 +19246,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
19218
19246
if ( exact ) {
19219
19247
switch (OP(REGNODE_p(scan))) {
19220
19248
case EXACT:
19249
+ case EXACT_ONLY8:
19221
19250
case EXACTL:
19222
19251
case EXACTF:
19223
19252
case EXACTFAA_NO_TRIE:
0 commit comments