Skip to content

Commit 1c2f3d7

Browse files
committed
PATCH GH #17025 \p{user-defined} overrides official Unicode
Prior to this patch, they only sometimes overrode.
1 parent c376875 commit 1c2f3d7

File tree

9 files changed

+104
-21
lines changed

9 files changed

+104
-21
lines changed

charclass_invlists.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -395174,7 +395174,7 @@ static const U8 WB_table[23][23] = {
395174395174
* 78e2600e24fa7d5ab62117de50b382f8b31b08401c37a0782c38dacb340b64e7 lib/unicore/extracted/DLineBreak.txt
395175395175
* 1bde4ad73e271c6349fbd1972e54f38bba5cc1900c28f678e79b9e8909b31793 lib/unicore/extracted/DNumType.txt
395176395176
* 6278722699123f3890e4b1cc42011e96d8960e4958a3b93484361530983d2611 lib/unicore/extracted/DNumValues.txt
395177-
* 08071cd168b1ac72bf01f13a82c4d0470a391e2bdd0b706e9fe20ab17cc861c8 lib/unicore/mktables
395177+
* 498da0b9ef6a52bfd71bda5771005bbe4cfc37b456d9d350cd840991eb80c8b1 lib/unicore/mktables
395178395178
* a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version
395179395179
* 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
395180395180
* e9283c761c5a95e3379384ca47c13a284f08d743c2be6e5091f1152b1b6b7a37 regen/mk_PL_charclass.pl

lib/unicore/mktables

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17075,6 +17075,9 @@ other two shortcuts, and Unicode continues to define new properties that begin
1707517075
with C<"In">, so it's quite possible that a conflict will occur in the future.
1707617076
The compound form is guaranteed to not become obsolete, and its meaning is
1707717077
clearer anyway. See L<perlunicode/"Blocks"> for more information about this.
17078+
17079+
User-defined properties must begin with "In" or "Is". These override any
17080+
Unicode property of the same name.
1707817081
END
1707917082
}
1708017083
my $text = $Is_flags_text;

lib/unicore/uni_keywords.pl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1260,7 +1260,7 @@
12601260
# 78e2600e24fa7d5ab62117de50b382f8b31b08401c37a0782c38dacb340b64e7 lib/unicore/extracted/DLineBreak.txt
12611261
# 1bde4ad73e271c6349fbd1972e54f38bba5cc1900c28f678e79b9e8909b31793 lib/unicore/extracted/DNumType.txt
12621262
# 6278722699123f3890e4b1cc42011e96d8960e4958a3b93484361530983d2611 lib/unicore/extracted/DNumValues.txt
1263-
# 08071cd168b1ac72bf01f13a82c4d0470a391e2bdd0b706e9fe20ab17cc861c8 lib/unicore/mktables
1263+
# 498da0b9ef6a52bfd71bda5771005bbe4cfc37b456d9d350cd840991eb80c8b1 lib/unicore/mktables
12641264
# a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version
12651265
# 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
12661266
# e9283c761c5a95e3379384ca47c13a284f08d743c2be6e5091f1152b1b6b7a37 regen/mk_PL_charclass.pl

pod/perldelta.pod

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,20 @@ XXX For a release on a stable branch, this section aspires to be:
4545

4646
[ List each incompatible change as a =head2 entry ]
4747

48+
=head2 C<\p{I<user-defined>}> properties now always override official
49+
Unicode ones
50+
51+
Previously, if and only if a user-defined property was declared prior to
52+
the compilation of the regular expression pattern containing it, its
53+
definition was used instead of any official Unicode property with the
54+
same name. Now, it always overrides the offical property. This
55+
change could break existing code that relied (likely unwittingly) on the
56+
previous behavior. Without this fix, if Unicode released a new version
57+
with a new property that happens to have the same name as the one you
58+
had long been using, your program would break when you upgraded to a
59+
perl that used that new Unicode version. See L<perlunicode/User-Defined
60+
Character Properties>. [GH #17205]
61+
4862
=head1 Deprecations
4963

5064
XXX Any deprecated features, syntax, modules etc. should be listed here.

pod/perlunicode.pod

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1066,7 +1066,9 @@ You can define your own binary character properties by defining subroutines
10661066
whose names begin with C<"In"> or C<"Is">. (The experimental feature
10671067
L<perlre/(?[ ])> provides an alternative which allows more complex
10681068
definitions.) The subroutines can be defined in any
1069-
package. The user-defined properties can be used in the regular expression
1069+
package. They override any Unicode properties expressed as the same
1070+
names. The user-defined properties can be used in the regular
1071+
expression
10701072
C<\p{}> and C<\P{}> constructs; if you are using a user-defined property from a
10711073
package other than the one you are in, you must specify its package in the
10721074
C<\p{}> or C<\P{}> construct.

regcharclass.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2245,7 +2245,7 @@
22452245
* 78e2600e24fa7d5ab62117de50b382f8b31b08401c37a0782c38dacb340b64e7 lib/unicore/extracted/DLineBreak.txt
22462246
* 1bde4ad73e271c6349fbd1972e54f38bba5cc1900c28f678e79b9e8909b31793 lib/unicore/extracted/DNumType.txt
22472247
* 6278722699123f3890e4b1cc42011e96d8960e4958a3b93484361530983d2611 lib/unicore/extracted/DNumValues.txt
2248-
* 08071cd168b1ac72bf01f13a82c4d0470a391e2bdd0b706e9fe20ab17cc861c8 lib/unicore/mktables
2248+
* 498da0b9ef6a52bfd71bda5771005bbe4cfc37b456d9d350cd840991eb80c8b1 lib/unicore/mktables
22492249
* a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version
22502250
* 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
22512251
* f9a393e7add8c7c2728356473ce5b52246d51295b2da0c48fb6f0aa21799e2bb regen/regcharclass.pl

regcomp.c

Lines changed: 78 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,14 @@ struct RExC_state_t {
426426
#define _invlist_intersection_complement_2nd(a, b, output) \
427427
_invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
428428

429+
/* We add a marker if we are deferring expansion of a potential user-defined
430+
* property until it is needed at runtime the first time it is encountered in a
431+
* pattern match. This marker that shouldn't conflict with any that could be
432+
* in a legal name is appended to its name to indicate this. There is a string
433+
* and character form */
434+
#define DEFERRED_PROP_EXPANSION_MARKERs "~"
435+
#define DEFERRED_PROP_EXPANSION_MARKERc '~'
436+
429437
/* About scan_data_t.
430438

431439
During optimisation we recurse through the regexp program performing
@@ -19845,11 +19853,13 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
1984519853
continue;
1984619854
}
1984719855

19848-
/* Here, didn't find a legal hex number. Just add it from
19849-
* here to the next \n */
19856+
/* Here, didn't find a legal hex number. Just add the text
19857+
* from here up to the next \n, omitting any trailing
19858+
* markers. */
1985019859

1985119860
remaining -= len;
19852-
len = strcspn(si_string, "\n");
19861+
len = strcspn(si_string,
19862+
DEFERRED_PROP_EXPANSION_MARKERs "\n");
1985319863
remaining -= len;
1985419864
if (matches_string) {
1985519865
sv_catpvn(matches_string, si_string, len);
@@ -19860,6 +19870,13 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
1986019870
sv_catpvs(matches_string, " ");
1986119871

1986219872
si_string += len;
19873+
if ( remaining
19874+
&& UCHARAT(si_string)
19875+
== DEFERRED_PROP_EXPANSION_MARKERc)
19876+
{
19877+
si_string++;
19878+
remaining--;
19879+
}
1986319880
if (remaining && UCHARAT(si_string) == '\n') {
1986419881
si_string++;
1986519882
remaining--;
@@ -23099,7 +23116,7 @@ Perl_parse_uniprop_string(pTHX_
2309923116
* Other parameters will be set on return as described below */
2310023117

2310123118
const char * const name, /* The first non-blank in the \p{}, \P{} */
23102-
const Size_t name_len, /* Its length in bytes, not including any
23119+
Size_t name_len, /* Its length in bytes, not including any
2310323120
trailing space */
2310423121
const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
2310523122
const bool to_fold, /* ? Is this under /i */
@@ -23147,6 +23164,9 @@ Perl_parse_uniprop_string(pTHX_
2314723164
qualified name */
2314823165
bool invert_return = FALSE; /* ? Do we need to complement the result before
2314923166
returning it */
23167+
bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
23168+
explicit utf8:: package that we strip
23169+
off */
2315023170

2315123171
PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
2315223172

@@ -23205,6 +23225,17 @@ Perl_parse_uniprop_string(pTHX_
2320523225
break;
2320623226
}
2320723227

23228+
/* If this looks like it is a marker we inserted at compile time,
23229+
* ignore it; otherwise keep it as it would have been user input. */
23230+
if ( UNLIKELY(cur == DEFERRED_PROP_EXPANSION_MARKERc)
23231+
&& ! deferrable
23232+
&& could_be_user_defined
23233+
&& i == name_len - 1)
23234+
{
23235+
name_len--;
23236+
continue;
23237+
}
23238+
2320823239
/* Otherwise, this character is part of the name. */
2320923240
lookup_name[j++] = cur;
2321023241

@@ -23238,6 +23269,7 @@ Perl_parse_uniprop_string(pTHX_
2323823269
lookup_name += STRLENs("utf8::");
2323923270
j -= STRLENs("utf8::");
2324023271
equals_pos -= STRLENs("utf8::");
23272+
stripped_utf8_pkg = TRUE;
2324123273
}
2324223274

2324323275
/* Here, we are either done with the whole property name, if it was simple;
@@ -23634,7 +23666,29 @@ Perl_parse_uniprop_string(pTHX_
2363423666
/* Here, the name could be for a user defined property, which are
2363523667
* implemented as subs. */
2363623668
user_sub = get_cvn_flags(name, name_len, 0);
23637-
if (user_sub) {
23669+
if (! user_sub) {
23670+
23671+
/* Here, the property name could be a user-defined one, but there
23672+
* is no subroutine to handle it (as of now). Defer handling it
23673+
* until runtime. Otherwise, a block defined by Unicode in a later
23674+
* release would get the synonym InFoo added for it, and existing
23675+
* code that used that name would suddenly break if it referred to
23676+
* the property before the sub was declared. See [perl #134146] */
23677+
if (deferrable) {
23678+
goto definition_deferred;
23679+
}
23680+
23681+
/* If we haven't already stripped the package name (if one), do so
23682+
* now so can look for an official property with the stripped name.
23683+
* */
23684+
if (! stripped_utf8_pkg) {
23685+
lookup_name += non_pkg_begin;
23686+
j -= non_pkg_begin;
23687+
}
23688+
23689+
/* Drop down to look up in the official properties */
23690+
}
23691+
else {
2363823692
const char insecure[] = "Insecure user-defined property";
2363923693

2364023694
/* Here, there is a sub by the correct name. Normally we call it
@@ -24270,18 +24324,34 @@ Perl_parse_uniprop_string(pTHX_
2427024324

2427124325
definition_deferred:
2427224326

24327+
{
24328+
bool is_qualified = non_pkg_begin != 0; /* If has "::" */
24329+
2427324330
/* Here it could yet to be defined, so defer evaluation of this
2427424331
* until its needed at runtime. We need the fully qualified property name
24275-
* to avoid ambiguity, and a trailing newline */
24332+
* to avoid ambiguity */
2427624333
if (! fq_name) {
2427724334
fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
24278-
non_pkg_begin != 0 /* If has "::" */
24279-
);
24335+
is_qualified);
2428024336
}
24337+
24338+
/* If it didn't come with a package, or the package is utf8::, this
24339+
* actually could be an official Unicode property whose inclusion we
24340+
* are deferring until runtime to make sure that it isn't overridden by
24341+
* a user-defined property of the same name (which we haven't
24342+
* encountered yet). Add a marker to indicate this possibility, for
24343+
* use at such time when we first need the definition during pattern
24344+
* matching execution */
24345+
if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
24346+
sv_catpvs(fq_name, DEFERRED_PROP_EXPANSION_MARKERs);
24347+
}
24348+
24349+
/* We also need a trailing newline */
2428124350
sv_catpvs(fq_name, "\n");
2428224351

2428324352
*user_defined_ptr = TRUE;
2428424353
return fq_name;
24354+
}
2428524355
}
2428624356

2428724357
#endif

t/re/regexp_unicode_prop.t

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ BEGIN {
143143
Dash => ['-'],
144144
ASCII_Hex_Digit => ['!-', 'A'],
145145
IsAsciiHexAndDash => ['-', 'A'],
146+
InLatin1 => ['\x{0100}', '!\x{00FF}'],
146147
);
147148

148149
@USER_CASELESS_PROPERTIES = (
@@ -194,12 +195,6 @@ BEGIN {
194195
}
195196
}
196197

197-
# These override the official ones, so if found before defined, the official
198-
# ones prevail, so can't test deferred definition
199-
my @OVERRIDING_USER_DEFINED_PROPERTIES = (
200-
InLatin1 => ['\x{0100}', '!\x{00FF}'],
201-
);
202-
203198
#
204199
# From the short properties we populate POSIX-like classes.
205200
#
@@ -249,8 +244,7 @@ while (my ($class, $chars) = each %SHORT_PROPERTIES) {
249244

250245
push @CLASSES => "# Short properties" => %SHORT_PROPERTIES,
251246
"# POSIX like properties" => %d,
252-
"# User defined properties" => @USER_DEFINED_PROPERTIES,
253-
"# Overriding user defined properties" => @OVERRIDING_USER_DEFINED_PROPERTIES;
247+
"# User defined properties" => @USER_DEFINED_PROPERTIES;
254248

255249

256250
#

uni_keywords.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7283,7 +7283,7 @@ MPH_VALt match_uniprop( const unsigned char * const key, const U16 key_len ) {
72837283
* 78e2600e24fa7d5ab62117de50b382f8b31b08401c37a0782c38dacb340b64e7 lib/unicore/extracted/DLineBreak.txt
72847284
* 1bde4ad73e271c6349fbd1972e54f38bba5cc1900c28f678e79b9e8909b31793 lib/unicore/extracted/DNumType.txt
72857285
* 6278722699123f3890e4b1cc42011e96d8960e4958a3b93484361530983d2611 lib/unicore/extracted/DNumValues.txt
7286-
* 08071cd168b1ac72bf01f13a82c4d0470a391e2bdd0b706e9fe20ab17cc861c8 lib/unicore/mktables
7286+
* 498da0b9ef6a52bfd71bda5771005bbe4cfc37b456d9d350cd840991eb80c8b1 lib/unicore/mktables
72877287
* a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version
72887288
* 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
72897289
* e9283c761c5a95e3379384ca47c13a284f08d743c2be6e5091f1152b1b6b7a37 regen/mk_PL_charclass.pl

0 commit comments

Comments
 (0)