@@ -6926,6 +6926,106 @@ Perl_invmap_dump(pTHX_ SV* invlist, UV *map)
6926
6926
}
6927
6927
}
6928
6928
6929
+ STATIC bool
6930
+ S_expands(UV t_cp, UV t_cp_end, UV r_cp, UV r_cp_end)
6931
+ {
6932
+ /* Returns a boolean as to whether or not there is a code point in the r
6933
+ * range (r_cp..r_cp_end) whose UTF-8 representation is larger than its
6934
+ * corresponding code point in the t range.
6935
+ *
6936
+ * This must be run in the first pass, which makes this task trivial on
6937
+ * ASCII platforms due to the special partitioning in that pass, as
6938
+ * explained below. Any compiler should then inline this function, but
6939
+ * experience has shown that compilation is not a performance bottleneck,
6940
+ * so it isn't a problem even if it doesn't get inlined.
6941
+ *
6942
+ * During the first pass, the t_invlist has been partitioned so that all
6943
+ * elements in any single range have the same number of bytes in their
6944
+ * UTF-8 representations. And the r space is either a single byte, or a
6945
+ * range of strictly monotonically increasing code points. So on ASCII
6946
+ * platforms, the final element in the range will be represented by no
6947
+ * fewer bytes than the initial one. (See below for EBCDIC.) That means
6948
+ * that, on ASCII platforms, if the final code point in the t range has at
6949
+ * least as many bytes as the final code point in the r, then all code
6950
+ * points in the t range have at least as many bytes as their corresponding
6951
+ * r range element. But if the final code point has more bytes than the
6952
+ * corresponding t range one, at least that transliteration grows in
6953
+ * length. As an example, suppose we had
6954
+ * tr/\x{fff0}-\x{fff1}/\x{ffff}-\x{10000}/
6955
+ * The UTF-8 for all but 10000 occupies 3 bytes on ASCII platforms. We
6956
+ * have deliberately set up the data structure so that any range in the lhs
6957
+ * gets split into chunks for processing, such that every code point in a
6958
+ * chunk has the same number of UTF-8 bytes. We only have to check the
6959
+ * final code point in the rhs against any code point in the lhs.
6960
+ *
6961
+ * On EBCDIC platforms, the above is true for any r range whose final code
6962
+ * point is above 255. But ranges below it could have a mixture of one and
6963
+ * two byte UTF-8 representations, so special code is needed for
6964
+ * determining that.
6965
+ */
6966
+
6967
+ #ifndef EBCDIC
6968
+
6969
+ /* On ASCII platforms, the lengths needed to represent code points in UTF-8
6970
+ * are monotonically increasing with code point. Thus if the final code
6971
+ * point in the t range is not greater than the corresponding final code
6972
+ * point in the r range, there is no growth */
6973
+ PERL_UNUSED_ARG(t_cp);
6974
+ PERL_UNUSED_ARG(r_cp);
6975
+
6976
+ return UVCHR_SKIP(t_cp_end) < UVCHR_SKIP(r_cp_end);
6977
+
6978
+ #else
6979
+
6980
+ /* But on EBCDIC platforms, there is a mixture of 1 and 2 byte
6981
+ * representations for characters below 256. But above that, everything
6982
+ * behaves like the ASCII case */
6983
+ if (t_cp_end > 255 || r_cp_end > 255) {
6984
+ return UVCHR_SKIP(t_cp_end) < UVCHR_SKIP(r_cp_end);
6985
+ }
6986
+
6987
+ /* Here, is in range 0-255: UTF-8 size is 1 or 2.
6988
+ *
6989
+ * Everything SPACE and below is 1 byte, so can't be larger than the lhs */
6990
+ if (r_cp_end <= ' ') {
6991
+ return FALSE;
6992
+ }
6993
+
6994
+ /* Handle the case of everything on the lhs mapping to the final mapping on
6995
+ * the rhs */
6996
+ if (r_cp == TR_SPECIAL_HANDLING) {
6997
+
6998
+ /* If the final mapping is size 1, then nothing will be less than it */
6999
+ if (UVCHR_IS_INVARIANT(r_cp_end)) {
7000
+ return FALSE;
7001
+ }
7002
+
7003
+ /* Otherwise it is size 2; if anything is size 1, that will grow */
7004
+ while (t_cp <= t_cp_end) {
7005
+ if (UVCHR_IS_INVARIANT(t_cp)) {
7006
+ return TRUE;
7007
+ }
7008
+ t_cp++;
7009
+ }
7010
+
7011
+ return FALSE;
7012
+ }
7013
+
7014
+ /* Handle the general case. If any character in the lhs is size one, and
7015
+ * it maps to a size two character, it grows */
7016
+ while (t_cp <= t_cp_end) {
7017
+ if (! UVCHR_IS_INVARIANT(t_cp) && UVCHR_IS_INVARIANT(r_cp)) {
7018
+ return TRUE;
7019
+ }
7020
+ t_cp++; r_cp++;
7021
+ }
7022
+
7023
+ return FALSE;
7024
+
7025
+ #endif
7026
+
7027
+ }
7028
+
6929
7029
/* Given an OP_TRANS / OP_TRANSR op o, plus OP_CONST ops expr and repl
6930
7030
* containing the search and replacement strings, assemble into
6931
7031
* a translation table attached as o->op_pv.
@@ -7065,13 +7165,17 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
7065
7165
* done after this has been determined which merges things together to
7066
7166
* shrink the table for runtime. For ASCII platforms, the table is
7067
7167
* trivial, given below, and uses the fundamental characteristics of UTF-8
7068
- * to construct the values. For EBCDIC, it isn't so, and we rely on a
7069
- * table constructed by the perl script that generates these kinds of
7070
- * things */
7071
- #ifndef EBCDIC
7168
+ * to construct the values. For EBCDIC, the table is useless for code
7169
+ * points below 256, as they are intermixed in size between 1 and 2. But
7170
+ * it is the same as ASCII for higher code points, so this just makes the
7171
+ * lower 256 a single pool, and code is executed to tease things apart. */
7072
7172
UV PL_partition_by_byte_length[] = {
7073
7173
0,
7174
+ #ifdef EBCDIC
7175
+ 0x100, /* Below this is 1 and 2 byte representations */
7176
+ #else
7074
7177
0x80, /* Below this is 1 byte representations */
7178
+ #endif
7075
7179
(32 * (1UL << ( UTF_ACCUMULATION_SHIFT))), /* 2 bytes below this */
7076
7180
(16 * (1UL << (2 * UTF_ACCUMULATION_SHIFT))), /* 3 bytes below this */
7077
7181
( 8 * (1UL << (3 * UTF_ACCUMULATION_SHIFT))), /* 4 bytes below this */
@@ -7085,8 +7189,6 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
7085
7189
7086
7190
};
7087
7191
7088
- #endif
7089
-
7090
7192
PERL_ARGS_ASSERT_PMTRANS;
7091
7193
7092
7194
PL_hints |= HINT_BLOCK_SCOPE;
@@ -7516,30 +7618,10 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
7516
7618
* longer than it. If none, the transliteration may be done
7517
7619
* in-place, as it can't write over a so-far unread byte.
7518
7620
* Otherwise, a copy must first be made. This could be
7519
- * expensive for long inputs.
7520
- *
7521
- * In the first pass, the t_invlist has been partitioned so
7522
- * that all elements in any single range have the same number
7523
- * of bytes in their UTF-8 representations. And the r space is
7524
- * either a single byte, or a range of strictly monotonically
7525
- * increasing code points. So the final element in the range
7526
- * will be represented by no fewer bytes than the initial one.
7527
- * That means that if the final code point in the t range has
7528
- * at least as many bytes as the final code point in the r,
7529
- * then all code points in the t range have at least as many
7530
- * bytes as their corresponding r range element. But if that's
7531
- * not true, the transliteration of at least the final code
7532
- * point grows in length. As an example, suppose we had
7533
- * tr/\x{fff0}-\x{fff1}/\x{ffff}-\x{10000}/
7534
- * The UTF-8 for all but 10000 occupies 3 bytes on ASCII
7535
- * platforms. We have deliberately set up the data structure
7536
- * so that any range in the lhs gets split into chunks for
7537
- * processing, such that every code point in a chunk has the
7538
- * same number of UTF-8 bytes. We only have to check the final
7539
- * code point in the rhs against any code point in the lhs. */
7621
+ * expensive for long inputs. */
7540
7622
if ( ! pass2
7541
7623
&& r_cp_end != TR_SPECIAL_HANDLING
7542
- && UVCHR_SKIP( t_cp_end) < UVCHR_SKIP( r_cp_end))
7624
+ && S_expands(t_cp, t_cp_end, r_cp, r_cp_end))
7543
7625
{
7544
7626
/* Here, we will need to make a copy of the input string
7545
7627
* before doing the transliteration. The worst possible
0 commit comments