Skip to content

Commit 92c6781

Browse files
committed
Fix tr/// determination of inplace editing for EBCDIC
I realized as a result of fixing GH #17654, that the code didn't properly decide if a tr/// can be done in-place on EBCDIC platforms. Our test suite passed, but if we had had valgrind, it would have shown failuress.
1 parent 0e6f299 commit 92c6781

File tree

3 files changed

+110
-188
lines changed

3 files changed

+110
-188
lines changed

ebcdic_tables.h

Lines changed: 0 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -413,60 +413,6 @@ SOFTWARE.
413413
};
414414
# endif
415415

416-
/* This table partitions all the code points of the platform into ranges which
417-
* have the property that all the code points in each range have the same
418-
* number of bytes in their UTF-EBCDIC representations, and the adjacent
419-
* ranges have a different number of bytes.
420-
*
421-
* Each number in the table begins such a range, which extends up to just
422-
* before the following table entry, except the final entry is understood to
423-
* extend to the platform's infinity
424-
*/
425-
# ifndef DOINIT
426-
EXTCONST UV PL_partition_by_byte_length[38];
427-
# else
428-
EXTCONST UV PL_partition_by_byte_length[38] = {
429-
0x00,
430-
0x41,
431-
0x4b,
432-
0x51,
433-
0x5a,
434-
0x62,
435-
0x6b,
436-
0x70,
437-
0x79,
438-
0x80,
439-
0x81,
440-
0x8a,
441-
0x91,
442-
0x9a,
443-
0xa1,
444-
0xaa,
445-
0xad,
446-
0xae,
447-
0xbd,
448-
0xbe,
449-
0xc0,
450-
0xca,
451-
0xd0,
452-
0xda,
453-
0xe0,
454-
0xe1,
455-
0xe2,
456-
0xea,
457-
0xf0,
458-
0xfa,
459-
0xff,
460-
0x100,
461-
0x400,
462-
0x4000,
463-
0x40000,
464-
0x400000,
465-
0x4000000,
466-
0x40000000
467-
};
468-
# endif
469-
470416
#endif /* EBCDIC 1047 */
471417

472418
#if 'A' == 193 /* EBCDIC 037 */ \
@@ -845,62 +791,6 @@ SOFTWARE.
845791
};
846792
# endif
847793

848-
/* This table partitions all the code points of the platform into ranges which
849-
* have the property that all the code points in each range have the same
850-
* number of bytes in their UTF-EBCDIC representations, and the adjacent
851-
* ranges have a different number of bytes.
852-
*
853-
* Each number in the table begins such a range, which extends up to just
854-
* before the following table entry, except the final entry is understood to
855-
* extend to the platform's infinity
856-
*/
857-
# ifndef DOINIT
858-
EXTCONST UV PL_partition_by_byte_length[40];
859-
# else
860-
EXTCONST UV PL_partition_by_byte_length[40] = {
861-
0x00,
862-
0x41,
863-
0x4b,
864-
0x51,
865-
0x5a,
866-
0x5f,
867-
0x60,
868-
0x62,
869-
0x6b,
870-
0x70,
871-
0x79,
872-
0x80,
873-
0x81,
874-
0x8a,
875-
0x91,
876-
0x9a,
877-
0xa1,
878-
0xaa,
879-
0xb0,
880-
0xb1,
881-
0xba,
882-
0xbc,
883-
0xc0,
884-
0xca,
885-
0xd0,
886-
0xda,
887-
0xe0,
888-
0xe1,
889-
0xe2,
890-
0xea,
891-
0xf0,
892-
0xfa,
893-
0xff,
894-
0x100,
895-
0x400,
896-
0x4000,
897-
0x40000,
898-
0x400000,
899-
0x4000000,
900-
0x40000000
901-
};
902-
# endif
903-
904794
#endif /* EBCDIC 037 */
905795

906796
#endif /* PERL_EBCDIC_TABLES_H_ */

op.c

Lines changed: 110 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -6926,6 +6926,106 @@ Perl_invmap_dump(pTHX_ SV* invlist, UV *map)
69266926
}
69276927
}
69286928

6929+
STATIC bool
6930+
S_expands(UV t_cp, UV t_cp_end, UV r_cp, UV r_cp_end)
6931+
{
6932+
/* Returns a boolean as to whether or not there is a code point in the r
6933+
* range (r_cp..r_cp_end) whose UTF-8 representation is larger than its
6934+
* corresponding code point in the t range.
6935+
*
6936+
* This must be run in the first pass, which makes this task trivial on
6937+
* ASCII platforms due to the special partitioning in that pass, as
6938+
* explained below. Any compiler should then inline this function, but
6939+
* experience has shown that compilation is not a performance bottleneck,
6940+
* so it isn't a problem even if it doesn't get inlined.
6941+
*
6942+
* During the first pass, the t_invlist has been partitioned so that all
6943+
* elements in any single range have the same number of bytes in their
6944+
* UTF-8 representations. And the r space is either a single byte, or a
6945+
* range of strictly monotonically increasing code points. So on ASCII
6946+
* platforms, the final element in the range will be represented by no
6947+
* fewer bytes than the initial one. (See below for EBCDIC.) That means
6948+
* that, on ASCII platforms, if the final code point in the t range has at
6949+
* least as many bytes as the final code point in the r, then all code
6950+
* points in the t range have at least as many bytes as their corresponding
6951+
* r range element. But if the final code point has more bytes than the
6952+
* corresponding t range one, at least that transliteration grows in
6953+
* length. As an example, suppose we had
6954+
* tr/\x{fff0}-\x{fff1}/\x{ffff}-\x{10000}/
6955+
* The UTF-8 for all but 10000 occupies 3 bytes on ASCII platforms. We
6956+
* have deliberately set up the data structure so that any range in the lhs
6957+
* gets split into chunks for processing, such that every code point in a
6958+
* chunk has the same number of UTF-8 bytes. We only have to check the
6959+
* final code point in the rhs against any code point in the lhs.
6960+
*
6961+
* On EBCDIC platforms, the above is true for any r range whose final code
6962+
* point is above 255. But ranges below it could have a mixture of one and
6963+
* two byte UTF-8 representations, so special code is needed for
6964+
* determining that.
6965+
*/
6966+
6967+
#ifndef EBCDIC
6968+
6969+
/* On ASCII platforms, the lengths needed to represent code points in UTF-8
6970+
* are monotonically increasing with code point. Thus if the final code
6971+
* point in the t range is not greater than the corresponding final code
6972+
* point in the r range, there is no growth */
6973+
PERL_UNUSED_ARG(t_cp);
6974+
PERL_UNUSED_ARG(r_cp);
6975+
6976+
return UVCHR_SKIP(t_cp_end) < UVCHR_SKIP(r_cp_end);
6977+
6978+
#else
6979+
6980+
/* But on EBCDIC platforms, there is a mixture of 1 and 2 byte
6981+
* representations for characters below 256. But above that, everything
6982+
* behaves like the ASCII case */
6983+
if (t_cp_end > 255 || r_cp_end > 255) {
6984+
return UVCHR_SKIP(t_cp_end) < UVCHR_SKIP(r_cp_end);
6985+
}
6986+
6987+
/* Here, is in range 0-255: UTF-8 size is 1 or 2.
6988+
*
6989+
* Everything SPACE and below is 1 byte, so can't be larger than the lhs */
6990+
if (r_cp_end <= ' ') {
6991+
return FALSE;
6992+
}
6993+
6994+
/* Handle the case of everything on the lhs mapping to the final mapping on
6995+
* the rhs */
6996+
if (r_cp == TR_SPECIAL_HANDLING) {
6997+
6998+
/* If the final mapping is size 1, then nothing will be less than it */
6999+
if (UVCHR_IS_INVARIANT(r_cp_end)) {
7000+
return FALSE;
7001+
}
7002+
7003+
/* Otherwise it is size 2; if anything is size 1, that will grow */
7004+
while (t_cp <= t_cp_end) {
7005+
if (UVCHR_IS_INVARIANT(t_cp)) {
7006+
return TRUE;
7007+
}
7008+
t_cp++;
7009+
}
7010+
7011+
return FALSE;
7012+
}
7013+
7014+
/* Handle the general case. If any character in the lhs is size one, and
7015+
* it maps to a size two character, it grows */
7016+
while (t_cp <= t_cp_end) {
7017+
if (! UVCHR_IS_INVARIANT(t_cp) && UVCHR_IS_INVARIANT(r_cp)) {
7018+
return TRUE;
7019+
}
7020+
t_cp++; r_cp++;
7021+
}
7022+
7023+
return FALSE;
7024+
7025+
#endif
7026+
7027+
}
7028+
69297029
/* Given an OP_TRANS / OP_TRANSR op o, plus OP_CONST ops expr and repl
69307030
* containing the search and replacement strings, assemble into
69317031
* a translation table attached as o->op_pv.
@@ -7065,13 +7165,17 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
70657165
* done after this has been determined which merges things together to
70667166
* shrink the table for runtime. For ASCII platforms, the table is
70677167
* trivial, given below, and uses the fundamental characteristics of UTF-8
7068-
* to construct the values. For EBCDIC, it isn't so, and we rely on a
7069-
* table constructed by the perl script that generates these kinds of
7070-
* things */
7071-
#ifndef EBCDIC
7168+
* to construct the values. For EBCDIC, the table is useless for code
7169+
* points below 256, as they are intermixed in size between 1 and 2. But
7170+
* it is the same as ASCII for higher code points, so this just makes the
7171+
* lower 256 a single pool, and code is executed to tease things apart. */
70727172
UV PL_partition_by_byte_length[] = {
70737173
0,
7174+
#ifdef EBCDIC
7175+
0x100, /* Below this is 1 and 2 byte representations */
7176+
#else
70747177
0x80, /* Below this is 1 byte representations */
7178+
#endif
70757179
(32 * (1UL << ( UTF_ACCUMULATION_SHIFT))), /* 2 bytes below this */
70767180
(16 * (1UL << (2 * UTF_ACCUMULATION_SHIFT))), /* 3 bytes below this */
70777181
( 8 * (1UL << (3 * UTF_ACCUMULATION_SHIFT))), /* 4 bytes below this */
@@ -7085,8 +7189,6 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
70857189

70867190
};
70877191

7088-
#endif
7089-
70907192
PERL_ARGS_ASSERT_PMTRANS;
70917193

70927194
PL_hints |= HINT_BLOCK_SCOPE;
@@ -7516,30 +7618,10 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
75167618
* longer than it. If none, the transliteration may be done
75177619
* in-place, as it can't write over a so-far unread byte.
75187620
* Otherwise, a copy must first be made. This could be
7519-
* expensive for long inputs.
7520-
*
7521-
* In the first pass, the t_invlist has been partitioned so
7522-
* that all elements in any single range have the same number
7523-
* of bytes in their UTF-8 representations. And the r space is
7524-
* either a single byte, or a range of strictly monotonically
7525-
* increasing code points. So the final element in the range
7526-
* will be represented by no fewer bytes than the initial one.
7527-
* That means that if the final code point in the t range has
7528-
* at least as many bytes as the final code point in the r,
7529-
* then all code points in the t range have at least as many
7530-
* bytes as their corresponding r range element. But if that's
7531-
* not true, the transliteration of at least the final code
7532-
* point grows in length. As an example, suppose we had
7533-
* tr/\x{fff0}-\x{fff1}/\x{ffff}-\x{10000}/
7534-
* The UTF-8 for all but 10000 occupies 3 bytes on ASCII
7535-
* platforms. We have deliberately set up the data structure
7536-
* so that any range in the lhs gets split into chunks for
7537-
* processing, such that every code point in a chunk has the
7538-
* same number of UTF-8 bytes. We only have to check the final
7539-
* code point in the rhs against any code point in the lhs. */
7621+
* expensive for long inputs. */
75407622
if ( ! pass2
75417623
&& r_cp_end != TR_SPECIAL_HANDLING
7542-
&& UVCHR_SKIP(t_cp_end) < UVCHR_SKIP(r_cp_end))
7624+
&& S_expands(t_cp, t_cp_end, r_cp, r_cp_end))
75437625
{
75447626
/* Here, we will need to make a copy of the input string
75457627
* before doing the transliteration. The worst possible

regen/ebcdic.pl

Lines changed: 0 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -779,56 +779,6 @@ END
779779
output_table(\@C9_utf8_dfa, "PL_c9_utf8_dfa_tab", $NUM_CLASSES);
780780
}
781781

782-
{
783-
print $out_fh <<EOF;
784-
/* This table partitions all the code points of the platform into ranges which
785-
* have the property that all the code points in each range have the same
786-
* number of bytes in their UTF-EBCDIC representations, and the adjacent
787-
* ranges have a different number of bytes.
788-
*
789-
* Each number in the table begins such a range, which extends up to just
790-
* before the following table entry, except the final entry is understood to
791-
* extend to the platform's infinity
792-
*/
793-
EOF
794-
# The lengths of the characters between 0 and 255 are either 1 or 2,
795-
# with those whose ASCII platform equivalents below 160 being 1, and
796-
# the rest being 2.
797-
my @list;
798-
push @list, 0;
799-
my $pushed_range_is_length_1 = 1;
800-
801-
for my $i (1 .. 0xFF) {
802-
my $this_code_point_is_length_1 = ($e2a[$i] < 160);
803-
if ($pushed_range_is_length_1 != $this_code_point_is_length_1) {
804-
push @list, $i;
805-
$pushed_range_is_length_1 = $this_code_point_is_length_1;
806-
}
807-
}
808-
809-
# Starting at 256, the length is 2.
810-
push @list, 0x100 if $pushed_range_is_length_1;
811-
812-
# These are based on the fundamental properties of UTF-EBCDIC. Each
813-
# continuation byte has 5 bits of information. Comments in utf8.h
814-
# explain the rest.
815-
my $UTF_ACCUMULATION_SHIFT = 5;
816-
push @list, (32 * (1 << ( $UTF_ACCUMULATION_SHIFT)));
817-
push @list, (16 * (1 << (2 * $UTF_ACCUMULATION_SHIFT)));
818-
push @list, ( 8 * (1 << (3 * $UTF_ACCUMULATION_SHIFT)));
819-
push @list, ( 4 * (1 << (4 * $UTF_ACCUMULATION_SHIFT)));
820-
push @list, ( 2 * (1 << (5 * $UTF_ACCUMULATION_SHIFT)));
821-
push @list, ( (1 << (6 * $UTF_ACCUMULATION_SHIFT)));
822-
823-
output_table_start($out_fh, "UV", "PL_partition_by_byte_length", scalar @list);
824-
print $out_fh "\t";
825-
826-
print $out_fh join ",\n\t", map { sprintf "0x%02x", $_ } @list;
827-
print $out_fh "\n";
828-
829-
output_table_end($out_fh);
830-
}
831-
832782
print $out_fh get_conditional_compile_line_end();
833783
}
834784

0 commit comments

Comments
 (0)