Fix tr/// determination of inplace editing for EBCDIC

khwilliamson · khwilliamson · commit 92c678141cc7 · 2020-04-01T08:22:25.000-06:00
I realized as a result of fixing GH #17654, that the code didn't properly decide if a tr/// can be done in-place on EBCDIC platforms. Our test suite passed, but if we had had valgrind, it would have shown failuress.
diff --git a/ebcdic_tables.h b/ebcdic_tables.h
@@ -413,60 +413,6 @@ SOFTWARE.
 };
 #  endif
 
-/* This table partitions all the code points of the platform into ranges which
- * have the property that all the code points in each range have the same
- * number of bytes in their UTF-EBCDIC representations, and the adjacent
- * ranges have a different number of bytes.
- *
- * Each number in the table begins such a range, which extends up to just
- * before the following table entry, except the final entry is understood to
- * extend to the platform's infinity
- */
-#  ifndef DOINIT
-    EXTCONST UV PL_partition_by_byte_length[38];
-#  else
-    EXTCONST UV PL_partition_by_byte_length[38] = {
-	0x00,
-	0x41,
-	0x4b,
-	0x51,
-	0x5a,
-	0x62,
-	0x6b,
-	0x70,
-	0x79,
-	0x80,
-	0x81,
-	0x8a,
-	0x91,
-	0x9a,
-	0xa1,
-	0xaa,
-	0xad,
-	0xae,
-	0xbd,
-	0xbe,
-	0xc0,
-	0xca,
-	0xd0,
-	0xda,
-	0xe0,
-	0xe1,
-	0xe2,
-	0xea,
-	0xf0,
-	0xfa,
-	0xff,
-	0x100,
-	0x400,
-	0x4000,
-	0x40000,
-	0x400000,
-	0x4000000,
-	0x40000000
-};
-#  endif
-
 #endif	/* EBCDIC 1047 */
 
 #if 'A' == 193 /* EBCDIC 037 */ \
@@ -845,62 +791,6 @@ SOFTWARE.
 };
 #  endif
 
-/* This table partitions all the code points of the platform into ranges which
- * have the property that all the code points in each range have the same
- * number of bytes in their UTF-EBCDIC representations, and the adjacent
- * ranges have a different number of bytes.
- *
- * Each number in the table begins such a range, which extends up to just
- * before the following table entry, except the final entry is understood to
- * extend to the platform's infinity
- */
-#  ifndef DOINIT
-    EXTCONST UV PL_partition_by_byte_length[40];
-#  else
-    EXTCONST UV PL_partition_by_byte_length[40] = {
-	0x00,
-	0x41,
-	0x4b,
-	0x51,
-	0x5a,
-	0x5f,
-	0x60,
-	0x62,
-	0x6b,
-	0x70,
-	0x79,
-	0x80,
-	0x81,
-	0x8a,
-	0x91,
-	0x9a,
-	0xa1,
-	0xaa,
-	0xb0,
-	0xb1,
-	0xba,
-	0xbc,
-	0xc0,
-	0xca,
-	0xd0,
-	0xda,
-	0xe0,
-	0xe1,
-	0xe2,
-	0xea,
-	0xf0,
-	0xfa,
-	0xff,
-	0x100,
-	0x400,
-	0x4000,
-	0x40000,
-	0x400000,
-	0x4000000,
-	0x40000000
-};
-#  endif
-
 #endif	/* EBCDIC 037 */
 
 #endif /* PERL_EBCDIC_TABLES_H_ */
diff --git a/op.c b/op.c
@@ -6926,6 +6926,106 @@ Perl_invmap_dump(pTHX_ SV* invlist, UV *map)
     }
 }
 
+STATIC bool
+S_expands(UV t_cp, UV t_cp_end, UV r_cp, UV r_cp_end)
+{
+    /* Returns a boolean as to whether or not there is a code point in the r
+     * range (r_cp..r_cp_end) whose UTF-8 representation is larger than its
+     * corresponding code point in the t range.
+     *
+     * This must be run in the first pass, which makes this task trivial on
+     * ASCII platforms due to the special partitioning in that pass, as
+     * explained below.  Any compiler should then inline this function, but
+     * experience has shown that compilation is not a performance bottleneck,
+     * so it isn't a problem even if it doesn't get inlined.
+     *
+     * During the first pass, the t_invlist has been partitioned so that all
+     * elements in any single range have the same number of bytes in their
+     * UTF-8 representations.  And the r space is either a single byte, or a
+     * range of strictly monotonically increasing code points.  So on ASCII
+     * platforms, the final element in the range will be represented by no
+     * fewer bytes than the initial one.  (See below for EBCDIC.) That means
+     * that, on ASCII platforms, if the final code point in the t range has at
+     * least as many bytes as the final code point in the r, then all code
+     * points in the t range have at least as many bytes as their corresponding
+     * r range element.  But if the final code point has more bytes than the
+     * corresponding t range one, at least that transliteration grows in
+     * length.  As an example, suppose we had
+     *      tr/\x{fff0}-\x{fff1}/\x{ffff}-\x{10000}/
+     * The UTF-8 for all but 10000 occupies 3 bytes on ASCII platforms.  We
+     * have deliberately set up the data structure so that any range in the lhs
+     * gets split into chunks for processing, such that every code point in a
+     * chunk has the same number of UTF-8 bytes.  We only have to check the
+     * final code point in the rhs against any code point in the lhs.
+     *
+     * On EBCDIC platforms, the above is true for any r range whose final code
+     * point is above 255.  But ranges below it could have a mixture of one and
+     * two byte UTF-8 representations, so special code is needed for
+     * determining that.
+     */
+
+#ifndef EBCDIC
+
+    /* On ASCII platforms, the lengths needed to represent code points in UTF-8
+     * are monotonically increasing with code point.  Thus if the final code
+     * point in the t range is not greater than the corresponding final code
+     * point in the r range, there is no growth */
+    PERL_UNUSED_ARG(t_cp);
+    PERL_UNUSED_ARG(r_cp);
+
+    return UVCHR_SKIP(t_cp_end) < UVCHR_SKIP(r_cp_end);
+
+#else
+
+    /* But on EBCDIC platforms, there is a mixture of 1 and 2 byte
+     * representations for characters below 256.  But above that, everything
+     * behaves like the ASCII case */
+    if (t_cp_end > 255 || r_cp_end > 255) {
+        return UVCHR_SKIP(t_cp_end) < UVCHR_SKIP(r_cp_end);
+    }
+
+    /* Here, is in range 0-255: UTF-8 size is 1 or 2.
+     *
+     * Everything SPACE and below is 1 byte, so can't be larger than the lhs */
+    if (r_cp_end <= ' ') {
+        return FALSE;
+    }
+
+    /* Handle the case of everything on the lhs mapping to the final mapping on
+     * the rhs */
+    if (r_cp == TR_SPECIAL_HANDLING) {
+
+        /* If the final mapping is size 1, then nothing will be less than it */
+        if (UVCHR_IS_INVARIANT(r_cp_end)) {
+            return FALSE;
+        }
+
+        /* Otherwise it is size 2; if anything is size 1, that will grow */
+        while (t_cp <= t_cp_end) {
+            if (UVCHR_IS_INVARIANT(t_cp)) {
+                return TRUE;
+            }
+            t_cp++;
+        }
+
+        return FALSE;
+    }
+
+    /* Handle the general case.  If any character in the lhs is size one, and
+     * it maps to a size two character, it grows */
+    while (t_cp <= t_cp_end) {
+        if (! UVCHR_IS_INVARIANT(t_cp) && UVCHR_IS_INVARIANT(r_cp)) {
+            return TRUE;
+        }
+        t_cp++; r_cp++;
+    }
+
+    return FALSE;
+
+#endif
+
+}
+
 /* Given an OP_TRANS / OP_TRANSR op o, plus OP_CONST ops expr and repl
  * containing the search and replacement strings, assemble into
  * a translation table attached as o->op_pv.
@@ -7065,13 +7165,17 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
      * done after this has been determined which merges things together to
      * shrink the table for runtime.  For ASCII platforms, the table is
      * trivial, given below, and uses the fundamental characteristics of UTF-8
-     * to construct the values.  For EBCDIC, it isn't so, and we rely on a
-     * table constructed by the perl script that generates these kinds of
-     * things */
-#ifndef EBCDIC
+     * to construct the values.  For EBCDIC, the table is useless for code
+     * points below 256, as they are intermixed in size between 1 and 2.  But
+     * it is the same as ASCII for higher code points, so this just makes the
+     * lower 256 a single pool, and code is executed to tease things apart. */
     UV PL_partition_by_byte_length[] = {
         0,
+#ifdef EBCDIC
+        0x100,  /* Below this is 1 and 2 byte representations */
+#else
         0x80,   /* Below this is 1 byte representations */
+#endif
         (32 * (1UL << (    UTF_ACCUMULATION_SHIFT))),   /* 2 bytes below this */
         (16 * (1UL << (2 * UTF_ACCUMULATION_SHIFT))),   /* 3 bytes below this */
         ( 8 * (1UL << (3 * UTF_ACCUMULATION_SHIFT))),   /* 4 bytes below this */
@@ -7085,8 +7189,6 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 
     };
 
-#endif
-
     PERL_ARGS_ASSERT_PMTRANS;
 
     PL_hints |= HINT_BLOCK_SCOPE;
@@ -7516,30 +7618,10 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                  * longer than it.  If none, the transliteration may be done
                  * in-place, as it can't write over a so-far unread byte.
                  * Otherwise, a copy must first be made.  This could be
-                 * expensive for long inputs.
-                 *
-                 * In the first pass, the t_invlist has been partitioned so
-                 * that all elements in any single range have the same number
-                 * of bytes in their UTF-8 representations.  And the r space is
-                 * either a single byte, or a range of strictly monotonically
-                 * increasing code points.  So the final element in the range
-                 * will be represented by no fewer bytes than the initial one.
-                 * That means that if the final code point in the t range has
-                 * at least as many bytes as the final code point in the r,
-                 * then all code points in the t range have at least as many
-                 * bytes as their corresponding r range element.  But if that's
-                 * not true, the transliteration of at least the final code
-                 * point grows in length.  As an example, suppose we had
-                 *      tr/\x{fff0}-\x{fff1}/\x{ffff}-\x{10000}/
-                 * The UTF-8 for all but 10000 occupies 3 bytes on ASCII
-                 * platforms.  We have deliberately set up the data structure
-                 * so that any range in the lhs gets split into chunks for
-                 * processing, such that every code point in a chunk has the
-                 * same number of UTF-8 bytes.  We only have to check the final
-                 * code point in the rhs against any code point in the lhs. */
+                 * expensive for long inputs. */
                 if ( ! pass2
                     && r_cp_end != TR_SPECIAL_HANDLING
-                    && UVCHR_SKIP(t_cp_end) < UVCHR_SKIP(r_cp_end))
+                    && S_expands(t_cp, t_cp_end, r_cp, r_cp_end))
                 {
                     /* Here, we will need to make a copy of the input string
                      * before doing the transliteration.  The worst possible
diff --git a/regen/ebcdic.pl b/regen/ebcdic.pl
@@ -779,56 +779,6 @@ END
         output_table(\@C9_utf8_dfa, "PL_c9_utf8_dfa_tab", $NUM_CLASSES);
     }
 
-    {
-        print $out_fh <<EOF;
-/* This table partitions all the code points of the platform into ranges which
- * have the property that all the code points in each range have the same
- * number of bytes in their UTF-EBCDIC representations, and the adjacent
- * ranges have a different number of bytes.
- *
- * Each number in the table begins such a range, which extends up to just
- * before the following table entry, except the final entry is understood to
- * extend to the platform's infinity
- */
-EOF
-        # The lengths of the characters between 0 and 255 are either 1 or 2,
-        # with those whose ASCII platform equivalents below 160 being 1, and
-        # the rest being 2.
-        my @list;
-        push @list, 0;
-        my $pushed_range_is_length_1 = 1;
-
-        for my $i (1 .. 0xFF) {
-            my $this_code_point_is_length_1 = ($e2a[$i] < 160);
-            if ($pushed_range_is_length_1 != $this_code_point_is_length_1) {
-                push @list, $i;
-                $pushed_range_is_length_1 = $this_code_point_is_length_1;
-            }
-        }
-
-        # Starting at 256, the length is 2.
-        push @list, 0x100 if $pushed_range_is_length_1;
-
-        # These are based on the fundamental properties of UTF-EBCDIC.  Each
-        # continuation byte has 5 bits of information.  Comments in utf8.h
-        # explain the rest.
-        my $UTF_ACCUMULATION_SHIFT = 5;
-        push @list, (32 * (1 << (    $UTF_ACCUMULATION_SHIFT)));
-        push @list, (16 * (1 << (2 * $UTF_ACCUMULATION_SHIFT)));
-        push @list, ( 8 * (1 << (3 * $UTF_ACCUMULATION_SHIFT)));
-        push @list, ( 4 * (1 << (4 * $UTF_ACCUMULATION_SHIFT)));
-        push @list, ( 2 * (1 << (5 * $UTF_ACCUMULATION_SHIFT)));
-        push @list, (     (1 << (6 * $UTF_ACCUMULATION_SHIFT)));
-
-        output_table_start($out_fh, "UV", "PL_partition_by_byte_length", scalar @list);
-        print $out_fh "\t";
-
-        print $out_fh join ",\n\t", map { sprintf "0x%02x", $_ } @list;
-        print $out_fh "\n";
-
-        output_table_end($out_fh);
-    }
-
     print $out_fh get_conditional_compile_line_end();
 }