Skip to content

Commit 5d4a52b

Browse files
committed
grok_atoUV: allow non-C strings and document
This changes the internal function grok_atoUV() to not require its input to be NUL-terminated. That means the existing calls to it must be changed to set the ending position before calling it, as some did already. This function is recommended to use in a couple of pods, but it wasn't documented in perlintern. This commit does that as well.
1 parent 6928bed commit 5d4a52b

File tree

10 files changed

+72
-39
lines changed

10 files changed

+72
-39
lines changed

ext/XS-APItest/APItest.pm

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use strict;
55
use warnings;
66
use Carp;
77

8-
our $VERSION = '0.98';
8+
our $VERSION = '0.99';
99

1010
require XSLoader;
1111

ext/XS-APItest/numeric.xs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ grok_atoUV(number, endsv)
4040
const char *pv = SvPV(number, len);
4141
UV value = 0xdeadbeef;
4242
bool result;
43-
const char* endptr = NULL;
43+
const char* endptr = pv + len;
4444
PPCODE:
4545
EXTEND(SP,2);
4646
if (endsv == &PL_sv_undef) {

mg.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3170,7 +3170,7 @@ Perl_magic_set(pTHX_ SV *sv, MAGIC *mg)
31703170
{
31713171
const char *p = SvPV_const(sv, len);
31723172
Groups_t *gary = NULL;
3173-
const char* endptr;
3173+
const char* endptr = p + len;
31743174
UV uv;
31753175
#ifdef _SC_NGROUPS_MAX
31763176
int maxgrp = sysconf(_SC_NGROUPS_MAX);

numeric.c

Lines changed: 48 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1049,31 +1049,39 @@ Perl_grok_number_flags(pTHX_ const char *pv, STRLEN len, UV *valuep, U32 flags)
10491049
}
10501050

10511051
/*
1052-
grok_atoUV
1052+
=for apidoc grok_atoUV
10531053
1054-
grok_atoUV parses a C-style zero-byte terminated string, looking for
1055-
a decimal unsigned integer.
1054+
parse a string, looking for a decimal unsigned integer.
10561055
1057-
Returns the unsigned integer, if a valid value can be parsed
1058-
from the beginning of the string.
1056+
On entry, C<pv> points to the beginning of the string;
1057+
C<valptr> points to a UV that will receive the converted value, if found;
1058+
C<endptr> is either NULL or points to a variable that points to one byte
1059+
beyond the point in C<pv> that this routine should examine.
1060+
If C<endptr> is NULL, C<pv> is assumed to be NUL-terminated.
10591061
1060-
Accepts only the decimal digits '0'..'9'.
1062+
Returns FALSE if C<pv> doesn't represent a valid unsigned integer value (with
1063+
no leading zeros). Otherwise it returns TRUE, and sets C<*valptr> to that
1064+
value.
10611065
1062-
As opposed to atoi or strtol, grok_atoUV does NOT allow optional
1063-
leading whitespace, or negative inputs. If such features are
1064-
required, the calling code needs to explicitly implement those.
1066+
If you constrain the portion of C<pv> that is looked at by this function (by
1067+
passing a non-NULL C<endptr>), and if the intial bytes of that portion form a
1068+
valid value, it will return TRUE, setting C<*endptr> to the byte following the
1069+
final digit of the value. But if there is no constraint at what's looked at,
1070+
all of C<pv> must be valid in order for TRUE to be returned.
10651071
1066-
Returns true if a valid value could be parsed. In that case, valptr
1067-
is set to the parsed value, and endptr (if provided) is set to point
1068-
to the character after the last digit.
1072+
The only characters this accepts are the decimal digits '0'..'9'.
10691073
1070-
Returns false otherwise. This can happen if a) there is a leading zero
1071-
followed by another digit; b) the digits would overflow a UV; or c)
1072-
there are trailing non-digits AND endptr is not provided.
1074+
As opposed to L<atoi(3)> or L<strtol(3)>, C<grok_atoUV> does NOT allow optional
1075+
leading whitespace, nor negative inputs. If such features are required, the
1076+
calling code needs to explicitly implement those.
10731077
1074-
Background: atoi has severe problems with illegal inputs, it cannot be
1078+
Note that this function returns FALSE for inputs that would overflow a UV,
1079+
or have leading zeros. Thus a single C<0> is accepted, but not C<00> nor
1080+
C<01>, C<002>, I<etc>.
1081+
1082+
Background: C<atoi> has severe problems with illegal inputs, it cannot be
10751083
used for incremental parsing, and therefore should be avoided
1076-
atoi and strtol are also affected by locale settings, which can also be
1084+
C<atoi> and C<strtol> are also affected by locale settings, which can also be
10771085
seen as a bug (global state controlled by user environment).
10781086
10791087
*/
@@ -1088,15 +1096,27 @@ Perl_grok_atoUV(const char *pv, UV *valptr, const char** endptr)
10881096

10891097
PERL_ARGS_ASSERT_GROK_ATOUV;
10901098

1091-
eptr = endptr ? endptr : &end2;
1092-
if (isDIGIT(*s)) {
1099+
if (endptr) {
1100+
eptr = endptr;
1101+
}
1102+
else {
1103+
end2 = s + strlen(s);
1104+
eptr = &end2;
1105+
}
1106+
1107+
if ( *eptr <= s
1108+
|| ! isDIGIT(*s))
1109+
{
1110+
return FALSE;
1111+
}
1112+
10931113
/* Single-digit inputs are quite common. */
10941114
val = *s++ - '0';
1095-
if (isDIGIT(*s)) {
1115+
if (s < *eptr && isDIGIT(*s)) {
10961116
/* Fail on extra leading zeros. */
10971117
if (val == 0)
10981118
return FALSE;
1099-
while (isDIGIT(*s)) {
1119+
while (s < *eptr && isDIGIT(*s)) {
11001120
/* This could be unrolled like in grok_number(), but
11011121
* the expected uses of this are not speed-needy, and
11021122
* unlikely to need full 64-bitness. */
@@ -1109,12 +1129,14 @@ Perl_grok_atoUV(const char *pv, UV *valptr, const char** endptr)
11091129
}
11101130
}
11111131
}
1132+
if (endptr == NULL) {
1133+
if (*s) {
1134+
return FALSE; /* If endptr is NULL, no trailing non-digits allowed. */
1135+
}
1136+
}
1137+
else {
1138+
*endptr = s;
11121139
}
1113-
if (s == pv)
1114-
return FALSE;
1115-
if (endptr == NULL && *s)
1116-
return FALSE; /* If endptr is NULL, no trailing non-digits allowed. */
1117-
*eptr = s;
11181140
*valptr = val;
11191141
return TRUE;
11201142
}

perl.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3351,7 +3351,7 @@ Perl_get_debug_opts(pTHX_ const char **s, bool givehelp)
33513351
}
33523352
}
33533353
else if (isDIGIT(**s)) {
3354-
const char* e;
3354+
const char* e = *s + strlen(*s);
33553355
if (grok_atoUV(*s, &uv, &e))
33563356
*s = e;
33573357
for (; isWORDCHAR(**s); (*s)++) ;
@@ -3946,6 +3946,7 @@ S_open_script(pTHX_ const char *scriptname, bool dosearch, bool *suidscript)
39463946
UV uv;
39473947
/* if find_script() returns, it returns a malloc()-ed value */
39483948
scriptname = PL_origfilename = find_script(scriptname, dosearch, NULL, 1);
3949+
s = scriptname + strlen(scriptname);
39493950

39503951
if (strBEGINs(scriptname, "/dev/fd/")
39513952
&& isDIGIT(scriptname[8])

pod/perlclib.pod

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,8 @@ C<toUPPER_uni>, as described in L<perlapi/Character case changing>.)
211211

212212
Typical use is to do range checks on C<uv> before casting:
213213

214-
int i; UV uv; char* end_ptr;
214+
int i; UV uv;
215+
char* end_ptr = input_end;
215216
if (grok_atoUV(input, &uv, &end_ptr)
216217
&& uv <= INT_MAX)
217218
i = (int)uv;

regcomp.c

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11247,6 +11247,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
1124711247
RExC_parse++;
1124811248
is_neg = TRUE;
1124911249
}
11250+
endptr = RExC_end;
1125011251
if (grok_atoUV(RExC_parse, &unum, &endptr)
1125111252
&& unum <= I32_MAX
1125211253
) {
@@ -11485,6 +11486,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
1148511486
}
1148611487
else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
1148711488
UV uv;
11489+
endptr = RExC_end;
1148811490
if (grok_atoUV(RExC_parse, &uv, &endptr)
1148911491
&& uv <= I32_MAX
1149011492
) {
@@ -11520,6 +11522,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
1152011522
/* (?(1)...) */
1152111523
char c;
1152211524
UV uv;
11525+
endptr = RExC_end;
1152311526
if (grok_atoUV(RExC_parse, &uv, &endptr)
1152411527
&& uv <= I32_MAX
1152511528
) {
@@ -12029,6 +12032,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
1202912032
maxpos = next;
1203012033
RExC_parse++;
1203112034
if (isDIGIT(*RExC_parse)) {
12035+
endptr = RExC_end;
1203212036
if (!grok_atoUV(RExC_parse, &uv, &endptr))
1203312037
vFAIL("Invalid quantifier in {,}");
1203412038
if (uv >= REG_INFTY)
@@ -12042,6 +12046,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
1204212046
else
1204312047
maxpos = RExC_parse;
1204412048
if (isDIGIT(*maxpos)) {
12049+
endptr = RExC_end;
1204512050
if (!grok_atoUV(maxpos, &uv, &endptr))
1204612051
vFAIL("Invalid quantifier in {,}");
1204712052
if (uv >= REG_INFTY)
@@ -12799,9 +12804,9 @@ S_new_regcurly(const char *s, const char *e)
1279912804
* in which case return I32_MAX (rather than possibly 32-bit wrapping) */
1280012805

1280112806
static I32
12802-
S_backref_value(char *p)
12807+
S_backref_value(char *p, char *e)
1280312808
{
12804-
const char* endptr;
12809+
const char* endptr = e;
1280512810
UV val;
1280612811
if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
1280712812
return (I32)val;
@@ -13347,7 +13352,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
1334713352
if (RExC_parse >= RExC_end) {
1334813353
goto unterminated_g;
1334913354
}
13350-
num = S_backref_value(RExC_parse);
13355+
num = S_backref_value(RExC_parse, RExC_end);
1335113356
if (num == 0)
1335213357
vFAIL("Reference to invalid group 0");
1335313358
else if (num == I32_MAX) {
@@ -13365,7 +13370,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
1336513370
}
1336613371
}
1336713372
else {
13368-
num = S_backref_value(RExC_parse);
13373+
num = S_backref_value(RExC_parse, RExC_end);
1336913374
/* bare \NNN might be backref or octal - if it is larger
1337013375
* than or equal RExC_npar then it is assumed to be an
1337113376
* octal escape. Note RExC_npar is +1 from the actual
@@ -13742,7 +13747,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
1374213747
/* NOTE, RExC_npar is 1 more than the actual number of
1374313748
* parens we have seen so far, hence the < RExC_npar below. */
1374413749

13745-
if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
13750+
if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
1374613751
{ /* Not to be treated as an octal constant, go
1374713752
find backref */
1374813753
--p;

t/porting/known_pod_issues.dat

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ Apache::SmallProf
2323
Archive::Extract
2424
Array::Base
2525
atan2(3)
26+
atoi(3)
2627
Attribute::Constant
2728
autobox
2829
B::Generate
@@ -283,6 +284,7 @@ strftime(3)
283284
strictures
284285
String::Base
285286
String::Scanf
287+
strtol(3)
286288
Switch
287289
tar(1)
288290
Template::Declare

utf8.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5201,6 +5201,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
52015201
/* Get the 0th element, which is needed to setup the inversion list
52025202
* */
52035203
while (isSPACE(*l)) l++;
5204+
after_atou = (char *) lend;
52045205
if (!grok_atoUV((const char *)l, &element0, &after_atou)) {
52055206
Perl_croak(aTHX_ "panic: Expecting a valid 0th element for"
52065207
" inversion list");
@@ -5217,6 +5218,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
52175218
" elements than available", elements);
52185219
}
52195220
while (isSPACE(*l)) l++;
5221+
after_atou = (char *) lend;
52205222
if (!grok_atoUV((const char *)l, other_elements_ptr++,
52215223
&after_atou))
52225224
{

util.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4320,7 +4320,7 @@ Perl_parse_unicode_opts(pTHX_ const char **popt)
43204320

43214321
if (*p) {
43224322
if (isDIGIT(*p)) {
4323-
const char* endptr;
4323+
const char* endptr = p + strlen(p);
43244324
UV uv;
43254325
if (grok_atoUV(p, &uv, &endptr) && uv <= U32_MAX) {
43264326
opt = (U32)uv;
@@ -4707,7 +4707,7 @@ S_mem_log_common(enum mem_log_type mlt, const UV n,
47074707
* timeval. */
47084708
{
47094709
STRLEN len;
4710-
const char* endptr;
4710+
const char* endptr = pmlenv + stren(pmlenv);
47114711
int fd;
47124712
UV uv;
47134713
if (grok_atoUV(pmlenv, &uv, &endptr) /* Ignore endptr. */
@@ -5989,7 +5989,7 @@ static const char* atos_parse(const char* p,
59895989
* The matched regular expression is roughly "\(.*:\d+\)\s*$" */
59905990
const char* source_number_start;
59915991
const char* source_name_end;
5992-
const char* source_line_end;
5992+
const char* source_line_end = start;
59935993
const char* close_paren;
59945994
UV uv;
59955995

0 commit comments

Comments
 (0)