Skip to content

Commit e51cfe6

Browse files
committed
Allow integer suffixes starting with e.
Integers with arbitrary suffixes are allowed as inputs to proc macros. A number of real-world crates use this capability in interesting ways, as seen in #103872. For example: - Suffixes representing units, such as `8bits`, `100px`, `20ns`, `30GB` - CSS hex colours such as `#7CFC00` (LawnGreen) - UUIDs, e.g. `785ada2c-f2d0-11fd-3839-b3104db0cb68` The hex cases may be surprising. - `#7CFC00` is tokenized as a `#` followed by a `7` integer with a `CFC00` suffix. - `785ada2c` is tokenized as a `785` integer with an `ada2c` suffix. - `f2d0` is tokenized as an identifier. - `3839` is tokenized as an integer literal. A proc macro will immediately stringify such tokens and reparse them itself, and so won't care that the token types vary. All suffixes must be consumed by the proc macro, of course; the only suffixes allowed after macro expansion are the numeric ones like `u8`, `i32`, and `f64`. Currently there is an annoying inconsistency in how integer literal suffixes are handled, which is that no suffix starting with `e` is allowed, because that it interpreted as a float literal with an exponent. For example: - Units: `1eV` and `1em` - CSS colours: `#90EE90` (LightGreen) - UUIDs: `785ada2c-f2d0-11ed-3839-b3104db0cb68` In each case, a sequence of digits followed by an 'e' or 'E' followed by a letter results in an "expected at least one digit in exponent" error. This is an annoying inconsistency in general, and a problem in practice. It's likely that some users haven't realized this inconsistency because they've gotten lucky and never used a token with an 'e' that causes problems. Other users *have* noticed; it's causing problems when embedding DSLs into proc macros, as seen in #111615, where the CSS colours case is causing problems for two different UI frameworks (Slint and Makepad). We can do better. This commit changes the lexer so that, when it hits a possible exponent, it looks ahead and only produces an exponent if a valid one is present. Otherwise, it produces a non-exponent form, which may be a single token (e.g. `1eV`) or multiple tokens (e.g. `1e+a`). Consequences of this: - All the proc macro problem cases mentioned above are fixed. - The "expected at least one digit in exponent" error is no longer possible. A few tests that only worked in the presence of that error have been removed. - The lexer requires unbounded lookahead due to the presence of '_' chars in exponents. E.g. to distinguish `1e+_______3` (a float literal with exponent) from `1e+_______a` (previously invalid, but now the tokenised as `1e`, `+`, `_______a`). This is a backwards compatible language change: all existing valid programs will be treated in the same way, and some previously invalid programs will become valid. The tokens chapter of the language reference (https://doc.rust-lang.org/reference/tokens.html) will need changing to account for this. In particular, the "Reserved forms similar to number literals" section will need updating, and grammar rules involving the SUFFIX_NO_E nonterminal will need adjusting. Fixes #111615.
1 parent ce5919f commit e51cfe6

24 files changed

+243
-336
lines changed

compiler/rustc_lexer/src/cursor.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,12 @@ impl<'a> Cursor<'a> {
5555
iter.next().unwrap_or(EOF_CHAR)
5656
}
5757

58+
/// Allows peeking an unbounded number of symbols from the input stream
59+
/// without consuming them.
60+
pub(crate) fn all(&self) -> Chars<'a> {
61+
self.chars.clone()
62+
}
63+
5864
/// Checks if there is nothing more to consume.
5965
pub(crate) fn is_eof(&self) -> bool {
6066
self.chars.as_str().is_empty()

compiler/rustc_lexer/src/lib.rs

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ pub enum LiteralKind {
177177
/// "12_u8", "0o100", "0b120i99", "1f32".
178178
Int { base: Base, empty_int: bool },
179179
/// "12.34f32", "1e3", but not "1f32".
180-
Float { base: Base, empty_exponent: bool },
180+
Float { base: Base },
181181
/// "'a'", "'\\'", "'''", "';"
182182
Char { terminated: bool },
183183
/// "b'a'", "b'\\'", "b'''", "b';"
@@ -578,6 +578,28 @@ impl Cursor<'_> {
578578
}
579579

580580
fn number(&mut self, first_digit: char) -> LiteralKind {
581+
// Scan ahead to determine if this is a valid exponent.
582+
fn is_exponent(cursor: &mut Cursor<'_>) -> bool {
583+
let mut iter = cursor.all();
584+
let c = iter.next();
585+
debug_assert!(matches!(c, Some('e' | 'E')));
586+
587+
// Exponent examples: `e3`, `e+3`, `e_3`, `e-___3`,
588+
// Non-exponent examples: `ea`, `e+a`, `e_a`, `e-___a`, `e_`
589+
match iter.next() {
590+
Some('0'..='9') => return true,
591+
Some('+' | '-' | '_') => {}
592+
_ => return false,
593+
}
594+
loop {
595+
match iter.next() {
596+
Some('0'..='9') => return true,
597+
Some('_') => {}
598+
_ => return false,
599+
}
600+
}
601+
}
602+
581603
debug_assert!('0' <= self.prev() && self.prev() <= '9');
582604
let mut base = Base::Decimal;
583605
if first_digit == '0' {
@@ -628,23 +650,28 @@ impl Cursor<'_> {
628650
// might have stuff after the ., and if it does, it needs to start
629651
// with a number
630652
self.bump();
631-
let mut empty_exponent = false;
632653
if self.first().is_digit(10) {
633654
self.eat_decimal_digits();
634655
match self.first() {
635-
'e' | 'E' => {
656+
// Scan ahead to decide if this is an exponent. If not,
657+
// it'll just be handled (later) as a suffix.
658+
'e' | 'E' if is_exponent(self) => {
636659
self.bump();
637-
empty_exponent = !self.eat_float_exponent();
660+
let empty_exponent = !self.eat_float_exponent();
661+
debug_assert!(!empty_exponent);
638662
}
639663
_ => (),
640664
}
641665
}
642-
Float { base, empty_exponent }
666+
Float { base }
643667
}
644-
'e' | 'E' => {
668+
// Scan ahead to decide if this is an exponent. If not,
669+
// it'll just be handled (later) as a suffix.
670+
'e' | 'E' if is_exponent(self) => {
645671
self.bump();
646672
let empty_exponent = !self.eat_float_exponent();
647-
Float { base, empty_exponent }
673+
debug_assert!(!empty_exponent);
674+
Float { base }
648675
}
649676
_ => Int { base, empty_int: false },
650677
}

compiler/rustc_lexer/src/tests.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -283,9 +283,9 @@ br###"raw"###suffix
283283
Token { kind: Whitespace, len: 1 }
284284
Token { kind: Literal { kind: Int { base: Hexadecimal, empty_int: false }, suffix_start: 5 }, len: 5 }
285285
Token { kind: Whitespace, len: 1 }
286-
Token { kind: Literal { kind: Float { base: Decimal, empty_exponent: false }, suffix_start: 3 }, len: 3 }
286+
Token { kind: Literal { kind: Float { base: Decimal }, suffix_start: 3 }, len: 3 }
287287
Token { kind: Whitespace, len: 1 }
288-
Token { kind: Literal { kind: Float { base: Decimal, empty_exponent: false }, suffix_start: 6 }, len: 6 }
288+
Token { kind: Literal { kind: Float { base: Decimal }, suffix_start: 6 }, len: 6 }
289289
Token { kind: Whitespace, len: 1 }
290290
Token { kind: Literal { kind: Int { base: Decimal, empty_int: false }, suffix_start: 1 }, len: 3 }
291291
Token { kind: Whitespace, len: 1 }

compiler/rustc_parse/messages.ftl

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -642,8 +642,6 @@ parse_no_digits_literal = no valid digits found for number
642642
643643
parse_invalid_digit_literal = invalid digit for a base {$base} literal
644644
645-
parse_empty_exponent_float = expected at least one digit in exponent
646-
647645
parse_float_literal_unsupported_base = {$base} float literal is not supported
648646
649647
parse_more_than_one_char = character literal may only contain one codepoint

compiler/rustc_parse/src/errors.rs

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1762,13 +1762,6 @@ pub struct InvalidDigitLiteral {
17621762
pub base: u32,
17631763
}
17641764

1765-
#[derive(Diagnostic)]
1766-
#[diag(parse_empty_exponent_float)]
1767-
pub struct EmptyExponentFloat {
1768-
#[primary_span]
1769-
pub span: Span,
1770-
}
1771-
17721765
#[derive(Diagnostic)]
17731766
#[diag(parse_float_literal_unsupported_base)]
17741767
pub struct FloatLiteralUnsupportedBase {

compiler/rustc_parse/src/lexer/mod.rs

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -479,11 +479,7 @@ impl<'a> StringReader<'a> {
479479
(token::Integer, self.symbol_from_to(start, end))
480480
}
481481
}
482-
rustc_lexer::LiteralKind::Float { base, empty_exponent } => {
483-
if empty_exponent {
484-
let span = self.mk_sp(start, self.pos);
485-
self.sess.emit_err(errors::EmptyExponentFloat { span });
486-
}
482+
rustc_lexer::LiteralKind::Float { base } => {
487483
let base = match base {
488484
Base::Hexadecimal => Some("hexadecimal"),
489485
Base::Octal => Some("octal"),

tests/ui/consts/const-eval/issue-104390.rs

Lines changed: 0 additions & 10 deletions
This file was deleted.

tests/ui/consts/const-eval/issue-104390.stderr

Lines changed: 0 additions & 65 deletions
This file was deleted.

tests/ui/consts/invalid-const-in-body.rs

Lines changed: 0 additions & 6 deletions
This file was deleted.

tests/ui/consts/invalid-const-in-body.stderr

Lines changed: 0 additions & 8 deletions
This file was deleted.

0 commit comments

Comments
 (0)