Skip to content

printf: Error handling with unicode parsing #7681

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 97 additions & 21 deletions src/uucore/src/lib/features/format/escape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

//! Parsing of escape sequences

use crate::format::FormatError;

#[derive(Debug)]
pub enum EscapedChar {
/// A single byte
Expand Down Expand Up @@ -90,34 +92,36 @@ fn parse_code(input: &mut &[u8], base: Base) -> Option<u8> {

// spell-checker:disable-next
/// Parse `\uHHHH` and `\UHHHHHHHH`
// TODO: This should print warnings and possibly halt execution when it fails to parse
// TODO: If the character cannot be converted to u32, the input should be printed.
fn parse_unicode(input: &mut &[u8], digits: u8) -> Option<char> {
let (c, rest) = input.split_first()?;
let mut ret = Base::Hex.convert_digit(*c)? as u32;
*input = rest;

for _ in 1..digits {
let (c, rest) = input.split_first()?;
let n = Base::Hex.convert_digit(*c)?;
ret = ret
.wrapping_mul(Base::Hex.as_base() as u32)
.wrapping_add(n as u32);
fn parse_unicode(input: &mut &[u8], digits: u8) -> Result<char, EscapeError> {
if let Some((new_digits, rest)) = input.split_at_checked(digits as usize) {
*input = rest;
let ret = new_digits
.iter()
.map(|c| Base::Hex.convert_digit(*c))
.collect::<Option<Vec<u8>>>()
.ok_or(EscapeError::MissingHexadecimalNumber)?
.iter()
.map(|n| *n as u32)
.reduce(|ret, n| ret.wrapping_mul(Base::Hex.as_base() as u32).wrapping_add(n))
.expect("must have multiple digits in unicode string");
char::from_u32(ret).ok_or_else(|| EscapeError::InvalidCharacters(new_digits.to_vec()))
} else {
Err(EscapeError::MissingHexadecimalNumber)
}

char::from_u32(ret)
}

/// Represents an invalid escape sequence.
#[derive(Debug)]
pub struct EscapeError {}
#[derive(Debug, PartialEq)]
pub enum EscapeError {
InvalidCharacters(Vec<u8>),
MissingHexadecimalNumber,
}

/// Parse an escape sequence, like `\n` or `\xff`, etc.
pub fn parse_escape_code(
rest: &mut &[u8],
zero_octal_parsing: OctalParsing,
) -> Result<EscapedChar, EscapeError> {
) -> Result<EscapedChar, FormatError> {
if let [c, new_rest @ ..] = rest {
// This is for the \NNN syntax for octal sequences.
// Note that '0' is intentionally omitted because that
Expand Down Expand Up @@ -145,17 +149,89 @@ pub fn parse_escape_code(
if let Some(c) = parse_code(rest, Base::Hex) {
Ok(EscapedChar::Byte(c))
} else {
Err(EscapeError {})
Err(FormatError::MissingHex)
}
}
b'0' => Ok(EscapedChar::Byte(
parse_code(rest, Base::Oct(zero_octal_parsing)).unwrap_or(b'\0'),
)),
b'u' => Ok(EscapedChar::Char(parse_unicode(rest, 4).unwrap_or('\0'))),
b'U' => Ok(EscapedChar::Char(parse_unicode(rest, 8).unwrap_or('\0'))),
b'u' => match parse_unicode(rest, 4) {
Ok(c) => Ok(EscapedChar::Char(c)),
Err(EscapeError::MissingHexadecimalNumber) => Err(FormatError::MissingHex),
Err(EscapeError::InvalidCharacters(chars)) => {
Err(FormatError::InvalidCharacter('u', chars))
}
},
b'U' => match parse_unicode(rest, 8) {
Ok(c) => Ok(EscapedChar::Char(c)),
Err(EscapeError::MissingHexadecimalNumber) => Err(FormatError::MissingHex),
Err(EscapeError::InvalidCharacters(chars)) => {
Err(FormatError::InvalidCharacter('U', chars))
}
},
c => Ok(EscapedChar::Backslash(*c)),
}
} else {
Ok(EscapedChar::Byte(b'\\'))
}
}

#[cfg(test)]
mod tests {
use super::*;

mod parse_unicode {
use super::*;

#[test]
fn parse_ascii() {
let input = b"2a";
assert_eq!(parse_unicode(&mut &input[..], 2), Ok('*'));

let input = b"002A";
assert_eq!(parse_unicode(&mut &input[..], 4), Ok('*'));
}

#[test]
fn parse_emoji_codepoint() {
let input = b"0001F60A";
assert_eq!(parse_unicode(&mut &input[..], 8), Ok('😊'));
}

#[test]
fn no_characters() {
let input = b"";
assert_eq!(
parse_unicode(&mut &input[..], 8),
Err(EscapeError::MissingHexadecimalNumber)
);
}

#[test]
fn incomplete_hexadecimal_number() {
let input = b"123";
assert_eq!(
parse_unicode(&mut &input[..], 4),
Err(EscapeError::MissingHexadecimalNumber)
);
}

#[test]
fn invalid_hex() {
let input = b"duck";
assert_eq!(
parse_unicode(&mut &input[..], 4),
Err(EscapeError::MissingHexadecimalNumber)
);
}

#[test]
fn surrogate_code_point() {
let input = b"d800";
assert_eq!(
parse_unicode(&mut &input[..], 4),
Err(EscapeError::InvalidCharacters(Vec::from(b"d800")))
);
}
}
}
16 changes: 10 additions & 6 deletions src/uucore/src/lib/features/format/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ pub enum FormatError {
EndsWithPercent(Vec<u8>),
/// The escape sequence `\x` appears without a literal hexadecimal value.
MissingHex,
/// The hexadecimal characters represent a code point that cannot represent a
/// Unicode character (e.g., a surrogate code point)
InvalidCharacter(char, Vec<u8>),
}

impl Error for FormatError {}
Expand Down Expand Up @@ -110,6 +113,12 @@ impl Display for FormatError {
Self::NoMoreArguments => write!(f, "no more arguments"),
Self::InvalidArgument(_) => write!(f, "invalid argument"),
Self::MissingHex => write!(f, "missing hexadecimal number in escape"),
Self::InvalidCharacter(escape_char, digits) => write!(
f,
"invalid universal character name \\{}{}",
escape_char,
String::from_utf8_lossy(digits)
),
}
}
}
Expand Down Expand Up @@ -186,12 +195,7 @@ pub fn parse_spec_and_escape(
}
[b'\\', rest @ ..] => {
current = rest;
Some(
match parse_escape_code(&mut current, OctalParsing::default()) {
Ok(c) => Ok(FormatItem::Char(c)),
Err(_) => Err(FormatError::MissingHex),
},
)
Some(parse_escape_code(&mut current, OctalParsing::default()).map(FormatItem::Char))
}
[c, rest @ ..] => {
current = rest;
Expand Down
8 changes: 7 additions & 1 deletion src/uucore/src/lib/features/format/spec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ struct Flags {
space: bool,
hash: bool,
zero: bool,
quote: bool,
}

impl Flags {
Expand All @@ -108,6 +109,11 @@ impl Flags {
b' ' => flags.space = true,
b'#' => flags.hash = true,
b'0' => flags.zero = true,
b'\'' => {
// the thousands separator is printed with numbers using the ' flag, but
// this is a no-op in the "C" locale. We only save this flag for reporting errors
flags.quote = true;
}
_ => break,
}
*index += 1;
Expand Down Expand Up @@ -181,7 +187,7 @@ impl Spec {
}
}
b's' => {
if flags.zero || flags.hash {
if flags.zero || flags.hash || flags.quote {
return Err(&start[..index]);
}
Self::String {
Expand Down
2 changes: 1 addition & 1 deletion src/uucore/src/lib/features/parser/num_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,7 @@ fn parse(

let ebd_result = construct_extended_big_decimal(digits, negative, base, scale, exponent);

// Return what has been parsed so far. It there are extra characters, mark the
// Return what has been parsed so far. If there are extra characters, mark the
// parsing as a partial match.
if let Some((first_unparsed, _)) = chars.next() {
Err(ExtendedParserError::PartialMatch(
Expand Down
30 changes: 30 additions & 0 deletions tests/by-util/test_printf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,26 @@ fn escaped_unicode_null_byte() {
.stdout_is_bytes([1u8, b'_']);
}

#[test]
fn escaped_unicode_incomplete() {
for arg in ["\\u", "\\U", "\\uabc", "\\Uabcd"] {
new_ucmd!()
.arg(arg)
.fails_with_code(1)
.stderr_only("printf: missing hexadecimal number in escape\n");
}
}

#[test]
fn escaped_unicode_invalid() {
for arg in ["\\ud9d0", "\\U0000D8F9"] {
new_ucmd!().arg(arg).fails_with_code(1).stderr_only(format!(
"printf: invalid universal character name {}\n",
arg
));
}
}

#[test]
fn escaped_percent_sign() {
new_ucmd!()
Expand Down Expand Up @@ -317,6 +337,16 @@ fn sub_num_int_char_const_in() {
.stdout_only("emoji is 128579");
}

#[test]
fn sub_num_thousands() {
// For "C" locale, the thousands separator is ignored but should
// not result in an error
new_ucmd!()
.args(&["%'i", "123456"])
.succeeds()
.stdout_only("123456");
}

#[test]
fn sub_num_uint() {
new_ucmd!()
Expand Down
Loading