diff --git a/src/uucore/src/lib/features/format/escape.rs b/src/uucore/src/lib/features/format/escape.rs index 5db611d818a..da6e691eaaf 100644 --- a/src/uucore/src/lib/features/format/escape.rs +++ b/src/uucore/src/lib/features/format/escape.rs @@ -5,6 +5,8 @@ //! Parsing of escape sequences +use crate::format::FormatError; + #[derive(Debug)] pub enum EscapedChar { /// A single byte @@ -90,34 +92,36 @@ fn parse_code(input: &mut &[u8], base: Base) -> Option { // spell-checker:disable-next /// Parse `\uHHHH` and `\UHHHHHHHH` -// TODO: This should print warnings and possibly halt execution when it fails to parse -// TODO: If the character cannot be converted to u32, the input should be printed. -fn parse_unicode(input: &mut &[u8], digits: u8) -> Option { - let (c, rest) = input.split_first()?; - let mut ret = Base::Hex.convert_digit(*c)? as u32; - *input = rest; - - for _ in 1..digits { - let (c, rest) = input.split_first()?; - let n = Base::Hex.convert_digit(*c)?; - ret = ret - .wrapping_mul(Base::Hex.as_base() as u32) - .wrapping_add(n as u32); +fn parse_unicode(input: &mut &[u8], digits: u8) -> Result { + if let Some((new_digits, rest)) = input.split_at_checked(digits as usize) { *input = rest; + let ret = new_digits + .iter() + .map(|c| Base::Hex.convert_digit(*c)) + .collect::>>() + .ok_or(EscapeError::MissingHexadecimalNumber)? + .iter() + .map(|n| *n as u32) + .reduce(|ret, n| ret.wrapping_mul(Base::Hex.as_base() as u32).wrapping_add(n)) + .expect("must have multiple digits in unicode string"); + char::from_u32(ret).ok_or_else(|| EscapeError::InvalidCharacters(new_digits.to_vec())) + } else { + Err(EscapeError::MissingHexadecimalNumber) } - - char::from_u32(ret) } /// Represents an invalid escape sequence. -#[derive(Debug)] -pub struct EscapeError {} +#[derive(Debug, PartialEq)] +pub enum EscapeError { + InvalidCharacters(Vec), + MissingHexadecimalNumber, +} /// Parse an escape sequence, like `\n` or `\xff`, etc. pub fn parse_escape_code( rest: &mut &[u8], zero_octal_parsing: OctalParsing, -) -> Result { +) -> Result { if let [c, new_rest @ ..] = rest { // This is for the \NNN syntax for octal sequences. // Note that '0' is intentionally omitted because that @@ -145,17 +149,89 @@ pub fn parse_escape_code( if let Some(c) = parse_code(rest, Base::Hex) { Ok(EscapedChar::Byte(c)) } else { - Err(EscapeError {}) + Err(FormatError::MissingHex) } } b'0' => Ok(EscapedChar::Byte( parse_code(rest, Base::Oct(zero_octal_parsing)).unwrap_or(b'\0'), )), - b'u' => Ok(EscapedChar::Char(parse_unicode(rest, 4).unwrap_or('\0'))), - b'U' => Ok(EscapedChar::Char(parse_unicode(rest, 8).unwrap_or('\0'))), + b'u' => match parse_unicode(rest, 4) { + Ok(c) => Ok(EscapedChar::Char(c)), + Err(EscapeError::MissingHexadecimalNumber) => Err(FormatError::MissingHex), + Err(EscapeError::InvalidCharacters(chars)) => { + Err(FormatError::InvalidCharacter('u', chars)) + } + }, + b'U' => match parse_unicode(rest, 8) { + Ok(c) => Ok(EscapedChar::Char(c)), + Err(EscapeError::MissingHexadecimalNumber) => Err(FormatError::MissingHex), + Err(EscapeError::InvalidCharacters(chars)) => { + Err(FormatError::InvalidCharacter('U', chars)) + } + }, c => Ok(EscapedChar::Backslash(*c)), } } else { Ok(EscapedChar::Byte(b'\\')) } } + +#[cfg(test)] +mod tests { + use super::*; + + mod parse_unicode { + use super::*; + + #[test] + fn parse_ascii() { + let input = b"2a"; + assert_eq!(parse_unicode(&mut &input[..], 2), Ok('*')); + + let input = b"002A"; + assert_eq!(parse_unicode(&mut &input[..], 4), Ok('*')); + } + + #[test] + fn parse_emoji_codepoint() { + let input = b"0001F60A"; + assert_eq!(parse_unicode(&mut &input[..], 8), Ok('😊')); + } + + #[test] + fn no_characters() { + let input = b""; + assert_eq!( + parse_unicode(&mut &input[..], 8), + Err(EscapeError::MissingHexadecimalNumber) + ); + } + + #[test] + fn incomplete_hexadecimal_number() { + let input = b"123"; + assert_eq!( + parse_unicode(&mut &input[..], 4), + Err(EscapeError::MissingHexadecimalNumber) + ); + } + + #[test] + fn invalid_hex() { + let input = b"duck"; + assert_eq!( + parse_unicode(&mut &input[..], 4), + Err(EscapeError::MissingHexadecimalNumber) + ); + } + + #[test] + fn surrogate_code_point() { + let input = b"d800"; + assert_eq!( + parse_unicode(&mut &input[..], 4), + Err(EscapeError::InvalidCharacters(Vec::from(b"d800"))) + ); + } + } +} diff --git a/src/uucore/src/lib/features/format/mod.rs b/src/uucore/src/lib/features/format/mod.rs index 3387f15fe56..2b372d3e0df 100644 --- a/src/uucore/src/lib/features/format/mod.rs +++ b/src/uucore/src/lib/features/format/mod.rs @@ -71,6 +71,9 @@ pub enum FormatError { EndsWithPercent(Vec), /// The escape sequence `\x` appears without a literal hexadecimal value. MissingHex, + /// The hexadecimal characters represent a code point that cannot represent a + /// Unicode character (e.g., a surrogate code point) + InvalidCharacter(char, Vec), } impl Error for FormatError {} @@ -110,6 +113,12 @@ impl Display for FormatError { Self::NoMoreArguments => write!(f, "no more arguments"), Self::InvalidArgument(_) => write!(f, "invalid argument"), Self::MissingHex => write!(f, "missing hexadecimal number in escape"), + Self::InvalidCharacter(escape_char, digits) => write!( + f, + "invalid universal character name \\{}{}", + escape_char, + String::from_utf8_lossy(digits) + ), } } } @@ -186,12 +195,7 @@ pub fn parse_spec_and_escape( } [b'\\', rest @ ..] => { current = rest; - Some( - match parse_escape_code(&mut current, OctalParsing::default()) { - Ok(c) => Ok(FormatItem::Char(c)), - Err(_) => Err(FormatError::MissingHex), - }, - ) + Some(parse_escape_code(&mut current, OctalParsing::default()).map(FormatItem::Char)) } [c, rest @ ..] => { current = rest; diff --git a/src/uucore/src/lib/features/format/spec.rs b/src/uucore/src/lib/features/format/spec.rs index 458fbf82bf1..9bb1fb4ae00 100644 --- a/src/uucore/src/lib/features/format/spec.rs +++ b/src/uucore/src/lib/features/format/spec.rs @@ -95,6 +95,7 @@ struct Flags { space: bool, hash: bool, zero: bool, + quote: bool, } impl Flags { @@ -108,6 +109,11 @@ impl Flags { b' ' => flags.space = true, b'#' => flags.hash = true, b'0' => flags.zero = true, + b'\'' => { + // the thousands separator is printed with numbers using the ' flag, but + // this is a no-op in the "C" locale. We only save this flag for reporting errors + flags.quote = true; + } _ => break, } *index += 1; @@ -181,7 +187,7 @@ impl Spec { } } b's' => { - if flags.zero || flags.hash { + if flags.zero || flags.hash || flags.quote { return Err(&start[..index]); } Self::String { diff --git a/src/uucore/src/lib/features/parser/num_parser.rs b/src/uucore/src/lib/features/parser/num_parser.rs index 1366c32405e..f21aa011450 100644 --- a/src/uucore/src/lib/features/parser/num_parser.rs +++ b/src/uucore/src/lib/features/parser/num_parser.rs @@ -502,7 +502,7 @@ fn parse( let ebd_result = construct_extended_big_decimal(digits, negative, base, scale, exponent); - // Return what has been parsed so far. It there are extra characters, mark the + // Return what has been parsed so far. If there are extra characters, mark the // parsing as a partial match. if let Some((first_unparsed, _)) = chars.next() { Err(ExtendedParserError::PartialMatch( diff --git a/tests/by-util/test_printf.rs b/tests/by-util/test_printf.rs index 9bd762ee9fa..fb397b08d01 100644 --- a/tests/by-util/test_printf.rs +++ b/tests/by-util/test_printf.rs @@ -112,6 +112,26 @@ fn escaped_unicode_null_byte() { .stdout_is_bytes([1u8, b'_']); } +#[test] +fn escaped_unicode_incomplete() { + for arg in ["\\u", "\\U", "\\uabc", "\\Uabcd"] { + new_ucmd!() + .arg(arg) + .fails_with_code(1) + .stderr_only("printf: missing hexadecimal number in escape\n"); + } +} + +#[test] +fn escaped_unicode_invalid() { + for arg in ["\\ud9d0", "\\U0000D8F9"] { + new_ucmd!().arg(arg).fails_with_code(1).stderr_only(format!( + "printf: invalid universal character name {}\n", + arg + )); + } +} + #[test] fn escaped_percent_sign() { new_ucmd!() @@ -317,6 +337,16 @@ fn sub_num_int_char_const_in() { .stdout_only("emoji is 128579"); } +#[test] +fn sub_num_thousands() { + // For "C" locale, the thousands separator is ignored but should + // not result in an error + new_ucmd!() + .args(&["%'i", "123456"]) + .succeeds() + .stdout_only("123456"); +} + #[test] fn sub_num_uint() { new_ucmd!()