uutils · jfinkels · Apr 12, 2025 · Apr 6, 2025 · Apr 7, 2025
diff --git a/src/uucore/src/lib/features/format/escape.rs b/src/uucore/src/lib/features/format/escape.rs
@@ -5,6 +5,8 @@
 
 //! Parsing of escape sequences
 
+use crate::format::FormatError;
+
 #[derive(Debug)]
 pub enum EscapedChar {
     /// A single byte
@@ -90,34 +92,36 @@ fn parse_code(input: &mut &[u8], base: Base) -> Option<u8> {
 
 // spell-checker:disable-next
 /// Parse `\uHHHH` and `\UHHHHHHHH`
-// TODO: This should print warnings and possibly halt execution when it fails to parse
-// TODO: If the character cannot be converted to u32, the input should be printed.
-fn parse_unicode(input: &mut &[u8], digits: u8) -> Option<char> {
-    let (c, rest) = input.split_first()?;
-    let mut ret = Base::Hex.convert_digit(*c)? as u32;
-    *input = rest;
-
-    for _ in 1..digits {
-        let (c, rest) = input.split_first()?;
-        let n = Base::Hex.convert_digit(*c)?;
-        ret = ret
-            .wrapping_mul(Base::Hex.as_base() as u32)
-            .wrapping_add(n as u32);
+fn parse_unicode(input: &mut &[u8], digits: u8) -> Result<char, EscapeError> {
+    if let Some((new_digits, rest)) = input.split_at_checked(digits as usize) {
         *input = rest;
+        let ret = new_digits
+            .iter()
+            .map(|c| Base::Hex.convert_digit(*c))
+            .collect::<Option<Vec<u8>>>()
+            .ok_or(EscapeError::MissingHexadecimalNumber)?
+            .iter()
+            .map(|n| *n as u32)
+            .reduce(|ret, n| ret.wrapping_mul(Base::Hex.as_base() as u32).wrapping_add(n))
+            .expect("must have multiple digits in unicode string");
+        char::from_u32(ret).ok_or_else(|| EscapeError::InvalidCharacters(new_digits.to_vec()))
+    } else {
+        Err(EscapeError::MissingHexadecimalNumber)
     }
-
-    char::from_u32(ret)
 }
 
 /// Represents an invalid escape sequence.
-#[derive(Debug)]
-pub struct EscapeError {}
+#[derive(Debug, PartialEq)]
+pub enum EscapeError {
+    InvalidCharacters(Vec<u8>),
+    MissingHexadecimalNumber,
+}
 
 /// Parse an escape sequence, like `\n` or `\xff`, etc.
 pub fn parse_escape_code(
     rest: &mut &[u8],
     zero_octal_parsing: OctalParsing,
-) -> Result<EscapedChar, EscapeError> {
+) -> Result<EscapedChar, FormatError> {
     if let [c, new_rest @ ..] = rest {
         // This is for the \NNN syntax for octal sequences.
         // Note that '0' is intentionally omitted because that
@@ -145,17 +149,89 @@ pub fn parse_escape_code(
                 if let Some(c) = parse_code(rest, Base::Hex) {
                     Ok(EscapedChar::Byte(c))
                 } else {
-                    Err(EscapeError {})
+                    Err(FormatError::MissingHex)
                 }
             }
             b'0' => Ok(EscapedChar::Byte(
                 parse_code(rest, Base::Oct(zero_octal_parsing)).unwrap_or(b'\0'),
             )),
-            b'u' => Ok(EscapedChar::Char(parse_unicode(rest, 4).unwrap_or('\0'))),
-            b'U' => Ok(EscapedChar::Char(parse_unicode(rest, 8).unwrap_or('\0'))),
+            b'u' => match parse_unicode(rest, 4) {
+                Ok(c) => Ok(EscapedChar::Char(c)),
+                Err(EscapeError::MissingHexadecimalNumber) => Err(FormatError::MissingHex),
+                Err(EscapeError::InvalidCharacters(chars)) => {
+                    Err(FormatError::InvalidCharacter('u', chars))
+                }
+            },
+            b'U' => match parse_unicode(rest, 8) {
+                Ok(c) => Ok(EscapedChar::Char(c)),
+                Err(EscapeError::MissingHexadecimalNumber) => Err(FormatError::MissingHex),
+                Err(EscapeError::InvalidCharacters(chars)) => {
+                    Err(FormatError::InvalidCharacter('U', chars))
+                }
+            },
             c => Ok(EscapedChar::Backslash(*c)),
         }
     } else {
         Ok(EscapedChar::Byte(b'\\'))
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    mod parse_unicode {
+        use super::*;
+
+        #[test]
+        fn parse_ascii() {
+            let input = b"2a";
+            assert_eq!(parse_unicode(&mut &input[..], 2), Ok('*'));
+
+            let input = b"002A";
+            assert_eq!(parse_unicode(&mut &input[..], 4), Ok('*'));
+        }
+
+        #[test]
+        fn parse_emoji_codepoint() {
+            let input = b"0001F60A";
+            assert_eq!(parse_unicode(&mut &input[..], 8), Ok('😊'));
+        }
+
+        #[test]
+        fn no_characters() {
+            let input = b"";
+            assert_eq!(
+                parse_unicode(&mut &input[..], 8),
+                Err(EscapeError::MissingHexadecimalNumber)
+            );
+        }
+
+        #[test]
+        fn incomplete_hexadecimal_number() {
+            let input = b"123";
+            assert_eq!(
+                parse_unicode(&mut &input[..], 4),
+                Err(EscapeError::MissingHexadecimalNumber)
+            );
+        }
+
+        #[test]
+        fn invalid_hex() {
+            let input = b"duck";
+            assert_eq!(
+                parse_unicode(&mut &input[..], 4),
+                Err(EscapeError::MissingHexadecimalNumber)
+            );
+        }
+
+        #[test]
+        fn surrogate_code_point() {
+            let input = b"d800";
+            assert_eq!(
+                parse_unicode(&mut &input[..], 4),
+                Err(EscapeError::InvalidCharacters(Vec::from(b"d800")))
+            );
+        }
+    }
+}
diff --git a/src/uucore/src/lib/features/format/mod.rs b/src/uucore/src/lib/features/format/mod.rs
@@ -71,6 +71,9 @@ pub enum FormatError {
     EndsWithPercent(Vec<u8>),
     /// The escape sequence `\x` appears without a literal hexadecimal value.
     MissingHex,
+    /// The hexadecimal characters represent a code point that cannot represent a
+    /// Unicode character (e.g., a surrogate code point)
+    InvalidCharacter(char, Vec<u8>),
 }
 
 impl Error for FormatError {}
@@ -110,6 +113,12 @@ impl Display for FormatError {
             Self::NoMoreArguments => write!(f, "no more arguments"),
             Self::InvalidArgument(_) => write!(f, "invalid argument"),
             Self::MissingHex => write!(f, "missing hexadecimal number in escape"),
+            Self::InvalidCharacter(escape_char, digits) => write!(
+                f,
+                "invalid universal character name \\{}{}",
+                escape_char,
+                String::from_utf8_lossy(digits)
+            ),
         }
     }
 }
@@ -186,12 +195,7 @@ pub fn parse_spec_and_escape(
         }
         [b'\\', rest @ ..] => {
             current = rest;
-            Some(
-                match parse_escape_code(&mut current, OctalParsing::default()) {
-                    Ok(c) => Ok(FormatItem::Char(c)),
-                    Err(_) => Err(FormatError::MissingHex),
-                },
-            )
+            Some(parse_escape_code(&mut current, OctalParsing::default()).map(FormatItem::Char))
         }
         [c, rest @ ..] => {
             current = rest;

diff --git a/src/uucore/src/lib/features/format/spec.rs b/src/uucore/src/lib/features/format/spec.rs
@@ -95,6 +95,7 @@ struct Flags {
     space: bool,
     hash: bool,
     zero: bool,
+    quote: bool,
 }
 
 impl Flags {
@@ -108,6 +109,11 @@ impl Flags {
                 b' ' => flags.space = true,
                 b'#' => flags.hash = true,
                 b'0' => flags.zero = true,
+                b'\'' => {
+                    // the thousands separator is printed with numbers using the ' flag, but
+                    // this is a no-op in the "C" locale. We only save this flag for reporting errors
+                    flags.quote = true;
+                }
                 _ => break,
             }
             *index += 1;
@@ -181,7 +187,7 @@ impl Spec {
                 }
             }
             b's' => {
-                if flags.zero || flags.hash {
+                if flags.zero || flags.hash || flags.quote {
                     return Err(&start[..index]);
                 }
                 Self::String {

diff --git a/src/uucore/src/lib/features/parser/num_parser.rs b/src/uucore/src/lib/features/parser/num_parser.rs
@@ -502,7 +502,7 @@ fn parse(
 
     let ebd_result = construct_extended_big_decimal(digits, negative, base, scale, exponent);
 
-    // Return what has been parsed so far. It there are extra characters, mark the
+    // Return what has been parsed so far. If there are extra characters, mark the
     // parsing as a partial match.
     if let Some((first_unparsed, _)) = chars.next() {
         Err(ExtendedParserError::PartialMatch(

diff --git a/tests/by-util/test_printf.rs b/tests/by-util/test_printf.rs
@@ -112,6 +112,26 @@ fn escaped_unicode_null_byte() {
         .stdout_is_bytes([1u8, b'_']);
 }
 
+#[test]
+fn escaped_unicode_incomplete() {
+    for arg in ["\\u", "\\U", "\\uabc", "\\Uabcd"] {
+        new_ucmd!()
+            .arg(arg)
+            .fails_with_code(1)
+            .stderr_only("printf: missing hexadecimal number in escape\n");
+    }
+}
+
+#[test]
+fn escaped_unicode_invalid() {
+    for arg in ["\\ud9d0", "\\U0000D8F9"] {
+        new_ucmd!().arg(arg).fails_with_code(1).stderr_only(format!(
+            "printf: invalid universal character name {}\n",
+            arg
+        ));
+    }
+}
+
 #[test]
 fn escaped_percent_sign() {
     new_ucmd!()
@@ -317,6 +337,16 @@ fn sub_num_int_char_const_in() {
         .stdout_only("emoji is 128579");
 }
 
+#[test]
+fn sub_num_thousands() {
+    // For "C" locale, the thousands separator is ignored but should
+    // not result in an error
+    new_ucmd!()
+        .args(&["%'i", "123456"])
+        .succeeds()
+        .stdout_only("123456");
+}
+
 #[test]
 fn sub_num_uint() {
     new_ucmd!()