style: simplify string formatting for readability (#1632)

hamirmahal · web-flow · commit 557fde76d8ee · 2024-10-04T13:11:50.000+02:00
diff --git a/tokenizers/benches/unigram_benchmark.rs b/tokenizers/benches/unigram_benchmark.rs
@@ -21,7 +21,7 @@ pub fn bench_train(c: &mut Criterion) {
     let mut word_counts = HashMap::new();
     content.split_whitespace().for_each(|word| {
         // This is important for the test of char vs u8
-        let word = format!("▁{}", word);
+        let word = format!("▁{word}");
         *word_counts.entry(word).or_insert(0) += 1;
     });
 
@@ -49,7 +49,7 @@ pub fn bench_train(c: &mut Criterion) {
     let mut word_counts = HashMap::new();
     content.split_whitespace().for_each(|word| {
         // This is important for the test of char vs u8
-        let word = format!("▁{}", word);
+        let word = format!("▁{word}");
         *word_counts.entry(word).or_insert(0) += 1;
     });
 
diff --git a/tokenizers/examples/serialization.rs b/tokenizers/examples/serialization.rs
@@ -8,7 +8,7 @@ fn main() {
     // Mix special and not special
     // You can make sure ids are in order, and special status is correct.
     let tokens: Vec<_> = (0..120_000)
-        .map(|i| AddedToken::from(format!("[SPECIAL_{}]", i), i % 2 == 0))
+        .map(|i| AddedToken::from(format!("[SPECIAL_{i}]"), i % 2 == 0))
         .collect();
     tokenizer.add_tokens(&tokens);
     tokenizer.save("_tok.json", true).unwrap();
diff --git a/tokenizers/src/decoders/wordpiece.rs b/tokenizers/src/decoders/wordpiece.rs
@@ -53,7 +53,7 @@ impl Decoder for WordPiece {
                     if token.starts_with(&self.prefix) {
                         *token = token.replacen(&self.prefix, "", 1);
                     } else {
-                        *token = format!(" {}", token);
+                        *token = format!(" {token}");
                     }
                 }
                 if self.cleanup {
diff --git a/tokenizers/src/models/bpe/model.rs b/tokenizers/src/models/bpe/model.rs
@@ -385,13 +385,13 @@ impl BPE {
             // Add the `continuing_subword_prefix` if relevant
             if !is_first {
                 if let Some(ref prefix) = self.continuing_subword_prefix {
-                    s = format!("{}{}", prefix, s).into()
+                    s = format!("{prefix}{s}").into()
                 }
             }
             // Add the `end_of_word_suffix` if relevant
             if is_last {
                 if let Some(ref suffix) = self.end_of_word_suffix {
-                    s = format!("{}{}", s, suffix).into()
+                    s = format!("{s}{suffix}").into()
                 }
             }
 
@@ -406,7 +406,7 @@ impl BPE {
                     let tokens: Option<Vec<_>> = s
                         .bytes()
                         .map(|b| -> Option<&u32> {
-                            let code = format!("<{:#04X}>", b);
+                            let code = format!("<{b:#04X}>");
 
                             self.vocab.get(&code)
                         })
@@ -515,7 +515,7 @@ impl Model for BPE {
 
     fn save(&self, folder: &Path, name: Option<&str>) -> Result<Vec<PathBuf>> {
         let vocab_file_name = match name {
-            Some(name) => format!("{}-vocab.json", name),
+            Some(name) => format!("{name}-vocab.json"),
             None => "vocab.json".to_string(),
         };
 
@@ -530,7 +530,7 @@ impl Model for BPE {
 
         // Write merges.txt
         let merges_file_name = match name {
-            Some(name) => format!("{}-merges.txt", name),
+            Some(name) => format!("{name}-merges.txt"),
             None => "merges.txt".to_string(),
         };
 
diff --git a/tokenizers/src/models/bpe/trainer.rs b/tokenizers/src/models/bpe/trainer.rs
@@ -342,13 +342,13 @@ impl BpeTrainer {
                     // Add the `continuing_subword_prefix` if relevant
                     if !is_first {
                         if let Some(prefix) = &self.continuing_subword_prefix {
-                            s = format!("{}{}", prefix, s);
+                            s = format!("{prefix}{s}");
                         }
                     }
                     // Add the `end_of_word_suffix` if relevant
                     if is_last {
                         if let Some(suffix) = &self.end_of_word_suffix {
-                            s = format!("{}{}", s, suffix);
+                            s = format!("{s}{suffix}");
                         }
                     }
 
@@ -513,7 +513,7 @@ impl BpeTrainer {
                     part_b = part_b[prefix_byte_len..].to_string();
                 }
             }
-            let new_token = format!("{}{}", part_a, part_b);
+            let new_token = format!("{part_a}{part_b}");
             // implement sentencepiece-like merge.
             // if this code were to be merged, integrate a way in the python bindings to communicate this variable
             // default should be 0/None to maintain previous behavior. 16 is the spm default.
diff --git a/tokenizers/src/models/mod.rs b/tokenizers/src/models/mod.rs
@@ -51,7 +51,7 @@ impl<'a> Serialize for OrderedVocabIter<'a> {
 
         if !holes.is_empty() {
             warn!("The OrderedVocab you are attempting to save contains holes for indices {:?}, your vocabulary could be corrupted !", holes);
-            println!("The OrderedVocab you are attempting to save contains holes for indices {:?}, your vocabulary could be corrupted !", holes);
+            println!("The OrderedVocab you are attempting to save contains holes for indices {holes:?}, your vocabulary could be corrupted !");
         }
         result
     }
diff --git a/tokenizers/src/models/unigram/model.rs b/tokenizers/src/models/unigram/model.rs
@@ -425,7 +425,7 @@ impl Model for Unigram {
                         let byte_tokens: Option<Vec<_>> = string
                             .bytes()
                             .map(|byte| -> Option<Token> {
-                                let byte_string = format!("<0x{:02X}>", byte);
+                                let byte_string = format!("<0x{byte:02X}>");
                                 let id = self.token_to_ids.get(&byte_string);
                                 id.map(|id| Token::new(*id, byte_string, (offset, offset + len)))
                             })
@@ -457,7 +457,7 @@ impl Model for Unigram {
 
     fn save(&self, folder: &Path, name: Option<&str>) -> Result<Vec<PathBuf>> {
         let name = match name {
-            Some(name) => format!("{}-unigram.json", name),
+            Some(name) => format!("{name}-unigram.json"),
             None => "unigram.json".to_string(),
         };
         let mut fullpath = PathBuf::new();
@@ -568,7 +568,7 @@ mod tests {
 
         for is_optimized in &[true, false] {
             model.set_optimized(*is_optimized);
-            println!("IsOptimized {:?}", is_optimized);
+            println!("IsOptimized {is_optimized:?}");
             assert_eq!(model.encode("abc").unwrap(), vec!["abc"]);
             assert_eq!(model.encode("AB").unwrap(), vec!["AB"]);
 
diff --git a/tokenizers/src/models/unigram/serialization.rs b/tokenizers/src/models/unigram/serialization.rs
@@ -70,7 +70,7 @@ impl<'de> Visitor<'de> for UnigramVisitor {
         }
         match (vocab, unk_id, byte_fallback) {
             (Some(vocab), unk_id, byte_fallback) => Ok(Unigram::from(vocab, unk_id, byte_fallback)
-                .map_err(|err| Error::custom(format!("Unable to load vocab {:?}", err)))?),
+                .map_err(|err| Error::custom(format!("Unable to load vocab {err:?}")))?),
             (None, _, _) => Err(Error::custom("Missing vocab")),
         }
     }
diff --git a/tokenizers/src/models/wordlevel/mod.rs b/tokenizers/src/models/wordlevel/mod.rs
@@ -194,7 +194,7 @@ impl Model for WordLevel {
 
     fn save(&self, folder: &Path, name: Option<&str>) -> Result<Vec<PathBuf>> {
         let vocab_file_name = match name {
-            Some(name) => format!("{}-vocab.json", name),
+            Some(name) => format!("{name}-vocab.json"),
             None => "vocab.json".to_string(),
         };
 
diff --git a/tokenizers/src/models/wordpiece/mod.rs b/tokenizers/src/models/wordpiece/mod.rs
@@ -271,7 +271,7 @@ impl Model for WordPiece {
 
     fn save(&self, folder: &Path, name: Option<&str>) -> Result<Vec<PathBuf>> {
         let vocab_file_name = match name {
-            Some(name) => format!("{}-vocab.txt", name),
+            Some(name) => format!("{name}-vocab.txt"),
             None => "vocab.txt".to_string(),
         };
 
@@ -285,7 +285,7 @@ impl Model for WordPiece {
         vocab_file.write_all(
             &vocab
                 .into_iter()
-                .flat_map(|(token, _)| format!("{}\n", token).as_bytes().to_owned())
+                .flat_map(|(token, _)| format!("{token}\n").as_bytes().to_owned())
                 .collect::<Vec<_>>()[..],
         )?;
 
diff --git a/tokenizers/src/processors/template.rs b/tokenizers/src/processors/template.rs
@@ -150,7 +150,7 @@ impl TryFrom<String> for Piece {
     fn try_from(s: String) -> StdResult<Self, Self::Error> {
         let parts = s.split(':').collect::<Vec<_>>();
 
-        let err = || format!("Cannot build Piece from string \"{}\"", s);
+        let err = || format!("Cannot build Piece from string \"{s}\"");
         match parts.as_slice() {
             [id, type_id] => {
                 let type_id: u32 = type_id.parse().map_err(|_| err())?;
diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs
@@ -351,7 +351,7 @@ impl NormalizedString {
                     match changes {
                         0 => "Replacing".into(),
                         ch if ch > 0 => "Adding".into(),
-                        ch if ch < 0 => format!("Replacing + removing {} following chars", ch),
+                        ch if ch < 0 => format!("Replacing + removing {ch} following chars"),
                         _ => "Undefined".into(),
                     },
                     offset
diff --git a/tokenizers/src/tokenizer/serialization.rs b/tokenizers/src/tokenizer/serialization.rs
@@ -116,7 +116,7 @@ where
                 "version" => {
                     let v: String = map.next_value()?;
                     if &v != "1.0" {
-                        return Err(Error::custom(format!("Unknown tokenizer version '{}'", v)));
+                        return Err(Error::custom(format!("Unknown tokenizer version '{v}'")));
                     }
                 }
                 "truncation" => {
diff --git a/tokenizers/tests/documentation.rs b/tokenizers/tests/documentation.rs
@@ -199,7 +199,7 @@ fn quicktour() -> tokenizers::Result<()> {
     // START quicktour_encode_batch
     let output = tokenizer.encode_batch(vec!["Hello, y'all!", "How are you 😁 ?"], true)?;
     // END quicktour_encode_batch
-    println!("{:?}", output);
+    println!("{output:?}");
     // START quicktour_encode_batch_pair
     let output = tokenizer.encode_batch(
         vec![
@@ -209,7 +209,7 @@ fn quicktour() -> tokenizers::Result<()> {
         true,
     )?;
     // END quicktour_encode_batch_pair
-    println!("{:?}", output);
+    println!("{output:?}");
     // START quicktour_enable_padding
     use tokenizers::PaddingParams;
 
@@ -350,7 +350,7 @@ fn pipeline() -> tokenizers::Result<()> {
         &[1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2],
         true,
     )?;
-    println!("{}", decoded);
+    println!("{decoded}");
     // "Hello , y ' all ! How are you ?"
     // END pipeline_test_decoding
 
@@ -436,7 +436,7 @@ fn pipeline_bert() -> tokenizers::Result<()> {
     // ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"]
 
     let decoded = bert_tokenizer.decode(output.get_ids(), true)?;
-    println!("{}", decoded);
+    println!("{decoded}");
     // "welcome to the tok ##eni ##zer ##s library ."
     // END bert_test_decoding
     assert_eq!(
diff --git a/tokenizers/tests/unigram.rs b/tokenizers/tests/unigram.rs
@@ -44,7 +44,7 @@ fn test_train_unigram_from_file() {
     let mut word_counts = HashMap::new();
     content.split_whitespace().for_each(|word| {
         // This is important for the test of char vs u8
-        let word = format!("▁{}", word);
+        let word = format!("▁{word}");
         *word_counts.entry(word).or_insert(0) += 1;
     });
 

Original file line number	Diff line number	Diff line change
`@@ -53,7 +53,7 @@ impl Decoder for WordPiece {`
`53`	`53`	`if token.starts_with(&self.prefix) {`
`54`	`54`	`*token = token.replacen(&self.prefix, "", 1);`
`55`	`55`	`} else {`
`56`		`- *token = format!(" {}", token);`
	`56`	`+ *token = format!(" {token}");`
`57`	`57`	`}`
`58`	`58`	`}`
`59`	`59`	`if self.cleanup {`
Original file line number	Diff line number	Diff line change
`@@ -342,13 +342,13 @@ impl BpeTrainer {`
`342`	`342`	// Add the `continuing_subword_prefix` if relevant
`343`	`343`	`if !is_first {`
`344`	`344`	`if let Some(prefix) = &self.continuing_subword_prefix {`
`345`		`- s = format!("{}{}", prefix, s);`
	`345`	`+ s = format!("{prefix}{s}");`
`346`	`346`	`}`
`347`	`347`	`}`
`348`	`348`	// Add the `end_of_word_suffix` if relevant
`349`	`349`	`if is_last {`
`350`	`350`	`if let Some(suffix) = &self.end_of_word_suffix {`
`351`		`- s = format!("{}{}", s, suffix);`
	`351`	`+ s = format!("{s}{suffix}");`
`352`	`352`	`}`
`353`	`353`	`}`
`354`	`354`
`@@ -513,7 +513,7 @@ impl BpeTrainer {`
`513`	`513`	`part_b = part_b[prefix_byte_len..].to_string();`
`514`	`514`	`}`
`515`	`515`	`}`
`516`		`- let new_token = format!("{}{}", part_a, part_b);`
	`516`	`+ let new_token = format!("{part_a}{part_b}");`
`517`	`517`	`// implement sentencepiece-like merge.`
`518`	`518`	`// if this code were to be merged, integrate a way in the python bindings to communicate this variable`
`519`	`519`	`// default should be 0/None to maintain previous behavior. 16 is the spm default.`
Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,7 @@ impl<'a> Serialize for OrderedVocabIter<'a> {`
`51`	`51`
`52`	`52`	`if !holes.is_empty() {`
`53`	`53`	`warn!("The OrderedVocab you are attempting to save contains holes for indices {:?}, your vocabulary could be corrupted !", holes);`
`54`		`- println!("The OrderedVocab you are attempting to save contains holes for indices {:?}, your vocabulary could be corrupted !", holes);`
	`54`	`+ println!("The OrderedVocab you are attempting to save contains holes for indices {holes:?}, your vocabulary could be corrupted !");`
`55`	`55`	`}`
`56`	`56`	`result`
`57`	`57`	`}`
Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,7 @@ impl<'de> Visitor<'de> for UnigramVisitor {`
`70`	`70`	`}`
`71`	`71`	`match (vocab, unk_id, byte_fallback) {`
`72`	`72`	`(Some(vocab), unk_id, byte_fallback) => Ok(Unigram::from(vocab, unk_id, byte_fallback)`
`73`		`- .map_err(\|err\| Error::custom(format!("Unable to load vocab {:?}", err)))?),`
	`73`	`+ .map_err(\|err\| Error::custom(format!("Unable to load vocab {err:?}")))?),`
`74`	`74`	`(None, _, _) => Err(Error::custom("Missing vocab")),`
`75`	`75`	`}`
`76`	`76`	`}`
Original file line number	Diff line number	Diff line change
`@@ -116,7 +116,7 @@ where`
`116`	`116`	`"version" => {`
`117`	`117`	`let v: String = map.next_value()?;`
`118`	`118`	`if &v != "1.0" {`
`119`		`- return Err(Error::custom(format!("Unknown tokenizer version '{}'", v)));`
	`119`	`+ return Err(Error::custom(format!("Unknown tokenizer version '{v}'")));`
`120`	`120`	`}`
`121`	`121`	`}`
`122`	`122`	`"truncation" => {`