Skip to content

Commit adc82cb

Browse files
ArthurZuckerNarsil
andauthored
Add-legacy-tests (#1597)
* add tests * decoder as well * check error * propagate * lint * rafiune the test * lint * revert decoder changes * on more? * fmt * Update tokenizers/src/pre_tokenizers/mod.rs Co-authored-by: Nicolas Patry <[email protected]> * fix commit * simplify err * fmt --------- Co-authored-by: Nicolas Patry <[email protected]>
1 parent 99a48dc commit adc82cb

File tree

2 files changed

+77
-0
lines changed

2 files changed

+77
-0
lines changed

tokenizers/src/pre_tokenizers/mod.rs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,4 +144,43 @@ mod tests {
144144
PreTokenizerWrapper::WhitespaceSplit(WhitespaceSplit {})
145145
);
146146
}
147+
148+
#[test]
149+
fn pre_tokenizer_deserialization_no_type() {
150+
let json = r#"{"replacement":"▁","add_prefix_space":true, "prepend_scheme":"always"}}"#;
151+
let reconstructed = serde_json::from_str::<PreTokenizerWrapper>(json);
152+
match reconstructed {
153+
Err(err) => assert_eq!(
154+
err.to_string(),
155+
"data did not match any variant of untagged enum PreTokenizerWrapper"
156+
),
157+
_ => panic!("Expected an error here"),
158+
}
159+
160+
let json = r#"{"type":"Metaspace", "replacement":"▁" }"#;
161+
let reconstructed = serde_json::from_str::<PreTokenizerWrapper>(json);
162+
assert_eq!(
163+
reconstructed.unwrap(),
164+
PreTokenizerWrapper::Metaspace(Metaspace::default())
165+
);
166+
167+
let json = r#"{"type":"Metaspace", "add_prefix_space":true }"#;
168+
let reconstructed = serde_json::from_str::<PreTokenizerWrapper>(json);
169+
match reconstructed {
170+
Err(err) => assert_eq!(
171+
err.to_string(),
172+
"data did not match any variant of untagged enum PreTokenizerWrapper"
173+
),
174+
_ => panic!("Expected an error here"),
175+
}
176+
let json = r#"{"behavior":"default_split"}"#;
177+
let reconstructed = serde_json::from_str::<PreTokenizerWrapper>(json);
178+
match reconstructed {
179+
Err(err) => assert_eq!(
180+
err.to_string(),
181+
"data did not match any variant of untagged enum PreTokenizerWrapper"
182+
),
183+
_ => panic!("Expected an error here"),
184+
}
185+
}
147186
}

tokenizers/src/processors/mod.rs

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,4 +87,42 @@ mod tests {
8787
PostProcessorWrapper::Bert(bert)
8888
);
8989
}
90+
91+
#[test]
92+
fn post_processor_deserialization_no_type() {
93+
let json = r#"{"add_prefix_space": true, "trim_offsets": false, "use_regex": false}"#;
94+
let reconstructed = serde_json::from_str::<PostProcessorWrapper>(json);
95+
match reconstructed {
96+
Err(err) => assert_eq!(
97+
err.to_string(),
98+
"data did not match any variant of untagged enum PostProcessorWrapper"
99+
),
100+
_ => panic!("Expected an error here"),
101+
}
102+
103+
let json = r#"{"sep":["[SEP]",102],"cls":["[CLS]",101]}"#;
104+
let reconstructed = serde_json::from_str::<PostProcessorWrapper>(json);
105+
assert!(matches!(
106+
reconstructed.unwrap(),
107+
PostProcessorWrapper::Bert(_)
108+
));
109+
110+
let json =
111+
r#"{"sep":["</s>",2], "cls":["<s>",0], "trim_offsets":true, "add_prefix_space":true}"#;
112+
let reconstructed = serde_json::from_str::<PostProcessorWrapper>(json);
113+
assert!(matches!(
114+
reconstructed.unwrap(),
115+
PostProcessorWrapper::Roberta(_)
116+
));
117+
118+
let json = r#"{"type":"RobertaProcessing", "sep":["</s>",2] }"#;
119+
let reconstructed = serde_json::from_str::<PostProcessorWrapper>(json);
120+
match reconstructed {
121+
Err(err) => assert_eq!(
122+
err.to_string(),
123+
"data did not match any variant of untagged enum PostProcessorWrapper"
124+
),
125+
_ => panic!("Expected an error here"),
126+
}
127+
}
90128
}

0 commit comments

Comments
 (0)