Skip to content

Commit d3d3609

Browse files
authored
❇️ Introducing fallback match for ASCII and UTF-8 (last resort only) (#64)
1 parent d8f0bf4 commit d3d3609

File tree

2 files changed

+43
-0
lines changed

2 files changed

+43
-0
lines changed

charset_normalizer/api.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,9 @@ def from_bytes(
116116
tested_but_hard_failure = [] # type: List[str]
117117
tested_but_soft_failure = [] # type: List[str]
118118

119+
fallback_ascii = None # type: Optional[CharsetMatch]
120+
fallback_u8 = None # type: Optional[CharsetMatch]
121+
119122
single_byte_hard_failure_count = 0 # type: int
120123
single_byte_soft_failure_count = 0 # type: int
121124

@@ -251,6 +254,20 @@ def from_bytes(
251254
encoding_iana,
252255
early_stop_count,
253256
round(mean_mess_ratio * 100, ndigits=3))
257+
# Preparing those fallbacks in case we got nothing.
258+
if encoding_iana in ["ascii", "utf_8"]:
259+
fallback_entry = CharsetMatch(
260+
sequences,
261+
encoding_iana,
262+
threshold,
263+
False,
264+
[],
265+
decoded_payload
266+
)
267+
if encoding_iana == "ascii":
268+
fallback_ascii = fallback_entry
269+
else:
270+
fallback_u8 = fallback_entry
254271
continue
255272

256273
logger.info(
@@ -314,6 +331,17 @@ def from_bytes(
314331
results[-1]._languages
315332
)
316333

334+
if len(results) == 0:
335+
if fallback_u8 or fallback_ascii:
336+
logger.warning("Nothing got out of the detection process. Using ASCII/UTF-8 fallback.")
337+
338+
if fallback_u8 and fallback_u8.fingerprint != fallback_ascii.fingerprint:
339+
logger.warning("utf_8 will be used as a fallback match")
340+
results.append(fallback_u8)
341+
elif fallback_ascii:
342+
logger.warning("ascii will be used as a fallback match")
343+
results.append(fallback_ascii)
344+
317345
return results
318346

319347

tests/test_on_byte.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,21 @@ def test_empty_bytes(self):
2828
len(r.alphabets)
2929
)
3030

31+
def test_ensure_ascii_fallback(self):
32+
payload = b"AbAdZ pOoooOlDl mmlDoDkA lldDkeEkddA mpAlkDF"
33+
r = from_bytes(payload).best()
34+
35+
self.assertIsNotNone(
36+
r,
37+
msg="Fallback ASCII detection has failed. You clearly have tempered with it. Testing with {}".format(payload)
38+
)
39+
40+
self.assertEqual(
41+
r.encoding,
42+
"ascii",
43+
msg="Fallback ASCII miss-detection. You clearly have tempered with it. Testing with {}".format(payload)
44+
)
45+
3146
def test_ensure_ascii(self):
3247

3348
for payload in [

0 commit comments

Comments
 (0)