Skip to content

Commit b92df5e

Browse files
AfryMaskAfryMask
andauthored
whisper : fix the bug related to word splitting errors in the "tokenize" function. (ggml-org#760)
Co-authored-by: AfryMask <[email protected]>
1 parent fec219a commit b92df5e

File tree

1 file changed

+6
-11
lines changed

1 file changed

+6
-11
lines changed

whisper.cpp

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2449,25 +2449,20 @@ static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, cons
24492449
int n = word.size();
24502450
while (i < n) {
24512451
int j = n;
2452+
bool found = false;
24522453
while (j > i) {
2453-
auto it = vocab.token_to_id.find(word.substr(i, j-i));
2454+
auto sub = word.substr(i, j-i);
2455+
auto it = vocab.token_to_id.find(sub);
24542456
if (it != vocab.token_to_id.end()) {
24552457
tokens.push_back(it->second);
24562458
i = j;
2459+
found = true;
24572460
break;
24582461
}
24592462
--j;
24602463
}
2461-
if (i == n) {
2462-
break;
2463-
}
2464-
if (j == i) {
2465-
auto sub = word.substr(i, 1);
2466-
if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
2467-
tokens.push_back(vocab.token_to_id.at(sub));
2468-
} else {
2469-
fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
2470-
}
2464+
if (!found) {
2465+
fprintf(stderr, "unknown token \n");
24712466
++i;
24722467
}
24732468
}

0 commit comments

Comments
 (0)