@@ -46,49 +46,6 @@ static std::string format(const char * fmt, ...) {
46
46
return std::string (buf.data (), size);
47
47
}
48
48
49
- struct naive_trie {
50
- naive_trie () : has_value(false ), value(0 ) {
51
- }
52
- void insert (const char * key, size_t len, int32_t value = 0 ) {
53
- if (len == 0 ) {
54
- this ->has_value = true ;
55
- this ->value = value;
56
- return ;
57
- }
58
- char c = key[0 ];
59
- auto res = children.find (c);
60
- if (res != children.end ()) {
61
- res->second .insert (key + 1 , len - 1 , value);
62
- } else {
63
- auto res = children.insert (std::make_pair (c, naive_trie ()));
64
- res.first ->second .insert (key + 1 , len - 1 , value);
65
- }
66
- }
67
- std::pair<const char *, size_t > get_longest_prefix (const char * key, size_t len, size_t offset = 0 ) {
68
- if (len == 0 || offset == len) {
69
- return std::make_pair (key, offset);
70
- }
71
- char c = key[offset];
72
- auto res = children.find (c);
73
- if (res != children.end ()) {
74
- return res->second .get_longest_prefix (key, len, offset + 1 );
75
- } else {
76
- return std::make_pair (key, offset);
77
- }
78
- }
79
- struct naive_trie * traverse (const char c) {
80
- auto res = children.find (c);
81
- if (res != children.end ()) {
82
- return &res->second ;
83
- } else {
84
- return NULL ;
85
- }
86
- }
87
- std::map<char , struct naive_trie > children;
88
- bool has_value;
89
- llama_token value;
90
- };
91
-
92
49
//
93
50
// impl
94
51
//
@@ -779,27 +736,6 @@ struct llm_tokenizer_ugm {
779
736
prefix_replacements = &vocab.precompiled_charsmap [charsmap_offset];
780
737
prefix_replacements_size = vocab.precompiled_charsmap .size () - charsmap_offset;
781
738
}
782
-
783
- for (unsigned int id = 0 ; id < vocab.id_to_token .size (); ++id) {
784
- const auto &token_data = vocab.id_to_token [id];
785
-
786
- if (llama_is_normal_token (vocab, id)) {
787
- min_score = std::min<float >(min_score, token_data.score );
788
- max_score = std::max<float >(max_score, token_data.score );
789
- }
790
-
791
- if (llama_is_normal_token (vocab, id) ||
792
- llama_is_user_defined_token (vocab, id) ||
793
- llama_is_unused_token (vocab, id)) {
794
- token_matcher.insert (token_data.text .data (), token_data.text .size (), id);
795
- }
796
-
797
- if (llama_is_user_defined_token (vocab, id)) {
798
- user_defined_token_matcher.insert (token_data.text .data (), token_data.text .size ());
799
- }
800
- }
801
-
802
- unknown_token_score = min_score - unknown_token_score_penalty;
803
739
}
804
740
805
741
/* This implementation is based on SentencePiece optimized Viterbi algorithm for
@@ -840,7 +776,7 @@ struct llm_tokenizer_ugm {
840
776
// traverse the token matcher trie to find a matching token
841
777
bool single_codepoint_token_found = false ;
842
778
const struct best_tokenization & current_best = tokenization_results[input_offset];
843
- struct naive_trie * node = token_matcher.traverse (normalized[prefix_offset++]);
779
+ const struct naive_trie * node = vocab. token_matcher .traverse (normalized[prefix_offset++]);
844
780
845
781
while (prefix_offset <= input_len && node != NULL ) {
846
782
// check if we found valid token in prefix
@@ -1003,7 +939,7 @@ struct llm_tokenizer_ugm {
1003
939
}
1004
940
1005
941
// if input prefix matches some user-defined token return this token as normalization result
1006
- auto user_defined_token_match = user_defined_token_matcher.get_longest_prefix (&input[input_offset], input.size () - input_offset);
942
+ auto user_defined_token_match = vocab. user_defined_token_matcher .get_longest_prefix (&input[input_offset], input.size () - input_offset);
1007
943
if (user_defined_token_match.second > 0 ) {
1008
944
return { &input[input_offset], user_defined_token_match.second , user_defined_token_match.second };
1009
945
}
@@ -1076,22 +1012,14 @@ struct llm_tokenizer_ugm {
1076
1012
const uint32_t * xcda_array = NULL ;
1077
1013
size_t xcda_array_size = 0 ;
1078
1014
1079
- struct naive_trie user_defined_token_matcher;
1080
-
1081
1015
// this structure stores the best tokenization so far at input_offset
1082
1016
struct best_tokenization {
1083
1017
llama_token token_id;
1084
1018
size_t input_offset;
1085
1019
float score_sum;
1086
1020
};
1087
1021
1088
- float min_score = FLT_MAX;
1089
- float max_score = -FLT_MAX;
1090
-
1091
- float unknown_token_score_penalty = 10.0 ;
1092
1022
float unknown_token_score;
1093
-
1094
- struct naive_trie token_matcher;
1095
1023
};
1096
1024
1097
1025
//
0 commit comments