precompute token maps, very slightly hacky

iamlemec · iamlemec · commit 5dad774c32d4 · 2024-07-24T01:19:07.000-05:00
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
@@ -46,49 +46,6 @@ static std::string format(const char * fmt, ...) {
     return std::string(buf.data(), size);
 }
 
-struct naive_trie {
-    naive_trie() : has_value(false), value(0) {
-    }
-    void insert(const char * key, size_t len, int32_t value = 0) {
-        if (len == 0) {
-            this->has_value = true;
-            this->value = value;
-            return;
-        }
-        char c = key[0];
-        auto res = children.find(c);
-        if (res != children.end()) {
-            res->second.insert(key + 1, len - 1, value);
-        } else {
-            auto res = children.insert(std::make_pair(c, naive_trie()));
-            res.first->second.insert(key + 1, len - 1, value);
-        }
-    }
-    std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) {
-        if (len == 0 || offset == len) {
-            return std::make_pair(key, offset);
-        }
-        char c = key[offset];
-        auto res = children.find(c);
-        if (res != children.end()) {
-            return res->second.get_longest_prefix(key, len, offset + 1);
-        } else {
-            return std::make_pair(key, offset);
-        }
-    }
-    struct naive_trie * traverse(const char c) {
-        auto res = children.find(c);
-        if (res != children.end()) {
-            return &res->second;
-        } else {
-            return NULL;
-        }
-    }
-    std::map<char, struct naive_trie> children;
-    bool has_value;
-    llama_token value;
-};
-
 //
 // impl
 //
@@ -779,27 +736,6 @@ struct llm_tokenizer_ugm {
             prefix_replacements = &vocab.precompiled_charsmap[charsmap_offset];
             prefix_replacements_size = vocab.precompiled_charsmap.size() - charsmap_offset;
         }
-
-        for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) {
-            const auto &token_data = vocab.id_to_token[id];
-
-            if (llama_is_normal_token(vocab, id)) {
-                min_score = std::min<float>(min_score, token_data.score);
-                max_score = std::max<float>(max_score, token_data.score);
-            }
-
-            if (llama_is_normal_token(vocab, id) ||
-                llama_is_user_defined_token(vocab, id) ||
-                llama_is_unused_token(vocab, id)) {
-                token_matcher.insert(token_data.text.data(), token_data.text.size(), id);
-            }
-
-            if (llama_is_user_defined_token(vocab, id)) {
-                user_defined_token_matcher.insert(token_data.text.data(), token_data.text.size());
-            }
-        }
-
-        unknown_token_score = min_score - unknown_token_score_penalty;
     }
 
     /* This implementation is based on SentencePiece optimized Viterbi algorithm for
@@ -840,7 +776,7 @@ struct llm_tokenizer_ugm {
             // traverse the token matcher trie to find a matching token
             bool single_codepoint_token_found = false;
             const struct best_tokenization & current_best = tokenization_results[input_offset];
-            struct naive_trie * node  = token_matcher.traverse(normalized[prefix_offset++]);
+            const struct naive_trie * node  = vocab.token_matcher.traverse(normalized[prefix_offset++]);
 
             while (prefix_offset <= input_len && node != NULL) {
                 // check if we found valid token in prefix
@@ -1003,7 +939,7 @@ struct llm_tokenizer_ugm {
         }
 
         // if input prefix matches some user-defined token return this token as normalization result
-        auto user_defined_token_match = user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
+        auto user_defined_token_match = vocab.user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
         if (user_defined_token_match.second > 0) {
             return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
         }
@@ -1076,22 +1012,14 @@ struct llm_tokenizer_ugm {
     const uint32_t * xcda_array = NULL;
     size_t xcda_array_size = 0;
 
-    struct naive_trie user_defined_token_matcher;
-
     // this structure stores the best tokenization so far at input_offset
     struct best_tokenization {
         llama_token token_id;
         size_t input_offset;
         float score_sum;
     };
 
-    float min_score = FLT_MAX;
-    float max_score = -FLT_MAX;
-
-    float unknown_token_score_penalty = 10.0;
     float unknown_token_score;
-
-    struct naive_trie token_matcher;
 };
 
 //
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
@@ -7,6 +7,58 @@
 #include <unordered_map>
 #include <map>
 
+
+//
+// naive_trie
+//
+
+struct naive_trie {
+    naive_trie() : has_value(false), value(0) {
+    }
+    void insert(const char * key, size_t len, int32_t value = 0) {
+        if (len == 0) {
+            this->has_value = true;
+            this->value = value;
+            return;
+        }
+        char c = key[0];
+        auto res = children.find(c);
+        if (res != children.end()) {
+            res->second.insert(key + 1, len - 1, value);
+        } else {
+            auto res = children.insert(std::make_pair(c, naive_trie()));
+            res.first->second.insert(key + 1, len - 1, value);
+        }
+    }
+    std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) const {
+        if (len == 0 || offset == len) {
+            return std::make_pair(key, offset);
+        }
+        char c = key[offset];
+        auto res = children.find(c);
+        if (res != children.end()) {
+            return res->second.get_longest_prefix(key, len, offset + 1);
+        } else {
+            return std::make_pair(key, offset);
+        }
+    }
+    const struct naive_trie * traverse(const char c) const {
+        auto res = children.find(c);
+        if (res != children.end()) {
+            return &res->second;
+        } else {
+            return NULL;
+        }
+    }
+    std::map<char, struct naive_trie> children;
+    bool has_value;
+    llama_token value;
+};
+
+//
+// llama_vocab
+//
+
 struct llama_vocab {
     using id    = llama_token;
     using token = std::string;
@@ -57,6 +109,9 @@ struct llama_vocab {
     bool tokenizer_treat_whitespace_as_suffix = false;
 
     std::vector<char> precompiled_charsmap;
+    struct naive_trie user_defined_token_matcher;
+    struct naive_trie token_matcher;
+    float unknown_token_score = 0.0f;
 
     int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
 };
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -5217,6 +5217,36 @@ static void llm_load_hparams(
     hparams.rope_type = llama_rope_type(&model);
 }
 
+static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
+    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
+}
+
+static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
+    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
+}
+
+static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
+    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
+}
+
+static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
+    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
+}
+
+static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
+    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
+}
+
+static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
+    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
+}
+
 static void llm_load_vocab(
         llama_model_loader & ml,
         llama_model & model) {
@@ -5598,6 +5628,34 @@ static void llm_load_vocab(
             }
         }
 
+        // parse precompiled charsmap
+        if (vocab.type == LLAMA_VOCAB_TYPE_UGM) {
+            float min_score = -FLT_MIN;
+            float max_score = FLT_MAX;
+
+            for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) {
+                const auto &token_data = vocab.id_to_token[id];
+
+                if (llama_is_normal_token(vocab, id)) {
+                    min_score = std::min<float>(min_score, token_data.score);
+                    max_score = std::max<float>(max_score, token_data.score);
+                }
+
+                if (llama_is_normal_token(vocab, id) ||
+                    llama_is_user_defined_token(vocab, id) ||
+                    llama_is_unused_token(vocab, id)) {
+                    vocab.token_matcher.insert(token_data.text.data(), token_data.text.size(), id);
+                }
+
+                if (llama_is_user_defined_token(vocab, id)) {
+                    vocab.user_defined_token_matcher.insert(token_data.text.data(), token_data.text.size());
+                }
+            }
+
+            float unknown_token_score_penalty = 10.0;
+            vocab.unknown_token_score = min_score - unknown_token_score_penalty;
+        }
+
         // Handle add_bos_token and add_eos_token
         {
             bool temp = true;