update embedding/retrieval/gritlm examples for pooling changes

iamlemec · iamlemec · commit 69cd94e8c0c8 · 2024-05-22T15:54:43.000-05:00
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
@@ -17,9 +17,25 @@ static std::vector<std::string> split_lines(const std::string & s) {
     return lines;
 }
 
-static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
-    for (size_t i = 0; i < tokens.size(); i++) {
-        llama_batch_add(batch, tokens[i], i, { seq_id }, true);
+static bool needs_logit(enum llama_pooling_type pooling_type, int pos, int n_tokens) {
+    switch (pooling_type) {
+        case LLAMA_POOLING_TYPE_MEAN:
+        case LLAMA_POOLING_TYPE_NONE:
+            return true;
+        case LLAMA_POOLING_TYPE_CLS:
+            return pos == 0;
+        case LLAMA_POOLING_TYPE_LAST:
+            return pos == n_tokens - 1;
+        default:
+            GGML_ASSERT(false && "unsupported pooling type");
+    }
+}
+
+static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id, enum llama_pooling_type pooling_type) {
+    int n_tokens = tokens.size();
+    for (size_t i = 0; i < n_tokens; i++) {
+        bool logit = needs_logit(pooling_type, i, n_tokens);
+        llama_batch_add(batch, tokens[i], i, { seq_id }, logit);
     }
 }
 
@@ -40,13 +56,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 
         // try to get sequence embeddings - supported only when pooling_type is not NONE
         const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-        if (embd == NULL) {
-            embd = llama_get_embeddings_ith(ctx, i);
-            if (embd == NULL) {
-                fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
-                continue;
-            }
-        }
+        GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
 
         float * out = output + batch.seq_id[i][0] * n_embd;
         //TODO: I would also add a parameter here to enable normalization or not.
@@ -99,6 +109,12 @@ int main(int argc, char ** argv) {
     const int n_ctx_train = llama_n_ctx_train(model);
     const int n_ctx = llama_n_ctx(ctx);
 
+    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
+    if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+        fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
+        return 1;
+    }
+
     if (n_ctx > n_ctx_train) {
         fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
                 __func__, n_ctx_train, n_ctx);
@@ -178,7 +194,7 @@ int main(int argc, char ** argv) {
         }
 
         // add to batch
-        batch_add_seq(batch, inp, s);
+        batch_add_seq(batch, inp, s, pooling_type);
         s += 1;
     }
 
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
@@ -164,9 +164,13 @@ int main(int argc, char * argv[]) {
 
     llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
 
-    // create new context - set to embedding mode
+    // create generation context
+    llama_context * ctx_gen = llama_new_context_with_model(mdl, cparams);
+
+    // create embedding context
     cparams.embeddings = true;
-    llama_context * ctx = llama_new_context_with_model(mdl, cparams);
+    cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
+    llama_context * ctx_emb = llama_new_context_with_model(mdl, cparams);
 
     // ### Embedding/Representation ###
     // samples taken from: https://github.com/ContextualAI/gritlm#basic
@@ -184,8 +188,8 @@ int main(int argc, char * argv[]) {
         };
 
         // No need to add instruction for retrieval documents
-        const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
-        const std::vector<std::vector<float>> q_rep = encode(ctx, queries,   gritlm_instruction(instruction));
+        const std::vector<std::vector<float>> d_rep = encode(ctx_emb, documents, gritlm_instruction(""));
+        const std::vector<std::vector<float>> q_rep = encode(ctx_emb, queries,   gritlm_instruction(instruction));
 
         const int n_embd = llama_n_embd(mdl);
 
@@ -204,10 +208,11 @@ int main(int argc, char * argv[]) {
     // GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
     {
         const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n";
-        std::string response = generate(ctx, prompt, true);
+        std::string response = generate(ctx_gen, prompt, true);
     }
 
-    llama_free(ctx);
+    llama_free(ctx_gen);
+    llama_free(ctx_emb);
     llama_free_model(mdl);
     llama_backend_free();
 
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
@@ -133,9 +133,25 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
     return chunks;
 }
 
-static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
+static bool needs_logit(enum llama_pooling_type pooling_type, int pos, int n_tokens) {
+    switch (pooling_type) {
+        case LLAMA_POOLING_TYPE_MEAN:
+        case LLAMA_POOLING_TYPE_NONE:
+            return true;
+        case LLAMA_POOLING_TYPE_CLS:
+            return pos == 0;
+        case LLAMA_POOLING_TYPE_LAST:
+            return pos == n_tokens - 1;
+        default:
+            GGML_ASSERT(false && "unsupported pooling type");
+    }
+}
+
+static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id, enum llama_pooling_type pooling_type) {
+    int n_tokens = tokens.size();
     for (size_t i = 0; i < tokens.size(); i++) {
-        llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
+        bool logit = needs_logit(pooling_type, i, n_tokens);
+        llama_batch_add(batch, tokens[i], i, { seq_id }, logit);
     }
 }
 
@@ -217,6 +233,7 @@ int main(int argc, char ** argv) {
 
     const int n_ctx_train = llama_n_ctx_train(model);
     const int n_ctx = llama_n_ctx(ctx);
+    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
 
     if (n_ctx > n_ctx_train) {
         fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
@@ -288,7 +305,7 @@ int main(int argc, char ** argv) {
         }
 
         // add to batch
-        batch_add_seq(batch, inp, s);
+        batch_add_seq(batch, inp, s, pooling_type);
         s += 1;
     }
 
@@ -311,7 +328,7 @@ int main(int argc, char ** argv) {
         std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
 
         struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
-        batch_add_seq(query_batch, query_tokens, 0);
+        batch_add_seq(query_batch, query_tokens, 0, pooling_type);
 
         std::vector<float> query_emb(n_embd, 0);
         batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);