@@ -16,6 +16,7 @@ struct mtmd_context {
16
16
struct clip_ctx * ctx_clip;
17
17
const struct llama_model * text_model;
18
18
std::vector<float > image_embd_v; // image embedding vector
19
+
19
20
bool print_timings;
20
21
int n_threads;
21
22
std::string image_marker;
@@ -24,7 +25,11 @@ struct mtmd_context {
24
25
25
26
mtmd_context (const char * mmproj_fname,
26
27
const llama_model * text_model,
27
- const mtmd_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
28
+ const mtmd_context_params & ctx_params) :
29
+ print_timings (ctx_params.print_timings),
30
+ n_threads (ctx_params.n_threads),
31
+ image_marker (ctx_params.image_marker)
32
+ {
28
33
clip_context_params ctx_clip_params;
29
34
ctx_clip_params.use_gpu = ctx_params.use_gpu ;
30
35
ctx_clip_params.verbosity = ctx_params.verbosity ;
@@ -49,6 +54,7 @@ struct mtmd_image_tokens {
49
54
uint32_t ny; // number of tokens in y direction
50
55
uint32_t n_tokens () const { return nx * ny; }
51
56
clip_image_f32_batch batch_f32; // preprocessed image patches
57
+ std::string id; // optional user-defined ID, useful for KV cache tracking
52
58
};
53
59
54
60
mtmd_context * mtmd_init_from_file (const char * mmproj_fname,
@@ -88,10 +94,10 @@ static std::vector<llama_token> mtmd_tokenize_text_internal(
88
94
return result;
89
95
}
90
96
91
- mtmd_input_chunks * mtmd_tokenize (mtmd_context * ctx,
92
- const mtmd_input_text & text ,
93
- const std::vector<mtmd_bitmap> & bitmaps) {
94
- mtmd_input_chunks * output = new mtmd_input_chunks;
97
+ int32_t mtmd_tokenize (mtmd_context * ctx,
98
+ std::vector<mtmd_input_chunk> & output ,
99
+ const mtmd_input_text & text,
100
+ const std::vector<mtmd_bitmap> & bitmaps) {
95
101
auto vocab = llama_model_get_vocab (ctx->text_model );
96
102
97
103
std::string prompt_modified (text.text );
@@ -105,9 +111,9 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
105
111
string_replace_all (prompt_modified, ctx->image_marker , marker_modified);
106
112
}
107
113
108
- std::vector<std::string> parts = string_split_str (text. text , ctx->image_marker );
109
- output-> clear ();
110
- output-> reserve (parts.size ());
114
+ std::vector<std::string> parts = string_split_str (prompt_modified , ctx->image_marker );
115
+ output. clear ();
116
+ output. reserve (parts.size ());
111
117
112
118
size_t i_img = 0 ;
113
119
@@ -123,14 +129,14 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
123
129
std::move (tokens),
124
130
{},
125
131
};
126
- output-> emplace_back (std::move (chunk));
132
+ output. emplace_back (std::move (chunk));
127
133
128
134
if (&parts.back () != &part) {
129
135
// add image token to middle of 2 parts
130
136
131
137
if (i_img >= bitmaps.size ()) {
132
138
LOG_ERR (" %s: error: not enough images for %d parts\n " , __func__, (int )parts.size ());
133
- return nullptr ;
139
+ return 1 ;
134
140
}
135
141
136
142
// shim layer
@@ -145,34 +151,48 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
145
151
bool ok = clip_image_preprocess (ctx->ctx_clip , img_u8.get (), &batch_f32);
146
152
if (!ok) {
147
153
LOG_ERR (" Unable to preprocess image\n " );
148
- return nullptr ;
154
+ return 2 ;
149
155
}
150
156
151
- mtmd_image_tokens * image_tokens = new mtmd_image_tokens;
157
+ mtmd_image_tokens_ptr image_tokens ( new mtmd_image_tokens) ;
152
158
image_tokens->nx = clip_n_patches (ctx->ctx_clip ); // TODO @ngxson : use clip_n_patches_by_image
153
159
image_tokens->ny = 1 ; // TODO
154
160
image_tokens->batch_f32 = std::move (batch_f32);
161
+ image_tokens->id = bitmaps[i_img].id ; // optional
155
162
156
163
mtmd_input_chunk chunk{
157
164
MTMD_INPUT_CHUNK_TYPE_IMAGE,
158
165
{},
159
- image_tokens,
166
+ std::move ( image_tokens) ,
160
167
};
161
- output-> emplace_back (std::move (chunk));
168
+ output. emplace_back (std::move (chunk));
162
169
i_img++;
163
170
}
164
171
}
165
172
166
- return output ;
173
+ return 0 ;
167
174
}
168
175
169
- void mtmd_input_chunks_free (mtmd_input_chunks * chunks) {
170
- for (auto & chunk : *chunks) {
171
- if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE && chunk.tokens_image ) {
172
- delete chunk.tokens_image ;
173
- }
176
+ void mtmd_image_tokens_free (mtmd_image_tokens * image_tokens) {
177
+ if (image_tokens) {
178
+ delete image_tokens;
174
179
}
175
- delete chunks;
180
+ }
181
+
182
+ size_t mtmd_image_tokens_get_n_tokens (const mtmd_image_tokens * image_tokens) {
183
+ return image_tokens->n_tokens ();
184
+ }
185
+
186
+ size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens) {
187
+ return image_tokens->nx ;
188
+ }
189
+
190
+ size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens) {
191
+ return image_tokens->ny ;
192
+ }
193
+
194
+ std::string mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens) {
195
+ return image_tokens->id ;
176
196
}
177
197
178
198
int32_t mtmd_encode (mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
@@ -190,9 +210,9 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
190
210
return ctx->image_embd_v .data ();
191
211
}
192
212
193
- size_t mtmd_helper_get_n_tokens (mtmd_input_chunks * chunks) {
213
+ size_t mtmd_helper_get_n_tokens (mtmd_input_chunks & chunks) {
194
214
size_t n_tokens = 0 ;
195
- for (auto & chunk : * chunks) {
215
+ for (auto & chunk : chunks) {
196
216
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
197
217
n_tokens += chunk.tokens_text .size ();
198
218
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
@@ -241,16 +261,16 @@ struct decode_embd_batch {
241
261
242
262
int32_t mtmd_helper_eval (mtmd_context * ctx,
243
263
llama_context * lctx,
244
- mtmd_input_chunks * chunks,
264
+ mtmd_input_chunks & chunks,
245
265
llama_pos pos0,
246
266
llama_seq_id seq_id,
247
267
int32_t n_batch) {
248
268
int32_t ret;
249
269
llama_pos n_past = pos0;
250
270
llama_batch text_batch = llama_batch_init (n_batch, 0 , 1 );
251
271
252
- for (auto & chunk : * chunks) {
253
- bool is_last = &chunk == &chunks-> back ();
272
+ for (auto & chunk : chunks) {
273
+ bool is_last = &chunk == &chunks. back ();
254
274
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
255
275
// TODO @ngxson : may need to split into smaller batches
256
276
text_batch.n_tokens = chunk.tokens_text .size ();
@@ -279,7 +299,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
279
299
if (ctx->print_timings ) {
280
300
LOG_INF (" encoding image...\n " );
281
301
}
282
- ret = mtmd_encode (ctx, chunk.tokens_image );
302
+ ret = mtmd_encode (ctx, chunk.tokens_image . get () );
283
303
if (ret != 0 ) {
284
304
LOG_ERR (" failed to encode image\n " );
285
305
llama_batch_free (text_batch);
@@ -289,7 +309,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
289
309
LOG_INF (" image encoded in %" PRId64 " ms\n " , ggml_time_ms () - t0);
290
310
}
291
311
292
- int32_t n_tokens = chunk.tokens_image -> n_tokens ( );
312
+ int32_t n_tokens = mtmd_image_tokens_get_n_tokens ( chunk.tokens_image . get () );
293
313
float * embd = mtmd_get_output_embd (ctx);
294
314
decode_embd_batch batch_img (embd, n_tokens, n_past, 0 );
295
315
int64_t t1 = ggml_time_ms ();
@@ -339,3 +359,15 @@ int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & outp
339
359
std::memcpy (output.data .data (), data, output.nx * output.ny * 3 );
340
360
return 0 ;
341
361
}
362
+
363
+ bool mtmd_decode_use_non_causal (mtmd_context * ctx) {
364
+ projector_type proj_type = clip_get_projector_type (ctx->ctx_clip );
365
+ if (proj_type == PROJECTOR_TYPE_GEMMA3) {
366
+ return true ;
367
+ }
368
+ return false ;
369
+ }
370
+
371
+ void mtmd_image_tokens_deleter::operator ()(mtmd_image_tokens * val) {
372
+ mtmd_image_tokens_free (val);
373
+ }
0 commit comments