@@ -78,6 +78,7 @@ static std::string format(const char * fmt, ...) {
78
78
#define KEY_HAS_VIS_ENC " clip.has_vision_encoder"
79
79
#define KEY_HAS_LLAVA_PROJ " clip.has_llava_projector"
80
80
#define KEY_HAS_MINICPMV_PROJ " clip.has_minicpmv_projector"
81
+ #define KEY_MINICPMV_VERSION " clip.minicpmv_version"
81
82
#define KEY_USE_GELU " clip.use_gelu"
82
83
#define KEY_N_EMBD " clip.%s.embedding_length"
83
84
#define KEY_N_FF " clip.%s.feed_forward_length"
@@ -526,6 +527,7 @@ struct clip_ctx {
526
527
bool has_vision_encoder = false ;
527
528
bool has_llava_projector = false ;
528
529
bool has_minicpmv_projector = false ;
530
+ int minicpmv_version = 2 ;
529
531
530
532
struct clip_vision_model vision_model;
531
533
projector_type proj_type = PROJECTOR_TYPE_MLP;
@@ -616,7 +618,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
616
618
inp = ggml_add (ctx0, inp, model.patch_bias );
617
619
}
618
620
struct ggml_tensor * embeddings = inp;
619
- struct ggml_tensor * pos_embed;
621
+ struct ggml_tensor * pos_embed = nullptr ;
620
622
621
623
if (ctx->has_llava_projector ){
622
624
// concat class_embeddings and patch_embeddings
@@ -638,10 +640,15 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
638
640
embeddings =
639
641
ggml_add (ctx0, embeddings, ggml_get_rows (ctx0, model.position_embeddings , positions));
640
642
641
- if (ctx->has_minicpmv_projector ){
643
+ if (ctx->has_minicpmv_projector ) {
642
644
int pos_w = image_size_width/patch_size;
643
645
int pos_h = image_size_height/patch_size;
644
- pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 4096 , pos_w * pos_h, 1 );
646
+ if (ctx->minicpmv_version == 2 ) {
647
+ pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 4096 , pos_w * pos_h, 1 );
648
+ }
649
+ else if (ctx->minicpmv_version == 3 ) {
650
+ pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 3584 , pos_w * pos_h, 1 );
651
+ }
645
652
ggml_set_name (pos_embed, " pos_embed" );
646
653
ggml_set_input (pos_embed);
647
654
}
@@ -931,8 +938,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
931
938
}
932
939
}
933
940
// minicpmv projector
934
- else if (ctx->has_minicpmv_projector )
935
- {
941
+ else if (ctx->has_minicpmv_projector ) {
936
942
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
937
943
struct ggml_tensor * q = model.mm_model_query ;
938
944
{ // layernorm
@@ -950,11 +956,20 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
950
956
}
951
957
952
958
{ // attention
953
- const int hidden_size = 4096 ;
959
+ int hidden_size = 4096 ;
954
960
const int d_head = 128 ;
955
- const int n_head = hidden_size/d_head;
956
- const int num_query = 96 ;
957
-
961
+ int n_head = hidden_size/d_head;
962
+ int num_query = 96 ;
963
+ if (ctx->minicpmv_version == 2 ) {
964
+ hidden_size = 4096 ;
965
+ n_head = hidden_size/d_head;
966
+ num_query = 96 ;
967
+ }
968
+ else if (ctx->minicpmv_version == 3 ) {
969
+ hidden_size = 3584 ;
970
+ n_head = hidden_size/d_head;
971
+ num_query = 64 ;
972
+ }
958
973
struct ggml_tensor * Q = ggml_add (ctx0, ggml_mul_mat (ctx0, model.mm_model_attn_q_w , q), model.mm_model_attn_q_b );
959
974
Q = ggml_scale_inplace (ctx0, Q, 1 .0f / sqrt ((float )d_head));
960
975
struct ggml_tensor * K = ggml_add (ctx0, ggml_mul_mat (ctx0, model.mm_model_attn_k_w , k), model.mm_model_attn_k_b );
@@ -1145,6 +1160,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1145
1160
new_clip->has_minicpmv_projector = gguf_get_val_bool (ctx, idx);
1146
1161
}
1147
1162
1163
+ idx = gguf_find_key (ctx, KEY_MINICPMV_VERSION);
1164
+ if (idx != -1 ) {
1165
+ new_clip->minicpmv_version = gguf_get_val_i32 (ctx, idx);
1166
+ }
1167
+
1148
1168
// GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
1149
1169
1150
1170
GGML_ASSERT (new_clip->has_vision_encoder );
@@ -1908,7 +1928,14 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip){
1908
1928
bool clip_image_preprocess (struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
1909
1929
1910
1930
if (clip_is_minicpmv (ctx)){
1911
- std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image (img);
1931
+ int max_slice_nums = 9 ;
1932
+ if (ctx->minicpmv_version == 2 ) {
1933
+ max_slice_nums = 9 ;
1934
+ }
1935
+ else if (ctx->minicpmv_version == 3 ) {
1936
+ max_slice_nums = 9 ;
1937
+ }
1938
+ std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image (img, max_slice_nums);
1912
1939
res_imgs->size = 0 ;
1913
1940
for (size_t i = 0 ; i < imgs.size (); ++i){
1914
1941
res_imgs->size += imgs[i].size ();
@@ -2143,7 +2170,12 @@ int clip_n_patches(const struct clip_ctx * ctx) {
2143
2170
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
2144
2171
n_patches /= 4 ;
2145
2172
} else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
2146
- n_patches = 96 ;
2173
+ if (ctx->minicpmv_version == 2 ) {
2174
+ n_patches = 96 ;
2175
+ }
2176
+ else if (ctx->minicpmv_version == 3 ) {
2177
+ n_patches = 64 ;
2178
+ }
2147
2179
}
2148
2180
2149
2181
return n_patches;
@@ -2279,6 +2311,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2279
2311
const int patch_size = hparams.patch_size ;
2280
2312
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
2281
2313
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0 );
2314
+ if (ctx->load_image_size ==nullptr ){
2315
+ ctx->load_image_size = clip_image_size_init ();
2316
+ }
2317
+ const int pos_w = ctx->load_image_size ->width /patch_size;
2318
+ const int pos_h = ctx->load_image_size ->height /patch_size;
2282
2319
2283
2320
{
2284
2321
struct ggml_tensor * inp_raw = ggml_graph_get_tensor (gf, " inp_raw" );
@@ -2313,8 +2350,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2313
2350
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
2314
2351
struct ggml_tensor * positions = ggml_graph_get_tensor (gf, " positions" );
2315
2352
int * positions_data = (int *)malloc (ggml_nbytes (positions));
2316
- for (int i = 0 ; i < num_positions; i++) {
2317
- positions_data[i] = std::floor (70.0 *i/num_positions);
2353
+ int bucket_coords_h[70 ];
2354
+ int bucket_coords_w[70 ];
2355
+ for (int i = 0 ; i < pos_h; i++){
2356
+ bucket_coords_h[i] = std::floor (70.0 *i/pos_h);
2357
+ }
2358
+ for (int i = 0 ; i < pos_w; i++){
2359
+ bucket_coords_w[i] = std::floor (70.0 *i/pos_w);
2360
+ }
2361
+ for (int i = 0 , id = 0 ; i < pos_h; i++){
2362
+ for (int j = 0 ; j < pos_w; j++){
2363
+ positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
2364
+ }
2318
2365
}
2319
2366
ggml_backend_tensor_set (positions, positions_data, 0 , ggml_nbytes (positions));
2320
2367
free (positions_data);
@@ -2325,12 +2372,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2325
2372
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
2326
2373
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
2327
2374
struct ggml_tensor * pos_embed = ggml_graph_get_tensor (gf, " pos_embed" );
2328
- if (ctx->load_image_size ==nullptr ){
2329
- ctx->load_image_size = clip_image_size_init ();
2330
- }
2331
- int pos_w = ctx->load_image_size ->width /patch_size;
2332
- int pos_h = ctx->load_image_size ->height /patch_size;
2333
2375
int embed_dim = 4096 ;
2376
+ if (ctx->minicpmv_version == 2 ) {
2377
+ embed_dim = 4096 ;
2378
+ }
2379
+ else if (ctx->minicpmv_version == 3 ) {
2380
+ embed_dim = 3584 ;
2381
+ }
2334
2382
auto pos_embed_t = get_2d_sincos_pos_embed (embed_dim, std::make_pair (pos_w, pos_h));
2335
2383
2336
2384
float * pos_embed_data = (float *)malloc (ggml_nbytes (pos_embed));
@@ -2546,13 +2594,21 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
2546
2594
return ctx->vision_model .mm_3_b ->ne [0 ];
2547
2595
}
2548
2596
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
2549
- return 4096 ;
2597
+ if (ctx->minicpmv_version == 2 ) {
2598
+ return 4096 ;
2599
+ }
2600
+ else if (ctx->minicpmv_version == 3 ) {
2601
+ return 3584 ;
2602
+ }
2550
2603
}
2551
2604
2552
2605
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type ];
2553
2606
throw std::runtime_error (format (" %s: don't support projector with: %s currently\n " , __func__, proj_type.c_str ()));
2554
2607
}
2555
2608
2556
- bool clip_is_minicpmv (const struct clip_ctx * ctx) {
2557
- return ctx->has_minicpmv_projector ;
2609
+ int clip_is_minicpmv (const struct clip_ctx * ctx) {
2610
+ if (ctx->has_minicpmv_projector ) {
2611
+ return ctx->minicpmv_version ;
2612
+ }
2613
+ return 0 ;
2558
2614
}
0 commit comments