Skip to content

Commit 947f64f

Browse files
finetune : zero the loraB initial vectors (#4082)
* finetune : zero the loraB initial vectors Without this, the first iteration is starting out far from the base model, instead of exactly on it. Zeroing loraB is what the paper recommends. loralib also zeroes at least one of the init vector pairs (though it departs from the paper in using a different distribution for the other vector, in some cases). * tabs to spaces * Use ggml_set_zero instead of adding a new function
1 parent b83e149 commit 947f64f

File tree

1 file changed

+12
-12
lines changed

1 file changed

+12
-12
lines changed

examples/finetune/finetune.cpp

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -548,35 +548,35 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
548548
struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
549549

550550
randomize_tensor_normal(lora->tok_embeddings_a, rnd);
551-
randomize_tensor_normal(lora->tok_embeddings_b, rnd);
551+
ggml_set_zero(lora->tok_embeddings_b);
552552
randomize_tensor_normal(lora->norm_a, rnd);
553-
randomize_tensor_normal(lora->norm_b, rnd);
553+
ggml_set_zero(lora->norm_b);
554554
randomize_tensor_normal(lora->output_a, rnd);
555-
randomize_tensor_normal(lora->output_b, rnd);
555+
ggml_set_zero(lora->output_b);
556556

557557
for (uint32_t i = 0; i < n_layer; ++i) {
558558
auto & layer = lora->layers[i];
559559
randomize_tensor_normal(layer.attention_norm_a, rnd);
560-
randomize_tensor_normal(layer.attention_norm_b, rnd);
560+
ggml_set_zero(layer.attention_norm_b);
561561

562562
randomize_tensor_normal(layer.wq_a, rnd);
563-
randomize_tensor_normal(layer.wq_b, rnd);
563+
ggml_set_zero(layer.wq_b);
564564
randomize_tensor_normal(layer.wk_a, rnd);
565-
randomize_tensor_normal(layer.wk_b, rnd);
565+
ggml_set_zero(layer.wk_b);
566566
randomize_tensor_normal(layer.wv_a, rnd);
567-
randomize_tensor_normal(layer.wv_b, rnd);
567+
ggml_set_zero(layer.wv_b);
568568
randomize_tensor_normal(layer.wo_a, rnd);
569-
randomize_tensor_normal(layer.wo_b, rnd);
569+
ggml_set_zero(layer.wo_b);
570570

571571
randomize_tensor_normal(layer.ffn_norm_a, rnd);
572-
randomize_tensor_normal(layer.ffn_norm_b, rnd);
572+
ggml_set_zero(layer.ffn_norm_b);
573573

574574
randomize_tensor_normal(layer.w1_a, rnd);
575-
randomize_tensor_normal(layer.w1_b, rnd);
575+
ggml_set_zero(layer.w1_b);
576576
randomize_tensor_normal(layer.w2_a, rnd);
577-
randomize_tensor_normal(layer.w2_b, rnd);
577+
ggml_set_zero(layer.w2_b);
578578
randomize_tensor_normal(layer.w3_a, rnd);
579-
randomize_tensor_normal(layer.w3_b, rnd);
579+
ggml_set_zero(layer.w3_b);
580580
}
581581

582582
free_random_normal_distribution(rnd);

0 commit comments

Comments
 (0)