@@ -47,6 +47,16 @@ int main(int argc, char ** argv) {
47
47
params.model = params.model_draft ;
48
48
std::tie (model_dft, ctx_dft) = llama_init_from_gpt_params (params);
49
49
50
+ {
51
+ LOG (" warming up the models with an empty run\n " );
52
+
53
+ const std::vector<llama_token> tmp = { llama_token_bos (ctx_tgt), };
54
+ llama_eval (ctx_tgt, tmp.data (), tmp.size (), 0 , params.n_threads );
55
+ llama_eval (ctx_dft, tmp.data (), tmp.size (), 0 , params.n_threads );
56
+ llama_reset_timings (ctx_tgt);
57
+ llama_reset_timings (ctx_dft);
58
+ }
59
+
50
60
// tokenize the prompt
51
61
std::vector<llama_token> inp;
52
62
inp = ::llama_tokenize (ctx_tgt, params.prompt , true );
@@ -67,11 +77,17 @@ int main(int argc, char ** argv) {
67
77
68
78
fflush (stderr);
69
79
80
+ const int n_input = inp.size ();
81
+
82
+ const auto t_enc_start = ggml_time_us ();
83
+
70
84
// eval the prompt with both models
71
85
llama_eval (ctx_tgt, inp.data (), int (inp.size () - 1 ), 0 , params.n_threads );
72
86
llama_eval (ctx_tgt, &inp.back (), 1 , inp.size () - 1 , params.n_threads );
73
87
llama_eval (ctx_dft, inp.data (), int (inp.size ()), 0 , params.n_threads );
74
88
89
+ const auto t_enc_end = ggml_time_us ();
90
+
75
91
// the 2 models should have the same vocab
76
92
const int n_ctx = llama_n_ctx (ctx_tgt);
77
93
const int n_vocab = llama_n_vocab (ctx_tgt);
@@ -103,7 +119,7 @@ int main(int argc, char ** argv) {
103
119
// used to determine end of generation
104
120
bool has_eos = false ;
105
121
106
- const auto t_gen_start = ggml_time_us ();
122
+ const auto t_dec_start = ggml_time_us ();
107
123
108
124
while (true ) {
109
125
LOG (" drafted: %s\n " , LOG_TOKENS_TOSTR_PRETTY (ctx_dft, drafted));
@@ -193,11 +209,12 @@ int main(int argc, char ** argv) {
193
209
drafted.erase (drafted.begin ());
194
210
}
195
211
196
- auto t_gen_end = ggml_time_us ();
212
+ auto t_dec_end = ggml_time_us ();
197
213
198
214
LOG_TEE (" \n\n " );
199
215
200
- LOG_TEE (" generated %d tokens in %.3f seconds, speed: %.3f t/s\n " , n_predict, (t_gen_end - t_gen_start) / 1e6f, n_predict / ((t_gen_end - t_gen_start) / 1e6f));
216
+ LOG_TEE (" encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n " , n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size () / ((t_enc_end - t_enc_start) / 1e6f));
217
+ LOG_TEE (" decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n " , n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
201
218
202
219
// TODO: make sure these numbers are computed correctly
203
220
LOG_TEE (" \n " );
0 commit comments