diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index b74dc5e2114d8..f07d2553669af 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -35,7 +35,6 @@ jobs: include: - build_type: Release sanitizer: "" - disabled_on_pr: true fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken container: diff --git a/common/common.cpp b/common/common.cpp index 69c2d5bf79f91..fb80d4bf72283 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -39,6 +39,9 @@ #endif #if defined(LLAMA_USE_CURL) #include +#include +#include +#include #endif #if defined(_MSC_VER) @@ -61,7 +64,7 @@ #else #include #endif -#define LLAMA_CURL_MAX_PATH_LENGTH PATH_MAX +#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 #define LLAMA_CURL_MAX_HEADER_LENGTH 256 #endif // LLAMA_USE_CURL @@ -1702,27 +1705,13 @@ void llama_batch_add( #ifdef LLAMA_USE_CURL -struct llama_model * llama_load_model_from_url( - const char * model_url, - const char * path_model, - const struct llama_model_params & params) { - // Basic validation of the model_url - if (!model_url || strlen(model_url) == 0) { - fprintf(stderr, "%s: invalid model_url\n", __func__); - return NULL; - } - - // Initialize libcurl globally - auto curl = curl_easy_init(); - - if (!curl) { - fprintf(stderr, "%s: error initializing libcurl\n", __func__); - return NULL; - } +static bool llama_download_file(CURL * curl, const char * url, const char * path) { + bool force_download = false; // Set the URL, allow to follow http redirection - curl_easy_setopt(curl, CURLOPT_URL, model_url); + curl_easy_setopt(curl, CURLOPT_URL, url); curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + #if defined(_WIN32) // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of // operating system. Currently implemented under MS-Windows. @@ -1731,16 +1720,16 @@ struct llama_model * llama_load_model_from_url( // Check if the file already exists locally struct stat model_file_info; - auto file_exists = (stat(path_model, &model_file_info) == 0); + auto file_exists = (stat(path, &model_file_info) == 0); // If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; - char etag_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0}; - snprintf(etag_path, sizeof(etag_path), "%s.etag", path_model); + char etag_path[PATH_MAX] = {0}; + snprintf(etag_path, sizeof(etag_path), "%s.etag", path); char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; - char last_modified_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0}; - snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path_model); + char last_modified_path[PATH_MAX] = {0}; + snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path); if (file_exists) { auto * f_etag = fopen(etag_path, "r"); @@ -1748,7 +1737,7 @@ struct llama_model * llama_load_model_from_url( if (!fgets(etag, sizeof(etag), f_etag)) { fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path); } else { - fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, etag_path, etag); + fprintf(stderr, "%s: previous file found %s: %s\n", __func__, etag_path, etag); } fclose(f_etag); } @@ -1758,7 +1747,7 @@ struct llama_model * llama_load_model_from_url( if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) { fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path); } else { - fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, last_modified_path, + fprintf(stderr, "%s: previous file found %s: %s\n", __func__, last_modified_path, last_modified); } fclose(f_last_modified); @@ -1776,6 +1765,11 @@ struct llama_model * llama_load_model_from_url( auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata; + // Convert header field name to lowercase + for (size_t i = 0; i < n_items && buffer[i] != ':'; ++i) { + buffer[i] = tolower(buffer[i]); + } + const char * etag_prefix = "etag: "; if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) { strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF @@ -1798,7 +1792,7 @@ struct llama_model * llama_load_model_from_url( if (res != CURLE_OK) { curl_easy_cleanup(curl); fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); - return NULL; + return false; } long http_code = 0; @@ -1806,30 +1800,34 @@ struct llama_model * llama_load_model_from_url( if (http_code != 200) { // HEAD not supported, we don't know if the file has changed // force trigger downloading - file_exists = false; + force_download = true; fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code); } } // If the ETag or the Last-Modified headers are different: trigger a new download - if (!file_exists || strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) { - char path_model_temporary[LLAMA_CURL_MAX_PATH_LENGTH] = {0}; - snprintf(path_model_temporary, sizeof(path_model_temporary), "%s.downloadInProgress", path_model); + bool should_download = !file_exists + || force_download + || (strlen(headers.etag) > 0 && strcmp(etag, headers.etag) != 0) + || (strlen(headers.last_modified) > 0 && strcmp(last_modified, headers.last_modified) != 0); + if (should_download) { + char path_temporary[PATH_MAX] = {0}; + snprintf(path_temporary, sizeof(path_temporary), "%s.downloadInProgress", path); if (file_exists) { - fprintf(stderr, "%s: deleting previous downloaded model file: %s\n", __func__, path_model); - if (remove(path_model) != 0) { + fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path); + if (remove(path) != 0) { curl_easy_cleanup(curl); - fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path_model); - return NULL; + fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path); + return false; } } // Set the output file - auto * outfile = fopen(path_model_temporary, "wb"); + auto * outfile = fopen(path_temporary, "wb"); if (!outfile) { curl_easy_cleanup(curl); - fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model); - return NULL; + fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path); + return false; } typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd); @@ -1843,15 +1841,30 @@ struct llama_model * llama_load_model_from_url( // display download progress curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); + // helper function to hide password in URL + auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string { + std::size_t protocol_pos = url.find("://"); + if (protocol_pos == std::string::npos) { + return url; // Malformed URL + } + + std::size_t at_pos = url.find('@', protocol_pos + 3); + if (at_pos == std::string::npos) { + return url; // No password in URL + } + + return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos); + }; + // start the download - fprintf(stderr, "%s: downloading model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, - model_url, path_model, headers.etag, headers.last_modified); + fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, + llama_download_hide_password_in_url(url).c_str(), path, headers.etag, headers.last_modified); auto res = curl_easy_perform(curl); if (res != CURLE_OK) { fclose(outfile); curl_easy_cleanup(curl); fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); - return NULL; + return false; } long http_code = 0; @@ -1860,7 +1873,7 @@ struct llama_model * llama_load_model_from_url( fclose(outfile); curl_easy_cleanup(curl); fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code); - return NULL; + return false; } // Clean up @@ -1872,7 +1885,7 @@ struct llama_model * llama_load_model_from_url( if (etag_file) { fputs(headers.etag, etag_file); fclose(etag_file); - fprintf(stderr, "%s: model etag saved %s: %s\n", __func__, etag_path, headers.etag); + fprintf(stderr, "%s: file etag saved %s: %s\n", __func__, etag_path, headers.etag); } } @@ -1882,20 +1895,118 @@ struct llama_model * llama_load_model_from_url( if (last_modified_file) { fputs(headers.last_modified, last_modified_file); fclose(last_modified_file); - fprintf(stderr, "%s: model last modified saved %s: %s\n", __func__, last_modified_path, + fprintf(stderr, "%s: file last modified saved %s: %s\n", __func__, last_modified_path, headers.last_modified); } } - if (rename(path_model_temporary, path_model) != 0) { + if (rename(path_temporary, path) != 0) { + curl_easy_cleanup(curl); + fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary, path); + return false; + } + } + + return true; +} + +struct llama_model * llama_load_model_from_url( + const char * model_url, + const char * path_model, + const struct llama_model_params & params) { + // Basic validation of the model_url + if (!model_url || strlen(model_url) == 0) { + fprintf(stderr, "%s: invalid model_url\n", __func__); + return NULL; + } + + // Initialize libcurl + auto * curl = curl_easy_init(); + + if (!curl) { + fprintf(stderr, "%s: error initializing libcurl\n", __func__); + return NULL; + } + + if (!curl) { + fprintf(stderr, "%s: error initializing libcurl\n", __func__); + return NULL; + } + + if (!llama_download_file(curl, model_url, path_model)) { + return NULL; + } + + // check for additional GGUFs split to download + int n_split = 0; + { + struct gguf_init_params gguf_params = { + /*.no_alloc = */ true, + /*.ctx = */ NULL, + }; + auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params); + if (!ctx_gguf) { + fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model); curl_easy_cleanup(curl); - fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_model_temporary, path_model); return NULL; } + + auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT); + if (key_n_split >= 0) { + n_split = gguf_get_val_u16(ctx_gguf, key_n_split); + } + + gguf_free(ctx_gguf); } curl_easy_cleanup(curl); + if (n_split > 1) { + char split_prefix[PATH_MAX] = {0}; + char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0}; + + // Verify the first split file format + // and extract split URL and PATH prefixes + { + if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) { + fprintf(stderr, "\n%s: unexpected model file name: %s" + " n_split=%d\n", __func__, path_model, n_split); + return NULL; + } + + if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) { + fprintf(stderr, "\n%s: unexpected model url: %s" + " n_split=%d\n", __func__, model_url, n_split); + return NULL; + } + } + + // Prepare download in parallel + std::vector> futures_download; + for (int idx = 1; idx < n_split; idx++) { + futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool { + char split_path[PATH_MAX] = {0}; + llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split); + + char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0}; + llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split); + + auto * curl = curl_easy_init(); + bool res = llama_download_file(curl, split_url, split_path); + curl_easy_cleanup(curl); + + return res; + }, idx)); + } + + // Wait for all downloads to complete + for (auto & f : futures_download) { + if (!f.get()) { + return NULL; + } + } + } + return llama_load_model_from_file(path_model, params); } diff --git a/common/common.h b/common/common.h index afa4cf6d7d528..a223eceaad87e 100644 --- a/common/common.h +++ b/common/common.h @@ -306,3 +306,10 @@ struct llama_control_vector_load_info { // Load control vectors, scale each by strength, and add them together. // On error, returns {-1, empty} llama_control_vector_data llama_control_vector_load(const std::vector & load_infos); + +// +// Split utils +// +static const char * const LLM_KV_SPLIT_NO = "split.no"; +static const char * const LLM_KV_SPLIT_COUNT = "split.count"; +static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index f703588e164f6..b1af599923809 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -26,10 +26,6 @@ enum split_operation : uint8_t { SPLIT_OP_MERGE, }; -static const char * const LLM_KV_SPLIT_NO = "split.no"; -static const char * const LLM_KV_SPLIT_COUNT = "split.count"; -static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; - struct split_params { split_operation operation = SPLIT_OP_SPLIT; int n_split_tensors = 128; diff --git a/examples/server/README.md b/examples/server/README.md index 355601ff4dec2..256a53b7a673d 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -20,7 +20,9 @@ The project is under active development, and we are [looking for feedback and co - `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. - `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`) - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). -- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf). +- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (default: unused). +- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository (default: unused). +- `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused). - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 27bd2dd70c5f7..b02c2546eb4c6 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2208,7 +2208,11 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co printf(" -m FNAME, --model FNAME\n"); printf(" model path (default: %s)\n", params.model.c_str()); printf(" -mu MODEL_URL, --model-url MODEL_URL\n"); - printf(" model download url (default: %s)\n", params.model_url.c_str()); + printf(" model download url (default: unused)\n"); + printf(" -hfr REPO, --hf-repo REPO\n"); + printf(" Hugging Face model repository (default: unused)\n"); + printf(" -hff FILE, --hf-file FILE\n"); + printf(" Hugging Face model file (default: unused)\n"); printf(" -a ALIAS, --alias ALIAS\n"); printf(" set an alias for the model, will be added as `model` field in completion response\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); @@ -2337,6 +2341,18 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, break; } params.model_url = argv[i]; + } else if (arg == "-hfr" || arg == "--hf-repo") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.hf_repo = argv[i]; + } else if (arg == "-hff" || arg == "--hf-file") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.hf_file = argv[i]; } else if (arg == "-a" || arg == "--alias") { if (++i >= argc) { invalid_param = true; diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature index a66fed626619d..6cd306a2bcf7c 100644 --- a/examples/server/tests/features/parallel.feature +++ b/examples/server/tests/features/parallel.feature @@ -4,7 +4,8 @@ Feature: Parallel Background: Server startup Given a server listening on localhost:8080 - And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models + And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models + And a model file test-model-00001-of-00003.gguf And 42 as server seed And 128 as batch size And 256 KV cache size diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index a2e0e5b3532b5..646a4e49d0d56 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -4,8 +4,8 @@ Feature: llama.cpp server Background: Server startup Given a server listening on localhost:8080 - And a model url https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf - And a model file stories260K.gguf + And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models + And a model file test-model.gguf And a model alias tinyllama-2 And 42 as server seed # KV Cache corresponds to the total amount of tokens diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 03f55f65910de..86c3339dc7183 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -16,7 +16,6 @@ import openai from behave import step from behave.api.async_step import async_run_until_complete -from huggingface_hub import hf_hub_download from prometheus_client import parser @@ -39,6 +38,8 @@ def step_server_config(context, server_fqdn, server_port): context.model_alias = None context.model_file = None + context.model_hf_repo = None + context.model_hf_file = None context.model_url = None context.n_batch = None context.n_ubatch = None @@ -68,9 +69,9 @@ def step_server_config(context, server_fqdn, server_port): @step('a model file {hf_file} from HF repo {hf_repo}') def step_download_hf_model(context, hf_file, hf_repo): - context.model_file = hf_hub_download(repo_id=hf_repo, filename=hf_file) - if context.debug: - print(f"model file: {context.model_file}") + context.model_hf_repo = hf_repo + context.model_hf_file = hf_file + context.model_file = os.path.basename(hf_file) @step('a model file {model_file}') @@ -1079,6 +1080,10 @@ def start_server_background(context): server_args.extend(['--model', context.model_file]) if context.model_url: server_args.extend(['--model-url', context.model_url]) + if context.model_hf_repo: + server_args.extend(['--hf-repo', context.model_hf_repo]) + if context.model_hf_file: + server_args.extend(['--hf-file', context.model_hf_file]) if context.n_batch: server_args.extend(['--batch-size', context.n_batch]) if context.n_ubatch: diff --git a/llama.cpp b/llama.cpp index eedca802b86a7..a1850b44bcf3a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2934,7 +2934,7 @@ struct llama_model_loader { } } - LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split); + LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1); } n_kv = gguf_get_n_kv(meta); @@ -14841,7 +14841,7 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int // check if dest ends with postfix int size_prefix = str_split_path.size() - str_postfix.size(); if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) { - snprintf(dest, std::min((size_t) size_prefix, maxlen), "%s", split_path); + snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path); return size_prefix; }