From d8be8af24a514227ea9f116c98ec2e47cd2299d3 Mon Sep 17 00:00:00 2001 From: Dorin Geman Date: Wed, 3 Dec 2025 12:33:38 +0200 Subject: [PATCH] llamacpp: bump llama.cpp (b7245) See https://github.com/ggml-org/llama.cpp/tree/b7245. Signed-off-by: Dorin Geman --- llamacpp/native/src/server/CMakeLists.txt | 8 +- llamacpp/native/src/server/Makefile | 18 +- llamacpp/native/src/server/httplib.h | 10506 ---------------- .../server/{utils.hpp => server-common.cpp} | 2043 +-- llamacpp/native/src/server/server-common.h | 359 + llamacpp/native/src/server/server-context.cpp | 3637 ++++++ llamacpp/native/src/server/server-context.h | 83 + llamacpp/native/src/server/server-http.cpp | 380 + llamacpp/native/src/server/server-http.h | 78 + llamacpp/native/src/server/server-http.patch | 61 + llamacpp/native/src/server/server-models.cpp | 975 ++ llamacpp/native/src/server/server-models.h | 174 + llamacpp/native/src/server/server-queue.cpp | 351 + llamacpp/native/src/server/server-queue.h | 146 + llamacpp/native/src/server/server-task.cpp | 1471 +++ llamacpp/native/src/server/server-task.h | 460 + llamacpp/native/src/server/server.cpp | 5948 +-------- llamacpp/native/src/server/server.patch | 20 - llamacpp/native/vendor/llama.cpp | 2 +- 19 files changed, 9503 insertions(+), 17217 deletions(-) delete mode 100644 llamacpp/native/src/server/httplib.h rename llamacpp/native/src/server/{utils.hpp => server-common.cpp} (59%) create mode 100644 llamacpp/native/src/server/server-common.h create mode 100644 llamacpp/native/src/server/server-context.cpp create mode 100644 llamacpp/native/src/server/server-context.h create mode 100644 llamacpp/native/src/server/server-http.cpp create mode 100644 llamacpp/native/src/server/server-http.h create mode 100644 llamacpp/native/src/server/server-http.patch create mode 100644 llamacpp/native/src/server/server-models.cpp create mode 100644 llamacpp/native/src/server/server-models.h create mode 100644 llamacpp/native/src/server/server-queue.cpp create mode 100644 llamacpp/native/src/server/server-queue.h create mode 100644 llamacpp/native/src/server/server-task.cpp create mode 100644 llamacpp/native/src/server/server-task.h delete mode 100644 llamacpp/native/src/server/server.patch diff --git a/llamacpp/native/src/server/CMakeLists.txt b/llamacpp/native/src/server/CMakeLists.txt index 8d995f069..95d89b5cb 100644 --- a/llamacpp/native/src/server/CMakeLists.txt +++ b/llamacpp/native/src/server/CMakeLists.txt @@ -15,18 +15,14 @@ if (MINGW) add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER}) endif() -set(TARGET_SRCS - server.cpp - utils.hpp - httplib.h -) +file(GLOB TARGET_SRCS "*.cpp") add_executable(${TARGET} ${TARGET_SRCS}) install(TARGETS ${TARGET} RUNTIME) target_include_directories(${TARGET} PRIVATE ../../vendor/llama.cpp/tools/mtmd) target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR}) -target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT} cpp-httplib) if (WIN32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) diff --git a/llamacpp/native/src/server/Makefile b/llamacpp/native/src/server/Makefile index cd1b03ef3..aad5b71b0 100644 --- a/llamacpp/native/src/server/Makefile +++ b/llamacpp/native/src/server/Makefile @@ -1,16 +1,18 @@ LLAMA_SERVER_DIR = ../../vendor/llama.cpp/tools/server/ +SERVER_FILES = server-common server-context server-http server-models server-queue server-task server +HEADERS = $(addsuffix .h, $(filter-out server, $(SERVER_FILES))) +SOURCES = $(addsuffix .cpp, $(SERVER_FILES)) .PHONY: clean all -all: utils.hpp server.cpp +all: $(HEADERS) $(SOURCES) -utils.hpp: $(LLAMA_SERVER_DIR)/utils.hpp - cp $(LLAMA_SERVER_DIR)/utils.hpp . +%.h: $(LLAMA_SERVER_DIR)/%.h + cp $< $@ -server.cpp: $(LLAMA_SERVER_DIR)/server.cpp - cp $(LLAMA_SERVER_DIR)/server.cpp . - patch server.cpp < server.patch +%.cpp: $(LLAMA_SERVER_DIR)/%.cpp + cp $< $@ + @if [ "$@" = "server-http.cpp" ]; then patch $@ < server-http.patch; fi clean: - rm *.cpp - rm *.hpp + rm -f $(HEADERS) $(SOURCES) diff --git a/llamacpp/native/src/server/httplib.h b/llamacpp/native/src/server/httplib.h deleted file mode 100644 index 0f981dc89..000000000 --- a/llamacpp/native/src/server/httplib.h +++ /dev/null @@ -1,10506 +0,0 @@ -// -// httplib.h -// -// Copyright (c) 2025 Yuji Hirose. All rights reserved. -// MIT License -// - -#ifndef CPPHTTPLIB_HTTPLIB_H -#define CPPHTTPLIB_HTTPLIB_H - -#define CPPHTTPLIB_VERSION "0.20.0" - -/* - * Configuration - */ - -#ifndef CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND -#define CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND 5 -#endif - -#ifndef CPPHTTPLIB_KEEPALIVE_TIMEOUT_CHECK_INTERVAL_USECOND -#define CPPHTTPLIB_KEEPALIVE_TIMEOUT_CHECK_INTERVAL_USECOND 10000 -#endif - -#ifndef CPPHTTPLIB_KEEPALIVE_MAX_COUNT -#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 100 -#endif - -#ifndef CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND -#define CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND 300 -#endif - -#ifndef CPPHTTPLIB_CONNECTION_TIMEOUT_USECOND -#define CPPHTTPLIB_CONNECTION_TIMEOUT_USECOND 0 -#endif - -#ifndef CPPHTTPLIB_SERVER_READ_TIMEOUT_SECOND -#define CPPHTTPLIB_SERVER_READ_TIMEOUT_SECOND 5 -#endif - -#ifndef CPPHTTPLIB_SERVER_READ_TIMEOUT_USECOND -#define CPPHTTPLIB_SERVER_READ_TIMEOUT_USECOND 0 -#endif - -#ifndef CPPHTTPLIB_SERVER_WRITE_TIMEOUT_SECOND -#define CPPHTTPLIB_SERVER_WRITE_TIMEOUT_SECOND 5 -#endif - -#ifndef CPPHTTPLIB_SERVER_WRITE_TIMEOUT_USECOND -#define CPPHTTPLIB_SERVER_WRITE_TIMEOUT_USECOND 0 -#endif - -#ifndef CPPHTTPLIB_CLIENT_READ_TIMEOUT_SECOND -#define CPPHTTPLIB_CLIENT_READ_TIMEOUT_SECOND 300 -#endif - -#ifndef CPPHTTPLIB_CLIENT_READ_TIMEOUT_USECOND -#define CPPHTTPLIB_CLIENT_READ_TIMEOUT_USECOND 0 -#endif - -#ifndef CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_SECOND -#define CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_SECOND 5 -#endif - -#ifndef CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_USECOND -#define CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_USECOND 0 -#endif - -#ifndef CPPHTTPLIB_CLIENT_MAX_TIMEOUT_MSECOND -#define CPPHTTPLIB_CLIENT_MAX_TIMEOUT_MSECOND 0 -#endif - -#ifndef CPPHTTPLIB_IDLE_INTERVAL_SECOND -#define CPPHTTPLIB_IDLE_INTERVAL_SECOND 0 -#endif - -#ifndef CPPHTTPLIB_IDLE_INTERVAL_USECOND -#ifdef _WIN32 -#define CPPHTTPLIB_IDLE_INTERVAL_USECOND 10000 -#else -#define CPPHTTPLIB_IDLE_INTERVAL_USECOND 0 -#endif -#endif - -#ifndef CPPHTTPLIB_REQUEST_URI_MAX_LENGTH -#define CPPHTTPLIB_REQUEST_URI_MAX_LENGTH 8192 -#endif - -#ifndef CPPHTTPLIB_HEADER_MAX_LENGTH -#define CPPHTTPLIB_HEADER_MAX_LENGTH 8192 -#endif - -#ifndef CPPHTTPLIB_REDIRECT_MAX_COUNT -#define CPPHTTPLIB_REDIRECT_MAX_COUNT 20 -#endif - -#ifndef CPPHTTPLIB_MULTIPART_FORM_DATA_FILE_MAX_COUNT -#define CPPHTTPLIB_MULTIPART_FORM_DATA_FILE_MAX_COUNT 1024 -#endif - -#ifndef CPPHTTPLIB_PAYLOAD_MAX_LENGTH -#define CPPHTTPLIB_PAYLOAD_MAX_LENGTH ((std::numeric_limits::max)()) -#endif - -#ifndef CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH -#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 8192 -#endif - -#ifndef CPPHTTPLIB_RANGE_MAX_COUNT -#define CPPHTTPLIB_RANGE_MAX_COUNT 1024 -#endif - -#ifndef CPPHTTPLIB_TCP_NODELAY -#define CPPHTTPLIB_TCP_NODELAY false -#endif - -#ifndef CPPHTTPLIB_IPV6_V6ONLY -#define CPPHTTPLIB_IPV6_V6ONLY false -#endif - -#ifndef CPPHTTPLIB_RECV_BUFSIZ -#define CPPHTTPLIB_RECV_BUFSIZ size_t(16384u) -#endif - -#ifndef CPPHTTPLIB_COMPRESSION_BUFSIZ -#define CPPHTTPLIB_COMPRESSION_BUFSIZ size_t(16384u) -#endif - -#ifndef CPPHTTPLIB_THREAD_POOL_COUNT -#define CPPHTTPLIB_THREAD_POOL_COUNT \ - ((std::max)(8u, std::thread::hardware_concurrency() > 0 \ - ? std::thread::hardware_concurrency() - 1 \ - : 0)) -#endif - -#ifndef CPPHTTPLIB_RECV_FLAGS -#define CPPHTTPLIB_RECV_FLAGS 0 -#endif - -#ifndef CPPHTTPLIB_SEND_FLAGS -#define CPPHTTPLIB_SEND_FLAGS 0 -#endif - -#ifndef CPPHTTPLIB_LISTEN_BACKLOG -#define CPPHTTPLIB_LISTEN_BACKLOG 5 -#endif - -/* - * Headers - */ - -#ifdef _WIN32 -#ifndef _CRT_SECURE_NO_WARNINGS -#define _CRT_SECURE_NO_WARNINGS -#endif //_CRT_SECURE_NO_WARNINGS - -#ifndef _CRT_NONSTDC_NO_DEPRECATE -#define _CRT_NONSTDC_NO_DEPRECATE -#endif //_CRT_NONSTDC_NO_DEPRECATE - -#if defined(_MSC_VER) -#if _MSC_VER < 1900 -#error Sorry, Visual Studio versions prior to 2015 are not supported -#endif - -#pragma comment(lib, "ws2_32.lib") - -#ifdef _WIN64 -using ssize_t = __int64; -#else -using ssize_t = long; -#endif -#endif // _MSC_VER - -#ifndef S_ISREG -#define S_ISREG(m) (((m) & S_IFREG) == S_IFREG) -#endif // S_ISREG - -#ifndef S_ISDIR -#define S_ISDIR(m) (((m) & S_IFDIR) == S_IFDIR) -#endif // S_ISDIR - -#ifndef NOMINMAX -#define NOMINMAX -#endif // NOMINMAX - -#include -#include -#include - -// afunix.h uses types declared in winsock2.h, so has to be included after it. -#include - -#ifndef WSA_FLAG_NO_HANDLE_INHERIT -#define WSA_FLAG_NO_HANDLE_INHERIT 0x80 -#endif - -using nfds_t = unsigned long; -using socket_t = SOCKET; -using socklen_t = int; - -#else // not _WIN32 - -#include -#if !defined(_AIX) && !defined(__MVS__) -#include -#endif -#ifdef __MVS__ -#include -#ifndef NI_MAXHOST -#define NI_MAXHOST 1025 -#endif -#endif -#include -#include -#include -#ifdef __linux__ -#include -#endif -#include -#include -#include -#include -#include -#include -#include -#include - -using socket_t = int; -#ifndef INVALID_SOCKET -#define INVALID_SOCKET (-1) -#endif -#endif //_WIN32 - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT -#ifdef _WIN32 -#include - -// these are defined in wincrypt.h and it breaks compilation if BoringSSL is -// used -#undef X509_NAME -#undef X509_CERT_PAIR -#undef X509_EXTENSIONS -#undef PKCS7_SIGNER_INFO - -#ifdef _MSC_VER -#pragma comment(lib, "crypt32.lib") -#endif -#elif defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN) && defined(__APPLE__) -#include -#if TARGET_OS_OSX -#include -#include -#endif // TARGET_OS_OSX -#endif // _WIN32 - -#include -#include -#include -#include - -#if defined(_WIN32) && defined(OPENSSL_USE_APPLINK) -#include -#endif - -#include -#include - -#if defined(OPENSSL_IS_BORINGSSL) || defined(LIBRESSL_VERSION_NUMBER) -#if OPENSSL_VERSION_NUMBER < 0x1010107f -#error Please use OpenSSL or a current version of BoringSSL -#endif -#define SSL_get1_peer_certificate SSL_get_peer_certificate -#elif OPENSSL_VERSION_NUMBER < 0x30000000L -#error Sorry, OpenSSL versions prior to 3.0.0 are not supported -#endif - -#endif - -#ifdef CPPHTTPLIB_ZLIB_SUPPORT -#include -#endif - -#ifdef CPPHTTPLIB_BROTLI_SUPPORT -#include -#include -#endif - -#ifdef CPPHTTPLIB_ZSTD_SUPPORT -#include -#endif - -/* - * Declaration - */ -namespace httplib { - -namespace detail { - -/* - * Backport std::make_unique from C++14. - * - * NOTE: This code came up with the following stackoverflow post: - * https://stackoverflow.com/questions/10149840/c-arrays-and-make-unique - * - */ - -template -typename std::enable_if::value, std::unique_ptr>::type -make_unique(Args &&...args) { - return std::unique_ptr(new T(std::forward(args)...)); -} - -template -typename std::enable_if::value, std::unique_ptr>::type -make_unique(std::size_t n) { - typedef typename std::remove_extent::type RT; - return std::unique_ptr(new RT[n]); -} - -namespace case_ignore { - -inline unsigned char to_lower(int c) { - const static unsigned char table[256] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, - 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, - 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, - 60, 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, - 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, - 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, - 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, - 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, - 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, - 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, - 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, - 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 224, 225, 226, - 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, - 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, 224, - 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, - 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, - 255, - }; - return table[(unsigned char)(char)c]; -} - -inline bool equal(const std::string &a, const std::string &b) { - return a.size() == b.size() && - std::equal(a.begin(), a.end(), b.begin(), [](char ca, char cb) { - return to_lower(ca) == to_lower(cb); - }); -} - -struct equal_to { - bool operator()(const std::string &a, const std::string &b) const { - return equal(a, b); - } -}; - -struct hash { - size_t operator()(const std::string &key) const { - return hash_core(key.data(), key.size(), 0); - } - - size_t hash_core(const char *s, size_t l, size_t h) const { - return (l == 0) ? h - : hash_core(s + 1, l - 1, - // Unsets the 6 high bits of h, therefore no - // overflow happens - (((std::numeric_limits::max)() >> 6) & - h * 33) ^ - static_cast(to_lower(*s))); - } -}; - -} // namespace case_ignore - -// This is based on -// "http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n4189". - -struct scope_exit { - explicit scope_exit(std::function &&f) - : exit_function(std::move(f)), execute_on_destruction{true} {} - - scope_exit(scope_exit &&rhs) noexcept - : exit_function(std::move(rhs.exit_function)), - execute_on_destruction{rhs.execute_on_destruction} { - rhs.release(); - } - - ~scope_exit() { - if (execute_on_destruction) { this->exit_function(); } - } - - void release() { this->execute_on_destruction = false; } - -private: - scope_exit(const scope_exit &) = delete; - void operator=(const scope_exit &) = delete; - scope_exit &operator=(scope_exit &&) = delete; - - std::function exit_function; - bool execute_on_destruction; -}; - -} // namespace detail - -enum SSLVerifierResponse { - // no decision has been made, use the built-in certificate verifier - NoDecisionMade, - // connection certificate is verified and accepted - CertificateAccepted, - // connection certificate was processed but is rejected - CertificateRejected -}; - -enum StatusCode { - // Information responses - Continue_100 = 100, - SwitchingProtocol_101 = 101, - Processing_102 = 102, - EarlyHints_103 = 103, - - // Successful responses - OK_200 = 200, - Created_201 = 201, - Accepted_202 = 202, - NonAuthoritativeInformation_203 = 203, - NoContent_204 = 204, - ResetContent_205 = 205, - PartialContent_206 = 206, - MultiStatus_207 = 207, - AlreadyReported_208 = 208, - IMUsed_226 = 226, - - // Redirection messages - MultipleChoices_300 = 300, - MovedPermanently_301 = 301, - Found_302 = 302, - SeeOther_303 = 303, - NotModified_304 = 304, - UseProxy_305 = 305, - unused_306 = 306, - TemporaryRedirect_307 = 307, - PermanentRedirect_308 = 308, - - // Client error responses - BadRequest_400 = 400, - Unauthorized_401 = 401, - PaymentRequired_402 = 402, - Forbidden_403 = 403, - NotFound_404 = 404, - MethodNotAllowed_405 = 405, - NotAcceptable_406 = 406, - ProxyAuthenticationRequired_407 = 407, - RequestTimeout_408 = 408, - Conflict_409 = 409, - Gone_410 = 410, - LengthRequired_411 = 411, - PreconditionFailed_412 = 412, - PayloadTooLarge_413 = 413, - UriTooLong_414 = 414, - UnsupportedMediaType_415 = 415, - RangeNotSatisfiable_416 = 416, - ExpectationFailed_417 = 417, - ImATeapot_418 = 418, - MisdirectedRequest_421 = 421, - UnprocessableContent_422 = 422, - Locked_423 = 423, - FailedDependency_424 = 424, - TooEarly_425 = 425, - UpgradeRequired_426 = 426, - PreconditionRequired_428 = 428, - TooManyRequests_429 = 429, - RequestHeaderFieldsTooLarge_431 = 431, - UnavailableForLegalReasons_451 = 451, - - // Server error responses - InternalServerError_500 = 500, - NotImplemented_501 = 501, - BadGateway_502 = 502, - ServiceUnavailable_503 = 503, - GatewayTimeout_504 = 504, - HttpVersionNotSupported_505 = 505, - VariantAlsoNegotiates_506 = 506, - InsufficientStorage_507 = 507, - LoopDetected_508 = 508, - NotExtended_510 = 510, - NetworkAuthenticationRequired_511 = 511, -}; - -using Headers = - std::unordered_multimap; - -using Params = std::multimap; -using Match = std::smatch; - -using Progress = std::function; - -struct Response; -using ResponseHandler = std::function; - -struct MultipartFormData { - std::string name; - std::string content; - std::string filename; - std::string content_type; -}; -using MultipartFormDataItems = std::vector; -using MultipartFormDataMap = std::multimap; - -class DataSink { -public: - DataSink() : os(&sb_), sb_(*this) {} - - DataSink(const DataSink &) = delete; - DataSink &operator=(const DataSink &) = delete; - DataSink(DataSink &&) = delete; - DataSink &operator=(DataSink &&) = delete; - - std::function write; - std::function is_writable; - std::function done; - std::function done_with_trailer; - std::ostream os; - -private: - class data_sink_streambuf final : public std::streambuf { - public: - explicit data_sink_streambuf(DataSink &sink) : sink_(sink) {} - - protected: - std::streamsize xsputn(const char *s, std::streamsize n) override { - sink_.write(s, static_cast(n)); - return n; - } - - private: - DataSink &sink_; - }; - - data_sink_streambuf sb_; -}; - -using ContentProvider = - std::function; - -using ContentProviderWithoutLength = - std::function; - -using ContentProviderResourceReleaser = std::function; - -struct MultipartFormDataProvider { - std::string name; - ContentProviderWithoutLength provider; - std::string filename; - std::string content_type; -}; -using MultipartFormDataProviderItems = std::vector; - -using ContentReceiverWithProgress = - std::function; - -using ContentReceiver = - std::function; - -using MultipartContentHeader = - std::function; - -class ContentReader { -public: - using Reader = std::function; - using MultipartReader = std::function; - - ContentReader(Reader reader, MultipartReader multipart_reader) - : reader_(std::move(reader)), - multipart_reader_(std::move(multipart_reader)) {} - - bool operator()(MultipartContentHeader header, - ContentReceiver receiver) const { - return multipart_reader_(std::move(header), std::move(receiver)); - } - - bool operator()(ContentReceiver receiver) const { - return reader_(std::move(receiver)); - } - - Reader reader_; - MultipartReader multipart_reader_; -}; - -using Range = std::pair; -using Ranges = std::vector; - -struct Request { - std::string method; - std::string path; - Params params; - Headers headers; - std::string body; - - std::string remote_addr; - int remote_port = -1; - std::string local_addr; - int local_port = -1; - - // for server - std::string version; - std::string target; - MultipartFormDataMap files; - Ranges ranges; - Match matches; - std::unordered_map path_params; - std::function is_connection_closed = []() { return true; }; - - // for client - ResponseHandler response_handler; - ContentReceiverWithProgress content_receiver; - Progress progress; -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - const SSL *ssl = nullptr; -#endif - - bool has_header(const std::string &key) const; - std::string get_header_value(const std::string &key, const char *def = "", - size_t id = 0) const; - uint64_t get_header_value_u64(const std::string &key, uint64_t def = 0, - size_t id = 0) const; - size_t get_header_value_count(const std::string &key) const; - void set_header(const std::string &key, const std::string &val); - - bool has_param(const std::string &key) const; - std::string get_param_value(const std::string &key, size_t id = 0) const; - size_t get_param_value_count(const std::string &key) const; - - bool is_multipart_form_data() const; - - bool has_file(const std::string &key) const; - MultipartFormData get_file_value(const std::string &key) const; - std::vector get_file_values(const std::string &key) const; - - // private members... - size_t redirect_count_ = CPPHTTPLIB_REDIRECT_MAX_COUNT; - size_t content_length_ = 0; - ContentProvider content_provider_; - bool is_chunked_content_provider_ = false; - size_t authorization_count_ = 0; - std::chrono::time_point start_time_ = - (std::chrono::steady_clock::time_point::min)(); -}; - -struct Response { - std::string version; - int status = -1; - std::string reason; - Headers headers; - std::string body; - std::string location; // Redirect location - - bool has_header(const std::string &key) const; - std::string get_header_value(const std::string &key, const char *def = "", - size_t id = 0) const; - uint64_t get_header_value_u64(const std::string &key, uint64_t def = 0, - size_t id = 0) const; - size_t get_header_value_count(const std::string &key) const; - void set_header(const std::string &key, const std::string &val); - - void set_redirect(const std::string &url, int status = StatusCode::Found_302); - void set_content(const char *s, size_t n, const std::string &content_type); - void set_content(const std::string &s, const std::string &content_type); - void set_content(std::string &&s, const std::string &content_type); - - void set_content_provider( - size_t length, const std::string &content_type, ContentProvider provider, - ContentProviderResourceReleaser resource_releaser = nullptr); - - void set_content_provider( - const std::string &content_type, ContentProviderWithoutLength provider, - ContentProviderResourceReleaser resource_releaser = nullptr); - - void set_chunked_content_provider( - const std::string &content_type, ContentProviderWithoutLength provider, - ContentProviderResourceReleaser resource_releaser = nullptr); - - void set_file_content(const std::string &path, - const std::string &content_type); - void set_file_content(const std::string &path); - - Response() = default; - Response(const Response &) = default; - Response &operator=(const Response &) = default; - Response(Response &&) = default; - Response &operator=(Response &&) = default; - ~Response() { - if (content_provider_resource_releaser_) { - content_provider_resource_releaser_(content_provider_success_); - } - } - - // private members... - size_t content_length_ = 0; - ContentProvider content_provider_; - ContentProviderResourceReleaser content_provider_resource_releaser_; - bool is_chunked_content_provider_ = false; - bool content_provider_success_ = false; - std::string file_content_path_; - std::string file_content_content_type_; -}; - -class Stream { -public: - virtual ~Stream() = default; - - virtual bool is_readable() const = 0; - virtual bool wait_readable() const = 0; - virtual bool wait_writable() const = 0; - - virtual ssize_t read(char *ptr, size_t size) = 0; - virtual ssize_t write(const char *ptr, size_t size) = 0; - virtual void get_remote_ip_and_port(std::string &ip, int &port) const = 0; - virtual void get_local_ip_and_port(std::string &ip, int &port) const = 0; - virtual socket_t socket() const = 0; - - virtual time_t duration() const = 0; - - ssize_t write(const char *ptr); - ssize_t write(const std::string &s); -}; - -class TaskQueue { -public: - TaskQueue() = default; - virtual ~TaskQueue() = default; - - virtual bool enqueue(std::function fn) = 0; - virtual void shutdown() = 0; - - virtual void on_idle() {} -}; - -class ThreadPool final : public TaskQueue { -public: - explicit ThreadPool(size_t n, size_t mqr = 0) - : shutdown_(false), max_queued_requests_(mqr) { - while (n) { - threads_.emplace_back(worker(*this)); - n--; - } - } - - ThreadPool(const ThreadPool &) = delete; - ~ThreadPool() override = default; - - bool enqueue(std::function fn) override { - { - std::unique_lock lock(mutex_); - if (max_queued_requests_ > 0 && jobs_.size() >= max_queued_requests_) { - return false; - } - jobs_.push_back(std::move(fn)); - } - - cond_.notify_one(); - return true; - } - - void shutdown() override { - // Stop all worker threads... - { - std::unique_lock lock(mutex_); - shutdown_ = true; - } - - cond_.notify_all(); - - // Join... - for (auto &t : threads_) { - t.join(); - } - } - -private: - struct worker { - explicit worker(ThreadPool &pool) : pool_(pool) {} - - void operator()() { - for (;;) { - std::function fn; - { - std::unique_lock lock(pool_.mutex_); - - pool_.cond_.wait( - lock, [&] { return !pool_.jobs_.empty() || pool_.shutdown_; }); - - if (pool_.shutdown_ && pool_.jobs_.empty()) { break; } - - fn = pool_.jobs_.front(); - pool_.jobs_.pop_front(); - } - - assert(true == static_cast(fn)); - fn(); - } - -#if defined(CPPHTTPLIB_OPENSSL_SUPPORT) && !defined(OPENSSL_IS_BORINGSSL) && \ - !defined(LIBRESSL_VERSION_NUMBER) - OPENSSL_thread_stop(); -#endif - } - - ThreadPool &pool_; - }; - friend struct worker; - - std::vector threads_; - std::list> jobs_; - - bool shutdown_; - size_t max_queued_requests_ = 0; - - std::condition_variable cond_; - std::mutex mutex_; -}; - -using Logger = std::function; - -using SocketOptions = std::function; - -namespace detail { - -bool set_socket_opt_impl(socket_t sock, int level, int optname, - const void *optval, socklen_t optlen); -bool set_socket_opt(socket_t sock, int level, int optname, int opt); -bool set_socket_opt_time(socket_t sock, int level, int optname, time_t sec, - time_t usec); - -} // namespace detail - -void default_socket_options(socket_t sock); - -const char *status_message(int status); - -std::string get_bearer_token_auth(const Request &req); - -namespace detail { - -class MatcherBase { -public: - virtual ~MatcherBase() = default; - - // Match request path and populate its matches and - virtual bool match(Request &request) const = 0; -}; - -/** - * Captures parameters in request path and stores them in Request::path_params - * - * Capture name is a substring of a pattern from : to /. - * The rest of the pattern is matched against the request path directly - * Parameters are captured starting from the next character after - * the end of the last matched static pattern fragment until the next /. - * - * Example pattern: - * "/path/fragments/:capture/more/fragments/:second_capture" - * Static fragments: - * "/path/fragments/", "more/fragments/" - * - * Given the following request path: - * "/path/fragments/:1/more/fragments/:2" - * the resulting capture will be - * {{"capture", "1"}, {"second_capture", "2"}} - */ -class PathParamsMatcher final : public MatcherBase { -public: - PathParamsMatcher(const std::string &pattern); - - bool match(Request &request) const override; - -private: - // Treat segment separators as the end of path parameter capture - // Does not need to handle query parameters as they are parsed before path - // matching - static constexpr char separator = '/'; - - // Contains static path fragments to match against, excluding the '/' after - // path params - // Fragments are separated by path params - std::vector static_fragments_; - // Stores the names of the path parameters to be used as keys in the - // Request::path_params map - std::vector param_names_; -}; - -/** - * Performs std::regex_match on request path - * and stores the result in Request::matches - * - * Note that regex match is performed directly on the whole request. - * This means that wildcard patterns may match multiple path segments with /: - * "/begin/(.*)/end" will match both "/begin/middle/end" and "/begin/1/2/end". - */ -class RegexMatcher final : public MatcherBase { -public: - RegexMatcher(const std::string &pattern) : regex_(pattern) {} - - bool match(Request &request) const override; - -private: - std::regex regex_; -}; - -ssize_t write_headers(Stream &strm, const Headers &headers); - -} // namespace detail - -class Server { -public: - using Handler = std::function; - - using ExceptionHandler = - std::function; - - enum class HandlerResponse { - Handled, - Unhandled, - }; - using HandlerWithResponse = - std::function; - - using HandlerWithContentReader = std::function; - - using Expect100ContinueHandler = - std::function; - - Server(); - - virtual ~Server(); - - virtual bool is_valid() const; - - Server &Get(const std::string &pattern, Handler handler); - Server &Post(const std::string &pattern, Handler handler); - Server &Post(const std::string &pattern, HandlerWithContentReader handler); - Server &Put(const std::string &pattern, Handler handler); - Server &Put(const std::string &pattern, HandlerWithContentReader handler); - Server &Patch(const std::string &pattern, Handler handler); - Server &Patch(const std::string &pattern, HandlerWithContentReader handler); - Server &Delete(const std::string &pattern, Handler handler); - Server &Delete(const std::string &pattern, HandlerWithContentReader handler); - Server &Options(const std::string &pattern, Handler handler); - - bool set_base_dir(const std::string &dir, - const std::string &mount_point = std::string()); - bool set_mount_point(const std::string &mount_point, const std::string &dir, - Headers headers = Headers()); - bool remove_mount_point(const std::string &mount_point); - Server &set_file_extension_and_mimetype_mapping(const std::string &ext, - const std::string &mime); - Server &set_default_file_mimetype(const std::string &mime); - Server &set_file_request_handler(Handler handler); - - template - Server &set_error_handler(ErrorHandlerFunc &&handler) { - return set_error_handler_core( - std::forward(handler), - std::is_convertible{}); - } - - Server &set_exception_handler(ExceptionHandler handler); - Server &set_pre_routing_handler(HandlerWithResponse handler); - Server &set_post_routing_handler(Handler handler); - - Server &set_expect_100_continue_handler(Expect100ContinueHandler handler); - Server &set_logger(Logger logger); - - Server &set_address_family(int family); - Server &set_tcp_nodelay(bool on); - Server &set_ipv6_v6only(bool on); - Server &set_socket_options(SocketOptions socket_options); - - Server &set_default_headers(Headers headers); - Server & - set_header_writer(std::function const &writer); - - Server &set_keep_alive_max_count(size_t count); - Server &set_keep_alive_timeout(time_t sec); - - Server &set_read_timeout(time_t sec, time_t usec = 0); - template - Server &set_read_timeout(const std::chrono::duration &duration); - - Server &set_write_timeout(time_t sec, time_t usec = 0); - template - Server &set_write_timeout(const std::chrono::duration &duration); - - Server &set_idle_interval(time_t sec, time_t usec = 0); - template - Server &set_idle_interval(const std::chrono::duration &duration); - - Server &set_payload_max_length(size_t length); - - bool bind_to_port(const std::string &host, int port, int socket_flags = 0); - int bind_to_any_port(const std::string &host, int socket_flags = 0); - bool listen_after_bind(); - - bool listen(const std::string &host, int port, int socket_flags = 0); - - bool is_running() const; - void wait_until_ready() const; - void stop(); - void decommission(); - - std::function new_task_queue; - -protected: - bool process_request(Stream &strm, const std::string &remote_addr, - int remote_port, const std::string &local_addr, - int local_port, bool close_connection, - bool &connection_closed, - const std::function &setup_request); - - std::atomic svr_sock_{INVALID_SOCKET}; - size_t keep_alive_max_count_ = CPPHTTPLIB_KEEPALIVE_MAX_COUNT; - time_t keep_alive_timeout_sec_ = CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND; - time_t read_timeout_sec_ = CPPHTTPLIB_SERVER_READ_TIMEOUT_SECOND; - time_t read_timeout_usec_ = CPPHTTPLIB_SERVER_READ_TIMEOUT_USECOND; - time_t write_timeout_sec_ = CPPHTTPLIB_SERVER_WRITE_TIMEOUT_SECOND; - time_t write_timeout_usec_ = CPPHTTPLIB_SERVER_WRITE_TIMEOUT_USECOND; - time_t idle_interval_sec_ = CPPHTTPLIB_IDLE_INTERVAL_SECOND; - time_t idle_interval_usec_ = CPPHTTPLIB_IDLE_INTERVAL_USECOND; - size_t payload_max_length_ = CPPHTTPLIB_PAYLOAD_MAX_LENGTH; - -private: - using Handlers = - std::vector, Handler>>; - using HandlersForContentReader = - std::vector, - HandlerWithContentReader>>; - - static std::unique_ptr - make_matcher(const std::string &pattern); - - Server &set_error_handler_core(HandlerWithResponse handler, std::true_type); - Server &set_error_handler_core(Handler handler, std::false_type); - - socket_t create_server_socket(const std::string &host, int port, - int socket_flags, - SocketOptions socket_options) const; - int bind_internal(const std::string &host, int port, int socket_flags); - bool listen_internal(); - - bool routing(Request &req, Response &res, Stream &strm); - bool handle_file_request(const Request &req, Response &res, - bool head = false); - bool dispatch_request(Request &req, Response &res, - const Handlers &handlers) const; - bool dispatch_request_for_content_reader( - Request &req, Response &res, ContentReader content_reader, - const HandlersForContentReader &handlers) const; - - bool parse_request_line(const char *s, Request &req) const; - void apply_ranges(const Request &req, Response &res, - std::string &content_type, std::string &boundary) const; - bool write_response(Stream &strm, bool close_connection, Request &req, - Response &res); - bool write_response_with_content(Stream &strm, bool close_connection, - const Request &req, Response &res); - bool write_response_core(Stream &strm, bool close_connection, - const Request &req, Response &res, - bool need_apply_ranges); - bool write_content_with_provider(Stream &strm, const Request &req, - Response &res, const std::string &boundary, - const std::string &content_type); - bool read_content(Stream &strm, Request &req, Response &res); - bool - read_content_with_content_receiver(Stream &strm, Request &req, Response &res, - ContentReceiver receiver, - MultipartContentHeader multipart_header, - ContentReceiver multipart_receiver); - bool read_content_core(Stream &strm, Request &req, Response &res, - ContentReceiver receiver, - MultipartContentHeader multipart_header, - ContentReceiver multipart_receiver) const; - - virtual bool process_and_close_socket(socket_t sock); - - std::atomic is_running_{false}; - std::atomic is_decommissioned{false}; - - struct MountPointEntry { - std::string mount_point; - std::string base_dir; - Headers headers; - }; - std::vector base_dirs_; - std::map file_extension_and_mimetype_map_; - std::string default_file_mimetype_ = "application/octet-stream"; - Handler file_request_handler_; - - Handlers get_handlers_; - Handlers post_handlers_; - HandlersForContentReader post_handlers_for_content_reader_; - Handlers put_handlers_; - HandlersForContentReader put_handlers_for_content_reader_; - Handlers patch_handlers_; - HandlersForContentReader patch_handlers_for_content_reader_; - Handlers delete_handlers_; - HandlersForContentReader delete_handlers_for_content_reader_; - Handlers options_handlers_; - - HandlerWithResponse error_handler_; - ExceptionHandler exception_handler_; - HandlerWithResponse pre_routing_handler_; - Handler post_routing_handler_; - Expect100ContinueHandler expect_100_continue_handler_; - - Logger logger_; - - int address_family_ = AF_UNSPEC; - bool tcp_nodelay_ = CPPHTTPLIB_TCP_NODELAY; - bool ipv6_v6only_ = CPPHTTPLIB_IPV6_V6ONLY; - SocketOptions socket_options_ = default_socket_options; - - Headers default_headers_; - std::function header_writer_ = - detail::write_headers; -}; - -enum class Error { - Success = 0, - Unknown, - Connection, - BindIPAddress, - Read, - Write, - ExceedRedirectCount, - Canceled, - SSLConnection, - SSLLoadingCerts, - SSLServerVerification, - SSLServerHostnameVerification, - UnsupportedMultipartBoundaryChars, - Compression, - ConnectionTimeout, - ProxyConnection, - - // For internal use only - SSLPeerCouldBeClosed_, -}; - -std::string to_string(Error error); - -std::ostream &operator<<(std::ostream &os, const Error &obj); - -class Result { -public: - Result() = default; - Result(std::unique_ptr &&res, Error err, - Headers &&request_headers = Headers{}) - : res_(std::move(res)), err_(err), - request_headers_(std::move(request_headers)) {} - // Response - operator bool() const { return res_ != nullptr; } - bool operator==(std::nullptr_t) const { return res_ == nullptr; } - bool operator!=(std::nullptr_t) const { return res_ != nullptr; } - const Response &value() const { return *res_; } - Response &value() { return *res_; } - const Response &operator*() const { return *res_; } - Response &operator*() { return *res_; } - const Response *operator->() const { return res_.get(); } - Response *operator->() { return res_.get(); } - - // Error - Error error() const { return err_; } - - // Request Headers - bool has_request_header(const std::string &key) const; - std::string get_request_header_value(const std::string &key, - const char *def = "", - size_t id = 0) const; - uint64_t get_request_header_value_u64(const std::string &key, - uint64_t def = 0, size_t id = 0) const; - size_t get_request_header_value_count(const std::string &key) const; - -private: - std::unique_ptr res_; - Error err_ = Error::Unknown; - Headers request_headers_; -}; - -class ClientImpl { -public: - explicit ClientImpl(const std::string &host); - - explicit ClientImpl(const std::string &host, int port); - - explicit ClientImpl(const std::string &host, int port, - const std::string &client_cert_path, - const std::string &client_key_path); - - virtual ~ClientImpl(); - - virtual bool is_valid() const; - - Result Get(const std::string &path); - Result Get(const std::string &path, const Headers &headers); - Result Get(const std::string &path, Progress progress); - Result Get(const std::string &path, const Headers &headers, - Progress progress); - Result Get(const std::string &path, ContentReceiver content_receiver); - Result Get(const std::string &path, const Headers &headers, - ContentReceiver content_receiver); - Result Get(const std::string &path, ContentReceiver content_receiver, - Progress progress); - Result Get(const std::string &path, const Headers &headers, - ContentReceiver content_receiver, Progress progress); - Result Get(const std::string &path, ResponseHandler response_handler, - ContentReceiver content_receiver); - Result Get(const std::string &path, const Headers &headers, - ResponseHandler response_handler, - ContentReceiver content_receiver); - Result Get(const std::string &path, ResponseHandler response_handler, - ContentReceiver content_receiver, Progress progress); - Result Get(const std::string &path, const Headers &headers, - ResponseHandler response_handler, ContentReceiver content_receiver, - Progress progress); - - Result Get(const std::string &path, const Params ¶ms, - const Headers &headers, Progress progress = nullptr); - Result Get(const std::string &path, const Params ¶ms, - const Headers &headers, ContentReceiver content_receiver, - Progress progress = nullptr); - Result Get(const std::string &path, const Params ¶ms, - const Headers &headers, ResponseHandler response_handler, - ContentReceiver content_receiver, Progress progress = nullptr); - - Result Head(const std::string &path); - Result Head(const std::string &path, const Headers &headers); - - Result Post(const std::string &path); - Result Post(const std::string &path, const Headers &headers); - Result Post(const std::string &path, const char *body, size_t content_length, - const std::string &content_type); - Result Post(const std::string &path, const Headers &headers, const char *body, - size_t content_length, const std::string &content_type); - Result Post(const std::string &path, const Headers &headers, const char *body, - size_t content_length, const std::string &content_type, - Progress progress); - Result Post(const std::string &path, const std::string &body, - const std::string &content_type); - Result Post(const std::string &path, const std::string &body, - const std::string &content_type, Progress progress); - Result Post(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type); - Result Post(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type, - Progress progress); - Result Post(const std::string &path, size_t content_length, - ContentProvider content_provider, - const std::string &content_type); - Result Post(const std::string &path, - ContentProviderWithoutLength content_provider, - const std::string &content_type); - Result Post(const std::string &path, const Headers &headers, - size_t content_length, ContentProvider content_provider, - const std::string &content_type); - Result Post(const std::string &path, const Headers &headers, - ContentProviderWithoutLength content_provider, - const std::string &content_type); - Result Post(const std::string &path, const Params ¶ms); - Result Post(const std::string &path, const Headers &headers, - const Params ¶ms); - Result Post(const std::string &path, const Headers &headers, - const Params ¶ms, Progress progress); - Result Post(const std::string &path, const MultipartFormDataItems &items); - Result Post(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items); - Result Post(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items, const std::string &boundary); - Result Post(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items, - const MultipartFormDataProviderItems &provider_items); - - Result Put(const std::string &path); - Result Put(const std::string &path, const char *body, size_t content_length, - const std::string &content_type); - Result Put(const std::string &path, const Headers &headers, const char *body, - size_t content_length, const std::string &content_type); - Result Put(const std::string &path, const Headers &headers, const char *body, - size_t content_length, const std::string &content_type, - Progress progress); - Result Put(const std::string &path, const std::string &body, - const std::string &content_type); - Result Put(const std::string &path, const std::string &body, - const std::string &content_type, Progress progress); - Result Put(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type); - Result Put(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type, - Progress progress); - Result Put(const std::string &path, size_t content_length, - ContentProvider content_provider, const std::string &content_type); - Result Put(const std::string &path, - ContentProviderWithoutLength content_provider, - const std::string &content_type); - Result Put(const std::string &path, const Headers &headers, - size_t content_length, ContentProvider content_provider, - const std::string &content_type); - Result Put(const std::string &path, const Headers &headers, - ContentProviderWithoutLength content_provider, - const std::string &content_type); - Result Put(const std::string &path, const Params ¶ms); - Result Put(const std::string &path, const Headers &headers, - const Params ¶ms); - Result Put(const std::string &path, const Headers &headers, - const Params ¶ms, Progress progress); - Result Put(const std::string &path, const MultipartFormDataItems &items); - Result Put(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items); - Result Put(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items, const std::string &boundary); - Result Put(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items, - const MultipartFormDataProviderItems &provider_items); - - Result Patch(const std::string &path); - Result Patch(const std::string &path, const char *body, size_t content_length, - const std::string &content_type); - Result Patch(const std::string &path, const char *body, size_t content_length, - const std::string &content_type, Progress progress); - Result Patch(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type); - Result Patch(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type, Progress progress); - Result Patch(const std::string &path, const std::string &body, - const std::string &content_type); - Result Patch(const std::string &path, const std::string &body, - const std::string &content_type, Progress progress); - Result Patch(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type); - Result Patch(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type, - Progress progress); - Result Patch(const std::string &path, size_t content_length, - ContentProvider content_provider, - const std::string &content_type); - Result Patch(const std::string &path, - ContentProviderWithoutLength content_provider, - const std::string &content_type); - Result Patch(const std::string &path, const Headers &headers, - size_t content_length, ContentProvider content_provider, - const std::string &content_type); - Result Patch(const std::string &path, const Headers &headers, - ContentProviderWithoutLength content_provider, - const std::string &content_type); - - Result Delete(const std::string &path); - Result Delete(const std::string &path, const Headers &headers); - Result Delete(const std::string &path, const char *body, - size_t content_length, const std::string &content_type); - Result Delete(const std::string &path, const char *body, - size_t content_length, const std::string &content_type, - Progress progress); - Result Delete(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type); - Result Delete(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type, Progress progress); - Result Delete(const std::string &path, const std::string &body, - const std::string &content_type); - Result Delete(const std::string &path, const std::string &body, - const std::string &content_type, Progress progress); - Result Delete(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type); - Result Delete(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type, - Progress progress); - - Result Options(const std::string &path); - Result Options(const std::string &path, const Headers &headers); - - bool send(Request &req, Response &res, Error &error); - Result send(const Request &req); - - void stop(); - - std::string host() const; - int port() const; - - size_t is_socket_open() const; - socket_t socket() const; - - void set_hostname_addr_map(std::map addr_map); - - void set_default_headers(Headers headers); - - void - set_header_writer(std::function const &writer); - - void set_address_family(int family); - void set_tcp_nodelay(bool on); - void set_ipv6_v6only(bool on); - void set_socket_options(SocketOptions socket_options); - - void set_connection_timeout(time_t sec, time_t usec = 0); - template - void - set_connection_timeout(const std::chrono::duration &duration); - - void set_read_timeout(time_t sec, time_t usec = 0); - template - void set_read_timeout(const std::chrono::duration &duration); - - void set_write_timeout(time_t sec, time_t usec = 0); - template - void set_write_timeout(const std::chrono::duration &duration); - - void set_max_timeout(time_t msec); - template - void set_max_timeout(const std::chrono::duration &duration); - - void set_basic_auth(const std::string &username, const std::string &password); - void set_bearer_token_auth(const std::string &token); -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - void set_digest_auth(const std::string &username, - const std::string &password); -#endif - - void set_keep_alive(bool on); - void set_follow_location(bool on); - - void set_url_encode(bool on); - - void set_compress(bool on); - - void set_decompress(bool on); - - void set_interface(const std::string &intf); - - void set_proxy(const std::string &host, int port); - void set_proxy_basic_auth(const std::string &username, - const std::string &password); - void set_proxy_bearer_token_auth(const std::string &token); -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - void set_proxy_digest_auth(const std::string &username, - const std::string &password); -#endif - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - void set_ca_cert_path(const std::string &ca_cert_file_path, - const std::string &ca_cert_dir_path = std::string()); - void set_ca_cert_store(X509_STORE *ca_cert_store); - X509_STORE *create_ca_cert_store(const char *ca_cert, std::size_t size) const; -#endif - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - void enable_server_certificate_verification(bool enabled); - void enable_server_hostname_verification(bool enabled); - void set_server_certificate_verifier( - std::function verifier); -#endif - - void set_logger(Logger logger); - -protected: - struct Socket { - socket_t sock = INVALID_SOCKET; -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - SSL *ssl = nullptr; -#endif - - bool is_open() const { return sock != INVALID_SOCKET; } - }; - - virtual bool create_and_connect_socket(Socket &socket, Error &error); - - // All of: - // shutdown_ssl - // shutdown_socket - // close_socket - // should ONLY be called when socket_mutex_ is locked. - // Also, shutdown_ssl and close_socket should also NOT be called concurrently - // with a DIFFERENT thread sending requests using that socket. - virtual void shutdown_ssl(Socket &socket, bool shutdown_gracefully); - void shutdown_socket(Socket &socket) const; - void close_socket(Socket &socket); - - bool process_request(Stream &strm, Request &req, Response &res, - bool close_connection, Error &error); - - bool write_content_with_provider(Stream &strm, const Request &req, - Error &error) const; - - void copy_settings(const ClientImpl &rhs); - - // Socket endpoint information - const std::string host_; - const int port_; - const std::string host_and_port_; - - // Current open socket - Socket socket_; - mutable std::mutex socket_mutex_; - std::recursive_mutex request_mutex_; - - // These are all protected under socket_mutex - size_t socket_requests_in_flight_ = 0; - std::thread::id socket_requests_are_from_thread_ = std::thread::id(); - bool socket_should_be_closed_when_request_is_done_ = false; - - // Hostname-IP map - std::map addr_map_; - - // Default headers - Headers default_headers_; - - // Header writer - std::function header_writer_ = - detail::write_headers; - - // Settings - std::string client_cert_path_; - std::string client_key_path_; - - time_t connection_timeout_sec_ = CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND; - time_t connection_timeout_usec_ = CPPHTTPLIB_CONNECTION_TIMEOUT_USECOND; - time_t read_timeout_sec_ = CPPHTTPLIB_CLIENT_READ_TIMEOUT_SECOND; - time_t read_timeout_usec_ = CPPHTTPLIB_CLIENT_READ_TIMEOUT_USECOND; - time_t write_timeout_sec_ = CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_SECOND; - time_t write_timeout_usec_ = CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_USECOND; - time_t max_timeout_msec_ = CPPHTTPLIB_CLIENT_MAX_TIMEOUT_MSECOND; - - std::string basic_auth_username_; - std::string basic_auth_password_; - std::string bearer_token_auth_token_; -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - std::string digest_auth_username_; - std::string digest_auth_password_; -#endif - - bool keep_alive_ = false; - bool follow_location_ = false; - - bool url_encode_ = true; - - int address_family_ = AF_UNSPEC; - bool tcp_nodelay_ = CPPHTTPLIB_TCP_NODELAY; - bool ipv6_v6only_ = CPPHTTPLIB_IPV6_V6ONLY; - SocketOptions socket_options_ = nullptr; - - bool compress_ = false; - bool decompress_ = true; - - std::string interface_; - - std::string proxy_host_; - int proxy_port_ = -1; - - std::string proxy_basic_auth_username_; - std::string proxy_basic_auth_password_; - std::string proxy_bearer_token_auth_token_; -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - std::string proxy_digest_auth_username_; - std::string proxy_digest_auth_password_; -#endif - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - std::string ca_cert_file_path_; - std::string ca_cert_dir_path_; - - X509_STORE *ca_cert_store_ = nullptr; -#endif - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - bool server_certificate_verification_ = true; - bool server_hostname_verification_ = true; - std::function server_certificate_verifier_; -#endif - - Logger logger_; - -private: - bool send_(Request &req, Response &res, Error &error); - Result send_(Request &&req); - - socket_t create_client_socket(Error &error) const; - bool read_response_line(Stream &strm, const Request &req, - Response &res) const; - bool write_request(Stream &strm, Request &req, bool close_connection, - Error &error); - bool redirect(Request &req, Response &res, Error &error); - bool handle_request(Stream &strm, Request &req, Response &res, - bool close_connection, Error &error); - std::unique_ptr send_with_content_provider( - Request &req, const char *body, size_t content_length, - ContentProvider content_provider, - ContentProviderWithoutLength content_provider_without_length, - const std::string &content_type, Error &error); - Result send_with_content_provider( - const std::string &method, const std::string &path, - const Headers &headers, const char *body, size_t content_length, - ContentProvider content_provider, - ContentProviderWithoutLength content_provider_without_length, - const std::string &content_type, Progress progress); - ContentProviderWithoutLength get_multipart_content_provider( - const std::string &boundary, const MultipartFormDataItems &items, - const MultipartFormDataProviderItems &provider_items) const; - - std::string adjust_host_string(const std::string &host) const; - - virtual bool - process_socket(const Socket &socket, - std::chrono::time_point start_time, - std::function callback); - virtual bool is_ssl() const; -}; - -class Client { -public: - // Universal interface - explicit Client(const std::string &scheme_host_port); - - explicit Client(const std::string &scheme_host_port, - const std::string &client_cert_path, - const std::string &client_key_path); - - // HTTP only interface - explicit Client(const std::string &host, int port); - - explicit Client(const std::string &host, int port, - const std::string &client_cert_path, - const std::string &client_key_path); - - Client(Client &&) = default; - Client &operator=(Client &&) = default; - - ~Client(); - - bool is_valid() const; - - Result Get(const std::string &path); - Result Get(const std::string &path, const Headers &headers); - Result Get(const std::string &path, Progress progress); - Result Get(const std::string &path, const Headers &headers, - Progress progress); - Result Get(const std::string &path, ContentReceiver content_receiver); - Result Get(const std::string &path, const Headers &headers, - ContentReceiver content_receiver); - Result Get(const std::string &path, ContentReceiver content_receiver, - Progress progress); - Result Get(const std::string &path, const Headers &headers, - ContentReceiver content_receiver, Progress progress); - Result Get(const std::string &path, ResponseHandler response_handler, - ContentReceiver content_receiver); - Result Get(const std::string &path, const Headers &headers, - ResponseHandler response_handler, - ContentReceiver content_receiver); - Result Get(const std::string &path, const Headers &headers, - ResponseHandler response_handler, ContentReceiver content_receiver, - Progress progress); - Result Get(const std::string &path, ResponseHandler response_handler, - ContentReceiver content_receiver, Progress progress); - - Result Get(const std::string &path, const Params ¶ms, - const Headers &headers, Progress progress = nullptr); - Result Get(const std::string &path, const Params ¶ms, - const Headers &headers, ContentReceiver content_receiver, - Progress progress = nullptr); - Result Get(const std::string &path, const Params ¶ms, - const Headers &headers, ResponseHandler response_handler, - ContentReceiver content_receiver, Progress progress = nullptr); - - Result Head(const std::string &path); - Result Head(const std::string &path, const Headers &headers); - - Result Post(const std::string &path); - Result Post(const std::string &path, const Headers &headers); - Result Post(const std::string &path, const char *body, size_t content_length, - const std::string &content_type); - Result Post(const std::string &path, const Headers &headers, const char *body, - size_t content_length, const std::string &content_type); - Result Post(const std::string &path, const Headers &headers, const char *body, - size_t content_length, const std::string &content_type, - Progress progress); - Result Post(const std::string &path, const std::string &body, - const std::string &content_type); - Result Post(const std::string &path, const std::string &body, - const std::string &content_type, Progress progress); - Result Post(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type); - Result Post(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type, - Progress progress); - Result Post(const std::string &path, size_t content_length, - ContentProvider content_provider, - const std::string &content_type); - Result Post(const std::string &path, - ContentProviderWithoutLength content_provider, - const std::string &content_type); - Result Post(const std::string &path, const Headers &headers, - size_t content_length, ContentProvider content_provider, - const std::string &content_type); - Result Post(const std::string &path, const Headers &headers, - ContentProviderWithoutLength content_provider, - const std::string &content_type); - Result Post(const std::string &path, const Params ¶ms); - Result Post(const std::string &path, const Headers &headers, - const Params ¶ms); - Result Post(const std::string &path, const Headers &headers, - const Params ¶ms, Progress progress); - Result Post(const std::string &path, const MultipartFormDataItems &items); - Result Post(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items); - Result Post(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items, const std::string &boundary); - Result Post(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items, - const MultipartFormDataProviderItems &provider_items); - - Result Put(const std::string &path); - Result Put(const std::string &path, const char *body, size_t content_length, - const std::string &content_type); - Result Put(const std::string &path, const Headers &headers, const char *body, - size_t content_length, const std::string &content_type); - Result Put(const std::string &path, const Headers &headers, const char *body, - size_t content_length, const std::string &content_type, - Progress progress); - Result Put(const std::string &path, const std::string &body, - const std::string &content_type); - Result Put(const std::string &path, const std::string &body, - const std::string &content_type, Progress progress); - Result Put(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type); - Result Put(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type, - Progress progress); - Result Put(const std::string &path, size_t content_length, - ContentProvider content_provider, const std::string &content_type); - Result Put(const std::string &path, - ContentProviderWithoutLength content_provider, - const std::string &content_type); - Result Put(const std::string &path, const Headers &headers, - size_t content_length, ContentProvider content_provider, - const std::string &content_type); - Result Put(const std::string &path, const Headers &headers, - ContentProviderWithoutLength content_provider, - const std::string &content_type); - Result Put(const std::string &path, const Params ¶ms); - Result Put(const std::string &path, const Headers &headers, - const Params ¶ms); - Result Put(const std::string &path, const Headers &headers, - const Params ¶ms, Progress progress); - Result Put(const std::string &path, const MultipartFormDataItems &items); - Result Put(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items); - Result Put(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items, const std::string &boundary); - Result Put(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items, - const MultipartFormDataProviderItems &provider_items); - - Result Patch(const std::string &path); - Result Patch(const std::string &path, const char *body, size_t content_length, - const std::string &content_type); - Result Patch(const std::string &path, const char *body, size_t content_length, - const std::string &content_type, Progress progress); - Result Patch(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type); - Result Patch(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type, Progress progress); - Result Patch(const std::string &path, const std::string &body, - const std::string &content_type); - Result Patch(const std::string &path, const std::string &body, - const std::string &content_type, Progress progress); - Result Patch(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type); - Result Patch(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type, - Progress progress); - Result Patch(const std::string &path, size_t content_length, - ContentProvider content_provider, - const std::string &content_type); - Result Patch(const std::string &path, - ContentProviderWithoutLength content_provider, - const std::string &content_type); - Result Patch(const std::string &path, const Headers &headers, - size_t content_length, ContentProvider content_provider, - const std::string &content_type); - Result Patch(const std::string &path, const Headers &headers, - ContentProviderWithoutLength content_provider, - const std::string &content_type); - - Result Delete(const std::string &path); - Result Delete(const std::string &path, const Headers &headers); - Result Delete(const std::string &path, const char *body, - size_t content_length, const std::string &content_type); - Result Delete(const std::string &path, const char *body, - size_t content_length, const std::string &content_type, - Progress progress); - Result Delete(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type); - Result Delete(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type, Progress progress); - Result Delete(const std::string &path, const std::string &body, - const std::string &content_type); - Result Delete(const std::string &path, const std::string &body, - const std::string &content_type, Progress progress); - Result Delete(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type); - Result Delete(const std::string &path, const Headers &headers, - const std::string &body, const std::string &content_type, - Progress progress); - - Result Options(const std::string &path); - Result Options(const std::string &path, const Headers &headers); - - bool send(Request &req, Response &res, Error &error); - Result send(const Request &req); - - void stop(); - - std::string host() const; - int port() const; - - size_t is_socket_open() const; - socket_t socket() const; - - void set_hostname_addr_map(std::map addr_map); - - void set_default_headers(Headers headers); - - void - set_header_writer(std::function const &writer); - - void set_address_family(int family); - void set_tcp_nodelay(bool on); - void set_socket_options(SocketOptions socket_options); - - void set_connection_timeout(time_t sec, time_t usec = 0); - template - void - set_connection_timeout(const std::chrono::duration &duration); - - void set_read_timeout(time_t sec, time_t usec = 0); - template - void set_read_timeout(const std::chrono::duration &duration); - - void set_write_timeout(time_t sec, time_t usec = 0); - template - void set_write_timeout(const std::chrono::duration &duration); - - void set_max_timeout(time_t msec); - template - void set_max_timeout(const std::chrono::duration &duration); - - void set_basic_auth(const std::string &username, const std::string &password); - void set_bearer_token_auth(const std::string &token); -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - void set_digest_auth(const std::string &username, - const std::string &password); -#endif - - void set_keep_alive(bool on); - void set_follow_location(bool on); - - void set_url_encode(bool on); - - void set_compress(bool on); - - void set_decompress(bool on); - - void set_interface(const std::string &intf); - - void set_proxy(const std::string &host, int port); - void set_proxy_basic_auth(const std::string &username, - const std::string &password); - void set_proxy_bearer_token_auth(const std::string &token); -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - void set_proxy_digest_auth(const std::string &username, - const std::string &password); -#endif - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - void enable_server_certificate_verification(bool enabled); - void enable_server_hostname_verification(bool enabled); - void set_server_certificate_verifier( - std::function verifier); -#endif - - void set_logger(Logger logger); - - // SSL -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - void set_ca_cert_path(const std::string &ca_cert_file_path, - const std::string &ca_cert_dir_path = std::string()); - - void set_ca_cert_store(X509_STORE *ca_cert_store); - void load_ca_cert_store(const char *ca_cert, std::size_t size); - - long get_openssl_verify_result() const; - - SSL_CTX *ssl_context() const; -#endif - -private: - std::unique_ptr cli_; - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - bool is_ssl_ = false; -#endif -}; - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT -class SSLServer : public Server { -public: - SSLServer(const char *cert_path, const char *private_key_path, - const char *client_ca_cert_file_path = nullptr, - const char *client_ca_cert_dir_path = nullptr, - const char *private_key_password = nullptr); - - SSLServer(X509 *cert, EVP_PKEY *private_key, - X509_STORE *client_ca_cert_store = nullptr); - - SSLServer( - const std::function &setup_ssl_ctx_callback); - - ~SSLServer() override; - - bool is_valid() const override; - - SSL_CTX *ssl_context() const; - - void update_certs(X509 *cert, EVP_PKEY *private_key, - X509_STORE *client_ca_cert_store = nullptr); - -private: - bool process_and_close_socket(socket_t sock) override; - - SSL_CTX *ctx_; - std::mutex ctx_mutex_; -}; - -class SSLClient final : public ClientImpl { -public: - explicit SSLClient(const std::string &host); - - explicit SSLClient(const std::string &host, int port); - - explicit SSLClient(const std::string &host, int port, - const std::string &client_cert_path, - const std::string &client_key_path, - const std::string &private_key_password = std::string()); - - explicit SSLClient(const std::string &host, int port, X509 *client_cert, - EVP_PKEY *client_key, - const std::string &private_key_password = std::string()); - - ~SSLClient() override; - - bool is_valid() const override; - - void set_ca_cert_store(X509_STORE *ca_cert_store); - void load_ca_cert_store(const char *ca_cert, std::size_t size); - - long get_openssl_verify_result() const; - - SSL_CTX *ssl_context() const; - -private: - bool create_and_connect_socket(Socket &socket, Error &error) override; - void shutdown_ssl(Socket &socket, bool shutdown_gracefully) override; - void shutdown_ssl_impl(Socket &socket, bool shutdown_gracefully); - - bool - process_socket(const Socket &socket, - std::chrono::time_point start_time, - std::function callback) override; - bool is_ssl() const override; - - bool connect_with_proxy( - Socket &sock, - std::chrono::time_point start_time, - Response &res, bool &success, Error &error); - bool initialize_ssl(Socket &socket, Error &error); - - bool load_certs(); - - bool verify_host(X509 *server_cert) const; - bool verify_host_with_subject_alt_name(X509 *server_cert) const; - bool verify_host_with_common_name(X509 *server_cert) const; - bool check_host_name(const char *pattern, size_t pattern_len) const; - - SSL_CTX *ctx_; - std::mutex ctx_mutex_; - std::once_flag initialize_cert_; - - std::vector host_components_; - - long verify_result_ = 0; - - friend class ClientImpl; -}; -#endif - -/* - * Implementation of template methods. - */ - -namespace detail { - -template -inline void duration_to_sec_and_usec(const T &duration, U callback) { - auto sec = std::chrono::duration_cast(duration).count(); - auto usec = std::chrono::duration_cast( - duration - std::chrono::seconds(sec)) - .count(); - callback(static_cast(sec), static_cast(usec)); -} - -template inline constexpr size_t str_len(const char (&)[N]) { - return N - 1; -} - -inline bool is_numeric(const std::string &str) { - return !str.empty() && std::all_of(str.begin(), str.end(), ::isdigit); -} - -inline uint64_t get_header_value_u64(const Headers &headers, - const std::string &key, uint64_t def, - size_t id, bool &is_invalid_value) { - is_invalid_value = false; - auto rng = headers.equal_range(key); - auto it = rng.first; - std::advance(it, static_cast(id)); - if (it != rng.second) { - if (is_numeric(it->second)) { - return std::strtoull(it->second.data(), nullptr, 10); - } else { - is_invalid_value = true; - } - } - return def; -} - -inline uint64_t get_header_value_u64(const Headers &headers, - const std::string &key, uint64_t def, - size_t id) { - bool dummy = false; - return get_header_value_u64(headers, key, def, id, dummy); -} - -} // namespace detail - -inline uint64_t Request::get_header_value_u64(const std::string &key, - uint64_t def, size_t id) const { - return detail::get_header_value_u64(headers, key, def, id); -} - -inline uint64_t Response::get_header_value_u64(const std::string &key, - uint64_t def, size_t id) const { - return detail::get_header_value_u64(headers, key, def, id); -} - -namespace detail { - -inline bool set_socket_opt_impl(socket_t sock, int level, int optname, - const void *optval, socklen_t optlen) { - return setsockopt(sock, level, optname, -#ifdef _WIN32 - reinterpret_cast(optval), -#else - optval, -#endif - optlen) == 0; -} - -inline bool set_socket_opt(socket_t sock, int level, int optname, int optval) { - return set_socket_opt_impl(sock, level, optname, &optval, sizeof(optval)); -} - -inline bool set_socket_opt_time(socket_t sock, int level, int optname, - time_t sec, time_t usec) { -#ifdef _WIN32 - auto timeout = static_cast(sec * 1000 + usec / 1000); -#else - timeval timeout; - timeout.tv_sec = static_cast(sec); - timeout.tv_usec = static_cast(usec); -#endif - return set_socket_opt_impl(sock, level, optname, &timeout, sizeof(timeout)); -} - -} // namespace detail - -inline void default_socket_options(socket_t sock) { - detail::set_socket_opt(sock, SOL_SOCKET, -#ifdef SO_REUSEPORT - SO_REUSEPORT, -#else - SO_REUSEADDR, -#endif - 1); -} - -inline const char *status_message(int status) { - switch (status) { - case StatusCode::Continue_100: return "Continue"; - case StatusCode::SwitchingProtocol_101: return "Switching Protocol"; - case StatusCode::Processing_102: return "Processing"; - case StatusCode::EarlyHints_103: return "Early Hints"; - case StatusCode::OK_200: return "OK"; - case StatusCode::Created_201: return "Created"; - case StatusCode::Accepted_202: return "Accepted"; - case StatusCode::NonAuthoritativeInformation_203: - return "Non-Authoritative Information"; - case StatusCode::NoContent_204: return "No Content"; - case StatusCode::ResetContent_205: return "Reset Content"; - case StatusCode::PartialContent_206: return "Partial Content"; - case StatusCode::MultiStatus_207: return "Multi-Status"; - case StatusCode::AlreadyReported_208: return "Already Reported"; - case StatusCode::IMUsed_226: return "IM Used"; - case StatusCode::MultipleChoices_300: return "Multiple Choices"; - case StatusCode::MovedPermanently_301: return "Moved Permanently"; - case StatusCode::Found_302: return "Found"; - case StatusCode::SeeOther_303: return "See Other"; - case StatusCode::NotModified_304: return "Not Modified"; - case StatusCode::UseProxy_305: return "Use Proxy"; - case StatusCode::unused_306: return "unused"; - case StatusCode::TemporaryRedirect_307: return "Temporary Redirect"; - case StatusCode::PermanentRedirect_308: return "Permanent Redirect"; - case StatusCode::BadRequest_400: return "Bad Request"; - case StatusCode::Unauthorized_401: return "Unauthorized"; - case StatusCode::PaymentRequired_402: return "Payment Required"; - case StatusCode::Forbidden_403: return "Forbidden"; - case StatusCode::NotFound_404: return "Not Found"; - case StatusCode::MethodNotAllowed_405: return "Method Not Allowed"; - case StatusCode::NotAcceptable_406: return "Not Acceptable"; - case StatusCode::ProxyAuthenticationRequired_407: - return "Proxy Authentication Required"; - case StatusCode::RequestTimeout_408: return "Request Timeout"; - case StatusCode::Conflict_409: return "Conflict"; - case StatusCode::Gone_410: return "Gone"; - case StatusCode::LengthRequired_411: return "Length Required"; - case StatusCode::PreconditionFailed_412: return "Precondition Failed"; - case StatusCode::PayloadTooLarge_413: return "Payload Too Large"; - case StatusCode::UriTooLong_414: return "URI Too Long"; - case StatusCode::UnsupportedMediaType_415: return "Unsupported Media Type"; - case StatusCode::RangeNotSatisfiable_416: return "Range Not Satisfiable"; - case StatusCode::ExpectationFailed_417: return "Expectation Failed"; - case StatusCode::ImATeapot_418: return "I'm a teapot"; - case StatusCode::MisdirectedRequest_421: return "Misdirected Request"; - case StatusCode::UnprocessableContent_422: return "Unprocessable Content"; - case StatusCode::Locked_423: return "Locked"; - case StatusCode::FailedDependency_424: return "Failed Dependency"; - case StatusCode::TooEarly_425: return "Too Early"; - case StatusCode::UpgradeRequired_426: return "Upgrade Required"; - case StatusCode::PreconditionRequired_428: return "Precondition Required"; - case StatusCode::TooManyRequests_429: return "Too Many Requests"; - case StatusCode::RequestHeaderFieldsTooLarge_431: - return "Request Header Fields Too Large"; - case StatusCode::UnavailableForLegalReasons_451: - return "Unavailable For Legal Reasons"; - case StatusCode::NotImplemented_501: return "Not Implemented"; - case StatusCode::BadGateway_502: return "Bad Gateway"; - case StatusCode::ServiceUnavailable_503: return "Service Unavailable"; - case StatusCode::GatewayTimeout_504: return "Gateway Timeout"; - case StatusCode::HttpVersionNotSupported_505: - return "HTTP Version Not Supported"; - case StatusCode::VariantAlsoNegotiates_506: return "Variant Also Negotiates"; - case StatusCode::InsufficientStorage_507: return "Insufficient Storage"; - case StatusCode::LoopDetected_508: return "Loop Detected"; - case StatusCode::NotExtended_510: return "Not Extended"; - case StatusCode::NetworkAuthenticationRequired_511: - return "Network Authentication Required"; - - default: - case StatusCode::InternalServerError_500: return "Internal Server Error"; - } -} - -inline std::string get_bearer_token_auth(const Request &req) { - if (req.has_header("Authorization")) { - constexpr auto bearer_header_prefix_len = detail::str_len("Bearer "); - return req.get_header_value("Authorization") - .substr(bearer_header_prefix_len); - } - return ""; -} - -template -inline Server & -Server::set_read_timeout(const std::chrono::duration &duration) { - detail::duration_to_sec_and_usec( - duration, [&](time_t sec, time_t usec) { set_read_timeout(sec, usec); }); - return *this; -} - -template -inline Server & -Server::set_write_timeout(const std::chrono::duration &duration) { - detail::duration_to_sec_and_usec( - duration, [&](time_t sec, time_t usec) { set_write_timeout(sec, usec); }); - return *this; -} - -template -inline Server & -Server::set_idle_interval(const std::chrono::duration &duration) { - detail::duration_to_sec_and_usec( - duration, [&](time_t sec, time_t usec) { set_idle_interval(sec, usec); }); - return *this; -} - -inline std::string to_string(const Error error) { - switch (error) { - case Error::Success: return "Success (no error)"; - case Error::Connection: return "Could not establish connection"; - case Error::BindIPAddress: return "Failed to bind IP address"; - case Error::Read: return "Failed to read connection"; - case Error::Write: return "Failed to write connection"; - case Error::ExceedRedirectCount: return "Maximum redirect count exceeded"; - case Error::Canceled: return "Connection handling canceled"; - case Error::SSLConnection: return "SSL connection failed"; - case Error::SSLLoadingCerts: return "SSL certificate loading failed"; - case Error::SSLServerVerification: return "SSL server verification failed"; - case Error::SSLServerHostnameVerification: - return "SSL server hostname verification failed"; - case Error::UnsupportedMultipartBoundaryChars: - return "Unsupported HTTP multipart boundary characters"; - case Error::Compression: return "Compression failed"; - case Error::ConnectionTimeout: return "Connection timed out"; - case Error::ProxyConnection: return "Proxy connection failed"; - case Error::Unknown: return "Unknown"; - default: break; - } - - return "Invalid"; -} - -inline std::ostream &operator<<(std::ostream &os, const Error &obj) { - os << to_string(obj); - os << " (" << static_cast::type>(obj) << ')'; - return os; -} - -inline uint64_t Result::get_request_header_value_u64(const std::string &key, - uint64_t def, - size_t id) const { - return detail::get_header_value_u64(request_headers_, key, def, id); -} - -template -inline void ClientImpl::set_connection_timeout( - const std::chrono::duration &duration) { - detail::duration_to_sec_and_usec(duration, [&](time_t sec, time_t usec) { - set_connection_timeout(sec, usec); - }); -} - -template -inline void ClientImpl::set_read_timeout( - const std::chrono::duration &duration) { - detail::duration_to_sec_and_usec( - duration, [&](time_t sec, time_t usec) { set_read_timeout(sec, usec); }); -} - -template -inline void ClientImpl::set_write_timeout( - const std::chrono::duration &duration) { - detail::duration_to_sec_and_usec( - duration, [&](time_t sec, time_t usec) { set_write_timeout(sec, usec); }); -} - -template -inline void ClientImpl::set_max_timeout( - const std::chrono::duration &duration) { - auto msec = - std::chrono::duration_cast(duration).count(); - set_max_timeout(msec); -} - -template -inline void Client::set_connection_timeout( - const std::chrono::duration &duration) { - cli_->set_connection_timeout(duration); -} - -template -inline void -Client::set_read_timeout(const std::chrono::duration &duration) { - cli_->set_read_timeout(duration); -} - -template -inline void -Client::set_write_timeout(const std::chrono::duration &duration) { - cli_->set_write_timeout(duration); -} - -template -inline void -Client::set_max_timeout(const std::chrono::duration &duration) { - cli_->set_max_timeout(duration); -} - -/* - * Forward declarations and types that will be part of the .h file if split into - * .h + .cc. - */ - -std::string hosted_at(const std::string &hostname); - -void hosted_at(const std::string &hostname, std::vector &addrs); - -std::string append_query_params(const std::string &path, const Params ¶ms); - -std::pair make_range_header(const Ranges &ranges); - -std::pair -make_basic_authentication_header(const std::string &username, - const std::string &password, - bool is_proxy = false); - -namespace detail { - -#if defined(_WIN32) -inline std::wstring u8string_to_wstring(const char *s) { - std::wstring ws; - auto len = static_cast(strlen(s)); - auto wlen = ::MultiByteToWideChar(CP_UTF8, 0, s, len, nullptr, 0); - if (wlen > 0) { - ws.resize(wlen); - wlen = ::MultiByteToWideChar( - CP_UTF8, 0, s, len, - const_cast(reinterpret_cast(ws.data())), wlen); - if (wlen != static_cast(ws.size())) { ws.clear(); } - } - return ws; -} -#endif - -struct FileStat { - FileStat(const std::string &path); - bool is_file() const; - bool is_dir() const; - -private: -#if defined(_WIN32) - struct _stat st_; -#else - struct stat st_; -#endif - int ret_ = -1; -}; - -std::string encode_query_param(const std::string &value); - -std::string decode_url(const std::string &s, bool convert_plus_to_space); - -std::string trim_copy(const std::string &s); - -void divide( - const char *data, std::size_t size, char d, - std::function - fn); - -void divide( - const std::string &str, char d, - std::function - fn); - -void split(const char *b, const char *e, char d, - std::function fn); - -void split(const char *b, const char *e, char d, size_t m, - std::function fn); - -bool process_client_socket( - socket_t sock, time_t read_timeout_sec, time_t read_timeout_usec, - time_t write_timeout_sec, time_t write_timeout_usec, - time_t max_timeout_msec, - std::chrono::time_point start_time, - std::function callback); - -socket_t create_client_socket(const std::string &host, const std::string &ip, - int port, int address_family, bool tcp_nodelay, - bool ipv6_v6only, SocketOptions socket_options, - time_t connection_timeout_sec, - time_t connection_timeout_usec, - time_t read_timeout_sec, time_t read_timeout_usec, - time_t write_timeout_sec, - time_t write_timeout_usec, - const std::string &intf, Error &error); - -const char *get_header_value(const Headers &headers, const std::string &key, - const char *def, size_t id); - -std::string params_to_query_str(const Params ¶ms); - -void parse_query_text(const char *data, std::size_t size, Params ¶ms); - -void parse_query_text(const std::string &s, Params ¶ms); - -bool parse_multipart_boundary(const std::string &content_type, - std::string &boundary); - -bool parse_range_header(const std::string &s, Ranges &ranges); - -int close_socket(socket_t sock); - -ssize_t send_socket(socket_t sock, const void *ptr, size_t size, int flags); - -ssize_t read_socket(socket_t sock, void *ptr, size_t size, int flags); - -enum class EncodingType { None = 0, Gzip, Brotli, Zstd }; - -EncodingType encoding_type(const Request &req, const Response &res); - -class BufferStream final : public Stream { -public: - BufferStream() = default; - ~BufferStream() override = default; - - bool is_readable() const override; - bool wait_readable() const override; - bool wait_writable() const override; - ssize_t read(char *ptr, size_t size) override; - ssize_t write(const char *ptr, size_t size) override; - void get_remote_ip_and_port(std::string &ip, int &port) const override; - void get_local_ip_and_port(std::string &ip, int &port) const override; - socket_t socket() const override; - time_t duration() const override; - - const std::string &get_buffer() const; - -private: - std::string buffer; - size_t position = 0; -}; - -class compressor { -public: - virtual ~compressor() = default; - - typedef std::function Callback; - virtual bool compress(const char *data, size_t data_length, bool last, - Callback callback) = 0; -}; - -class decompressor { -public: - virtual ~decompressor() = default; - - virtual bool is_valid() const = 0; - - typedef std::function Callback; - virtual bool decompress(const char *data, size_t data_length, - Callback callback) = 0; -}; - -class nocompressor final : public compressor { -public: - ~nocompressor() override = default; - - bool compress(const char *data, size_t data_length, bool /*last*/, - Callback callback) override; -}; - -#ifdef CPPHTTPLIB_ZLIB_SUPPORT -class gzip_compressor final : public compressor { -public: - gzip_compressor(); - ~gzip_compressor() override; - - bool compress(const char *data, size_t data_length, bool last, - Callback callback) override; - -private: - bool is_valid_ = false; - z_stream strm_; -}; - -class gzip_decompressor final : public decompressor { -public: - gzip_decompressor(); - ~gzip_decompressor() override; - - bool is_valid() const override; - - bool decompress(const char *data, size_t data_length, - Callback callback) override; - -private: - bool is_valid_ = false; - z_stream strm_; -}; -#endif - -#ifdef CPPHTTPLIB_BROTLI_SUPPORT -class brotli_compressor final : public compressor { -public: - brotli_compressor(); - ~brotli_compressor(); - - bool compress(const char *data, size_t data_length, bool last, - Callback callback) override; - -private: - BrotliEncoderState *state_ = nullptr; -}; - -class brotli_decompressor final : public decompressor { -public: - brotli_decompressor(); - ~brotli_decompressor(); - - bool is_valid() const override; - - bool decompress(const char *data, size_t data_length, - Callback callback) override; - -private: - BrotliDecoderResult decoder_r; - BrotliDecoderState *decoder_s = nullptr; -}; -#endif - -#ifdef CPPHTTPLIB_ZSTD_SUPPORT -class zstd_compressor : public compressor { -public: - zstd_compressor(); - ~zstd_compressor(); - - bool compress(const char *data, size_t data_length, bool last, - Callback callback) override; - -private: - ZSTD_CCtx *ctx_ = nullptr; -}; - -class zstd_decompressor : public decompressor { -public: - zstd_decompressor(); - ~zstd_decompressor(); - - bool is_valid() const override; - - bool decompress(const char *data, size_t data_length, - Callback callback) override; - -private: - ZSTD_DCtx *ctx_ = nullptr; -}; -#endif - -// NOTE: until the read size reaches `fixed_buffer_size`, use `fixed_buffer` -// to store data. The call can set memory on stack for performance. -class stream_line_reader { -public: - stream_line_reader(Stream &strm, char *fixed_buffer, - size_t fixed_buffer_size); - const char *ptr() const; - size_t size() const; - bool end_with_crlf() const; - bool getline(); - -private: - void append(char c); - - Stream &strm_; - char *fixed_buffer_; - const size_t fixed_buffer_size_; - size_t fixed_buffer_used_size_ = 0; - std::string growable_buffer_; -}; - -class mmap { -public: - mmap(const char *path); - ~mmap(); - - bool open(const char *path); - void close(); - - bool is_open() const; - size_t size() const; - const char *data() const; - -private: -#if defined(_WIN32) - HANDLE hFile_ = NULL; - HANDLE hMapping_ = NULL; -#else - int fd_ = -1; -#endif - size_t size_ = 0; - void *addr_ = nullptr; - bool is_open_empty_file = false; -}; - -// NOTE: https://www.rfc-editor.org/rfc/rfc9110#section-5 -namespace fields { - -inline bool is_token_char(char c) { - return std::isalnum(c) || c == '!' || c == '#' || c == '$' || c == '%' || - c == '&' || c == '\'' || c == '*' || c == '+' || c == '-' || - c == '.' || c == '^' || c == '_' || c == '`' || c == '|' || c == '~'; -} - -inline bool is_token(const std::string &s) { - if (s.empty()) { return false; } - for (auto c : s) { - if (!is_token_char(c)) { return false; } - } - return true; -} - -inline bool is_field_name(const std::string &s) { return is_token(s); } - -inline bool is_vchar(char c) { return c >= 33 && c <= 126; } - -inline bool is_obs_text(char c) { return 128 <= static_cast(c); } - -inline bool is_field_vchar(char c) { return is_vchar(c) || is_obs_text(c); } - -inline bool is_field_content(const std::string &s) { - if (s.empty()) { return true; } - - if (s.size() == 1) { - return is_field_vchar(s[0]); - } else if (s.size() == 2) { - return is_field_vchar(s[0]) && is_field_vchar(s[1]); - } else { - size_t i = 0; - - if (!is_field_vchar(s[i])) { return false; } - i++; - - while (i < s.size() - 1) { - auto c = s[i++]; - if (c == ' ' || c == '\t' || is_field_vchar(c)) { - } else { - return false; - } - } - - return is_field_vchar(s[i]); - } -} - -inline bool is_field_value(const std::string &s) { return is_field_content(s); } - -} // namespace fields - -} // namespace detail - -// ---------------------------------------------------------------------------- - -/* - * Implementation that will be part of the .cc file if split into .h + .cc. - */ - -namespace detail { - -inline bool is_hex(char c, int &v) { - if (0x20 <= c && isdigit(c)) { - v = c - '0'; - return true; - } else if ('A' <= c && c <= 'F') { - v = c - 'A' + 10; - return true; - } else if ('a' <= c && c <= 'f') { - v = c - 'a' + 10; - return true; - } - return false; -} - -inline bool from_hex_to_i(const std::string &s, size_t i, size_t cnt, - int &val) { - if (i >= s.size()) { return false; } - - val = 0; - for (; cnt; i++, cnt--) { - if (!s[i]) { return false; } - auto v = 0; - if (is_hex(s[i], v)) { - val = val * 16 + v; - } else { - return false; - } - } - return true; -} - -inline std::string from_i_to_hex(size_t n) { - static const auto charset = "0123456789abcdef"; - std::string ret; - do { - ret = charset[n & 15] + ret; - n >>= 4; - } while (n > 0); - return ret; -} - -inline size_t to_utf8(int code, char *buff) { - if (code < 0x0080) { - buff[0] = static_cast(code & 0x7F); - return 1; - } else if (code < 0x0800) { - buff[0] = static_cast(0xC0 | ((code >> 6) & 0x1F)); - buff[1] = static_cast(0x80 | (code & 0x3F)); - return 2; - } else if (code < 0xD800) { - buff[0] = static_cast(0xE0 | ((code >> 12) & 0xF)); - buff[1] = static_cast(0x80 | ((code >> 6) & 0x3F)); - buff[2] = static_cast(0x80 | (code & 0x3F)); - return 3; - } else if (code < 0xE000) { // D800 - DFFF is invalid... - return 0; - } else if (code < 0x10000) { - buff[0] = static_cast(0xE0 | ((code >> 12) & 0xF)); - buff[1] = static_cast(0x80 | ((code >> 6) & 0x3F)); - buff[2] = static_cast(0x80 | (code & 0x3F)); - return 3; - } else if (code < 0x110000) { - buff[0] = static_cast(0xF0 | ((code >> 18) & 0x7)); - buff[1] = static_cast(0x80 | ((code >> 12) & 0x3F)); - buff[2] = static_cast(0x80 | ((code >> 6) & 0x3F)); - buff[3] = static_cast(0x80 | (code & 0x3F)); - return 4; - } - - // NOTREACHED - return 0; -} - -// NOTE: This code came up with the following stackoverflow post: -// https://stackoverflow.com/questions/180947/base64-decode-snippet-in-c -inline std::string base64_encode(const std::string &in) { - static const auto lookup = - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; - - std::string out; - out.reserve(in.size()); - - auto val = 0; - auto valb = -6; - - for (auto c : in) { - val = (val << 8) + static_cast(c); - valb += 8; - while (valb >= 0) { - out.push_back(lookup[(val >> valb) & 0x3F]); - valb -= 6; - } - } - - if (valb > -6) { out.push_back(lookup[((val << 8) >> (valb + 8)) & 0x3F]); } - - while (out.size() % 4) { - out.push_back('='); - } - - return out; -} - -inline bool is_valid_path(const std::string &path) { - size_t level = 0; - size_t i = 0; - - // Skip slash - while (i < path.size() && path[i] == '/') { - i++; - } - - while (i < path.size()) { - // Read component - auto beg = i; - while (i < path.size() && path[i] != '/') { - if (path[i] == '\0') { - return false; - } else if (path[i] == '\\') { - return false; - } - i++; - } - - auto len = i - beg; - assert(len > 0); - - if (!path.compare(beg, len, ".")) { - ; - } else if (!path.compare(beg, len, "..")) { - if (level == 0) { return false; } - level--; - } else { - level++; - } - - // Skip slash - while (i < path.size() && path[i] == '/') { - i++; - } - } - - return true; -} - -inline FileStat::FileStat(const std::string &path) { -#if defined(_WIN32) - auto wpath = u8string_to_wstring(path.c_str()); - ret_ = _wstat(wpath.c_str(), &st_); -#else - ret_ = stat(path.c_str(), &st_); -#endif -} -inline bool FileStat::is_file() const { - return ret_ >= 0 && S_ISREG(st_.st_mode); -} -inline bool FileStat::is_dir() const { - return ret_ >= 0 && S_ISDIR(st_.st_mode); -} - -inline std::string encode_query_param(const std::string &value) { - std::ostringstream escaped; - escaped.fill('0'); - escaped << std::hex; - - for (auto c : value) { - if (std::isalnum(static_cast(c)) || c == '-' || c == '_' || - c == '.' || c == '!' || c == '~' || c == '*' || c == '\'' || c == '(' || - c == ')') { - escaped << c; - } else { - escaped << std::uppercase; - escaped << '%' << std::setw(2) - << static_cast(static_cast(c)); - escaped << std::nouppercase; - } - } - - return escaped.str(); -} - -inline std::string encode_url(const std::string &s) { - std::string result; - result.reserve(s.size()); - - for (size_t i = 0; s[i]; i++) { - switch (s[i]) { - case ' ': result += "%20"; break; - case '+': result += "%2B"; break; - case '\r': result += "%0D"; break; - case '\n': result += "%0A"; break; - case '\'': result += "%27"; break; - case ',': result += "%2C"; break; - // case ':': result += "%3A"; break; // ok? probably... - case ';': result += "%3B"; break; - default: - auto c = static_cast(s[i]); - if (c >= 0x80) { - result += '%'; - char hex[4]; - auto len = snprintf(hex, sizeof(hex) - 1, "%02X", c); - assert(len == 2); - result.append(hex, static_cast(len)); - } else { - result += s[i]; - } - break; - } - } - - return result; -} - -inline std::string decode_url(const std::string &s, - bool convert_plus_to_space) { - std::string result; - - for (size_t i = 0; i < s.size(); i++) { - if (s[i] == '%' && i + 1 < s.size()) { - if (s[i + 1] == 'u') { - auto val = 0; - if (from_hex_to_i(s, i + 2, 4, val)) { - // 4 digits Unicode codes - char buff[4]; - size_t len = to_utf8(val, buff); - if (len > 0) { result.append(buff, len); } - i += 5; // 'u0000' - } else { - result += s[i]; - } - } else { - auto val = 0; - if (from_hex_to_i(s, i + 1, 2, val)) { - // 2 digits hex codes - result += static_cast(val); - i += 2; // '00' - } else { - result += s[i]; - } - } - } else if (convert_plus_to_space && s[i] == '+') { - result += ' '; - } else { - result += s[i]; - } - } - - return result; -} - -inline std::string file_extension(const std::string &path) { - std::smatch m; - thread_local auto re = std::regex("\\.([a-zA-Z0-9]+)$"); - if (std::regex_search(path, m, re)) { return m[1].str(); } - return std::string(); -} - -inline bool is_space_or_tab(char c) { return c == ' ' || c == '\t'; } - -inline std::pair trim(const char *b, const char *e, size_t left, - size_t right) { - while (b + left < e && is_space_or_tab(b[left])) { - left++; - } - while (right > 0 && is_space_or_tab(b[right - 1])) { - right--; - } - return std::make_pair(left, right); -} - -inline std::string trim_copy(const std::string &s) { - auto r = trim(s.data(), s.data() + s.size(), 0, s.size()); - return s.substr(r.first, r.second - r.first); -} - -inline std::string trim_double_quotes_copy(const std::string &s) { - if (s.length() >= 2 && s.front() == '"' && s.back() == '"') { - return s.substr(1, s.size() - 2); - } - return s; -} - -inline void -divide(const char *data, std::size_t size, char d, - std::function - fn) { - const auto it = std::find(data, data + size, d); - const auto found = static_cast(it != data + size); - const auto lhs_data = data; - const auto lhs_size = static_cast(it - data); - const auto rhs_data = it + found; - const auto rhs_size = size - lhs_size - found; - - fn(lhs_data, lhs_size, rhs_data, rhs_size); -} - -inline void -divide(const std::string &str, char d, - std::function - fn) { - divide(str.data(), str.size(), d, std::move(fn)); -} - -inline void split(const char *b, const char *e, char d, - std::function fn) { - return split(b, e, d, (std::numeric_limits::max)(), std::move(fn)); -} - -inline void split(const char *b, const char *e, char d, size_t m, - std::function fn) { - size_t i = 0; - size_t beg = 0; - size_t count = 1; - - while (e ? (b + i < e) : (b[i] != '\0')) { - if (b[i] == d && count < m) { - auto r = trim(b, e, beg, i); - if (r.first < r.second) { fn(&b[r.first], &b[r.second]); } - beg = i + 1; - count++; - } - i++; - } - - if (i) { - auto r = trim(b, e, beg, i); - if (r.first < r.second) { fn(&b[r.first], &b[r.second]); } - } -} - -inline stream_line_reader::stream_line_reader(Stream &strm, char *fixed_buffer, - size_t fixed_buffer_size) - : strm_(strm), fixed_buffer_(fixed_buffer), - fixed_buffer_size_(fixed_buffer_size) {} - -inline const char *stream_line_reader::ptr() const { - if (growable_buffer_.empty()) { - return fixed_buffer_; - } else { - return growable_buffer_.data(); - } -} - -inline size_t stream_line_reader::size() const { - if (growable_buffer_.empty()) { - return fixed_buffer_used_size_; - } else { - return growable_buffer_.size(); - } -} - -inline bool stream_line_reader::end_with_crlf() const { - auto end = ptr() + size(); - return size() >= 2 && end[-2] == '\r' && end[-1] == '\n'; -} - -inline bool stream_line_reader::getline() { - fixed_buffer_used_size_ = 0; - growable_buffer_.clear(); - -#ifndef CPPHTTPLIB_ALLOW_LF_AS_LINE_TERMINATOR - char prev_byte = 0; -#endif - - for (size_t i = 0;; i++) { - char byte; - auto n = strm_.read(&byte, 1); - - if (n < 0) { - return false; - } else if (n == 0) { - if (i == 0) { - return false; - } else { - break; - } - } - - append(byte); - -#ifdef CPPHTTPLIB_ALLOW_LF_AS_LINE_TERMINATOR - if (byte == '\n') { break; } -#else - if (prev_byte == '\r' && byte == '\n') { break; } - prev_byte = byte; -#endif - } - - return true; -} - -inline void stream_line_reader::append(char c) { - if (fixed_buffer_used_size_ < fixed_buffer_size_ - 1) { - fixed_buffer_[fixed_buffer_used_size_++] = c; - fixed_buffer_[fixed_buffer_used_size_] = '\0'; - } else { - if (growable_buffer_.empty()) { - assert(fixed_buffer_[fixed_buffer_used_size_] == '\0'); - growable_buffer_.assign(fixed_buffer_, fixed_buffer_used_size_); - } - growable_buffer_ += c; - } -} - -inline mmap::mmap(const char *path) { open(path); } - -inline mmap::~mmap() { close(); } - -inline bool mmap::open(const char *path) { - close(); - -#if defined(_WIN32) - auto wpath = u8string_to_wstring(path); - if (wpath.empty()) { return false; } - -#if _WIN32_WINNT >= _WIN32_WINNT_WIN8 - hFile_ = ::CreateFile2(wpath.c_str(), GENERIC_READ, FILE_SHARE_READ, - OPEN_EXISTING, NULL); -#else - hFile_ = ::CreateFileW(wpath.c_str(), GENERIC_READ, FILE_SHARE_READ, NULL, - OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); -#endif - - if (hFile_ == INVALID_HANDLE_VALUE) { return false; } - - LARGE_INTEGER size{}; - if (!::GetFileSizeEx(hFile_, &size)) { return false; } - // If the following line doesn't compile due to QuadPart, update Windows SDK. - // See: - // https://github.com/yhirose/cpp-httplib/issues/1903#issuecomment-2316520721 - if (static_cast(size.QuadPart) > - (std::numeric_limits::max)()) { - // `size_t` might be 32-bits, on 32-bits Windows. - return false; - } - size_ = static_cast(size.QuadPart); - -#if _WIN32_WINNT >= _WIN32_WINNT_WIN8 - hMapping_ = - ::CreateFileMappingFromApp(hFile_, NULL, PAGE_READONLY, size_, NULL); -#else - hMapping_ = ::CreateFileMappingW(hFile_, NULL, PAGE_READONLY, 0, 0, NULL); -#endif - - // Special treatment for an empty file... - if (hMapping_ == NULL && size_ == 0) { - close(); - is_open_empty_file = true; - return true; - } - - if (hMapping_ == NULL) { - close(); - return false; - } - -#if _WIN32_WINNT >= _WIN32_WINNT_WIN8 - addr_ = ::MapViewOfFileFromApp(hMapping_, FILE_MAP_READ, 0, 0); -#else - addr_ = ::MapViewOfFile(hMapping_, FILE_MAP_READ, 0, 0, 0); -#endif - - if (addr_ == nullptr) { - close(); - return false; - } -#else - fd_ = ::open(path, O_RDONLY); - if (fd_ == -1) { return false; } - - struct stat sb; - if (fstat(fd_, &sb) == -1) { - close(); - return false; - } - size_ = static_cast(sb.st_size); - - addr_ = ::mmap(NULL, size_, PROT_READ, MAP_PRIVATE, fd_, 0); - - // Special treatment for an empty file... - if (addr_ == MAP_FAILED && size_ == 0) { - close(); - is_open_empty_file = true; - return false; - } -#endif - - return true; -} - -inline bool mmap::is_open() const { - return is_open_empty_file ? true : addr_ != nullptr; -} - -inline size_t mmap::size() const { return size_; } - -inline const char *mmap::data() const { - return is_open_empty_file ? "" : static_cast(addr_); -} - -inline void mmap::close() { -#if defined(_WIN32) - if (addr_) { - ::UnmapViewOfFile(addr_); - addr_ = nullptr; - } - - if (hMapping_) { - ::CloseHandle(hMapping_); - hMapping_ = NULL; - } - - if (hFile_ != INVALID_HANDLE_VALUE) { - ::CloseHandle(hFile_); - hFile_ = INVALID_HANDLE_VALUE; - } - - is_open_empty_file = false; -#else - if (addr_ != nullptr) { - munmap(addr_, size_); - addr_ = nullptr; - } - - if (fd_ != -1) { - ::close(fd_); - fd_ = -1; - } -#endif - size_ = 0; -} -inline int close_socket(socket_t sock) { -#ifdef _WIN32 - return closesocket(sock); -#else - return close(sock); -#endif -} - -template inline ssize_t handle_EINTR(T fn) { - ssize_t res = 0; - while (true) { - res = fn(); - if (res < 0 && errno == EINTR) { - std::this_thread::sleep_for(std::chrono::microseconds{1}); - continue; - } - break; - } - return res; -} - -inline ssize_t read_socket(socket_t sock, void *ptr, size_t size, int flags) { - return handle_EINTR([&]() { - return recv(sock, -#ifdef _WIN32 - static_cast(ptr), static_cast(size), -#else - ptr, size, -#endif - flags); - }); -} - -inline ssize_t send_socket(socket_t sock, const void *ptr, size_t size, - int flags) { - return handle_EINTR([&]() { - return send(sock, -#ifdef _WIN32 - static_cast(ptr), static_cast(size), -#else - ptr, size, -#endif - flags); - }); -} - -inline int poll_wrapper(struct pollfd *fds, nfds_t nfds, int timeout) { -#ifdef _WIN32 - return ::WSAPoll(fds, nfds, timeout); -#else - return ::poll(fds, nfds, timeout); -#endif -} - -template -inline ssize_t select_impl(socket_t sock, time_t sec, time_t usec) { - struct pollfd pfd; - pfd.fd = sock; - pfd.events = (Read ? POLLIN : POLLOUT); - - auto timeout = static_cast(sec * 1000 + usec / 1000); - - return handle_EINTR([&]() { return poll_wrapper(&pfd, 1, timeout); }); -} - -inline ssize_t select_read(socket_t sock, time_t sec, time_t usec) { - return select_impl(sock, sec, usec); -} - -inline ssize_t select_write(socket_t sock, time_t sec, time_t usec) { - return select_impl(sock, sec, usec); -} - -inline Error wait_until_socket_is_ready(socket_t sock, time_t sec, - time_t usec) { - struct pollfd pfd_read; - pfd_read.fd = sock; - pfd_read.events = POLLIN | POLLOUT; - - auto timeout = static_cast(sec * 1000 + usec / 1000); - - auto poll_res = - handle_EINTR([&]() { return poll_wrapper(&pfd_read, 1, timeout); }); - - if (poll_res == 0) { return Error::ConnectionTimeout; } - - if (poll_res > 0 && pfd_read.revents & (POLLIN | POLLOUT)) { - auto error = 0; - socklen_t len = sizeof(error); - auto res = getsockopt(sock, SOL_SOCKET, SO_ERROR, - reinterpret_cast(&error), &len); - auto successful = res >= 0 && !error; - return successful ? Error::Success : Error::Connection; - } - - return Error::Connection; -} - -inline bool is_socket_alive(socket_t sock) { - const auto val = detail::select_read(sock, 0, 0); - if (val == 0) { - return true; - } else if (val < 0 && errno == EBADF) { - return false; - } - char buf[1]; - return detail::read_socket(sock, &buf[0], sizeof(buf), MSG_PEEK) > 0; -} - -class SocketStream final : public Stream { -public: - SocketStream(socket_t sock, time_t read_timeout_sec, time_t read_timeout_usec, - time_t write_timeout_sec, time_t write_timeout_usec, - time_t max_timeout_msec = 0, - std::chrono::time_point start_time = - (std::chrono::steady_clock::time_point::min)()); - ~SocketStream() override; - - bool is_readable() const override; - bool wait_readable() const override; - bool wait_writable() const override; - ssize_t read(char *ptr, size_t size) override; - ssize_t write(const char *ptr, size_t size) override; - void get_remote_ip_and_port(std::string &ip, int &port) const override; - void get_local_ip_and_port(std::string &ip, int &port) const override; - socket_t socket() const override; - time_t duration() const override; - -private: - socket_t sock_; - time_t read_timeout_sec_; - time_t read_timeout_usec_; - time_t write_timeout_sec_; - time_t write_timeout_usec_; - time_t max_timeout_msec_; - const std::chrono::time_point start_time_; - - std::vector read_buff_; - size_t read_buff_off_ = 0; - size_t read_buff_content_size_ = 0; - - static const size_t read_buff_size_ = 1024l * 4; -}; - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT -class SSLSocketStream final : public Stream { -public: - SSLSocketStream( - socket_t sock, SSL *ssl, time_t read_timeout_sec, - time_t read_timeout_usec, time_t write_timeout_sec, - time_t write_timeout_usec, time_t max_timeout_msec = 0, - std::chrono::time_point start_time = - (std::chrono::steady_clock::time_point::min)()); - ~SSLSocketStream() override; - - bool is_readable() const override; - bool wait_readable() const override; - bool wait_writable() const override; - ssize_t read(char *ptr, size_t size) override; - ssize_t write(const char *ptr, size_t size) override; - void get_remote_ip_and_port(std::string &ip, int &port) const override; - void get_local_ip_and_port(std::string &ip, int &port) const override; - socket_t socket() const override; - time_t duration() const override; - -private: - socket_t sock_; - SSL *ssl_; - time_t read_timeout_sec_; - time_t read_timeout_usec_; - time_t write_timeout_sec_; - time_t write_timeout_usec_; - time_t max_timeout_msec_; - const std::chrono::time_point start_time_; -}; -#endif - -inline bool keep_alive(const std::atomic &svr_sock, socket_t sock, - time_t keep_alive_timeout_sec) { - using namespace std::chrono; - - const auto interval_usec = - CPPHTTPLIB_KEEPALIVE_TIMEOUT_CHECK_INTERVAL_USECOND; - - // Avoid expensive `steady_clock::now()` call for the first time - if (select_read(sock, 0, interval_usec) > 0) { return true; } - - const auto start = steady_clock::now() - microseconds{interval_usec}; - const auto timeout = seconds{keep_alive_timeout_sec}; - - while (true) { - if (svr_sock == INVALID_SOCKET) { - break; // Server socket is closed - } - - auto val = select_read(sock, 0, interval_usec); - if (val < 0) { - break; // Ssocket error - } else if (val == 0) { - if (steady_clock::now() - start > timeout) { - break; // Timeout - } - } else { - return true; // Ready for read - } - } - - return false; -} - -template -inline bool -process_server_socket_core(const std::atomic &svr_sock, socket_t sock, - size_t keep_alive_max_count, - time_t keep_alive_timeout_sec, T callback) { - assert(keep_alive_max_count > 0); - auto ret = false; - auto count = keep_alive_max_count; - while (count > 0 && keep_alive(svr_sock, sock, keep_alive_timeout_sec)) { - auto close_connection = count == 1; - auto connection_closed = false; - ret = callback(close_connection, connection_closed); - if (!ret || connection_closed) { break; } - count--; - } - return ret; -} - -template -inline bool -process_server_socket(const std::atomic &svr_sock, socket_t sock, - size_t keep_alive_max_count, - time_t keep_alive_timeout_sec, time_t read_timeout_sec, - time_t read_timeout_usec, time_t write_timeout_sec, - time_t write_timeout_usec, T callback) { - return process_server_socket_core( - svr_sock, sock, keep_alive_max_count, keep_alive_timeout_sec, - [&](bool close_connection, bool &connection_closed) { - SocketStream strm(sock, read_timeout_sec, read_timeout_usec, - write_timeout_sec, write_timeout_usec); - return callback(strm, close_connection, connection_closed); - }); -} - -inline bool process_client_socket( - socket_t sock, time_t read_timeout_sec, time_t read_timeout_usec, - time_t write_timeout_sec, time_t write_timeout_usec, - time_t max_timeout_msec, - std::chrono::time_point start_time, - std::function callback) { - SocketStream strm(sock, read_timeout_sec, read_timeout_usec, - write_timeout_sec, write_timeout_usec, max_timeout_msec, - start_time); - return callback(strm); -} - -inline int shutdown_socket(socket_t sock) { -#ifdef _WIN32 - return shutdown(sock, SD_BOTH); -#else - return shutdown(sock, SHUT_RDWR); -#endif -} - -inline std::string escape_abstract_namespace_unix_domain(const std::string &s) { - if (s.size() > 1 && s[0] == '\0') { - auto ret = s; - ret[0] = '@'; - return ret; - } - return s; -} - -inline std::string -unescape_abstract_namespace_unix_domain(const std::string &s) { - if (s.size() > 1 && s[0] == '@') { - auto ret = s; - ret[0] = '\0'; - return ret; - } - return s; -} - -template -socket_t create_socket(const std::string &host, const std::string &ip, int port, - int address_family, int socket_flags, bool tcp_nodelay, - bool ipv6_v6only, SocketOptions socket_options, - BindOrConnect bind_or_connect) { - // Get address info - const char *node = nullptr; - struct addrinfo hints; - struct addrinfo *result; - - memset(&hints, 0, sizeof(struct addrinfo)); - hints.ai_socktype = SOCK_STREAM; - hints.ai_protocol = IPPROTO_IP; - - if (!ip.empty()) { - node = ip.c_str(); - // Ask getaddrinfo to convert IP in c-string to address - hints.ai_family = AF_UNSPEC; - hints.ai_flags = AI_NUMERICHOST; - } else { - if (!host.empty()) { node = host.c_str(); } - hints.ai_family = address_family; - hints.ai_flags = socket_flags; - } - - if (hints.ai_family == AF_UNIX) { - const auto addrlen = host.length(); - if (addrlen > sizeof(sockaddr_un::sun_path)) { return INVALID_SOCKET; } - -#ifdef SOCK_CLOEXEC - auto sock = socket(hints.ai_family, hints.ai_socktype | SOCK_CLOEXEC, - hints.ai_protocol); -#else - auto sock = socket(hints.ai_family, hints.ai_socktype, hints.ai_protocol); -#endif - - if (sock != INVALID_SOCKET) { - sockaddr_un addr{}; - addr.sun_family = AF_UNIX; - - auto unescaped_host = unescape_abstract_namespace_unix_domain(host); - std::copy(unescaped_host.begin(), unescaped_host.end(), addr.sun_path); - - hints.ai_addr = reinterpret_cast(&addr); - hints.ai_addrlen = static_cast( - sizeof(addr) - sizeof(addr.sun_path) + addrlen); - -#ifndef SOCK_CLOEXEC -#ifndef _WIN32 - fcntl(sock, F_SETFD, FD_CLOEXEC); -#endif -#endif - - if (socket_options) { socket_options(sock); } - -#ifdef _WIN32 - // Setting SO_REUSEADDR seems not to work well with AF_UNIX on windows, so - // remove the option. - detail::set_socket_opt(sock, SOL_SOCKET, SO_REUSEADDR, 0); -#endif - - bool dummy; - if (!bind_or_connect(sock, hints, dummy)) { - close_socket(sock); - sock = INVALID_SOCKET; - } - } - return sock; - } - - auto service = std::to_string(port); - - if (getaddrinfo(node, service.c_str(), &hints, &result)) { -#if defined __linux__ && !defined __ANDROID__ - res_init(); -#endif - return INVALID_SOCKET; - } - auto se = detail::scope_exit([&] { freeaddrinfo(result); }); - - for (auto rp = result; rp; rp = rp->ai_next) { - // Create a socket -#ifdef _WIN32 - auto sock = - WSASocketW(rp->ai_family, rp->ai_socktype, rp->ai_protocol, nullptr, 0, - WSA_FLAG_NO_HANDLE_INHERIT | WSA_FLAG_OVERLAPPED); - /** - * Since the WSA_FLAG_NO_HANDLE_INHERIT is only supported on Windows 7 SP1 - * and above the socket creation fails on older Windows Systems. - * - * Let's try to create a socket the old way in this case. - * - * Reference: - * https://docs.microsoft.com/en-us/windows/win32/api/winsock2/nf-winsock2-wsasocketa - * - * WSA_FLAG_NO_HANDLE_INHERIT: - * This flag is supported on Windows 7 with SP1, Windows Server 2008 R2 with - * SP1, and later - * - */ - if (sock == INVALID_SOCKET) { - sock = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol); - } -#else - -#ifdef SOCK_CLOEXEC - auto sock = - socket(rp->ai_family, rp->ai_socktype | SOCK_CLOEXEC, rp->ai_protocol); -#else - auto sock = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol); -#endif - -#endif - if (sock == INVALID_SOCKET) { continue; } - -#if !defined _WIN32 && !defined SOCK_CLOEXEC - if (fcntl(sock, F_SETFD, FD_CLOEXEC) == -1) { - close_socket(sock); - continue; - } -#endif - - if (tcp_nodelay) { set_socket_opt(sock, IPPROTO_TCP, TCP_NODELAY, 1); } - - if (rp->ai_family == AF_INET6) { - set_socket_opt(sock, IPPROTO_IPV6, IPV6_V6ONLY, ipv6_v6only ? 1 : 0); - } - - if (socket_options) { socket_options(sock); } - - // bind or connect - auto quit = false; - if (bind_or_connect(sock, *rp, quit)) { return sock; } - - close_socket(sock); - - if (quit) { break; } - } - - return INVALID_SOCKET; -} - -inline void set_nonblocking(socket_t sock, bool nonblocking) { -#ifdef _WIN32 - auto flags = nonblocking ? 1UL : 0UL; - ioctlsocket(sock, FIONBIO, &flags); -#else - auto flags = fcntl(sock, F_GETFL, 0); - fcntl(sock, F_SETFL, - nonblocking ? (flags | O_NONBLOCK) : (flags & (~O_NONBLOCK))); -#endif -} - -inline bool is_connection_error() { -#ifdef _WIN32 - return WSAGetLastError() != WSAEWOULDBLOCK; -#else - return errno != EINPROGRESS; -#endif -} - -inline bool bind_ip_address(socket_t sock, const std::string &host) { - struct addrinfo hints; - struct addrinfo *result; - - memset(&hints, 0, sizeof(struct addrinfo)); - hints.ai_family = AF_UNSPEC; - hints.ai_socktype = SOCK_STREAM; - hints.ai_protocol = 0; - - if (getaddrinfo(host.c_str(), "0", &hints, &result)) { return false; } - auto se = detail::scope_exit([&] { freeaddrinfo(result); }); - - auto ret = false; - for (auto rp = result; rp; rp = rp->ai_next) { - const auto &ai = *rp; - if (!::bind(sock, ai.ai_addr, static_cast(ai.ai_addrlen))) { - ret = true; - break; - } - } - - return ret; -} - -#if !defined _WIN32 && !defined ANDROID && !defined _AIX && !defined __MVS__ -#define USE_IF2IP -#endif - -#ifdef USE_IF2IP -inline std::string if2ip(int address_family, const std::string &ifn) { - struct ifaddrs *ifap; - getifaddrs(&ifap); - auto se = detail::scope_exit([&] { freeifaddrs(ifap); }); - - std::string addr_candidate; - for (auto ifa = ifap; ifa; ifa = ifa->ifa_next) { - if (ifa->ifa_addr && ifn == ifa->ifa_name && - (AF_UNSPEC == address_family || - ifa->ifa_addr->sa_family == address_family)) { - if (ifa->ifa_addr->sa_family == AF_INET) { - auto sa = reinterpret_cast(ifa->ifa_addr); - char buf[INET_ADDRSTRLEN]; - if (inet_ntop(AF_INET, &sa->sin_addr, buf, INET_ADDRSTRLEN)) { - return std::string(buf, INET_ADDRSTRLEN); - } - } else if (ifa->ifa_addr->sa_family == AF_INET6) { - auto sa = reinterpret_cast(ifa->ifa_addr); - if (!IN6_IS_ADDR_LINKLOCAL(&sa->sin6_addr)) { - char buf[INET6_ADDRSTRLEN] = {}; - if (inet_ntop(AF_INET6, &sa->sin6_addr, buf, INET6_ADDRSTRLEN)) { - // equivalent to mac's IN6_IS_ADDR_UNIQUE_LOCAL - auto s6_addr_head = sa->sin6_addr.s6_addr[0]; - if (s6_addr_head == 0xfc || s6_addr_head == 0xfd) { - addr_candidate = std::string(buf, INET6_ADDRSTRLEN); - } else { - return std::string(buf, INET6_ADDRSTRLEN); - } - } - } - } - } - } - return addr_candidate; -} -#endif - -inline socket_t create_client_socket( - const std::string &host, const std::string &ip, int port, - int address_family, bool tcp_nodelay, bool ipv6_v6only, - SocketOptions socket_options, time_t connection_timeout_sec, - time_t connection_timeout_usec, time_t read_timeout_sec, - time_t read_timeout_usec, time_t write_timeout_sec, - time_t write_timeout_usec, const std::string &intf, Error &error) { - auto sock = create_socket( - host, ip, port, address_family, 0, tcp_nodelay, ipv6_v6only, - std::move(socket_options), - [&](socket_t sock2, struct addrinfo &ai, bool &quit) -> bool { - if (!intf.empty()) { -#ifdef USE_IF2IP - auto ip_from_if = if2ip(address_family, intf); - if (ip_from_if.empty()) { ip_from_if = intf; } - if (!bind_ip_address(sock2, ip_from_if)) { - error = Error::BindIPAddress; - return false; - } -#endif - } - - set_nonblocking(sock2, true); - - auto ret = - ::connect(sock2, ai.ai_addr, static_cast(ai.ai_addrlen)); - - if (ret < 0) { - if (is_connection_error()) { - error = Error::Connection; - return false; - } - error = wait_until_socket_is_ready(sock2, connection_timeout_sec, - connection_timeout_usec); - if (error != Error::Success) { - if (error == Error::ConnectionTimeout) { quit = true; } - return false; - } - } - - set_nonblocking(sock2, false); - set_socket_opt_time(sock2, SOL_SOCKET, SO_RCVTIMEO, read_timeout_sec, - read_timeout_usec); - set_socket_opt_time(sock2, SOL_SOCKET, SO_SNDTIMEO, write_timeout_sec, - write_timeout_usec); - - error = Error::Success; - return true; - }); - - if (sock != INVALID_SOCKET) { - error = Error::Success; - } else { - if (error == Error::Success) { error = Error::Connection; } - } - - return sock; -} - -inline bool get_ip_and_port(const struct sockaddr_storage &addr, - socklen_t addr_len, std::string &ip, int &port) { - if (addr.ss_family == AF_INET) { - port = ntohs(reinterpret_cast(&addr)->sin_port); - } else if (addr.ss_family == AF_INET6) { - port = - ntohs(reinterpret_cast(&addr)->sin6_port); - } else { - return false; - } - - std::array ipstr{}; - if (getnameinfo(reinterpret_cast(&addr), addr_len, - ipstr.data(), static_cast(ipstr.size()), nullptr, - 0, NI_NUMERICHOST)) { - return false; - } - - ip = ipstr.data(); - return true; -} - -inline void get_local_ip_and_port(socket_t sock, std::string &ip, int &port) { - struct sockaddr_storage addr; - socklen_t addr_len = sizeof(addr); - if (!getsockname(sock, reinterpret_cast(&addr), - &addr_len)) { - get_ip_and_port(addr, addr_len, ip, port); - } -} - -inline void get_remote_ip_and_port(socket_t sock, std::string &ip, int &port) { - struct sockaddr_storage addr; - socklen_t addr_len = sizeof(addr); - - if (!getpeername(sock, reinterpret_cast(&addr), - &addr_len)) { -#ifndef _WIN32 - if (addr.ss_family == AF_UNIX) { -#if defined(__linux__) - struct ucred ucred; - socklen_t len = sizeof(ucred); - if (getsockopt(sock, SOL_SOCKET, SO_PEERCRED, &ucred, &len) == 0) { - port = ucred.pid; - } -#elif defined(SOL_LOCAL) && defined(SO_PEERPID) // __APPLE__ - pid_t pid; - socklen_t len = sizeof(pid); - if (getsockopt(sock, SOL_LOCAL, SO_PEERPID, &pid, &len) == 0) { - port = pid; - } -#endif - return; - } -#endif - get_ip_and_port(addr, addr_len, ip, port); - } -} - -inline constexpr unsigned int str2tag_core(const char *s, size_t l, - unsigned int h) { - return (l == 0) - ? h - : str2tag_core( - s + 1, l - 1, - // Unsets the 6 high bits of h, therefore no overflow happens - (((std::numeric_limits::max)() >> 6) & - h * 33) ^ - static_cast(*s)); -} - -inline unsigned int str2tag(const std::string &s) { - return str2tag_core(s.data(), s.size(), 0); -} - -namespace udl { - -inline constexpr unsigned int operator""_t(const char *s, size_t l) { - return str2tag_core(s, l, 0); -} - -} // namespace udl - -inline std::string -find_content_type(const std::string &path, - const std::map &user_data, - const std::string &default_content_type) { - auto ext = file_extension(path); - - auto it = user_data.find(ext); - if (it != user_data.end()) { return it->second; } - - using udl::operator""_t; - - switch (str2tag(ext)) { - default: return default_content_type; - - case "css"_t: return "text/css"; - case "csv"_t: return "text/csv"; - case "htm"_t: - case "html"_t: return "text/html"; - case "js"_t: - case "mjs"_t: return "text/javascript"; - case "txt"_t: return "text/plain"; - case "vtt"_t: return "text/vtt"; - - case "apng"_t: return "image/apng"; - case "avif"_t: return "image/avif"; - case "bmp"_t: return "image/bmp"; - case "gif"_t: return "image/gif"; - case "png"_t: return "image/png"; - case "svg"_t: return "image/svg+xml"; - case "webp"_t: return "image/webp"; - case "ico"_t: return "image/x-icon"; - case "tif"_t: return "image/tiff"; - case "tiff"_t: return "image/tiff"; - case "jpg"_t: - case "jpeg"_t: return "image/jpeg"; - - case "mp4"_t: return "video/mp4"; - case "mpeg"_t: return "video/mpeg"; - case "webm"_t: return "video/webm"; - - case "mp3"_t: return "audio/mp3"; - case "mpga"_t: return "audio/mpeg"; - case "weba"_t: return "audio/webm"; - case "wav"_t: return "audio/wave"; - - case "otf"_t: return "font/otf"; - case "ttf"_t: return "font/ttf"; - case "woff"_t: return "font/woff"; - case "woff2"_t: return "font/woff2"; - - case "7z"_t: return "application/x-7z-compressed"; - case "atom"_t: return "application/atom+xml"; - case "pdf"_t: return "application/pdf"; - case "json"_t: return "application/json"; - case "rss"_t: return "application/rss+xml"; - case "tar"_t: return "application/x-tar"; - case "xht"_t: - case "xhtml"_t: return "application/xhtml+xml"; - case "xslt"_t: return "application/xslt+xml"; - case "xml"_t: return "application/xml"; - case "gz"_t: return "application/gzip"; - case "zip"_t: return "application/zip"; - case "wasm"_t: return "application/wasm"; - } -} - -inline bool can_compress_content_type(const std::string &content_type) { - using udl::operator""_t; - - auto tag = str2tag(content_type); - - switch (tag) { - case "image/svg+xml"_t: - case "application/javascript"_t: - case "application/json"_t: - case "application/xml"_t: - case "application/protobuf"_t: - case "application/xhtml+xml"_t: return true; - - case "text/event-stream"_t: return false; - - default: return !content_type.rfind("text/", 0); - } -} - -inline EncodingType encoding_type(const Request &req, const Response &res) { - auto ret = - detail::can_compress_content_type(res.get_header_value("Content-Type")); - if (!ret) { return EncodingType::None; } - - const auto &s = req.get_header_value("Accept-Encoding"); - (void)(s); - -#ifdef CPPHTTPLIB_BROTLI_SUPPORT - // TODO: 'Accept-Encoding' has br, not br;q=0 - ret = s.find("br") != std::string::npos; - if (ret) { return EncodingType::Brotli; } -#endif - -#ifdef CPPHTTPLIB_ZLIB_SUPPORT - // TODO: 'Accept-Encoding' has gzip, not gzip;q=0 - ret = s.find("gzip") != std::string::npos; - if (ret) { return EncodingType::Gzip; } -#endif - -#ifdef CPPHTTPLIB_ZSTD_SUPPORT - // TODO: 'Accept-Encoding' has zstd, not zstd;q=0 - ret = s.find("zstd") != std::string::npos; - if (ret) { return EncodingType::Zstd; } -#endif - - return EncodingType::None; -} - -inline bool nocompressor::compress(const char *data, size_t data_length, - bool /*last*/, Callback callback) { - if (!data_length) { return true; } - return callback(data, data_length); -} - -#ifdef CPPHTTPLIB_ZLIB_SUPPORT -inline gzip_compressor::gzip_compressor() { - std::memset(&strm_, 0, sizeof(strm_)); - strm_.zalloc = Z_NULL; - strm_.zfree = Z_NULL; - strm_.opaque = Z_NULL; - - is_valid_ = deflateInit2(&strm_, Z_DEFAULT_COMPRESSION, Z_DEFLATED, 31, 8, - Z_DEFAULT_STRATEGY) == Z_OK; -} - -inline gzip_compressor::~gzip_compressor() { deflateEnd(&strm_); } - -inline bool gzip_compressor::compress(const char *data, size_t data_length, - bool last, Callback callback) { - assert(is_valid_); - - do { - constexpr size_t max_avail_in = - (std::numeric_limits::max)(); - - strm_.avail_in = static_cast( - (std::min)(data_length, max_avail_in)); - strm_.next_in = const_cast(reinterpret_cast(data)); - - data_length -= strm_.avail_in; - data += strm_.avail_in; - - auto flush = (last && data_length == 0) ? Z_FINISH : Z_NO_FLUSH; - auto ret = Z_OK; - - std::array buff{}; - do { - strm_.avail_out = static_cast(buff.size()); - strm_.next_out = reinterpret_cast(buff.data()); - - ret = deflate(&strm_, flush); - if (ret == Z_STREAM_ERROR) { return false; } - - if (!callback(buff.data(), buff.size() - strm_.avail_out)) { - return false; - } - } while (strm_.avail_out == 0); - - assert((flush == Z_FINISH && ret == Z_STREAM_END) || - (flush == Z_NO_FLUSH && ret == Z_OK)); - assert(strm_.avail_in == 0); - } while (data_length > 0); - - return true; -} - -inline gzip_decompressor::gzip_decompressor() { - std::memset(&strm_, 0, sizeof(strm_)); - strm_.zalloc = Z_NULL; - strm_.zfree = Z_NULL; - strm_.opaque = Z_NULL; - - // 15 is the value of wbits, which should be at the maximum possible value - // to ensure that any gzip stream can be decoded. The offset of 32 specifies - // that the stream type should be automatically detected either gzip or - // deflate. - is_valid_ = inflateInit2(&strm_, 32 + 15) == Z_OK; -} - -inline gzip_decompressor::~gzip_decompressor() { inflateEnd(&strm_); } - -inline bool gzip_decompressor::is_valid() const { return is_valid_; } - -inline bool gzip_decompressor::decompress(const char *data, size_t data_length, - Callback callback) { - assert(is_valid_); - - auto ret = Z_OK; - - do { - constexpr size_t max_avail_in = - (std::numeric_limits::max)(); - - strm_.avail_in = static_cast( - (std::min)(data_length, max_avail_in)); - strm_.next_in = const_cast(reinterpret_cast(data)); - - data_length -= strm_.avail_in; - data += strm_.avail_in; - - std::array buff{}; - while (strm_.avail_in > 0 && ret == Z_OK) { - strm_.avail_out = static_cast(buff.size()); - strm_.next_out = reinterpret_cast(buff.data()); - - ret = inflate(&strm_, Z_NO_FLUSH); - - assert(ret != Z_STREAM_ERROR); - switch (ret) { - case Z_NEED_DICT: - case Z_DATA_ERROR: - case Z_MEM_ERROR: inflateEnd(&strm_); return false; - } - - if (!callback(buff.data(), buff.size() - strm_.avail_out)) { - return false; - } - } - - if (ret != Z_OK && ret != Z_STREAM_END) { return false; } - - } while (data_length > 0); - - return true; -} -#endif - -#ifdef CPPHTTPLIB_BROTLI_SUPPORT -inline brotli_compressor::brotli_compressor() { - state_ = BrotliEncoderCreateInstance(nullptr, nullptr, nullptr); -} - -inline brotli_compressor::~brotli_compressor() { - BrotliEncoderDestroyInstance(state_); -} - -inline bool brotli_compressor::compress(const char *data, size_t data_length, - bool last, Callback callback) { - std::array buff{}; - - auto operation = last ? BROTLI_OPERATION_FINISH : BROTLI_OPERATION_PROCESS; - auto available_in = data_length; - auto next_in = reinterpret_cast(data); - - for (;;) { - if (last) { - if (BrotliEncoderIsFinished(state_)) { break; } - } else { - if (!available_in) { break; } - } - - auto available_out = buff.size(); - auto next_out = buff.data(); - - if (!BrotliEncoderCompressStream(state_, operation, &available_in, &next_in, - &available_out, &next_out, nullptr)) { - return false; - } - - auto output_bytes = buff.size() - available_out; - if (output_bytes) { - callback(reinterpret_cast(buff.data()), output_bytes); - } - } - - return true; -} - -inline brotli_decompressor::brotli_decompressor() { - decoder_s = BrotliDecoderCreateInstance(0, 0, 0); - decoder_r = decoder_s ? BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT - : BROTLI_DECODER_RESULT_ERROR; -} - -inline brotli_decompressor::~brotli_decompressor() { - if (decoder_s) { BrotliDecoderDestroyInstance(decoder_s); } -} - -inline bool brotli_decompressor::is_valid() const { return decoder_s; } - -inline bool brotli_decompressor::decompress(const char *data, - size_t data_length, - Callback callback) { - if (decoder_r == BROTLI_DECODER_RESULT_SUCCESS || - decoder_r == BROTLI_DECODER_RESULT_ERROR) { - return 0; - } - - auto next_in = reinterpret_cast(data); - size_t avail_in = data_length; - size_t total_out; - - decoder_r = BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT; - - std::array buff{}; - while (decoder_r == BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT) { - char *next_out = buff.data(); - size_t avail_out = buff.size(); - - decoder_r = BrotliDecoderDecompressStream( - decoder_s, &avail_in, &next_in, &avail_out, - reinterpret_cast(&next_out), &total_out); - - if (decoder_r == BROTLI_DECODER_RESULT_ERROR) { return false; } - - if (!callback(buff.data(), buff.size() - avail_out)) { return false; } - } - - return decoder_r == BROTLI_DECODER_RESULT_SUCCESS || - decoder_r == BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT; -} -#endif - -#ifdef CPPHTTPLIB_ZSTD_SUPPORT -inline zstd_compressor::zstd_compressor() { - ctx_ = ZSTD_createCCtx(); - ZSTD_CCtx_setParameter(ctx_, ZSTD_c_compressionLevel, ZSTD_fast); -} - -inline zstd_compressor::~zstd_compressor() { ZSTD_freeCCtx(ctx_); } - -inline bool zstd_compressor::compress(const char *data, size_t data_length, - bool last, Callback callback) { - std::array buff{}; - - ZSTD_EndDirective mode = last ? ZSTD_e_end : ZSTD_e_continue; - ZSTD_inBuffer input = {data, data_length, 0}; - - bool finished; - do { - ZSTD_outBuffer output = {buff.data(), CPPHTTPLIB_COMPRESSION_BUFSIZ, 0}; - size_t const remaining = ZSTD_compressStream2(ctx_, &output, &input, mode); - - if (ZSTD_isError(remaining)) { return false; } - - if (!callback(buff.data(), output.pos)) { return false; } - - finished = last ? (remaining == 0) : (input.pos == input.size); - - } while (!finished); - - return true; -} - -inline zstd_decompressor::zstd_decompressor() { ctx_ = ZSTD_createDCtx(); } - -inline zstd_decompressor::~zstd_decompressor() { ZSTD_freeDCtx(ctx_); } - -inline bool zstd_decompressor::is_valid() const { return ctx_ != nullptr; } - -inline bool zstd_decompressor::decompress(const char *data, size_t data_length, - Callback callback) { - std::array buff{}; - ZSTD_inBuffer input = {data, data_length, 0}; - - while (input.pos < input.size) { - ZSTD_outBuffer output = {buff.data(), CPPHTTPLIB_COMPRESSION_BUFSIZ, 0}; - size_t const remaining = ZSTD_decompressStream(ctx_, &output, &input); - - if (ZSTD_isError(remaining)) { return false; } - - if (!callback(buff.data(), output.pos)) { return false; } - } - - return true; -} -#endif - -inline bool has_header(const Headers &headers, const std::string &key) { - return headers.find(key) != headers.end(); -} - -inline const char *get_header_value(const Headers &headers, - const std::string &key, const char *def, - size_t id) { - auto rng = headers.equal_range(key); - auto it = rng.first; - std::advance(it, static_cast(id)); - if (it != rng.second) { return it->second.c_str(); } - return def; -} - -template -inline bool parse_header(const char *beg, const char *end, T fn) { - // Skip trailing spaces and tabs. - while (beg < end && is_space_or_tab(end[-1])) { - end--; - } - - auto p = beg; - while (p < end && *p != ':') { - p++; - } - - auto name = std::string(beg, p); - if (!detail::fields::is_field_name(name)) { return false; } - - if (p == end) { return false; } - - auto key_end = p; - - if (*p++ != ':') { return false; } - - while (p < end && is_space_or_tab(*p)) { - p++; - } - - if (p <= end) { - auto key_len = key_end - beg; - if (!key_len) { return false; } - - auto key = std::string(beg, key_end); - auto val = std::string(p, end); - - if (!detail::fields::is_field_value(val)) { return false; } - - if (case_ignore::equal(key, "Location") || - case_ignore::equal(key, "Referer")) { - fn(key, val); - } else { - fn(key, decode_url(val, false)); - } - - return true; - } - - return false; -} - -inline bool read_headers(Stream &strm, Headers &headers) { - const auto bufsiz = 2048; - char buf[bufsiz]; - stream_line_reader line_reader(strm, buf, bufsiz); - - for (;;) { - if (!line_reader.getline()) { return false; } - - // Check if the line ends with CRLF. - auto line_terminator_len = 2; - if (line_reader.end_with_crlf()) { - // Blank line indicates end of headers. - if (line_reader.size() == 2) { break; } - } else { -#ifdef CPPHTTPLIB_ALLOW_LF_AS_LINE_TERMINATOR - // Blank line indicates end of headers. - if (line_reader.size() == 1) { break; } - line_terminator_len = 1; -#else - continue; // Skip invalid line. -#endif - } - - if (line_reader.size() > CPPHTTPLIB_HEADER_MAX_LENGTH) { return false; } - - // Exclude line terminator - auto end = line_reader.ptr() + line_reader.size() - line_terminator_len; - - if (!parse_header(line_reader.ptr(), end, - [&](const std::string &key, const std::string &val) { - headers.emplace(key, val); - })) { - return false; - } - } - - return true; -} - -inline bool read_content_with_length(Stream &strm, uint64_t len, - Progress progress, - ContentReceiverWithProgress out) { - char buf[CPPHTTPLIB_RECV_BUFSIZ]; - - uint64_t r = 0; - while (r < len) { - auto read_len = static_cast(len - r); - auto n = strm.read(buf, (std::min)(read_len, CPPHTTPLIB_RECV_BUFSIZ)); - if (n <= 0) { return false; } - - if (!out(buf, static_cast(n), r, len)) { return false; } - r += static_cast(n); - - if (progress) { - if (!progress(r, len)) { return false; } - } - } - - return true; -} - -inline void skip_content_with_length(Stream &strm, uint64_t len) { - char buf[CPPHTTPLIB_RECV_BUFSIZ]; - uint64_t r = 0; - while (r < len) { - auto read_len = static_cast(len - r); - auto n = strm.read(buf, (std::min)(read_len, CPPHTTPLIB_RECV_BUFSIZ)); - if (n <= 0) { return; } - r += static_cast(n); - } -} - -inline bool read_content_without_length(Stream &strm, - ContentReceiverWithProgress out) { - char buf[CPPHTTPLIB_RECV_BUFSIZ]; - uint64_t r = 0; - for (;;) { - auto n = strm.read(buf, CPPHTTPLIB_RECV_BUFSIZ); - if (n == 0) { return true; } - if (n < 0) { return false; } - - if (!out(buf, static_cast(n), r, 0)) { return false; } - r += static_cast(n); - } - - return true; -} - -template -inline bool read_content_chunked(Stream &strm, T &x, - ContentReceiverWithProgress out) { - const auto bufsiz = 16; - char buf[bufsiz]; - - stream_line_reader line_reader(strm, buf, bufsiz); - - if (!line_reader.getline()) { return false; } - - unsigned long chunk_len; - while (true) { - char *end_ptr; - - chunk_len = std::strtoul(line_reader.ptr(), &end_ptr, 16); - - if (end_ptr == line_reader.ptr()) { return false; } - if (chunk_len == ULONG_MAX) { return false; } - - if (chunk_len == 0) { break; } - - if (!read_content_with_length(strm, chunk_len, nullptr, out)) { - return false; - } - - if (!line_reader.getline()) { return false; } - - if (strcmp(line_reader.ptr(), "\r\n") != 0) { return false; } - - if (!line_reader.getline()) { return false; } - } - - assert(chunk_len == 0); - - // NOTE: In RFC 9112, '7.1 Chunked Transfer Coding' mentions "The chunked - // transfer coding is complete when a chunk with a chunk-size of zero is - // received, possibly followed by a trailer section, and finally terminated by - // an empty line". https://www.rfc-editor.org/rfc/rfc9112.html#section-7.1 - // - // In '7.1.3. Decoding Chunked', however, the pseudo-code in the section - // does't care for the existence of the final CRLF. In other words, it seems - // to be ok whether the final CRLF exists or not in the chunked data. - // https://www.rfc-editor.org/rfc/rfc9112.html#section-7.1.3 - // - // According to the reference code in RFC 9112, cpp-httplib now allows - // chunked transfer coding data without the final CRLF. - if (!line_reader.getline()) { return true; } - - while (strcmp(line_reader.ptr(), "\r\n") != 0) { - if (line_reader.size() > CPPHTTPLIB_HEADER_MAX_LENGTH) { return false; } - - // Exclude line terminator - constexpr auto line_terminator_len = 2; - auto end = line_reader.ptr() + line_reader.size() - line_terminator_len; - - parse_header(line_reader.ptr(), end, - [&](const std::string &key, const std::string &val) { - x.headers.emplace(key, val); - }); - - if (!line_reader.getline()) { return false; } - } - - return true; -} - -inline bool is_chunked_transfer_encoding(const Headers &headers) { - return case_ignore::equal( - get_header_value(headers, "Transfer-Encoding", "", 0), "chunked"); -} - -template -bool prepare_content_receiver(T &x, int &status, - ContentReceiverWithProgress receiver, - bool decompress, U callback) { - if (decompress) { - std::string encoding = x.get_header_value("Content-Encoding"); - std::unique_ptr decompressor; - - if (encoding == "gzip" || encoding == "deflate") { -#ifdef CPPHTTPLIB_ZLIB_SUPPORT - decompressor = detail::make_unique(); -#else - status = StatusCode::UnsupportedMediaType_415; - return false; -#endif - } else if (encoding.find("br") != std::string::npos) { -#ifdef CPPHTTPLIB_BROTLI_SUPPORT - decompressor = detail::make_unique(); -#else - status = StatusCode::UnsupportedMediaType_415; - return false; -#endif - } else if (encoding == "zstd") { -#ifdef CPPHTTPLIB_ZSTD_SUPPORT - decompressor = detail::make_unique(); -#else - status = StatusCode::UnsupportedMediaType_415; - return false; -#endif - } - - if (decompressor) { - if (decompressor->is_valid()) { - ContentReceiverWithProgress out = [&](const char *buf, size_t n, - uint64_t off, uint64_t len) { - return decompressor->decompress(buf, n, - [&](const char *buf2, size_t n2) { - return receiver(buf2, n2, off, len); - }); - }; - return callback(std::move(out)); - } else { - status = StatusCode::InternalServerError_500; - return false; - } - } - } - - ContentReceiverWithProgress out = [&](const char *buf, size_t n, uint64_t off, - uint64_t len) { - return receiver(buf, n, off, len); - }; - return callback(std::move(out)); -} - -template -bool read_content(Stream &strm, T &x, size_t payload_max_length, int &status, - Progress progress, ContentReceiverWithProgress receiver, - bool decompress) { - return prepare_content_receiver( - x, status, std::move(receiver), decompress, - [&](const ContentReceiverWithProgress &out) { - auto ret = true; - auto exceed_payload_max_length = false; - - if (is_chunked_transfer_encoding(x.headers)) { - ret = read_content_chunked(strm, x, out); - } else if (!has_header(x.headers, "Content-Length")) { - ret = read_content_without_length(strm, out); - } else { - auto is_invalid_value = false; - auto len = get_header_value_u64( - x.headers, "Content-Length", - (std::numeric_limits::max)(), 0, is_invalid_value); - - if (is_invalid_value) { - ret = false; - } else if (len > payload_max_length) { - exceed_payload_max_length = true; - skip_content_with_length(strm, len); - ret = false; - } else if (len > 0) { - ret = read_content_with_length(strm, len, std::move(progress), out); - } - } - - if (!ret) { - status = exceed_payload_max_length ? StatusCode::PayloadTooLarge_413 - : StatusCode::BadRequest_400; - } - return ret; - }); -} - -inline ssize_t write_request_line(Stream &strm, const std::string &method, - const std::string &path) { - std::string s = method; - s += " "; - s += path; - s += " HTTP/1.1\r\n"; - return strm.write(s.data(), s.size()); -} - -inline ssize_t write_response_line(Stream &strm, int status) { - std::string s = "HTTP/1.1 "; - s += std::to_string(status); - s += " "; - s += httplib::status_message(status); - s += "\r\n"; - return strm.write(s.data(), s.size()); -} - -inline ssize_t write_headers(Stream &strm, const Headers &headers) { - ssize_t write_len = 0; - for (const auto &x : headers) { - std::string s; - s = x.first; - s += ": "; - s += x.second; - s += "\r\n"; - - auto len = strm.write(s.data(), s.size()); - if (len < 0) { return len; } - write_len += len; - } - auto len = strm.write("\r\n"); - if (len < 0) { return len; } - write_len += len; - return write_len; -} - -inline bool write_data(Stream &strm, const char *d, size_t l) { - size_t offset = 0; - while (offset < l) { - auto length = strm.write(d + offset, l - offset); - if (length < 0) { return false; } - offset += static_cast(length); - } - return true; -} - -template -inline bool write_content(Stream &strm, const ContentProvider &content_provider, - size_t offset, size_t length, T is_shutting_down, - Error &error) { - size_t end_offset = offset + length; - auto ok = true; - DataSink data_sink; - - data_sink.write = [&](const char *d, size_t l) -> bool { - if (ok) { - if (write_data(strm, d, l)) { - offset += l; - } else { - ok = false; - } - } - return ok; - }; - - data_sink.is_writable = [&]() -> bool { return strm.wait_writable(); }; - - while (offset < end_offset && !is_shutting_down()) { - if (!strm.wait_writable()) { - error = Error::Write; - return false; - } else if (!content_provider(offset, end_offset - offset, data_sink)) { - error = Error::Canceled; - return false; - } else if (!ok) { - error = Error::Write; - return false; - } - } - - error = Error::Success; - return true; -} - -template -inline bool write_content(Stream &strm, const ContentProvider &content_provider, - size_t offset, size_t length, - const T &is_shutting_down) { - auto error = Error::Success; - return write_content(strm, content_provider, offset, length, is_shutting_down, - error); -} - -template -inline bool -write_content_without_length(Stream &strm, - const ContentProvider &content_provider, - const T &is_shutting_down) { - size_t offset = 0; - auto data_available = true; - auto ok = true; - DataSink data_sink; - - data_sink.write = [&](const char *d, size_t l) -> bool { - if (ok) { - offset += l; - if (!write_data(strm, d, l)) { ok = false; } - } - return ok; - }; - - data_sink.is_writable = [&]() -> bool { return strm.wait_writable(); }; - - data_sink.done = [&](void) { data_available = false; }; - - while (data_available && !is_shutting_down()) { - if (!strm.wait_writable()) { - return false; - } else if (!content_provider(offset, 0, data_sink)) { - return false; - } else if (!ok) { - return false; - } - } - return true; -} - -template -inline bool -write_content_chunked(Stream &strm, const ContentProvider &content_provider, - const T &is_shutting_down, U &compressor, Error &error) { - size_t offset = 0; - auto data_available = true; - auto ok = true; - DataSink data_sink; - - data_sink.write = [&](const char *d, size_t l) -> bool { - if (ok) { - data_available = l > 0; - offset += l; - - std::string payload; - if (compressor.compress(d, l, false, - [&](const char *data, size_t data_len) { - payload.append(data, data_len); - return true; - })) { - if (!payload.empty()) { - // Emit chunked response header and footer for each chunk - auto chunk = - from_i_to_hex(payload.size()) + "\r\n" + payload + "\r\n"; - if (!write_data(strm, chunk.data(), chunk.size())) { ok = false; } - } - } else { - ok = false; - } - } - return ok; - }; - - data_sink.is_writable = [&]() -> bool { return strm.wait_writable(); }; - - auto done_with_trailer = [&](const Headers *trailer) { - if (!ok) { return; } - - data_available = false; - - std::string payload; - if (!compressor.compress(nullptr, 0, true, - [&](const char *data, size_t data_len) { - payload.append(data, data_len); - return true; - })) { - ok = false; - return; - } - - if (!payload.empty()) { - // Emit chunked response header and footer for each chunk - auto chunk = from_i_to_hex(payload.size()) + "\r\n" + payload + "\r\n"; - if (!write_data(strm, chunk.data(), chunk.size())) { - ok = false; - return; - } - } - - constexpr const char done_marker[] = "0\r\n"; - if (!write_data(strm, done_marker, str_len(done_marker))) { ok = false; } - - // Trailer - if (trailer) { - for (const auto &kv : *trailer) { - std::string field_line = kv.first + ": " + kv.second + "\r\n"; - if (!write_data(strm, field_line.data(), field_line.size())) { - ok = false; - } - } - } - - constexpr const char crlf[] = "\r\n"; - if (!write_data(strm, crlf, str_len(crlf))) { ok = false; } - }; - - data_sink.done = [&](void) { done_with_trailer(nullptr); }; - - data_sink.done_with_trailer = [&](const Headers &trailer) { - done_with_trailer(&trailer); - }; - - while (data_available && !is_shutting_down()) { - if (!strm.wait_writable()) { - error = Error::Write; - return false; - } else if (!content_provider(offset, 0, data_sink)) { - error = Error::Canceled; - return false; - } else if (!ok) { - error = Error::Write; - return false; - } - } - - error = Error::Success; - return true; -} - -template -inline bool write_content_chunked(Stream &strm, - const ContentProvider &content_provider, - const T &is_shutting_down, U &compressor) { - auto error = Error::Success; - return write_content_chunked(strm, content_provider, is_shutting_down, - compressor, error); -} - -template -inline bool redirect(T &cli, Request &req, Response &res, - const std::string &path, const std::string &location, - Error &error) { - Request new_req = req; - new_req.path = path; - new_req.redirect_count_ -= 1; - - if (res.status == StatusCode::SeeOther_303 && - (req.method != "GET" && req.method != "HEAD")) { - new_req.method = "GET"; - new_req.body.clear(); - new_req.headers.clear(); - } - - Response new_res; - - auto ret = cli.send(new_req, new_res, error); - if (ret) { - req = new_req; - res = new_res; - - if (res.location.empty()) { res.location = location; } - } - return ret; -} - -inline std::string params_to_query_str(const Params ¶ms) { - std::string query; - - for (auto it = params.begin(); it != params.end(); ++it) { - if (it != params.begin()) { query += "&"; } - query += it->first; - query += "="; - query += encode_query_param(it->second); - } - return query; -} - -inline void parse_query_text(const char *data, std::size_t size, - Params ¶ms) { - std::set cache; - split(data, data + size, '&', [&](const char *b, const char *e) { - std::string kv(b, e); - if (cache.find(kv) != cache.end()) { return; } - cache.insert(std::move(kv)); - - std::string key; - std::string val; - divide(b, static_cast(e - b), '=', - [&](const char *lhs_data, std::size_t lhs_size, const char *rhs_data, - std::size_t rhs_size) { - key.assign(lhs_data, lhs_size); - val.assign(rhs_data, rhs_size); - }); - - if (!key.empty()) { - params.emplace(decode_url(key, true), decode_url(val, true)); - } - }); -} - -inline void parse_query_text(const std::string &s, Params ¶ms) { - parse_query_text(s.data(), s.size(), params); -} - -inline bool parse_multipart_boundary(const std::string &content_type, - std::string &boundary) { - auto boundary_keyword = "boundary="; - auto pos = content_type.find(boundary_keyword); - if (pos == std::string::npos) { return false; } - auto end = content_type.find(';', pos); - auto beg = pos + strlen(boundary_keyword); - boundary = trim_double_quotes_copy(content_type.substr(beg, end - beg)); - return !boundary.empty(); -} - -inline void parse_disposition_params(const std::string &s, Params ¶ms) { - std::set cache; - split(s.data(), s.data() + s.size(), ';', [&](const char *b, const char *e) { - std::string kv(b, e); - if (cache.find(kv) != cache.end()) { return; } - cache.insert(kv); - - std::string key; - std::string val; - split(b, e, '=', [&](const char *b2, const char *e2) { - if (key.empty()) { - key.assign(b2, e2); - } else { - val.assign(b2, e2); - } - }); - - if (!key.empty()) { - params.emplace(trim_double_quotes_copy((key)), - trim_double_quotes_copy((val))); - } - }); -} - -#ifdef CPPHTTPLIB_NO_EXCEPTIONS -inline bool parse_range_header(const std::string &s, Ranges &ranges) { -#else -inline bool parse_range_header(const std::string &s, Ranges &ranges) try { -#endif - auto is_valid = [](const std::string &str) { - return std::all_of(str.cbegin(), str.cend(), - [](unsigned char c) { return std::isdigit(c); }); - }; - - if (s.size() > 7 && s.compare(0, 6, "bytes=") == 0) { - const auto pos = static_cast(6); - const auto len = static_cast(s.size() - 6); - auto all_valid_ranges = true; - split(&s[pos], &s[pos + len], ',', [&](const char *b, const char *e) { - if (!all_valid_ranges) { return; } - - const auto it = std::find(b, e, '-'); - if (it == e) { - all_valid_ranges = false; - return; - } - - const auto lhs = std::string(b, it); - const auto rhs = std::string(it + 1, e); - if (!is_valid(lhs) || !is_valid(rhs)) { - all_valid_ranges = false; - return; - } - - const auto first = - static_cast(lhs.empty() ? -1 : std::stoll(lhs)); - const auto last = - static_cast(rhs.empty() ? -1 : std::stoll(rhs)); - if ((first == -1 && last == -1) || - (first != -1 && last != -1 && first > last)) { - all_valid_ranges = false; - return; - } - - ranges.emplace_back(first, last); - }); - return all_valid_ranges && !ranges.empty(); - } - return false; -#ifdef CPPHTTPLIB_NO_EXCEPTIONS -} -#else -} catch (...) { return false; } -#endif - -class MultipartFormDataParser { -public: - MultipartFormDataParser() = default; - - void set_boundary(std::string &&boundary) { - boundary_ = boundary; - dash_boundary_crlf_ = dash_ + boundary_ + crlf_; - crlf_dash_boundary_ = crlf_ + dash_ + boundary_; - } - - bool is_valid() const { return is_valid_; } - - bool parse(const char *buf, size_t n, const ContentReceiver &content_callback, - const MultipartContentHeader &header_callback) { - - buf_append(buf, n); - - while (buf_size() > 0) { - switch (state_) { - case 0: { // Initial boundary - buf_erase(buf_find(dash_boundary_crlf_)); - if (dash_boundary_crlf_.size() > buf_size()) { return true; } - if (!buf_start_with(dash_boundary_crlf_)) { return false; } - buf_erase(dash_boundary_crlf_.size()); - state_ = 1; - break; - } - case 1: { // New entry - clear_file_info(); - state_ = 2; - break; - } - case 2: { // Headers - auto pos = buf_find(crlf_); - if (pos > CPPHTTPLIB_HEADER_MAX_LENGTH) { return false; } - while (pos < buf_size()) { - // Empty line - if (pos == 0) { - if (!header_callback(file_)) { - is_valid_ = false; - return false; - } - buf_erase(crlf_.size()); - state_ = 3; - break; - } - - const auto header = buf_head(pos); - - if (!parse_header(header.data(), header.data() + header.size(), - [&](const std::string &, const std::string &) {})) { - is_valid_ = false; - return false; - } - - constexpr const char header_content_type[] = "Content-Type:"; - - if (start_with_case_ignore(header, header_content_type)) { - file_.content_type = - trim_copy(header.substr(str_len(header_content_type))); - } else { - thread_local const std::regex re_content_disposition( - R"~(^Content-Disposition:\s*form-data;\s*(.*)$)~", - std::regex_constants::icase); - - std::smatch m; - if (std::regex_match(header, m, re_content_disposition)) { - Params params; - parse_disposition_params(m[1], params); - - auto it = params.find("name"); - if (it != params.end()) { - file_.name = it->second; - } else { - is_valid_ = false; - return false; - } - - it = params.find("filename"); - if (it != params.end()) { file_.filename = it->second; } - - it = params.find("filename*"); - if (it != params.end()) { - // Only allow UTF-8 encoding... - thread_local const std::regex re_rfc5987_encoding( - R"~(^UTF-8''(.+?)$)~", std::regex_constants::icase); - - std::smatch m2; - if (std::regex_match(it->second, m2, re_rfc5987_encoding)) { - file_.filename = decode_url(m2[1], false); // override... - } else { - is_valid_ = false; - return false; - } - } - } - } - buf_erase(pos + crlf_.size()); - pos = buf_find(crlf_); - } - if (state_ != 3) { return true; } - break; - } - case 3: { // Body - if (crlf_dash_boundary_.size() > buf_size()) { return true; } - auto pos = buf_find(crlf_dash_boundary_); - if (pos < buf_size()) { - if (!content_callback(buf_data(), pos)) { - is_valid_ = false; - return false; - } - buf_erase(pos + crlf_dash_boundary_.size()); - state_ = 4; - } else { - auto len = buf_size() - crlf_dash_boundary_.size(); - if (len > 0) { - if (!content_callback(buf_data(), len)) { - is_valid_ = false; - return false; - } - buf_erase(len); - } - return true; - } - break; - } - case 4: { // Boundary - if (crlf_.size() > buf_size()) { return true; } - if (buf_start_with(crlf_)) { - buf_erase(crlf_.size()); - state_ = 1; - } else { - if (dash_.size() > buf_size()) { return true; } - if (buf_start_with(dash_)) { - buf_erase(dash_.size()); - is_valid_ = true; - buf_erase(buf_size()); // Remove epilogue - } else { - return true; - } - } - break; - } - } - } - - return true; - } - -private: - void clear_file_info() { - file_.name.clear(); - file_.filename.clear(); - file_.content_type.clear(); - } - - bool start_with_case_ignore(const std::string &a, const char *b) const { - const auto b_len = strlen(b); - if (a.size() < b_len) { return false; } - for (size_t i = 0; i < b_len; i++) { - if (case_ignore::to_lower(a[i]) != case_ignore::to_lower(b[i])) { - return false; - } - } - return true; - } - - const std::string dash_ = "--"; - const std::string crlf_ = "\r\n"; - std::string boundary_; - std::string dash_boundary_crlf_; - std::string crlf_dash_boundary_; - - size_t state_ = 0; - bool is_valid_ = false; - MultipartFormData file_; - - // Buffer - bool start_with(const std::string &a, size_t spos, size_t epos, - const std::string &b) const { - if (epos - spos < b.size()) { return false; } - for (size_t i = 0; i < b.size(); i++) { - if (a[i + spos] != b[i]) { return false; } - } - return true; - } - - size_t buf_size() const { return buf_epos_ - buf_spos_; } - - const char *buf_data() const { return &buf_[buf_spos_]; } - - std::string buf_head(size_t l) const { return buf_.substr(buf_spos_, l); } - - bool buf_start_with(const std::string &s) const { - return start_with(buf_, buf_spos_, buf_epos_, s); - } - - size_t buf_find(const std::string &s) const { - auto c = s.front(); - - size_t off = buf_spos_; - while (off < buf_epos_) { - auto pos = off; - while (true) { - if (pos == buf_epos_) { return buf_size(); } - if (buf_[pos] == c) { break; } - pos++; - } - - auto remaining_size = buf_epos_ - pos; - if (s.size() > remaining_size) { return buf_size(); } - - if (start_with(buf_, pos, buf_epos_, s)) { return pos - buf_spos_; } - - off = pos + 1; - } - - return buf_size(); - } - - void buf_append(const char *data, size_t n) { - auto remaining_size = buf_size(); - if (remaining_size > 0 && buf_spos_ > 0) { - for (size_t i = 0; i < remaining_size; i++) { - buf_[i] = buf_[buf_spos_ + i]; - } - } - buf_spos_ = 0; - buf_epos_ = remaining_size; - - if (remaining_size + n > buf_.size()) { buf_.resize(remaining_size + n); } - - for (size_t i = 0; i < n; i++) { - buf_[buf_epos_ + i] = data[i]; - } - buf_epos_ += n; - } - - void buf_erase(size_t size) { buf_spos_ += size; } - - std::string buf_; - size_t buf_spos_ = 0; - size_t buf_epos_ = 0; -}; - -inline std::string random_string(size_t length) { - constexpr const char data[] = - "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; - - thread_local auto engine([]() { - // std::random_device might actually be deterministic on some - // platforms, but due to lack of support in the c++ standard library, - // doing better requires either some ugly hacks or breaking portability. - std::random_device seed_gen; - // Request 128 bits of entropy for initialization - std::seed_seq seed_sequence{seed_gen(), seed_gen(), seed_gen(), seed_gen()}; - return std::mt19937(seed_sequence); - }()); - - std::string result; - for (size_t i = 0; i < length; i++) { - result += data[engine() % (sizeof(data) - 1)]; - } - return result; -} - -inline std::string make_multipart_data_boundary() { - return "--cpp-httplib-multipart-data-" + detail::random_string(16); -} - -inline bool is_multipart_boundary_chars_valid(const std::string &boundary) { - auto valid = true; - for (size_t i = 0; i < boundary.size(); i++) { - auto c = boundary[i]; - if (!std::isalnum(c) && c != '-' && c != '_') { - valid = false; - break; - } - } - return valid; -} - -template -inline std::string -serialize_multipart_formdata_item_begin(const T &item, - const std::string &boundary) { - std::string body = "--" + boundary + "\r\n"; - body += "Content-Disposition: form-data; name=\"" + item.name + "\""; - if (!item.filename.empty()) { - body += "; filename=\"" + item.filename + "\""; - } - body += "\r\n"; - if (!item.content_type.empty()) { - body += "Content-Type: " + item.content_type + "\r\n"; - } - body += "\r\n"; - - return body; -} - -inline std::string serialize_multipart_formdata_item_end() { return "\r\n"; } - -inline std::string -serialize_multipart_formdata_finish(const std::string &boundary) { - return "--" + boundary + "--\r\n"; -} - -inline std::string -serialize_multipart_formdata_get_content_type(const std::string &boundary) { - return "multipart/form-data; boundary=" + boundary; -} - -inline std::string -serialize_multipart_formdata(const MultipartFormDataItems &items, - const std::string &boundary, bool finish = true) { - std::string body; - - for (const auto &item : items) { - body += serialize_multipart_formdata_item_begin(item, boundary); - body += item.content + serialize_multipart_formdata_item_end(); - } - - if (finish) { body += serialize_multipart_formdata_finish(boundary); } - - return body; -} - -inline bool range_error(Request &req, Response &res) { - if (!req.ranges.empty() && 200 <= res.status && res.status < 300) { - ssize_t content_len = static_cast( - res.content_length_ ? res.content_length_ : res.body.size()); - - ssize_t prev_first_pos = -1; - ssize_t prev_last_pos = -1; - size_t overwrapping_count = 0; - - // NOTE: The following Range check is based on '14.2. Range' in RFC 9110 - // 'HTTP Semantics' to avoid potential denial-of-service attacks. - // https://www.rfc-editor.org/rfc/rfc9110#section-14.2 - - // Too many ranges - if (req.ranges.size() > CPPHTTPLIB_RANGE_MAX_COUNT) { return true; } - - for (auto &r : req.ranges) { - auto &first_pos = r.first; - auto &last_pos = r.second; - - if (first_pos == -1 && last_pos == -1) { - first_pos = 0; - last_pos = content_len; - } - - if (first_pos == -1) { - first_pos = content_len - last_pos; - last_pos = content_len - 1; - } - - // NOTE: RFC-9110 '14.1.2. Byte Ranges': - // A client can limit the number of bytes requested without knowing the - // size of the selected representation. If the last-pos value is absent, - // or if the value is greater than or equal to the current length of the - // representation data, the byte range is interpreted as the remainder of - // the representation (i.e., the server replaces the value of last-pos - // with a value that is one less than the current length of the selected - // representation). - // https://www.rfc-editor.org/rfc/rfc9110.html#section-14.1.2-6 - if (last_pos == -1 || last_pos >= content_len) { - last_pos = content_len - 1; - } - - // Range must be within content length - if (!(0 <= first_pos && first_pos <= last_pos && - last_pos <= content_len - 1)) { - return true; - } - - // Ranges must be in ascending order - if (first_pos <= prev_first_pos) { return true; } - - // Request must not have more than two overlapping ranges - if (first_pos <= prev_last_pos) { - overwrapping_count++; - if (overwrapping_count > 2) { return true; } - } - - prev_first_pos = (std::max)(prev_first_pos, first_pos); - prev_last_pos = (std::max)(prev_last_pos, last_pos); - } - } - - return false; -} - -inline std::pair -get_range_offset_and_length(Range r, size_t content_length) { - assert(r.first != -1 && r.second != -1); - assert(0 <= r.first && r.first < static_cast(content_length)); - assert(r.first <= r.second && - r.second < static_cast(content_length)); - (void)(content_length); - return std::make_pair(r.first, static_cast(r.second - r.first) + 1); -} - -inline std::string make_content_range_header_field( - const std::pair &offset_and_length, size_t content_length) { - auto st = offset_and_length.first; - auto ed = st + offset_and_length.second - 1; - - std::string field = "bytes "; - field += std::to_string(st); - field += "-"; - field += std::to_string(ed); - field += "/"; - field += std::to_string(content_length); - return field; -} - -template -bool process_multipart_ranges_data(const Request &req, - const std::string &boundary, - const std::string &content_type, - size_t content_length, SToken stoken, - CToken ctoken, Content content) { - for (size_t i = 0; i < req.ranges.size(); i++) { - ctoken("--"); - stoken(boundary); - ctoken("\r\n"); - if (!content_type.empty()) { - ctoken("Content-Type: "); - stoken(content_type); - ctoken("\r\n"); - } - - auto offset_and_length = - get_range_offset_and_length(req.ranges[i], content_length); - - ctoken("Content-Range: "); - stoken(make_content_range_header_field(offset_and_length, content_length)); - ctoken("\r\n"); - ctoken("\r\n"); - - if (!content(offset_and_length.first, offset_and_length.second)) { - return false; - } - ctoken("\r\n"); - } - - ctoken("--"); - stoken(boundary); - ctoken("--"); - - return true; -} - -inline void make_multipart_ranges_data(const Request &req, Response &res, - const std::string &boundary, - const std::string &content_type, - size_t content_length, - std::string &data) { - process_multipart_ranges_data( - req, boundary, content_type, content_length, - [&](const std::string &token) { data += token; }, - [&](const std::string &token) { data += token; }, - [&](size_t offset, size_t length) { - assert(offset + length <= content_length); - data += res.body.substr(offset, length); - return true; - }); -} - -inline size_t get_multipart_ranges_data_length(const Request &req, - const std::string &boundary, - const std::string &content_type, - size_t content_length) { - size_t data_length = 0; - - process_multipart_ranges_data( - req, boundary, content_type, content_length, - [&](const std::string &token) { data_length += token.size(); }, - [&](const std::string &token) { data_length += token.size(); }, - [&](size_t /*offset*/, size_t length) { - data_length += length; - return true; - }); - - return data_length; -} - -template -inline bool -write_multipart_ranges_data(Stream &strm, const Request &req, Response &res, - const std::string &boundary, - const std::string &content_type, - size_t content_length, const T &is_shutting_down) { - return process_multipart_ranges_data( - req, boundary, content_type, content_length, - [&](const std::string &token) { strm.write(token); }, - [&](const std::string &token) { strm.write(token); }, - [&](size_t offset, size_t length) { - return write_content(strm, res.content_provider_, offset, length, - is_shutting_down); - }); -} - -inline bool expect_content(const Request &req) { - if (req.method == "POST" || req.method == "PUT" || req.method == "PATCH" || - req.method == "DELETE") { - return true; - } - if (req.has_header("Content-Length") && - req.get_header_value_u64("Content-Length") > 0) { - return true; - } - if (is_chunked_transfer_encoding(req.headers)) { return true; } - return false; -} - -inline bool has_crlf(const std::string &s) { - auto p = s.c_str(); - while (*p) { - if (*p == '\r' || *p == '\n') { return true; } - p++; - } - return false; -} - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT -inline std::string message_digest(const std::string &s, const EVP_MD *algo) { - auto context = std::unique_ptr( - EVP_MD_CTX_new(), EVP_MD_CTX_free); - - unsigned int hash_length = 0; - unsigned char hash[EVP_MAX_MD_SIZE]; - - EVP_DigestInit_ex(context.get(), algo, nullptr); - EVP_DigestUpdate(context.get(), s.c_str(), s.size()); - EVP_DigestFinal_ex(context.get(), hash, &hash_length); - - std::stringstream ss; - for (auto i = 0u; i < hash_length; ++i) { - ss << std::hex << std::setw(2) << std::setfill('0') - << static_cast(hash[i]); - } - - return ss.str(); -} - -inline std::string MD5(const std::string &s) { - return message_digest(s, EVP_md5()); -} - -inline std::string SHA_256(const std::string &s) { - return message_digest(s, EVP_sha256()); -} - -inline std::string SHA_512(const std::string &s) { - return message_digest(s, EVP_sha512()); -} - -inline std::pair make_digest_authentication_header( - const Request &req, const std::map &auth, - size_t cnonce_count, const std::string &cnonce, const std::string &username, - const std::string &password, bool is_proxy = false) { - std::string nc; - { - std::stringstream ss; - ss << std::setfill('0') << std::setw(8) << std::hex << cnonce_count; - nc = ss.str(); - } - - std::string qop; - if (auth.find("qop") != auth.end()) { - qop = auth.at("qop"); - if (qop.find("auth-int") != std::string::npos) { - qop = "auth-int"; - } else if (qop.find("auth") != std::string::npos) { - qop = "auth"; - } else { - qop.clear(); - } - } - - std::string algo = "MD5"; - if (auth.find("algorithm") != auth.end()) { algo = auth.at("algorithm"); } - - std::string response; - { - auto H = algo == "SHA-256" ? detail::SHA_256 - : algo == "SHA-512" ? detail::SHA_512 - : detail::MD5; - - auto A1 = username + ":" + auth.at("realm") + ":" + password; - - auto A2 = req.method + ":" + req.path; - if (qop == "auth-int") { A2 += ":" + H(req.body); } - - if (qop.empty()) { - response = H(H(A1) + ":" + auth.at("nonce") + ":" + H(A2)); - } else { - response = H(H(A1) + ":" + auth.at("nonce") + ":" + nc + ":" + cnonce + - ":" + qop + ":" + H(A2)); - } - } - - auto opaque = (auth.find("opaque") != auth.end()) ? auth.at("opaque") : ""; - - auto field = "Digest username=\"" + username + "\", realm=\"" + - auth.at("realm") + "\", nonce=\"" + auth.at("nonce") + - "\", uri=\"" + req.path + "\", algorithm=" + algo + - (qop.empty() ? ", response=\"" - : ", qop=" + qop + ", nc=" + nc + ", cnonce=\"" + - cnonce + "\", response=\"") + - response + "\"" + - (opaque.empty() ? "" : ", opaque=\"" + opaque + "\""); - - auto key = is_proxy ? "Proxy-Authorization" : "Authorization"; - return std::make_pair(key, field); -} - -inline bool is_ssl_peer_could_be_closed(SSL *ssl, socket_t sock) { - detail::set_nonblocking(sock, true); - auto se = detail::scope_exit([&]() { detail::set_nonblocking(sock, false); }); - - char buf[1]; - return !SSL_peek(ssl, buf, 1) && - SSL_get_error(ssl, 0) == SSL_ERROR_ZERO_RETURN; -} - -#ifdef _WIN32 -// NOTE: This code came up with the following stackoverflow post: -// https://stackoverflow.com/questions/9507184/can-openssl-on-windows-use-the-system-certificate-store -inline bool load_system_certs_on_windows(X509_STORE *store) { - auto hStore = CertOpenSystemStoreW((HCRYPTPROV_LEGACY)NULL, L"ROOT"); - if (!hStore) { return false; } - - auto result = false; - PCCERT_CONTEXT pContext = NULL; - while ((pContext = CertEnumCertificatesInStore(hStore, pContext)) != - nullptr) { - auto encoded_cert = - static_cast(pContext->pbCertEncoded); - - auto x509 = d2i_X509(NULL, &encoded_cert, pContext->cbCertEncoded); - if (x509) { - X509_STORE_add_cert(store, x509); - X509_free(x509); - result = true; - } - } - - CertFreeCertificateContext(pContext); - CertCloseStore(hStore, 0); - - return result; -} -#elif defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN) && defined(__APPLE__) -#if TARGET_OS_OSX -template -using CFObjectPtr = - std::unique_ptr::type, void (*)(CFTypeRef)>; - -inline void cf_object_ptr_deleter(CFTypeRef obj) { - if (obj) { CFRelease(obj); } -} - -inline bool retrieve_certs_from_keychain(CFObjectPtr &certs) { - CFStringRef keys[] = {kSecClass, kSecMatchLimit, kSecReturnRef}; - CFTypeRef values[] = {kSecClassCertificate, kSecMatchLimitAll, - kCFBooleanTrue}; - - CFObjectPtr query( - CFDictionaryCreate(nullptr, reinterpret_cast(keys), values, - sizeof(keys) / sizeof(keys[0]), - &kCFTypeDictionaryKeyCallBacks, - &kCFTypeDictionaryValueCallBacks), - cf_object_ptr_deleter); - - if (!query) { return false; } - - CFTypeRef security_items = nullptr; - if (SecItemCopyMatching(query.get(), &security_items) != errSecSuccess || - CFArrayGetTypeID() != CFGetTypeID(security_items)) { - return false; - } - - certs.reset(reinterpret_cast(security_items)); - return true; -} - -inline bool retrieve_root_certs_from_keychain(CFObjectPtr &certs) { - CFArrayRef root_security_items = nullptr; - if (SecTrustCopyAnchorCertificates(&root_security_items) != errSecSuccess) { - return false; - } - - certs.reset(root_security_items); - return true; -} - -inline bool add_certs_to_x509_store(CFArrayRef certs, X509_STORE *store) { - auto result = false; - for (auto i = 0; i < CFArrayGetCount(certs); ++i) { - const auto cert = reinterpret_cast( - CFArrayGetValueAtIndex(certs, i)); - - if (SecCertificateGetTypeID() != CFGetTypeID(cert)) { continue; } - - CFDataRef cert_data = nullptr; - if (SecItemExport(cert, kSecFormatX509Cert, 0, nullptr, &cert_data) != - errSecSuccess) { - continue; - } - - CFObjectPtr cert_data_ptr(cert_data, cf_object_ptr_deleter); - - auto encoded_cert = static_cast( - CFDataGetBytePtr(cert_data_ptr.get())); - - auto x509 = - d2i_X509(NULL, &encoded_cert, CFDataGetLength(cert_data_ptr.get())); - - if (x509) { - X509_STORE_add_cert(store, x509); - X509_free(x509); - result = true; - } - } - - return result; -} - -inline bool load_system_certs_on_macos(X509_STORE *store) { - auto result = false; - CFObjectPtr certs(nullptr, cf_object_ptr_deleter); - if (retrieve_certs_from_keychain(certs) && certs) { - result = add_certs_to_x509_store(certs.get(), store); - } - - if (retrieve_root_certs_from_keychain(certs) && certs) { - result = add_certs_to_x509_store(certs.get(), store) || result; - } - - return result; -} -#endif // TARGET_OS_OSX -#endif // _WIN32 -#endif // CPPHTTPLIB_OPENSSL_SUPPORT - -#ifdef _WIN32 -class WSInit { -public: - WSInit() { - WSADATA wsaData; - if (WSAStartup(0x0002, &wsaData) == 0) is_valid_ = true; - } - - ~WSInit() { - if (is_valid_) WSACleanup(); - } - - bool is_valid_ = false; -}; - -static WSInit wsinit_; -#endif - -inline bool parse_www_authenticate(const Response &res, - std::map &auth, - bool is_proxy) { - auto auth_key = is_proxy ? "Proxy-Authenticate" : "WWW-Authenticate"; - if (res.has_header(auth_key)) { - thread_local auto re = - std::regex(R"~((?:(?:,\s*)?(.+?)=(?:"(.*?)"|([^,]*))))~"); - auto s = res.get_header_value(auth_key); - auto pos = s.find(' '); - if (pos != std::string::npos) { - auto type = s.substr(0, pos); - if (type == "Basic") { - return false; - } else if (type == "Digest") { - s = s.substr(pos + 1); - auto beg = std::sregex_iterator(s.begin(), s.end(), re); - for (auto i = beg; i != std::sregex_iterator(); ++i) { - const auto &m = *i; - auto key = s.substr(static_cast(m.position(1)), - static_cast(m.length(1))); - auto val = m.length(2) > 0 - ? s.substr(static_cast(m.position(2)), - static_cast(m.length(2))) - : s.substr(static_cast(m.position(3)), - static_cast(m.length(3))); - auth[key] = val; - } - return true; - } - } - } - return false; -} - -class ContentProviderAdapter { -public: - explicit ContentProviderAdapter( - ContentProviderWithoutLength &&content_provider) - : content_provider_(content_provider) {} - - bool operator()(size_t offset, size_t, DataSink &sink) { - return content_provider_(offset, sink); - } - -private: - ContentProviderWithoutLength content_provider_; -}; - -} // namespace detail - -inline std::string hosted_at(const std::string &hostname) { - std::vector addrs; - hosted_at(hostname, addrs); - if (addrs.empty()) { return std::string(); } - return addrs[0]; -} - -inline void hosted_at(const std::string &hostname, - std::vector &addrs) { - struct addrinfo hints; - struct addrinfo *result; - - memset(&hints, 0, sizeof(struct addrinfo)); - hints.ai_family = AF_UNSPEC; - hints.ai_socktype = SOCK_STREAM; - hints.ai_protocol = 0; - - if (getaddrinfo(hostname.c_str(), nullptr, &hints, &result)) { -#if defined __linux__ && !defined __ANDROID__ - res_init(); -#endif - return; - } - auto se = detail::scope_exit([&] { freeaddrinfo(result); }); - - for (auto rp = result; rp; rp = rp->ai_next) { - const auto &addr = - *reinterpret_cast(rp->ai_addr); - std::string ip; - auto dummy = -1; - if (detail::get_ip_and_port(addr, sizeof(struct sockaddr_storage), ip, - dummy)) { - addrs.push_back(ip); - } - } -} - -inline std::string append_query_params(const std::string &path, - const Params ¶ms) { - std::string path_with_query = path; - thread_local const std::regex re("[^?]+\\?.*"); - auto delm = std::regex_match(path, re) ? '&' : '?'; - path_with_query += delm + detail::params_to_query_str(params); - return path_with_query; -} - -// Header utilities -inline std::pair -make_range_header(const Ranges &ranges) { - std::string field = "bytes="; - auto i = 0; - for (const auto &r : ranges) { - if (i != 0) { field += ", "; } - if (r.first != -1) { field += std::to_string(r.first); } - field += '-'; - if (r.second != -1) { field += std::to_string(r.second); } - i++; - } - return std::make_pair("Range", std::move(field)); -} - -inline std::pair -make_basic_authentication_header(const std::string &username, - const std::string &password, bool is_proxy) { - auto field = "Basic " + detail::base64_encode(username + ":" + password); - auto key = is_proxy ? "Proxy-Authorization" : "Authorization"; - return std::make_pair(key, std::move(field)); -} - -inline std::pair -make_bearer_token_authentication_header(const std::string &token, - bool is_proxy = false) { - auto field = "Bearer " + token; - auto key = is_proxy ? "Proxy-Authorization" : "Authorization"; - return std::make_pair(key, std::move(field)); -} - -// Request implementation -inline bool Request::has_header(const std::string &key) const { - return detail::has_header(headers, key); -} - -inline std::string Request::get_header_value(const std::string &key, - const char *def, size_t id) const { - return detail::get_header_value(headers, key, def, id); -} - -inline size_t Request::get_header_value_count(const std::string &key) const { - auto r = headers.equal_range(key); - return static_cast(std::distance(r.first, r.second)); -} - -inline void Request::set_header(const std::string &key, - const std::string &val) { - if (detail::fields::is_field_name(key) && - detail::fields::is_field_value(val)) { - headers.emplace(key, val); - } -} - -inline bool Request::has_param(const std::string &key) const { - return params.find(key) != params.end(); -} - -inline std::string Request::get_param_value(const std::string &key, - size_t id) const { - auto rng = params.equal_range(key); - auto it = rng.first; - std::advance(it, static_cast(id)); - if (it != rng.second) { return it->second; } - return std::string(); -} - -inline size_t Request::get_param_value_count(const std::string &key) const { - auto r = params.equal_range(key); - return static_cast(std::distance(r.first, r.second)); -} - -inline bool Request::is_multipart_form_data() const { - const auto &content_type = get_header_value("Content-Type"); - return !content_type.rfind("multipart/form-data", 0); -} - -inline bool Request::has_file(const std::string &key) const { - return files.find(key) != files.end(); -} - -inline MultipartFormData Request::get_file_value(const std::string &key) const { - auto it = files.find(key); - if (it != files.end()) { return it->second; } - return MultipartFormData(); -} - -inline std::vector -Request::get_file_values(const std::string &key) const { - std::vector values; - auto rng = files.equal_range(key); - for (auto it = rng.first; it != rng.second; it++) { - values.push_back(it->second); - } - return values; -} - -// Response implementation -inline bool Response::has_header(const std::string &key) const { - return headers.find(key) != headers.end(); -} - -inline std::string Response::get_header_value(const std::string &key, - const char *def, - size_t id) const { - return detail::get_header_value(headers, key, def, id); -} - -inline size_t Response::get_header_value_count(const std::string &key) const { - auto r = headers.equal_range(key); - return static_cast(std::distance(r.first, r.second)); -} - -inline void Response::set_header(const std::string &key, - const std::string &val) { - if (detail::fields::is_field_name(key) && - detail::fields::is_field_value(val)) { - headers.emplace(key, val); - } -} - -inline void Response::set_redirect(const std::string &url, int stat) { - if (detail::fields::is_field_value(url)) { - set_header("Location", url); - if (300 <= stat && stat < 400) { - this->status = stat; - } else { - this->status = StatusCode::Found_302; - } - } -} - -inline void Response::set_content(const char *s, size_t n, - const std::string &content_type) { - body.assign(s, n); - - auto rng = headers.equal_range("Content-Type"); - headers.erase(rng.first, rng.second); - set_header("Content-Type", content_type); -} - -inline void Response::set_content(const std::string &s, - const std::string &content_type) { - set_content(s.data(), s.size(), content_type); -} - -inline void Response::set_content(std::string &&s, - const std::string &content_type) { - body = std::move(s); - - auto rng = headers.equal_range("Content-Type"); - headers.erase(rng.first, rng.second); - set_header("Content-Type", content_type); -} - -inline void Response::set_content_provider( - size_t in_length, const std::string &content_type, ContentProvider provider, - ContentProviderResourceReleaser resource_releaser) { - set_header("Content-Type", content_type); - content_length_ = in_length; - if (in_length > 0) { content_provider_ = std::move(provider); } - content_provider_resource_releaser_ = std::move(resource_releaser); - is_chunked_content_provider_ = false; -} - -inline void Response::set_content_provider( - const std::string &content_type, ContentProviderWithoutLength provider, - ContentProviderResourceReleaser resource_releaser) { - set_header("Content-Type", content_type); - content_length_ = 0; - content_provider_ = detail::ContentProviderAdapter(std::move(provider)); - content_provider_resource_releaser_ = std::move(resource_releaser); - is_chunked_content_provider_ = false; -} - -inline void Response::set_chunked_content_provider( - const std::string &content_type, ContentProviderWithoutLength provider, - ContentProviderResourceReleaser resource_releaser) { - set_header("Content-Type", content_type); - content_length_ = 0; - content_provider_ = detail::ContentProviderAdapter(std::move(provider)); - content_provider_resource_releaser_ = std::move(resource_releaser); - is_chunked_content_provider_ = true; -} - -inline void Response::set_file_content(const std::string &path, - const std::string &content_type) { - file_content_path_ = path; - file_content_content_type_ = content_type; -} - -inline void Response::set_file_content(const std::string &path) { - file_content_path_ = path; -} - -// Result implementation -inline bool Result::has_request_header(const std::string &key) const { - return request_headers_.find(key) != request_headers_.end(); -} - -inline std::string Result::get_request_header_value(const std::string &key, - const char *def, - size_t id) const { - return detail::get_header_value(request_headers_, key, def, id); -} - -inline size_t -Result::get_request_header_value_count(const std::string &key) const { - auto r = request_headers_.equal_range(key); - return static_cast(std::distance(r.first, r.second)); -} - -// Stream implementation -inline ssize_t Stream::write(const char *ptr) { - return write(ptr, strlen(ptr)); -} - -inline ssize_t Stream::write(const std::string &s) { - return write(s.data(), s.size()); -} - -namespace detail { - -inline void calc_actual_timeout(time_t max_timeout_msec, time_t duration_msec, - time_t timeout_sec, time_t timeout_usec, - time_t &actual_timeout_sec, - time_t &actual_timeout_usec) { - auto timeout_msec = (timeout_sec * 1000) + (timeout_usec / 1000); - - auto actual_timeout_msec = - (std::min)(max_timeout_msec - duration_msec, timeout_msec); - - actual_timeout_sec = actual_timeout_msec / 1000; - actual_timeout_usec = (actual_timeout_msec % 1000) * 1000; -} - -// Socket stream implementation -inline SocketStream::SocketStream( - socket_t sock, time_t read_timeout_sec, time_t read_timeout_usec, - time_t write_timeout_sec, time_t write_timeout_usec, - time_t max_timeout_msec, - std::chrono::time_point start_time) - : sock_(sock), read_timeout_sec_(read_timeout_sec), - read_timeout_usec_(read_timeout_usec), - write_timeout_sec_(write_timeout_sec), - write_timeout_usec_(write_timeout_usec), - max_timeout_msec_(max_timeout_msec), start_time_(start_time), - read_buff_(read_buff_size_, 0) {} - -inline SocketStream::~SocketStream() = default; - -inline bool SocketStream::is_readable() const { - return read_buff_off_ < read_buff_content_size_; -} - -inline bool SocketStream::wait_readable() const { - if (max_timeout_msec_ <= 0) { - return select_read(sock_, read_timeout_sec_, read_timeout_usec_) > 0; - } - - time_t read_timeout_sec; - time_t read_timeout_usec; - calc_actual_timeout(max_timeout_msec_, duration(), read_timeout_sec_, - read_timeout_usec_, read_timeout_sec, read_timeout_usec); - - return select_read(sock_, read_timeout_sec, read_timeout_usec) > 0; -} - -inline bool SocketStream::wait_writable() const { - return select_write(sock_, write_timeout_sec_, write_timeout_usec_) > 0 && - is_socket_alive(sock_); -} - -inline ssize_t SocketStream::read(char *ptr, size_t size) { -#ifdef _WIN32 - size = - (std::min)(size, static_cast((std::numeric_limits::max)())); -#else - size = (std::min)(size, - static_cast((std::numeric_limits::max)())); -#endif - - if (read_buff_off_ < read_buff_content_size_) { - auto remaining_size = read_buff_content_size_ - read_buff_off_; - if (size <= remaining_size) { - memcpy(ptr, read_buff_.data() + read_buff_off_, size); - read_buff_off_ += size; - return static_cast(size); - } else { - memcpy(ptr, read_buff_.data() + read_buff_off_, remaining_size); - read_buff_off_ += remaining_size; - return static_cast(remaining_size); - } - } - - if (!wait_readable()) { return -1; } - - read_buff_off_ = 0; - read_buff_content_size_ = 0; - - if (size < read_buff_size_) { - auto n = read_socket(sock_, read_buff_.data(), read_buff_size_, - CPPHTTPLIB_RECV_FLAGS); - if (n <= 0) { - return n; - } else if (n <= static_cast(size)) { - memcpy(ptr, read_buff_.data(), static_cast(n)); - return n; - } else { - memcpy(ptr, read_buff_.data(), size); - read_buff_off_ = size; - read_buff_content_size_ = static_cast(n); - return static_cast(size); - } - } else { - return read_socket(sock_, ptr, size, CPPHTTPLIB_RECV_FLAGS); - } -} - -inline ssize_t SocketStream::write(const char *ptr, size_t size) { - if (!wait_writable()) { return -1; } - -#if defined(_WIN32) && !defined(_WIN64) - size = - (std::min)(size, static_cast((std::numeric_limits::max)())); -#endif - - return send_socket(sock_, ptr, size, CPPHTTPLIB_SEND_FLAGS); -} - -inline void SocketStream::get_remote_ip_and_port(std::string &ip, - int &port) const { - return detail::get_remote_ip_and_port(sock_, ip, port); -} - -inline void SocketStream::get_local_ip_and_port(std::string &ip, - int &port) const { - return detail::get_local_ip_and_port(sock_, ip, port); -} - -inline socket_t SocketStream::socket() const { return sock_; } - -inline time_t SocketStream::duration() const { - return std::chrono::duration_cast( - std::chrono::steady_clock::now() - start_time_) - .count(); -} - -// Buffer stream implementation -inline bool BufferStream::is_readable() const { return true; } - -inline bool BufferStream::wait_readable() const { return true; } - -inline bool BufferStream::wait_writable() const { return true; } - -inline ssize_t BufferStream::read(char *ptr, size_t size) { -#if defined(_MSC_VER) && _MSC_VER < 1910 - auto len_read = buffer._Copy_s(ptr, size, size, position); -#else - auto len_read = buffer.copy(ptr, size, position); -#endif - position += static_cast(len_read); - return static_cast(len_read); -} - -inline ssize_t BufferStream::write(const char *ptr, size_t size) { - buffer.append(ptr, size); - return static_cast(size); -} - -inline void BufferStream::get_remote_ip_and_port(std::string & /*ip*/, - int & /*port*/) const {} - -inline void BufferStream::get_local_ip_and_port(std::string & /*ip*/, - int & /*port*/) const {} - -inline socket_t BufferStream::socket() const { return 0; } - -inline time_t BufferStream::duration() const { return 0; } - -inline const std::string &BufferStream::get_buffer() const { return buffer; } - -inline PathParamsMatcher::PathParamsMatcher(const std::string &pattern) { - constexpr const char marker[] = "/:"; - - // One past the last ending position of a path param substring - std::size_t last_param_end = 0; - -#ifndef CPPHTTPLIB_NO_EXCEPTIONS - // Needed to ensure that parameter names are unique during matcher - // construction - // If exceptions are disabled, only last duplicate path - // parameter will be set - std::unordered_set param_name_set; -#endif - - while (true) { - const auto marker_pos = pattern.find( - marker, last_param_end == 0 ? last_param_end : last_param_end - 1); - if (marker_pos == std::string::npos) { break; } - - static_fragments_.push_back( - pattern.substr(last_param_end, marker_pos - last_param_end + 1)); - - const auto param_name_start = marker_pos + str_len(marker); - - auto sep_pos = pattern.find(separator, param_name_start); - if (sep_pos == std::string::npos) { sep_pos = pattern.length(); } - - auto param_name = - pattern.substr(param_name_start, sep_pos - param_name_start); - -#ifndef CPPHTTPLIB_NO_EXCEPTIONS - if (param_name_set.find(param_name) != param_name_set.cend()) { - std::string msg = "Encountered path parameter '" + param_name + - "' multiple times in route pattern '" + pattern + "'."; - throw std::invalid_argument(msg); - } -#endif - - param_names_.push_back(std::move(param_name)); - - last_param_end = sep_pos + 1; - } - - if (last_param_end < pattern.length()) { - static_fragments_.push_back(pattern.substr(last_param_end)); - } -} - -inline bool PathParamsMatcher::match(Request &request) const { - request.matches = std::smatch(); - request.path_params.clear(); - request.path_params.reserve(param_names_.size()); - - // One past the position at which the path matched the pattern last time - std::size_t starting_pos = 0; - for (size_t i = 0; i < static_fragments_.size(); ++i) { - const auto &fragment = static_fragments_[i]; - - if (starting_pos + fragment.length() > request.path.length()) { - return false; - } - - // Avoid unnecessary allocation by using strncmp instead of substr + - // comparison - if (std::strncmp(request.path.c_str() + starting_pos, fragment.c_str(), - fragment.length()) != 0) { - return false; - } - - starting_pos += fragment.length(); - - // Should only happen when we have a static fragment after a param - // Example: '/users/:id/subscriptions' - // The 'subscriptions' fragment here does not have a corresponding param - if (i >= param_names_.size()) { continue; } - - auto sep_pos = request.path.find(separator, starting_pos); - if (sep_pos == std::string::npos) { sep_pos = request.path.length(); } - - const auto ¶m_name = param_names_[i]; - - request.path_params.emplace( - param_name, request.path.substr(starting_pos, sep_pos - starting_pos)); - - // Mark everything up to '/' as matched - starting_pos = sep_pos + 1; - } - // Returns false if the path is longer than the pattern - return starting_pos >= request.path.length(); -} - -inline bool RegexMatcher::match(Request &request) const { - request.path_params.clear(); - return std::regex_match(request.path, request.matches, regex_); -} - -} // namespace detail - -// HTTP server implementation -inline Server::Server() - : new_task_queue( - [] { return new ThreadPool(CPPHTTPLIB_THREAD_POOL_COUNT); }) { -#ifndef _WIN32 - signal(SIGPIPE, SIG_IGN); -#endif -} - -inline Server::~Server() = default; - -inline std::unique_ptr -Server::make_matcher(const std::string &pattern) { - if (pattern.find("/:") != std::string::npos) { - return detail::make_unique(pattern); - } else { - return detail::make_unique(pattern); - } -} - -inline Server &Server::Get(const std::string &pattern, Handler handler) { - get_handlers_.emplace_back(make_matcher(pattern), std::move(handler)); - return *this; -} - -inline Server &Server::Post(const std::string &pattern, Handler handler) { - post_handlers_.emplace_back(make_matcher(pattern), std::move(handler)); - return *this; -} - -inline Server &Server::Post(const std::string &pattern, - HandlerWithContentReader handler) { - post_handlers_for_content_reader_.emplace_back(make_matcher(pattern), - std::move(handler)); - return *this; -} - -inline Server &Server::Put(const std::string &pattern, Handler handler) { - put_handlers_.emplace_back(make_matcher(pattern), std::move(handler)); - return *this; -} - -inline Server &Server::Put(const std::string &pattern, - HandlerWithContentReader handler) { - put_handlers_for_content_reader_.emplace_back(make_matcher(pattern), - std::move(handler)); - return *this; -} - -inline Server &Server::Patch(const std::string &pattern, Handler handler) { - patch_handlers_.emplace_back(make_matcher(pattern), std::move(handler)); - return *this; -} - -inline Server &Server::Patch(const std::string &pattern, - HandlerWithContentReader handler) { - patch_handlers_for_content_reader_.emplace_back(make_matcher(pattern), - std::move(handler)); - return *this; -} - -inline Server &Server::Delete(const std::string &pattern, Handler handler) { - delete_handlers_.emplace_back(make_matcher(pattern), std::move(handler)); - return *this; -} - -inline Server &Server::Delete(const std::string &pattern, - HandlerWithContentReader handler) { - delete_handlers_for_content_reader_.emplace_back(make_matcher(pattern), - std::move(handler)); - return *this; -} - -inline Server &Server::Options(const std::string &pattern, Handler handler) { - options_handlers_.emplace_back(make_matcher(pattern), std::move(handler)); - return *this; -} - -inline bool Server::set_base_dir(const std::string &dir, - const std::string &mount_point) { - return set_mount_point(mount_point, dir); -} - -inline bool Server::set_mount_point(const std::string &mount_point, - const std::string &dir, Headers headers) { - detail::FileStat stat(dir); - if (stat.is_dir()) { - std::string mnt = !mount_point.empty() ? mount_point : "/"; - if (!mnt.empty() && mnt[0] == '/') { - base_dirs_.push_back({mnt, dir, std::move(headers)}); - return true; - } - } - return false; -} - -inline bool Server::remove_mount_point(const std::string &mount_point) { - for (auto it = base_dirs_.begin(); it != base_dirs_.end(); ++it) { - if (it->mount_point == mount_point) { - base_dirs_.erase(it); - return true; - } - } - return false; -} - -inline Server & -Server::set_file_extension_and_mimetype_mapping(const std::string &ext, - const std::string &mime) { - file_extension_and_mimetype_map_[ext] = mime; - return *this; -} - -inline Server &Server::set_default_file_mimetype(const std::string &mime) { - default_file_mimetype_ = mime; - return *this; -} - -inline Server &Server::set_file_request_handler(Handler handler) { - file_request_handler_ = std::move(handler); - return *this; -} - -inline Server &Server::set_error_handler_core(HandlerWithResponse handler, - std::true_type) { - error_handler_ = std::move(handler); - return *this; -} - -inline Server &Server::set_error_handler_core(Handler handler, - std::false_type) { - error_handler_ = [handler](const Request &req, Response &res) { - handler(req, res); - return HandlerResponse::Handled; - }; - return *this; -} - -inline Server &Server::set_exception_handler(ExceptionHandler handler) { - exception_handler_ = std::move(handler); - return *this; -} - -inline Server &Server::set_pre_routing_handler(HandlerWithResponse handler) { - pre_routing_handler_ = std::move(handler); - return *this; -} - -inline Server &Server::set_post_routing_handler(Handler handler) { - post_routing_handler_ = std::move(handler); - return *this; -} - -inline Server &Server::set_logger(Logger logger) { - logger_ = std::move(logger); - return *this; -} - -inline Server & -Server::set_expect_100_continue_handler(Expect100ContinueHandler handler) { - expect_100_continue_handler_ = std::move(handler); - return *this; -} - -inline Server &Server::set_address_family(int family) { - address_family_ = family; - return *this; -} - -inline Server &Server::set_tcp_nodelay(bool on) { - tcp_nodelay_ = on; - return *this; -} - -inline Server &Server::set_ipv6_v6only(bool on) { - ipv6_v6only_ = on; - return *this; -} - -inline Server &Server::set_socket_options(SocketOptions socket_options) { - socket_options_ = std::move(socket_options); - return *this; -} - -inline Server &Server::set_default_headers(Headers headers) { - default_headers_ = std::move(headers); - return *this; -} - -inline Server &Server::set_header_writer( - std::function const &writer) { - header_writer_ = writer; - return *this; -} - -inline Server &Server::set_keep_alive_max_count(size_t count) { - keep_alive_max_count_ = count; - return *this; -} - -inline Server &Server::set_keep_alive_timeout(time_t sec) { - keep_alive_timeout_sec_ = sec; - return *this; -} - -inline Server &Server::set_read_timeout(time_t sec, time_t usec) { - read_timeout_sec_ = sec; - read_timeout_usec_ = usec; - return *this; -} - -inline Server &Server::set_write_timeout(time_t sec, time_t usec) { - write_timeout_sec_ = sec; - write_timeout_usec_ = usec; - return *this; -} - -inline Server &Server::set_idle_interval(time_t sec, time_t usec) { - idle_interval_sec_ = sec; - idle_interval_usec_ = usec; - return *this; -} - -inline Server &Server::set_payload_max_length(size_t length) { - payload_max_length_ = length; - return *this; -} - -inline bool Server::bind_to_port(const std::string &host, int port, - int socket_flags) { - auto ret = bind_internal(host, port, socket_flags); - if (ret == -1) { is_decommissioned = true; } - return ret >= 0; -} -inline int Server::bind_to_any_port(const std::string &host, int socket_flags) { - auto ret = bind_internal(host, 0, socket_flags); - if (ret == -1) { is_decommissioned = true; } - return ret; -} - -inline bool Server::listen_after_bind() { return listen_internal(); } - -inline bool Server::listen(const std::string &host, int port, - int socket_flags) { - return bind_to_port(host, port, socket_flags) && listen_internal(); -} - -inline bool Server::is_running() const { return is_running_; } - -inline void Server::wait_until_ready() const { - while (!is_running_ && !is_decommissioned) { - std::this_thread::sleep_for(std::chrono::milliseconds{1}); - } -} - -inline void Server::stop() { - if (is_running_) { - assert(svr_sock_ != INVALID_SOCKET); - std::atomic sock(svr_sock_.exchange(INVALID_SOCKET)); - detail::shutdown_socket(sock); - detail::close_socket(sock); - } - is_decommissioned = false; -} - -inline void Server::decommission() { is_decommissioned = true; } - -inline bool Server::parse_request_line(const char *s, Request &req) const { - auto len = strlen(s); - if (len < 2 || s[len - 2] != '\r' || s[len - 1] != '\n') { return false; } - len -= 2; - - { - size_t count = 0; - - detail::split(s, s + len, ' ', [&](const char *b, const char *e) { - switch (count) { - case 0: req.method = std::string(b, e); break; - case 1: req.target = std::string(b, e); break; - case 2: req.version = std::string(b, e); break; - default: break; - } - count++; - }); - - if (count != 3) { return false; } - } - - thread_local const std::set methods{ - "GET", "HEAD", "POST", "PUT", "DELETE", - "CONNECT", "OPTIONS", "TRACE", "PATCH", "PRI"}; - - if (methods.find(req.method) == methods.end()) { return false; } - - if (req.version != "HTTP/1.1" && req.version != "HTTP/1.0") { return false; } - - { - // Skip URL fragment - for (size_t i = 0; i < req.target.size(); i++) { - if (req.target[i] == '#') { - req.target.erase(i); - break; - } - } - - detail::divide(req.target, '?', - [&](const char *lhs_data, std::size_t lhs_size, - const char *rhs_data, std::size_t rhs_size) { - req.path = detail::decode_url( - std::string(lhs_data, lhs_size), false); - detail::parse_query_text(rhs_data, rhs_size, req.params); - }); - } - - return true; -} - -inline bool Server::write_response(Stream &strm, bool close_connection, - Request &req, Response &res) { - // NOTE: `req.ranges` should be empty, otherwise it will be applied - // incorrectly to the error content. - req.ranges.clear(); - return write_response_core(strm, close_connection, req, res, false); -} - -inline bool Server::write_response_with_content(Stream &strm, - bool close_connection, - const Request &req, - Response &res) { - return write_response_core(strm, close_connection, req, res, true); -} - -inline bool Server::write_response_core(Stream &strm, bool close_connection, - const Request &req, Response &res, - bool need_apply_ranges) { - assert(res.status != -1); - - if (400 <= res.status && error_handler_ && - error_handler_(req, res) == HandlerResponse::Handled) { - need_apply_ranges = true; - } - - std::string content_type; - std::string boundary; - if (need_apply_ranges) { apply_ranges(req, res, content_type, boundary); } - - // Prepare additional headers - if (close_connection || req.get_header_value("Connection") == "close") { - res.set_header("Connection", "close"); - } else { - std::string s = "timeout="; - s += std::to_string(keep_alive_timeout_sec_); - s += ", max="; - s += std::to_string(keep_alive_max_count_); - res.set_header("Keep-Alive", s); - } - - if ((!res.body.empty() || res.content_length_ > 0 || res.content_provider_) && - !res.has_header("Content-Type")) { - res.set_header("Content-Type", "text/plain"); - } - - if (res.body.empty() && !res.content_length_ && !res.content_provider_ && - !res.has_header("Content-Length")) { - res.set_header("Content-Length", "0"); - } - - if (req.method == "HEAD" && !res.has_header("Accept-Ranges")) { - res.set_header("Accept-Ranges", "bytes"); - } - - if (post_routing_handler_) { post_routing_handler_(req, res); } - - // Response line and headers - { - detail::BufferStream bstrm; - if (!detail::write_response_line(bstrm, res.status)) { return false; } - if (!header_writer_(bstrm, res.headers)) { return false; } - - // Flush buffer - auto &data = bstrm.get_buffer(); - detail::write_data(strm, data.data(), data.size()); - } - - // Body - auto ret = true; - if (req.method != "HEAD") { - if (!res.body.empty()) { - if (!detail::write_data(strm, res.body.data(), res.body.size())) { - ret = false; - } - } else if (res.content_provider_) { - if (write_content_with_provider(strm, req, res, boundary, content_type)) { - res.content_provider_success_ = true; - } else { - ret = false; - } - } - } - - // Log - if (logger_) { logger_(req, res); } - - return ret; -} - -inline bool -Server::write_content_with_provider(Stream &strm, const Request &req, - Response &res, const std::string &boundary, - const std::string &content_type) { - auto is_shutting_down = [this]() { - return this->svr_sock_ == INVALID_SOCKET; - }; - - if (res.content_length_ > 0) { - if (req.ranges.empty()) { - return detail::write_content(strm, res.content_provider_, 0, - res.content_length_, is_shutting_down); - } else if (req.ranges.size() == 1) { - auto offset_and_length = detail::get_range_offset_and_length( - req.ranges[0], res.content_length_); - - return detail::write_content(strm, res.content_provider_, - offset_and_length.first, - offset_and_length.second, is_shutting_down); - } else { - return detail::write_multipart_ranges_data( - strm, req, res, boundary, content_type, res.content_length_, - is_shutting_down); - } - } else { - if (res.is_chunked_content_provider_) { - auto type = detail::encoding_type(req, res); - - std::unique_ptr compressor; - if (type == detail::EncodingType::Gzip) { -#ifdef CPPHTTPLIB_ZLIB_SUPPORT - compressor = detail::make_unique(); -#endif - } else if (type == detail::EncodingType::Brotli) { -#ifdef CPPHTTPLIB_BROTLI_SUPPORT - compressor = detail::make_unique(); -#endif - } else if (type == detail::EncodingType::Zstd) { -#ifdef CPPHTTPLIB_ZSTD_SUPPORT - compressor = detail::make_unique(); -#endif - } else { - compressor = detail::make_unique(); - } - assert(compressor != nullptr); - - return detail::write_content_chunked(strm, res.content_provider_, - is_shutting_down, *compressor); - } else { - return detail::write_content_without_length(strm, res.content_provider_, - is_shutting_down); - } - } -} - -inline bool Server::read_content(Stream &strm, Request &req, Response &res) { - MultipartFormDataMap::iterator cur; - auto file_count = 0; - if (read_content_core( - strm, req, res, - // Regular - [&](const char *buf, size_t n) { - if (req.body.size() + n > req.body.max_size()) { return false; } - req.body.append(buf, n); - return true; - }, - // Multipart - [&](const MultipartFormData &file) { - if (file_count++ == CPPHTTPLIB_MULTIPART_FORM_DATA_FILE_MAX_COUNT) { - return false; - } - cur = req.files.emplace(file.name, file); - return true; - }, - [&](const char *buf, size_t n) { - auto &content = cur->second.content; - if (content.size() + n > content.max_size()) { return false; } - content.append(buf, n); - return true; - })) { - const auto &content_type = req.get_header_value("Content-Type"); - if (!content_type.find("application/x-www-form-urlencoded")) { - if (req.body.size() > CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH) { - res.status = StatusCode::PayloadTooLarge_413; // NOTE: should be 414? - return false; - } - detail::parse_query_text(req.body, req.params); - } - return true; - } - return false; -} - -inline bool Server::read_content_with_content_receiver( - Stream &strm, Request &req, Response &res, ContentReceiver receiver, - MultipartContentHeader multipart_header, - ContentReceiver multipart_receiver) { - return read_content_core(strm, req, res, std::move(receiver), - std::move(multipart_header), - std::move(multipart_receiver)); -} - -inline bool -Server::read_content_core(Stream &strm, Request &req, Response &res, - ContentReceiver receiver, - MultipartContentHeader multipart_header, - ContentReceiver multipart_receiver) const { - detail::MultipartFormDataParser multipart_form_data_parser; - ContentReceiverWithProgress out; - - if (req.is_multipart_form_data()) { - const auto &content_type = req.get_header_value("Content-Type"); - std::string boundary; - if (!detail::parse_multipart_boundary(content_type, boundary)) { - res.status = StatusCode::BadRequest_400; - return false; - } - - multipart_form_data_parser.set_boundary(std::move(boundary)); - out = [&](const char *buf, size_t n, uint64_t /*off*/, uint64_t /*len*/) { - /* For debug - size_t pos = 0; - while (pos < n) { - auto read_size = (std::min)(1, n - pos); - auto ret = multipart_form_data_parser.parse( - buf + pos, read_size, multipart_receiver, multipart_header); - if (!ret) { return false; } - pos += read_size; - } - return true; - */ - return multipart_form_data_parser.parse(buf, n, multipart_receiver, - multipart_header); - }; - } else { - out = [receiver](const char *buf, size_t n, uint64_t /*off*/, - uint64_t /*len*/) { return receiver(buf, n); }; - } - - if (req.method == "DELETE" && !req.has_header("Content-Length")) { - return true; - } - - if (!detail::read_content(strm, req, payload_max_length_, res.status, nullptr, - out, true)) { - return false; - } - - if (req.is_multipart_form_data()) { - if (!multipart_form_data_parser.is_valid()) { - res.status = StatusCode::BadRequest_400; - return false; - } - } - - return true; -} - -inline bool Server::handle_file_request(const Request &req, Response &res, - bool head) { - for (const auto &entry : base_dirs_) { - // Prefix match - if (!req.path.compare(0, entry.mount_point.size(), entry.mount_point)) { - std::string sub_path = "/" + req.path.substr(entry.mount_point.size()); - if (detail::is_valid_path(sub_path)) { - auto path = entry.base_dir + sub_path; - if (path.back() == '/') { path += "index.html"; } - - detail::FileStat stat(path); - - if (stat.is_dir()) { - res.set_redirect(sub_path + "/", StatusCode::MovedPermanently_301); - return true; - } - - if (stat.is_file()) { - for (const auto &kv : entry.headers) { - res.set_header(kv.first, kv.second); - } - - auto mm = std::make_shared(path.c_str()); - if (!mm->is_open()) { return false; } - - res.set_content_provider( - mm->size(), - detail::find_content_type(path, file_extension_and_mimetype_map_, - default_file_mimetype_), - [mm](size_t offset, size_t length, DataSink &sink) -> bool { - sink.write(mm->data() + offset, length); - return true; - }); - - if (!head && file_request_handler_) { - file_request_handler_(req, res); - } - - return true; - } - } - } - } - return false; -} - -inline socket_t -Server::create_server_socket(const std::string &host, int port, - int socket_flags, - SocketOptions socket_options) const { - return detail::create_socket( - host, std::string(), port, address_family_, socket_flags, tcp_nodelay_, - ipv6_v6only_, std::move(socket_options), - [](socket_t sock, struct addrinfo &ai, bool & /*quit*/) -> bool { - if (::bind(sock, ai.ai_addr, static_cast(ai.ai_addrlen))) { - return false; - } - if (::listen(sock, CPPHTTPLIB_LISTEN_BACKLOG)) { return false; } - return true; - }); -} - -inline int Server::bind_internal(const std::string &host, int port, - int socket_flags) { - if (is_decommissioned) { return -1; } - - if (!is_valid()) { return -1; } - - svr_sock_ = create_server_socket(host, port, socket_flags, socket_options_); - if (svr_sock_ == INVALID_SOCKET) { return -1; } - - if (port == 0) { - struct sockaddr_storage addr; - socklen_t addr_len = sizeof(addr); - if (getsockname(svr_sock_, reinterpret_cast(&addr), - &addr_len) == -1) { - return -1; - } - if (addr.ss_family == AF_INET) { - return ntohs(reinterpret_cast(&addr)->sin_port); - } else if (addr.ss_family == AF_INET6) { - return ntohs(reinterpret_cast(&addr)->sin6_port); - } else { - return -1; - } - } else { - return port; - } -} - -inline bool Server::listen_internal() { - if (is_decommissioned) { return false; } - - auto ret = true; - is_running_ = true; - auto se = detail::scope_exit([&]() { is_running_ = false; }); - - { - std::unique_ptr task_queue(new_task_queue()); - - while (svr_sock_ != INVALID_SOCKET) { -#ifndef _WIN32 - if (idle_interval_sec_ > 0 || idle_interval_usec_ > 0) { -#endif - auto val = detail::select_read(svr_sock_, idle_interval_sec_, - idle_interval_usec_); - if (val == 0) { // Timeout - task_queue->on_idle(); - continue; - } -#ifndef _WIN32 - } -#endif - -#if defined _WIN32 - // sockets connected via WASAccept inherit flags NO_HANDLE_INHERIT, - // OVERLAPPED - socket_t sock = WSAAccept(svr_sock_, nullptr, nullptr, nullptr, 0); -#elif defined SOCK_CLOEXEC - socket_t sock = accept4(svr_sock_, nullptr, nullptr, SOCK_CLOEXEC); -#else - socket_t sock = accept(svr_sock_, nullptr, nullptr); -#endif - - if (sock == INVALID_SOCKET) { - if (errno == EMFILE) { - // The per-process limit of open file descriptors has been reached. - // Try to accept new connections after a short sleep. - std::this_thread::sleep_for(std::chrono::microseconds{1}); - continue; - } else if (errno == EINTR || errno == EAGAIN) { - continue; - } - if (svr_sock_ != INVALID_SOCKET) { - detail::close_socket(svr_sock_); - ret = false; - } else { - ; // The server socket was closed by user. - } - break; - } - - detail::set_socket_opt_time(sock, SOL_SOCKET, SO_RCVTIMEO, - read_timeout_sec_, read_timeout_usec_); - detail::set_socket_opt_time(sock, SOL_SOCKET, SO_SNDTIMEO, - write_timeout_sec_, write_timeout_usec_); - - if (!task_queue->enqueue( - [this, sock]() { process_and_close_socket(sock); })) { - detail::shutdown_socket(sock); - detail::close_socket(sock); - } - } - - task_queue->shutdown(); - } - - is_decommissioned = !ret; - return ret; -} - -inline bool Server::routing(Request &req, Response &res, Stream &strm) { - if (pre_routing_handler_ && - pre_routing_handler_(req, res) == HandlerResponse::Handled) { - return true; - } - - // File handler - auto is_head_request = req.method == "HEAD"; - if ((req.method == "GET" || is_head_request) && - handle_file_request(req, res, is_head_request)) { - return true; - } - - if (detail::expect_content(req)) { - // Content reader handler - { - ContentReader reader( - [&](ContentReceiver receiver) { - return read_content_with_content_receiver( - strm, req, res, std::move(receiver), nullptr, nullptr); - }, - [&](MultipartContentHeader header, ContentReceiver receiver) { - return read_content_with_content_receiver(strm, req, res, nullptr, - std::move(header), - std::move(receiver)); - }); - - if (req.method == "POST") { - if (dispatch_request_for_content_reader( - req, res, std::move(reader), - post_handlers_for_content_reader_)) { - return true; - } - } else if (req.method == "PUT") { - if (dispatch_request_for_content_reader( - req, res, std::move(reader), - put_handlers_for_content_reader_)) { - return true; - } - } else if (req.method == "PATCH") { - if (dispatch_request_for_content_reader( - req, res, std::move(reader), - patch_handlers_for_content_reader_)) { - return true; - } - } else if (req.method == "DELETE") { - if (dispatch_request_for_content_reader( - req, res, std::move(reader), - delete_handlers_for_content_reader_)) { - return true; - } - } - } - - // Read content into `req.body` - if (!read_content(strm, req, res)) { return false; } - } - - // Regular handler - if (req.method == "GET" || req.method == "HEAD") { - return dispatch_request(req, res, get_handlers_); - } else if (req.method == "POST") { - return dispatch_request(req, res, post_handlers_); - } else if (req.method == "PUT") { - return dispatch_request(req, res, put_handlers_); - } else if (req.method == "DELETE") { - return dispatch_request(req, res, delete_handlers_); - } else if (req.method == "OPTIONS") { - return dispatch_request(req, res, options_handlers_); - } else if (req.method == "PATCH") { - return dispatch_request(req, res, patch_handlers_); - } - - res.status = StatusCode::BadRequest_400; - return false; -} - -inline bool Server::dispatch_request(Request &req, Response &res, - const Handlers &handlers) const { - for (const auto &x : handlers) { - const auto &matcher = x.first; - const auto &handler = x.second; - - if (matcher->match(req)) { - handler(req, res); - return true; - } - } - return false; -} - -inline void Server::apply_ranges(const Request &req, Response &res, - std::string &content_type, - std::string &boundary) const { - if (req.ranges.size() > 1 && res.status == StatusCode::PartialContent_206) { - auto it = res.headers.find("Content-Type"); - if (it != res.headers.end()) { - content_type = it->second; - res.headers.erase(it); - } - - boundary = detail::make_multipart_data_boundary(); - - res.set_header("Content-Type", - "multipart/byteranges; boundary=" + boundary); - } - - auto type = detail::encoding_type(req, res); - - if (res.body.empty()) { - if (res.content_length_ > 0) { - size_t length = 0; - if (req.ranges.empty() || res.status != StatusCode::PartialContent_206) { - length = res.content_length_; - } else if (req.ranges.size() == 1) { - auto offset_and_length = detail::get_range_offset_and_length( - req.ranges[0], res.content_length_); - - length = offset_and_length.second; - - auto content_range = detail::make_content_range_header_field( - offset_and_length, res.content_length_); - res.set_header("Content-Range", content_range); - } else { - length = detail::get_multipart_ranges_data_length( - req, boundary, content_type, res.content_length_); - } - res.set_header("Content-Length", std::to_string(length)); - } else { - if (res.content_provider_) { - if (res.is_chunked_content_provider_) { - res.set_header("Transfer-Encoding", "chunked"); - if (type == detail::EncodingType::Gzip) { - res.set_header("Content-Encoding", "gzip"); - } else if (type == detail::EncodingType::Brotli) { - res.set_header("Content-Encoding", "br"); - } else if (type == detail::EncodingType::Zstd) { - res.set_header("Content-Encoding", "zstd"); - } - } - } - } - } else { - if (req.ranges.empty() || res.status != StatusCode::PartialContent_206) { - ; - } else if (req.ranges.size() == 1) { - auto offset_and_length = - detail::get_range_offset_and_length(req.ranges[0], res.body.size()); - auto offset = offset_and_length.first; - auto length = offset_and_length.second; - - auto content_range = detail::make_content_range_header_field( - offset_and_length, res.body.size()); - res.set_header("Content-Range", content_range); - - assert(offset + length <= res.body.size()); - res.body = res.body.substr(offset, length); - } else { - std::string data; - detail::make_multipart_ranges_data(req, res, boundary, content_type, - res.body.size(), data); - res.body.swap(data); - } - - if (type != detail::EncodingType::None) { - std::unique_ptr compressor; - std::string content_encoding; - - if (type == detail::EncodingType::Gzip) { -#ifdef CPPHTTPLIB_ZLIB_SUPPORT - compressor = detail::make_unique(); - content_encoding = "gzip"; -#endif - } else if (type == detail::EncodingType::Brotli) { -#ifdef CPPHTTPLIB_BROTLI_SUPPORT - compressor = detail::make_unique(); - content_encoding = "br"; -#endif - } else if (type == detail::EncodingType::Zstd) { -#ifdef CPPHTTPLIB_ZSTD_SUPPORT - compressor = detail::make_unique(); - content_encoding = "zstd"; -#endif - } - - if (compressor) { - std::string compressed; - if (compressor->compress(res.body.data(), res.body.size(), true, - [&](const char *data, size_t data_len) { - compressed.append(data, data_len); - return true; - })) { - res.body.swap(compressed); - res.set_header("Content-Encoding", content_encoding); - } - } - } - - auto length = std::to_string(res.body.size()); - res.set_header("Content-Length", length); - } -} - -inline bool Server::dispatch_request_for_content_reader( - Request &req, Response &res, ContentReader content_reader, - const HandlersForContentReader &handlers) const { - for (const auto &x : handlers) { - const auto &matcher = x.first; - const auto &handler = x.second; - - if (matcher->match(req)) { - handler(req, res, content_reader); - return true; - } - } - return false; -} - -inline bool -Server::process_request(Stream &strm, const std::string &remote_addr, - int remote_port, const std::string &local_addr, - int local_port, bool close_connection, - bool &connection_closed, - const std::function &setup_request) { - std::array buf{}; - - detail::stream_line_reader line_reader(strm, buf.data(), buf.size()); - - // Connection has been closed on client - if (!line_reader.getline()) { return false; } - - Request req; - - Response res; - res.version = "HTTP/1.1"; - res.headers = default_headers_; - - // Request line and headers - if (!parse_request_line(line_reader.ptr(), req) || - !detail::read_headers(strm, req.headers)) { - res.status = StatusCode::BadRequest_400; - return write_response(strm, close_connection, req, res); - } - - // Check if the request URI doesn't exceed the limit - if (req.target.size() > CPPHTTPLIB_REQUEST_URI_MAX_LENGTH) { - Headers dummy; - detail::read_headers(strm, dummy); - res.status = StatusCode::UriTooLong_414; - return write_response(strm, close_connection, req, res); - } - - if (req.get_header_value("Connection") == "close") { - connection_closed = true; - } - - if (req.version == "HTTP/1.0" && - req.get_header_value("Connection") != "Keep-Alive") { - connection_closed = true; - } - - req.remote_addr = remote_addr; - req.remote_port = remote_port; - req.set_header("REMOTE_ADDR", req.remote_addr); - req.set_header("REMOTE_PORT", std::to_string(req.remote_port)); - - req.local_addr = local_addr; - req.local_port = local_port; - req.set_header("LOCAL_ADDR", req.local_addr); - req.set_header("LOCAL_PORT", std::to_string(req.local_port)); - - if (req.has_header("Range")) { - const auto &range_header_value = req.get_header_value("Range"); - if (!detail::parse_range_header(range_header_value, req.ranges)) { - res.status = StatusCode::RangeNotSatisfiable_416; - return write_response(strm, close_connection, req, res); - } - } - - if (setup_request) { setup_request(req); } - - if (req.get_header_value("Expect") == "100-continue") { - int status = StatusCode::Continue_100; - if (expect_100_continue_handler_) { - status = expect_100_continue_handler_(req, res); - } - switch (status) { - case StatusCode::Continue_100: - case StatusCode::ExpectationFailed_417: - detail::write_response_line(strm, status); - strm.write("\r\n"); - break; - default: - connection_closed = true; - return write_response(strm, true, req, res); - } - } - - // Setup `is_connection_closed` method - req.is_connection_closed = [&]() { - return !detail::is_socket_alive(strm.socket()); - }; - - // Routing - auto routed = false; -#ifdef CPPHTTPLIB_NO_EXCEPTIONS - routed = routing(req, res, strm); -#else - try { - routed = routing(req, res, strm); - } catch (std::exception &e) { - if (exception_handler_) { - auto ep = std::current_exception(); - exception_handler_(req, res, ep); - routed = true; - } else { - res.status = StatusCode::InternalServerError_500; - std::string val; - auto s = e.what(); - for (size_t i = 0; s[i]; i++) { - switch (s[i]) { - case '\r': val += "\\r"; break; - case '\n': val += "\\n"; break; - default: val += s[i]; break; - } - } - res.set_header("EXCEPTION_WHAT", val); - } - } catch (...) { - if (exception_handler_) { - auto ep = std::current_exception(); - exception_handler_(req, res, ep); - routed = true; - } else { - res.status = StatusCode::InternalServerError_500; - res.set_header("EXCEPTION_WHAT", "UNKNOWN"); - } - } -#endif - if (routed) { - if (res.status == -1) { - res.status = req.ranges.empty() ? StatusCode::OK_200 - : StatusCode::PartialContent_206; - } - - // Serve file content by using a content provider - if (!res.file_content_path_.empty()) { - const auto &path = res.file_content_path_; - auto mm = std::make_shared(path.c_str()); - if (!mm->is_open()) { - res.body.clear(); - res.content_length_ = 0; - res.content_provider_ = nullptr; - res.status = StatusCode::NotFound_404; - return write_response(strm, close_connection, req, res); - } - - auto content_type = res.file_content_content_type_; - if (content_type.empty()) { - content_type = detail::find_content_type( - path, file_extension_and_mimetype_map_, default_file_mimetype_); - } - - res.set_content_provider( - mm->size(), content_type, - [mm](size_t offset, size_t length, DataSink &sink) -> bool { - sink.write(mm->data() + offset, length); - return true; - }); - } - - if (detail::range_error(req, res)) { - res.body.clear(); - res.content_length_ = 0; - res.content_provider_ = nullptr; - res.status = StatusCode::RangeNotSatisfiable_416; - return write_response(strm, close_connection, req, res); - } - - return write_response_with_content(strm, close_connection, req, res); - } else { - if (res.status == -1) { res.status = StatusCode::NotFound_404; } - - return write_response(strm, close_connection, req, res); - } -} - -inline bool Server::is_valid() const { return true; } - -inline bool Server::process_and_close_socket(socket_t sock) { - std::string remote_addr; - int remote_port = 0; - detail::get_remote_ip_and_port(sock, remote_addr, remote_port); - - std::string local_addr; - int local_port = 0; - detail::get_local_ip_and_port(sock, local_addr, local_port); - - auto ret = detail::process_server_socket( - svr_sock_, sock, keep_alive_max_count_, keep_alive_timeout_sec_, - read_timeout_sec_, read_timeout_usec_, write_timeout_sec_, - write_timeout_usec_, - [&](Stream &strm, bool close_connection, bool &connection_closed) { - return process_request(strm, remote_addr, remote_port, local_addr, - local_port, close_connection, connection_closed, - nullptr); - }); - - detail::shutdown_socket(sock); - detail::close_socket(sock); - return ret; -} - -// HTTP client implementation -inline ClientImpl::ClientImpl(const std::string &host) - : ClientImpl(host, 80, std::string(), std::string()) {} - -inline ClientImpl::ClientImpl(const std::string &host, int port) - : ClientImpl(host, port, std::string(), std::string()) {} - -inline ClientImpl::ClientImpl(const std::string &host, int port, - const std::string &client_cert_path, - const std::string &client_key_path) - : host_(detail::escape_abstract_namespace_unix_domain(host)), port_(port), - host_and_port_(adjust_host_string(host_) + ":" + std::to_string(port)), - client_cert_path_(client_cert_path), client_key_path_(client_key_path) {} - -inline ClientImpl::~ClientImpl() { - // Wait until all the requests in flight are handled. - size_t retry_count = 10; - while (retry_count-- > 0) { - { - std::lock_guard guard(socket_mutex_); - if (socket_requests_in_flight_ == 0) { break; } - } - std::this_thread::sleep_for(std::chrono::milliseconds{1}); - } - - std::lock_guard guard(socket_mutex_); - shutdown_socket(socket_); - close_socket(socket_); -} - -inline bool ClientImpl::is_valid() const { return true; } - -inline void ClientImpl::copy_settings(const ClientImpl &rhs) { - client_cert_path_ = rhs.client_cert_path_; - client_key_path_ = rhs.client_key_path_; - connection_timeout_sec_ = rhs.connection_timeout_sec_; - read_timeout_sec_ = rhs.read_timeout_sec_; - read_timeout_usec_ = rhs.read_timeout_usec_; - write_timeout_sec_ = rhs.write_timeout_sec_; - write_timeout_usec_ = rhs.write_timeout_usec_; - max_timeout_msec_ = rhs.max_timeout_msec_; - basic_auth_username_ = rhs.basic_auth_username_; - basic_auth_password_ = rhs.basic_auth_password_; - bearer_token_auth_token_ = rhs.bearer_token_auth_token_; -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - digest_auth_username_ = rhs.digest_auth_username_; - digest_auth_password_ = rhs.digest_auth_password_; -#endif - keep_alive_ = rhs.keep_alive_; - follow_location_ = rhs.follow_location_; - url_encode_ = rhs.url_encode_; - address_family_ = rhs.address_family_; - tcp_nodelay_ = rhs.tcp_nodelay_; - ipv6_v6only_ = rhs.ipv6_v6only_; - socket_options_ = rhs.socket_options_; - compress_ = rhs.compress_; - decompress_ = rhs.decompress_; - interface_ = rhs.interface_; - proxy_host_ = rhs.proxy_host_; - proxy_port_ = rhs.proxy_port_; - proxy_basic_auth_username_ = rhs.proxy_basic_auth_username_; - proxy_basic_auth_password_ = rhs.proxy_basic_auth_password_; - proxy_bearer_token_auth_token_ = rhs.proxy_bearer_token_auth_token_; -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - proxy_digest_auth_username_ = rhs.proxy_digest_auth_username_; - proxy_digest_auth_password_ = rhs.proxy_digest_auth_password_; -#endif -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - ca_cert_file_path_ = rhs.ca_cert_file_path_; - ca_cert_dir_path_ = rhs.ca_cert_dir_path_; - ca_cert_store_ = rhs.ca_cert_store_; -#endif -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - server_certificate_verification_ = rhs.server_certificate_verification_; - server_hostname_verification_ = rhs.server_hostname_verification_; - server_certificate_verifier_ = rhs.server_certificate_verifier_; -#endif - logger_ = rhs.logger_; -} - -inline socket_t ClientImpl::create_client_socket(Error &error) const { - if (!proxy_host_.empty() && proxy_port_ != -1) { - return detail::create_client_socket( - proxy_host_, std::string(), proxy_port_, address_family_, tcp_nodelay_, - ipv6_v6only_, socket_options_, connection_timeout_sec_, - connection_timeout_usec_, read_timeout_sec_, read_timeout_usec_, - write_timeout_sec_, write_timeout_usec_, interface_, error); - } - - // Check is custom IP specified for host_ - std::string ip; - auto it = addr_map_.find(host_); - if (it != addr_map_.end()) { ip = it->second; } - - return detail::create_client_socket( - host_, ip, port_, address_family_, tcp_nodelay_, ipv6_v6only_, - socket_options_, connection_timeout_sec_, connection_timeout_usec_, - read_timeout_sec_, read_timeout_usec_, write_timeout_sec_, - write_timeout_usec_, interface_, error); -} - -inline bool ClientImpl::create_and_connect_socket(Socket &socket, - Error &error) { - auto sock = create_client_socket(error); - if (sock == INVALID_SOCKET) { return false; } - socket.sock = sock; - return true; -} - -inline void ClientImpl::shutdown_ssl(Socket & /*socket*/, - bool /*shutdown_gracefully*/) { - // If there are any requests in flight from threads other than us, then it's - // a thread-unsafe race because individual ssl* objects are not thread-safe. - assert(socket_requests_in_flight_ == 0 || - socket_requests_are_from_thread_ == std::this_thread::get_id()); -} - -inline void ClientImpl::shutdown_socket(Socket &socket) const { - if (socket.sock == INVALID_SOCKET) { return; } - detail::shutdown_socket(socket.sock); -} - -inline void ClientImpl::close_socket(Socket &socket) { - // If there are requests in flight in another thread, usually closing - // the socket will be fine and they will simply receive an error when - // using the closed socket, but it is still a bug since rarely the OS - // may reassign the socket id to be used for a new socket, and then - // suddenly they will be operating on a live socket that is different - // than the one they intended! - assert(socket_requests_in_flight_ == 0 || - socket_requests_are_from_thread_ == std::this_thread::get_id()); - - // It is also a bug if this happens while SSL is still active -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - assert(socket.ssl == nullptr); -#endif - if (socket.sock == INVALID_SOCKET) { return; } - detail::close_socket(socket.sock); - socket.sock = INVALID_SOCKET; -} - -inline bool ClientImpl::read_response_line(Stream &strm, const Request &req, - Response &res) const { - std::array buf{}; - - detail::stream_line_reader line_reader(strm, buf.data(), buf.size()); - - if (!line_reader.getline()) { return false; } - -#ifdef CPPHTTPLIB_ALLOW_LF_AS_LINE_TERMINATOR - thread_local const std::regex re("(HTTP/1\\.[01]) (\\d{3})(?: (.*?))?\r?\n"); -#else - thread_local const std::regex re("(HTTP/1\\.[01]) (\\d{3})(?: (.*?))?\r\n"); -#endif - - std::cmatch m; - if (!std::regex_match(line_reader.ptr(), m, re)) { - return req.method == "CONNECT"; - } - res.version = std::string(m[1]); - res.status = std::stoi(std::string(m[2])); - res.reason = std::string(m[3]); - - // Ignore '100 Continue' - while (res.status == StatusCode::Continue_100) { - if (!line_reader.getline()) { return false; } // CRLF - if (!line_reader.getline()) { return false; } // next response line - - if (!std::regex_match(line_reader.ptr(), m, re)) { return false; } - res.version = std::string(m[1]); - res.status = std::stoi(std::string(m[2])); - res.reason = std::string(m[3]); - } - - return true; -} - -inline bool ClientImpl::send(Request &req, Response &res, Error &error) { - std::lock_guard request_mutex_guard(request_mutex_); - auto ret = send_(req, res, error); - if (error == Error::SSLPeerCouldBeClosed_) { - assert(!ret); - ret = send_(req, res, error); - } - return ret; -} - -inline bool ClientImpl::send_(Request &req, Response &res, Error &error) { - { - std::lock_guard guard(socket_mutex_); - - // Set this to false immediately - if it ever gets set to true by the end of - // the request, we know another thread instructed us to close the socket. - socket_should_be_closed_when_request_is_done_ = false; - - auto is_alive = false; - if (socket_.is_open()) { - is_alive = detail::is_socket_alive(socket_.sock); - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - if (is_alive && is_ssl()) { - if (detail::is_ssl_peer_could_be_closed(socket_.ssl, socket_.sock)) { - is_alive = false; - } - } -#endif - - if (!is_alive) { - // Attempt to avoid sigpipe by shutting down non-gracefully if it seems - // like the other side has already closed the connection Also, there - // cannot be any requests in flight from other threads since we locked - // request_mutex_, so safe to close everything immediately - const bool shutdown_gracefully = false; - shutdown_ssl(socket_, shutdown_gracefully); - shutdown_socket(socket_); - close_socket(socket_); - } - } - - if (!is_alive) { - if (!create_and_connect_socket(socket_, error)) { return false; } - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - // TODO: refactoring - if (is_ssl()) { - auto &scli = static_cast(*this); - if (!proxy_host_.empty() && proxy_port_ != -1) { - auto success = false; - if (!scli.connect_with_proxy(socket_, req.start_time_, res, success, - error)) { - return success; - } - } - - if (!scli.initialize_ssl(socket_, error)) { return false; } - } -#endif - } - - // Mark the current socket as being in use so that it cannot be closed by - // anyone else while this request is ongoing, even though we will be - // releasing the mutex. - if (socket_requests_in_flight_ > 1) { - assert(socket_requests_are_from_thread_ == std::this_thread::get_id()); - } - socket_requests_in_flight_ += 1; - socket_requests_are_from_thread_ = std::this_thread::get_id(); - } - - for (const auto &header : default_headers_) { - if (req.headers.find(header.first) == req.headers.end()) { - req.headers.insert(header); - } - } - - auto ret = false; - auto close_connection = !keep_alive_; - - auto se = detail::scope_exit([&]() { - // Briefly lock mutex in order to mark that a request is no longer ongoing - std::lock_guard guard(socket_mutex_); - socket_requests_in_flight_ -= 1; - if (socket_requests_in_flight_ <= 0) { - assert(socket_requests_in_flight_ == 0); - socket_requests_are_from_thread_ = std::thread::id(); - } - - if (socket_should_be_closed_when_request_is_done_ || close_connection || - !ret) { - shutdown_ssl(socket_, true); - shutdown_socket(socket_); - close_socket(socket_); - } - }); - - ret = process_socket(socket_, req.start_time_, [&](Stream &strm) { - return handle_request(strm, req, res, close_connection, error); - }); - - if (!ret) { - if (error == Error::Success) { error = Error::Unknown; } - } - - return ret; -} - -inline Result ClientImpl::send(const Request &req) { - auto req2 = req; - return send_(std::move(req2)); -} - -inline Result ClientImpl::send_(Request &&req) { - auto res = detail::make_unique(); - auto error = Error::Success; - auto ret = send(req, *res, error); - return Result{ret ? std::move(res) : nullptr, error, std::move(req.headers)}; -} - -inline bool ClientImpl::handle_request(Stream &strm, Request &req, - Response &res, bool close_connection, - Error &error) { - if (req.path.empty()) { - error = Error::Connection; - return false; - } - - auto req_save = req; - - bool ret; - - if (!is_ssl() && !proxy_host_.empty() && proxy_port_ != -1) { - auto req2 = req; - req2.path = "http://" + host_and_port_ + req.path; - ret = process_request(strm, req2, res, close_connection, error); - req = req2; - req.path = req_save.path; - } else { - ret = process_request(strm, req, res, close_connection, error); - } - - if (!ret) { return false; } - - if (res.get_header_value("Connection") == "close" || - (res.version == "HTTP/1.0" && res.reason != "Connection established")) { - // TODO this requires a not-entirely-obvious chain of calls to be correct - // for this to be safe. - - // This is safe to call because handle_request is only called by send_ - // which locks the request mutex during the process. It would be a bug - // to call it from a different thread since it's a thread-safety issue - // to do these things to the socket if another thread is using the socket. - std::lock_guard guard(socket_mutex_); - shutdown_ssl(socket_, true); - shutdown_socket(socket_); - close_socket(socket_); - } - - if (300 < res.status && res.status < 400 && follow_location_) { - req = req_save; - ret = redirect(req, res, error); - } - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - if ((res.status == StatusCode::Unauthorized_401 || - res.status == StatusCode::ProxyAuthenticationRequired_407) && - req.authorization_count_ < 5) { - auto is_proxy = res.status == StatusCode::ProxyAuthenticationRequired_407; - const auto &username = - is_proxy ? proxy_digest_auth_username_ : digest_auth_username_; - const auto &password = - is_proxy ? proxy_digest_auth_password_ : digest_auth_password_; - - if (!username.empty() && !password.empty()) { - std::map auth; - if (detail::parse_www_authenticate(res, auth, is_proxy)) { - Request new_req = req; - new_req.authorization_count_ += 1; - new_req.headers.erase(is_proxy ? "Proxy-Authorization" - : "Authorization"); - new_req.headers.insert(detail::make_digest_authentication_header( - req, auth, new_req.authorization_count_, detail::random_string(10), - username, password, is_proxy)); - - Response new_res; - - ret = send(new_req, new_res, error); - if (ret) { res = new_res; } - } - } - } -#endif - - return ret; -} - -inline bool ClientImpl::redirect(Request &req, Response &res, Error &error) { - if (req.redirect_count_ == 0) { - error = Error::ExceedRedirectCount; - return false; - } - - auto location = res.get_header_value("location"); - if (location.empty()) { return false; } - - thread_local const std::regex re( - R"((?:(https?):)?(?://(?:\[([a-fA-F\d:]+)\]|([^:/?#]+))(?::(\d+))?)?([^?#]*)(\?[^#]*)?(?:#.*)?)"); - - std::smatch m; - if (!std::regex_match(location, m, re)) { return false; } - - auto scheme = is_ssl() ? "https" : "http"; - - auto next_scheme = m[1].str(); - auto next_host = m[2].str(); - if (next_host.empty()) { next_host = m[3].str(); } - auto port_str = m[4].str(); - auto next_path = m[5].str(); - auto next_query = m[6].str(); - - auto next_port = port_; - if (!port_str.empty()) { - next_port = std::stoi(port_str); - } else if (!next_scheme.empty()) { - next_port = next_scheme == "https" ? 443 : 80; - } - - if (next_scheme.empty()) { next_scheme = scheme; } - if (next_host.empty()) { next_host = host_; } - if (next_path.empty()) { next_path = "/"; } - - auto path = detail::decode_url(next_path, true) + next_query; - - if (next_scheme == scheme && next_host == host_ && next_port == port_) { - return detail::redirect(*this, req, res, path, location, error); - } else { - if (next_scheme == "https") { -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - SSLClient cli(next_host, next_port); - cli.copy_settings(*this); - if (ca_cert_store_) { cli.set_ca_cert_store(ca_cert_store_); } - return detail::redirect(cli, req, res, path, location, error); -#else - return false; -#endif - } else { - ClientImpl cli(next_host, next_port); - cli.copy_settings(*this); - return detail::redirect(cli, req, res, path, location, error); - } - } -} - -inline bool ClientImpl::write_content_with_provider(Stream &strm, - const Request &req, - Error &error) const { - auto is_shutting_down = []() { return false; }; - - if (req.is_chunked_content_provider_) { - // TODO: Brotli support - std::unique_ptr compressor; -#ifdef CPPHTTPLIB_ZLIB_SUPPORT - if (compress_) { - compressor = detail::make_unique(); - } else -#endif - { - compressor = detail::make_unique(); - } - - return detail::write_content_chunked(strm, req.content_provider_, - is_shutting_down, *compressor, error); - } else { - return detail::write_content(strm, req.content_provider_, 0, - req.content_length_, is_shutting_down, error); - } -} - -inline bool ClientImpl::write_request(Stream &strm, Request &req, - bool close_connection, Error &error) { - // Prepare additional headers - if (close_connection) { - if (!req.has_header("Connection")) { - req.set_header("Connection", "close"); - } - } - - if (!req.has_header("Host")) { - if (is_ssl()) { - if (port_ == 443) { - req.set_header("Host", host_); - } else { - req.set_header("Host", host_and_port_); - } - } else { - if (port_ == 80) { - req.set_header("Host", host_); - } else { - req.set_header("Host", host_and_port_); - } - } - } - - if (!req.has_header("Accept")) { req.set_header("Accept", "*/*"); } - - if (!req.content_receiver) { - if (!req.has_header("Accept-Encoding")) { - std::string accept_encoding; -#ifdef CPPHTTPLIB_BROTLI_SUPPORT - accept_encoding = "br"; -#endif -#ifdef CPPHTTPLIB_ZLIB_SUPPORT - if (!accept_encoding.empty()) { accept_encoding += ", "; } - accept_encoding += "gzip, deflate"; -#endif -#ifdef CPPHTTPLIB_ZSTD_SUPPORT - if (!accept_encoding.empty()) { accept_encoding += ", "; } - accept_encoding += "zstd"; -#endif - req.set_header("Accept-Encoding", accept_encoding); - } - -#ifndef CPPHTTPLIB_NO_DEFAULT_USER_AGENT - if (!req.has_header("User-Agent")) { - auto agent = std::string("cpp-httplib/") + CPPHTTPLIB_VERSION; - req.set_header("User-Agent", agent); - } -#endif - }; - - if (req.body.empty()) { - if (req.content_provider_) { - if (!req.is_chunked_content_provider_) { - if (!req.has_header("Content-Length")) { - auto length = std::to_string(req.content_length_); - req.set_header("Content-Length", length); - } - } - } else { - if (req.method == "POST" || req.method == "PUT" || - req.method == "PATCH") { - req.set_header("Content-Length", "0"); - } - } - } else { - if (!req.has_header("Content-Type")) { - req.set_header("Content-Type", "text/plain"); - } - - if (!req.has_header("Content-Length")) { - auto length = std::to_string(req.body.size()); - req.set_header("Content-Length", length); - } - } - - if (!basic_auth_password_.empty() || !basic_auth_username_.empty()) { - if (!req.has_header("Authorization")) { - req.headers.insert(make_basic_authentication_header( - basic_auth_username_, basic_auth_password_, false)); - } - } - - if (!proxy_basic_auth_username_.empty() && - !proxy_basic_auth_password_.empty()) { - if (!req.has_header("Proxy-Authorization")) { - req.headers.insert(make_basic_authentication_header( - proxy_basic_auth_username_, proxy_basic_auth_password_, true)); - } - } - - if (!bearer_token_auth_token_.empty()) { - if (!req.has_header("Authorization")) { - req.headers.insert(make_bearer_token_authentication_header( - bearer_token_auth_token_, false)); - } - } - - if (!proxy_bearer_token_auth_token_.empty()) { - if (!req.has_header("Proxy-Authorization")) { - req.headers.insert(make_bearer_token_authentication_header( - proxy_bearer_token_auth_token_, true)); - } - } - - // Request line and headers - { - detail::BufferStream bstrm; - - const auto &path_with_query = - req.params.empty() ? req.path - : append_query_params(req.path, req.params); - - const auto &path = - url_encode_ ? detail::encode_url(path_with_query) : path_with_query; - - detail::write_request_line(bstrm, req.method, path); - - header_writer_(bstrm, req.headers); - - // Flush buffer - auto &data = bstrm.get_buffer(); - if (!detail::write_data(strm, data.data(), data.size())) { - error = Error::Write; - return false; - } - } - - // Body - if (req.body.empty()) { - return write_content_with_provider(strm, req, error); - } - - if (!detail::write_data(strm, req.body.data(), req.body.size())) { - error = Error::Write; - return false; - } - - return true; -} - -inline std::unique_ptr ClientImpl::send_with_content_provider( - Request &req, const char *body, size_t content_length, - ContentProvider content_provider, - ContentProviderWithoutLength content_provider_without_length, - const std::string &content_type, Error &error) { - if (!content_type.empty()) { req.set_header("Content-Type", content_type); } - -#ifdef CPPHTTPLIB_ZLIB_SUPPORT - if (compress_) { req.set_header("Content-Encoding", "gzip"); } -#endif - -#ifdef CPPHTTPLIB_ZLIB_SUPPORT - if (compress_ && !content_provider_without_length) { - // TODO: Brotli support - detail::gzip_compressor compressor; - - if (content_provider) { - auto ok = true; - size_t offset = 0; - DataSink data_sink; - - data_sink.write = [&](const char *data, size_t data_len) -> bool { - if (ok) { - auto last = offset + data_len == content_length; - - auto ret = compressor.compress( - data, data_len, last, - [&](const char *compressed_data, size_t compressed_data_len) { - req.body.append(compressed_data, compressed_data_len); - return true; - }); - - if (ret) { - offset += data_len; - } else { - ok = false; - } - } - return ok; - }; - - while (ok && offset < content_length) { - if (!content_provider(offset, content_length - offset, data_sink)) { - error = Error::Canceled; - return nullptr; - } - } - } else { - if (!compressor.compress(body, content_length, true, - [&](const char *data, size_t data_len) { - req.body.append(data, data_len); - return true; - })) { - error = Error::Compression; - return nullptr; - } - } - } else -#endif - { - if (content_provider) { - req.content_length_ = content_length; - req.content_provider_ = std::move(content_provider); - req.is_chunked_content_provider_ = false; - } else if (content_provider_without_length) { - req.content_length_ = 0; - req.content_provider_ = detail::ContentProviderAdapter( - std::move(content_provider_without_length)); - req.is_chunked_content_provider_ = true; - req.set_header("Transfer-Encoding", "chunked"); - } else { - req.body.assign(body, content_length); - } - } - - auto res = detail::make_unique(); - return send(req, *res, error) ? std::move(res) : nullptr; -} - -inline Result ClientImpl::send_with_content_provider( - const std::string &method, const std::string &path, const Headers &headers, - const char *body, size_t content_length, ContentProvider content_provider, - ContentProviderWithoutLength content_provider_without_length, - const std::string &content_type, Progress progress) { - Request req; - req.method = method; - req.headers = headers; - req.path = path; - req.progress = progress; - if (max_timeout_msec_ > 0) { - req.start_time_ = std::chrono::steady_clock::now(); - } - - auto error = Error::Success; - - auto res = send_with_content_provider( - req, body, content_length, std::move(content_provider), - std::move(content_provider_without_length), content_type, error); - - return Result{std::move(res), error, std::move(req.headers)}; -} - -inline std::string -ClientImpl::adjust_host_string(const std::string &host) const { - if (host.find(':') != std::string::npos) { return "[" + host + "]"; } - return host; -} - -inline bool ClientImpl::process_request(Stream &strm, Request &req, - Response &res, bool close_connection, - Error &error) { - // Send request - if (!write_request(strm, req, close_connection, error)) { return false; } - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - if (is_ssl()) { - auto is_proxy_enabled = !proxy_host_.empty() && proxy_port_ != -1; - if (!is_proxy_enabled) { - if (detail::is_ssl_peer_could_be_closed(socket_.ssl, socket_.sock)) { - error = Error::SSLPeerCouldBeClosed_; - return false; - } - } - } -#endif - - // Receive response and headers - if (!read_response_line(strm, req, res) || - !detail::read_headers(strm, res.headers)) { - error = Error::Read; - return false; - } - - // Body - if ((res.status != StatusCode::NoContent_204) && req.method != "HEAD" && - req.method != "CONNECT") { - auto redirect = 300 < res.status && res.status < 400 && - res.status != StatusCode::NotModified_304 && - follow_location_; - - if (req.response_handler && !redirect) { - if (!req.response_handler(res)) { - error = Error::Canceled; - return false; - } - } - - auto out = - req.content_receiver - ? static_cast( - [&](const char *buf, size_t n, uint64_t off, uint64_t len) { - if (redirect) { return true; } - auto ret = req.content_receiver(buf, n, off, len); - if (!ret) { error = Error::Canceled; } - return ret; - }) - : static_cast( - [&](const char *buf, size_t n, uint64_t /*off*/, - uint64_t /*len*/) { - assert(res.body.size() + n <= res.body.max_size()); - res.body.append(buf, n); - return true; - }); - - auto progress = [&](uint64_t current, uint64_t total) { - if (!req.progress || redirect) { return true; } - auto ret = req.progress(current, total); - if (!ret) { error = Error::Canceled; } - return ret; - }; - - if (res.has_header("Content-Length")) { - if (!req.content_receiver) { - auto len = res.get_header_value_u64("Content-Length"); - if (len > res.body.max_size()) { - error = Error::Read; - return false; - } - res.body.reserve(static_cast(len)); - } - } - - if (res.status != StatusCode::NotModified_304) { - int dummy_status; - if (!detail::read_content(strm, res, (std::numeric_limits::max)(), - dummy_status, std::move(progress), - std::move(out), decompress_)) { - if (error != Error::Canceled) { error = Error::Read; } - return false; - } - } - } - - // Log - if (logger_) { logger_(req, res); } - - return true; -} - -inline ContentProviderWithoutLength ClientImpl::get_multipart_content_provider( - const std::string &boundary, const MultipartFormDataItems &items, - const MultipartFormDataProviderItems &provider_items) const { - size_t cur_item = 0; - size_t cur_start = 0; - // cur_item and cur_start are copied to within the std::function and maintain - // state between successive calls - return [&, cur_item, cur_start](size_t offset, - DataSink &sink) mutable -> bool { - if (!offset && !items.empty()) { - sink.os << detail::serialize_multipart_formdata(items, boundary, false); - return true; - } else if (cur_item < provider_items.size()) { - if (!cur_start) { - const auto &begin = detail::serialize_multipart_formdata_item_begin( - provider_items[cur_item], boundary); - offset += begin.size(); - cur_start = offset; - sink.os << begin; - } - - DataSink cur_sink; - auto has_data = true; - cur_sink.write = sink.write; - cur_sink.done = [&]() { has_data = false; }; - - if (!provider_items[cur_item].provider(offset - cur_start, cur_sink)) { - return false; - } - - if (!has_data) { - sink.os << detail::serialize_multipart_formdata_item_end(); - cur_item++; - cur_start = 0; - } - return true; - } else { - sink.os << detail::serialize_multipart_formdata_finish(boundary); - sink.done(); - return true; - } - }; -} - -inline bool ClientImpl::process_socket( - const Socket &socket, - std::chrono::time_point start_time, - std::function callback) { - return detail::process_client_socket( - socket.sock, read_timeout_sec_, read_timeout_usec_, write_timeout_sec_, - write_timeout_usec_, max_timeout_msec_, start_time, std::move(callback)); -} - -inline bool ClientImpl::is_ssl() const { return false; } - -inline Result ClientImpl::Get(const std::string &path) { - return Get(path, Headers(), Progress()); -} - -inline Result ClientImpl::Get(const std::string &path, Progress progress) { - return Get(path, Headers(), std::move(progress)); -} - -inline Result ClientImpl::Get(const std::string &path, const Headers &headers) { - return Get(path, headers, Progress()); -} - -inline Result ClientImpl::Get(const std::string &path, const Headers &headers, - Progress progress) { - Request req; - req.method = "GET"; - req.path = path; - req.headers = headers; - req.progress = std::move(progress); - if (max_timeout_msec_ > 0) { - req.start_time_ = std::chrono::steady_clock::now(); - } - - return send_(std::move(req)); -} - -inline Result ClientImpl::Get(const std::string &path, - ContentReceiver content_receiver) { - return Get(path, Headers(), nullptr, std::move(content_receiver), nullptr); -} - -inline Result ClientImpl::Get(const std::string &path, - ContentReceiver content_receiver, - Progress progress) { - return Get(path, Headers(), nullptr, std::move(content_receiver), - std::move(progress)); -} - -inline Result ClientImpl::Get(const std::string &path, const Headers &headers, - ContentReceiver content_receiver) { - return Get(path, headers, nullptr, std::move(content_receiver), nullptr); -} - -inline Result ClientImpl::Get(const std::string &path, const Headers &headers, - ContentReceiver content_receiver, - Progress progress) { - return Get(path, headers, nullptr, std::move(content_receiver), - std::move(progress)); -} - -inline Result ClientImpl::Get(const std::string &path, - ResponseHandler response_handler, - ContentReceiver content_receiver) { - return Get(path, Headers(), std::move(response_handler), - std::move(content_receiver), nullptr); -} - -inline Result ClientImpl::Get(const std::string &path, const Headers &headers, - ResponseHandler response_handler, - ContentReceiver content_receiver) { - return Get(path, headers, std::move(response_handler), - std::move(content_receiver), nullptr); -} - -inline Result ClientImpl::Get(const std::string &path, - ResponseHandler response_handler, - ContentReceiver content_receiver, - Progress progress) { - return Get(path, Headers(), std::move(response_handler), - std::move(content_receiver), std::move(progress)); -} - -inline Result ClientImpl::Get(const std::string &path, const Headers &headers, - ResponseHandler response_handler, - ContentReceiver content_receiver, - Progress progress) { - Request req; - req.method = "GET"; - req.path = path; - req.headers = headers; - req.response_handler = std::move(response_handler); - req.content_receiver = - [content_receiver](const char *data, size_t data_length, - uint64_t /*offset*/, uint64_t /*total_length*/) { - return content_receiver(data, data_length); - }; - req.progress = std::move(progress); - if (max_timeout_msec_ > 0) { - req.start_time_ = std::chrono::steady_clock::now(); - } - - return send_(std::move(req)); -} - -inline Result ClientImpl::Get(const std::string &path, const Params ¶ms, - const Headers &headers, Progress progress) { - if (params.empty()) { return Get(path, headers); } - - std::string path_with_query = append_query_params(path, params); - return Get(path_with_query, headers, std::move(progress)); -} - -inline Result ClientImpl::Get(const std::string &path, const Params ¶ms, - const Headers &headers, - ContentReceiver content_receiver, - Progress progress) { - return Get(path, params, headers, nullptr, std::move(content_receiver), - std::move(progress)); -} - -inline Result ClientImpl::Get(const std::string &path, const Params ¶ms, - const Headers &headers, - ResponseHandler response_handler, - ContentReceiver content_receiver, - Progress progress) { - if (params.empty()) { - return Get(path, headers, std::move(response_handler), - std::move(content_receiver), std::move(progress)); - } - - std::string path_with_query = append_query_params(path, params); - return Get(path_with_query, headers, std::move(response_handler), - std::move(content_receiver), std::move(progress)); -} - -inline Result ClientImpl::Head(const std::string &path) { - return Head(path, Headers()); -} - -inline Result ClientImpl::Head(const std::string &path, - const Headers &headers) { - Request req; - req.method = "HEAD"; - req.headers = headers; - req.path = path; - if (max_timeout_msec_ > 0) { - req.start_time_ = std::chrono::steady_clock::now(); - } - - return send_(std::move(req)); -} - -inline Result ClientImpl::Post(const std::string &path) { - return Post(path, std::string(), std::string()); -} - -inline Result ClientImpl::Post(const std::string &path, - const Headers &headers) { - return Post(path, headers, nullptr, 0, std::string()); -} - -inline Result ClientImpl::Post(const std::string &path, const char *body, - size_t content_length, - const std::string &content_type) { - return Post(path, Headers(), body, content_length, content_type, nullptr); -} - -inline Result ClientImpl::Post(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type) { - return send_with_content_provider("POST", path, headers, body, content_length, - nullptr, nullptr, content_type, nullptr); -} - -inline Result ClientImpl::Post(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type, - Progress progress) { - return send_with_content_provider("POST", path, headers, body, content_length, - nullptr, nullptr, content_type, progress); -} - -inline Result ClientImpl::Post(const std::string &path, const std::string &body, - const std::string &content_type) { - return Post(path, Headers(), body, content_type); -} - -inline Result ClientImpl::Post(const std::string &path, const std::string &body, - const std::string &content_type, - Progress progress) { - return Post(path, Headers(), body, content_type, progress); -} - -inline Result ClientImpl::Post(const std::string &path, const Headers &headers, - const std::string &body, - const std::string &content_type) { - return send_with_content_provider("POST", path, headers, body.data(), - body.size(), nullptr, nullptr, content_type, - nullptr); -} - -inline Result ClientImpl::Post(const std::string &path, const Headers &headers, - const std::string &body, - const std::string &content_type, - Progress progress) { - return send_with_content_provider("POST", path, headers, body.data(), - body.size(), nullptr, nullptr, content_type, - progress); -} - -inline Result ClientImpl::Post(const std::string &path, const Params ¶ms) { - return Post(path, Headers(), params); -} - -inline Result ClientImpl::Post(const std::string &path, size_t content_length, - ContentProvider content_provider, - const std::string &content_type) { - return Post(path, Headers(), content_length, std::move(content_provider), - content_type); -} - -inline Result ClientImpl::Post(const std::string &path, - ContentProviderWithoutLength content_provider, - const std::string &content_type) { - return Post(path, Headers(), std::move(content_provider), content_type); -} - -inline Result ClientImpl::Post(const std::string &path, const Headers &headers, - size_t content_length, - ContentProvider content_provider, - const std::string &content_type) { - return send_with_content_provider("POST", path, headers, nullptr, - content_length, std::move(content_provider), - nullptr, content_type, nullptr); -} - -inline Result ClientImpl::Post(const std::string &path, const Headers &headers, - ContentProviderWithoutLength content_provider, - const std::string &content_type) { - return send_with_content_provider("POST", path, headers, nullptr, 0, nullptr, - std::move(content_provider), content_type, - nullptr); -} - -inline Result ClientImpl::Post(const std::string &path, const Headers &headers, - const Params ¶ms) { - auto query = detail::params_to_query_str(params); - return Post(path, headers, query, "application/x-www-form-urlencoded"); -} - -inline Result ClientImpl::Post(const std::string &path, const Headers &headers, - const Params ¶ms, Progress progress) { - auto query = detail::params_to_query_str(params); - return Post(path, headers, query, "application/x-www-form-urlencoded", - progress); -} - -inline Result ClientImpl::Post(const std::string &path, - const MultipartFormDataItems &items) { - return Post(path, Headers(), items); -} - -inline Result ClientImpl::Post(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items) { - const auto &boundary = detail::make_multipart_data_boundary(); - const auto &content_type = - detail::serialize_multipart_formdata_get_content_type(boundary); - const auto &body = detail::serialize_multipart_formdata(items, boundary); - return Post(path, headers, body, content_type); -} - -inline Result ClientImpl::Post(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items, - const std::string &boundary) { - if (!detail::is_multipart_boundary_chars_valid(boundary)) { - return Result{nullptr, Error::UnsupportedMultipartBoundaryChars}; - } - - const auto &content_type = - detail::serialize_multipart_formdata_get_content_type(boundary); - const auto &body = detail::serialize_multipart_formdata(items, boundary); - return Post(path, headers, body, content_type); -} - -inline Result -ClientImpl::Post(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items, - const MultipartFormDataProviderItems &provider_items) { - const auto &boundary = detail::make_multipart_data_boundary(); - const auto &content_type = - detail::serialize_multipart_formdata_get_content_type(boundary); - return send_with_content_provider( - "POST", path, headers, nullptr, 0, nullptr, - get_multipart_content_provider(boundary, items, provider_items), - content_type, nullptr); -} - -inline Result ClientImpl::Put(const std::string &path) { - return Put(path, std::string(), std::string()); -} - -inline Result ClientImpl::Put(const std::string &path, const char *body, - size_t content_length, - const std::string &content_type) { - return Put(path, Headers(), body, content_length, content_type); -} - -inline Result ClientImpl::Put(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type) { - return send_with_content_provider("PUT", path, headers, body, content_length, - nullptr, nullptr, content_type, nullptr); -} - -inline Result ClientImpl::Put(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type, - Progress progress) { - return send_with_content_provider("PUT", path, headers, body, content_length, - nullptr, nullptr, content_type, progress); -} - -inline Result ClientImpl::Put(const std::string &path, const std::string &body, - const std::string &content_type) { - return Put(path, Headers(), body, content_type); -} - -inline Result ClientImpl::Put(const std::string &path, const std::string &body, - const std::string &content_type, - Progress progress) { - return Put(path, Headers(), body, content_type, progress); -} - -inline Result ClientImpl::Put(const std::string &path, const Headers &headers, - const std::string &body, - const std::string &content_type) { - return send_with_content_provider("PUT", path, headers, body.data(), - body.size(), nullptr, nullptr, content_type, - nullptr); -} - -inline Result ClientImpl::Put(const std::string &path, const Headers &headers, - const std::string &body, - const std::string &content_type, - Progress progress) { - return send_with_content_provider("PUT", path, headers, body.data(), - body.size(), nullptr, nullptr, content_type, - progress); -} - -inline Result ClientImpl::Put(const std::string &path, size_t content_length, - ContentProvider content_provider, - const std::string &content_type) { - return Put(path, Headers(), content_length, std::move(content_provider), - content_type); -} - -inline Result ClientImpl::Put(const std::string &path, - ContentProviderWithoutLength content_provider, - const std::string &content_type) { - return Put(path, Headers(), std::move(content_provider), content_type); -} - -inline Result ClientImpl::Put(const std::string &path, const Headers &headers, - size_t content_length, - ContentProvider content_provider, - const std::string &content_type) { - return send_with_content_provider("PUT", path, headers, nullptr, - content_length, std::move(content_provider), - nullptr, content_type, nullptr); -} - -inline Result ClientImpl::Put(const std::string &path, const Headers &headers, - ContentProviderWithoutLength content_provider, - const std::string &content_type) { - return send_with_content_provider("PUT", path, headers, nullptr, 0, nullptr, - std::move(content_provider), content_type, - nullptr); -} - -inline Result ClientImpl::Put(const std::string &path, const Params ¶ms) { - return Put(path, Headers(), params); -} - -inline Result ClientImpl::Put(const std::string &path, const Headers &headers, - const Params ¶ms) { - auto query = detail::params_to_query_str(params); - return Put(path, headers, query, "application/x-www-form-urlencoded"); -} - -inline Result ClientImpl::Put(const std::string &path, const Headers &headers, - const Params ¶ms, Progress progress) { - auto query = detail::params_to_query_str(params); - return Put(path, headers, query, "application/x-www-form-urlencoded", - progress); -} - -inline Result ClientImpl::Put(const std::string &path, - const MultipartFormDataItems &items) { - return Put(path, Headers(), items); -} - -inline Result ClientImpl::Put(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items) { - const auto &boundary = detail::make_multipart_data_boundary(); - const auto &content_type = - detail::serialize_multipart_formdata_get_content_type(boundary); - const auto &body = detail::serialize_multipart_formdata(items, boundary); - return Put(path, headers, body, content_type); -} - -inline Result ClientImpl::Put(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items, - const std::string &boundary) { - if (!detail::is_multipart_boundary_chars_valid(boundary)) { - return Result{nullptr, Error::UnsupportedMultipartBoundaryChars}; - } - - const auto &content_type = - detail::serialize_multipart_formdata_get_content_type(boundary); - const auto &body = detail::serialize_multipart_formdata(items, boundary); - return Put(path, headers, body, content_type); -} - -inline Result -ClientImpl::Put(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items, - const MultipartFormDataProviderItems &provider_items) { - const auto &boundary = detail::make_multipart_data_boundary(); - const auto &content_type = - detail::serialize_multipart_formdata_get_content_type(boundary); - return send_with_content_provider( - "PUT", path, headers, nullptr, 0, nullptr, - get_multipart_content_provider(boundary, items, provider_items), - content_type, nullptr); -} -inline Result ClientImpl::Patch(const std::string &path) { - return Patch(path, std::string(), std::string()); -} - -inline Result ClientImpl::Patch(const std::string &path, const char *body, - size_t content_length, - const std::string &content_type) { - return Patch(path, Headers(), body, content_length, content_type); -} - -inline Result ClientImpl::Patch(const std::string &path, const char *body, - size_t content_length, - const std::string &content_type, - Progress progress) { - return Patch(path, Headers(), body, content_length, content_type, progress); -} - -inline Result ClientImpl::Patch(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type) { - return Patch(path, headers, body, content_length, content_type, nullptr); -} - -inline Result ClientImpl::Patch(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type, - Progress progress) { - return send_with_content_provider("PATCH", path, headers, body, - content_length, nullptr, nullptr, - content_type, progress); -} - -inline Result ClientImpl::Patch(const std::string &path, - const std::string &body, - const std::string &content_type) { - return Patch(path, Headers(), body, content_type); -} - -inline Result ClientImpl::Patch(const std::string &path, - const std::string &body, - const std::string &content_type, - Progress progress) { - return Patch(path, Headers(), body, content_type, progress); -} - -inline Result ClientImpl::Patch(const std::string &path, const Headers &headers, - const std::string &body, - const std::string &content_type) { - return Patch(path, headers, body, content_type, nullptr); -} - -inline Result ClientImpl::Patch(const std::string &path, const Headers &headers, - const std::string &body, - const std::string &content_type, - Progress progress) { - return send_with_content_provider("PATCH", path, headers, body.data(), - body.size(), nullptr, nullptr, content_type, - progress); -} - -inline Result ClientImpl::Patch(const std::string &path, size_t content_length, - ContentProvider content_provider, - const std::string &content_type) { - return Patch(path, Headers(), content_length, std::move(content_provider), - content_type); -} - -inline Result ClientImpl::Patch(const std::string &path, - ContentProviderWithoutLength content_provider, - const std::string &content_type) { - return Patch(path, Headers(), std::move(content_provider), content_type); -} - -inline Result ClientImpl::Patch(const std::string &path, const Headers &headers, - size_t content_length, - ContentProvider content_provider, - const std::string &content_type) { - return send_with_content_provider("PATCH", path, headers, nullptr, - content_length, std::move(content_provider), - nullptr, content_type, nullptr); -} - -inline Result ClientImpl::Patch(const std::string &path, const Headers &headers, - ContentProviderWithoutLength content_provider, - const std::string &content_type) { - return send_with_content_provider("PATCH", path, headers, nullptr, 0, nullptr, - std::move(content_provider), content_type, - nullptr); -} - -inline Result ClientImpl::Delete(const std::string &path) { - return Delete(path, Headers(), std::string(), std::string()); -} - -inline Result ClientImpl::Delete(const std::string &path, - const Headers &headers) { - return Delete(path, headers, std::string(), std::string()); -} - -inline Result ClientImpl::Delete(const std::string &path, const char *body, - size_t content_length, - const std::string &content_type) { - return Delete(path, Headers(), body, content_length, content_type); -} - -inline Result ClientImpl::Delete(const std::string &path, const char *body, - size_t content_length, - const std::string &content_type, - Progress progress) { - return Delete(path, Headers(), body, content_length, content_type, progress); -} - -inline Result ClientImpl::Delete(const std::string &path, - const Headers &headers, const char *body, - size_t content_length, - const std::string &content_type) { - return Delete(path, headers, body, content_length, content_type, nullptr); -} - -inline Result ClientImpl::Delete(const std::string &path, - const Headers &headers, const char *body, - size_t content_length, - const std::string &content_type, - Progress progress) { - Request req; - req.method = "DELETE"; - req.headers = headers; - req.path = path; - req.progress = progress; - if (max_timeout_msec_ > 0) { - req.start_time_ = std::chrono::steady_clock::now(); - } - - if (!content_type.empty()) { req.set_header("Content-Type", content_type); } - req.body.assign(body, content_length); - - return send_(std::move(req)); -} - -inline Result ClientImpl::Delete(const std::string &path, - const std::string &body, - const std::string &content_type) { - return Delete(path, Headers(), body.data(), body.size(), content_type); -} - -inline Result ClientImpl::Delete(const std::string &path, - const std::string &body, - const std::string &content_type, - Progress progress) { - return Delete(path, Headers(), body.data(), body.size(), content_type, - progress); -} - -inline Result ClientImpl::Delete(const std::string &path, - const Headers &headers, - const std::string &body, - const std::string &content_type) { - return Delete(path, headers, body.data(), body.size(), content_type); -} - -inline Result ClientImpl::Delete(const std::string &path, - const Headers &headers, - const std::string &body, - const std::string &content_type, - Progress progress) { - return Delete(path, headers, body.data(), body.size(), content_type, - progress); -} - -inline Result ClientImpl::Options(const std::string &path) { - return Options(path, Headers()); -} - -inline Result ClientImpl::Options(const std::string &path, - const Headers &headers) { - Request req; - req.method = "OPTIONS"; - req.headers = headers; - req.path = path; - if (max_timeout_msec_ > 0) { - req.start_time_ = std::chrono::steady_clock::now(); - } - - return send_(std::move(req)); -} - -inline void ClientImpl::stop() { - std::lock_guard guard(socket_mutex_); - - // If there is anything ongoing right now, the ONLY thread-safe thing we can - // do is to shutdown_socket, so that threads using this socket suddenly - // discover they can't read/write any more and error out. Everything else - // (closing the socket, shutting ssl down) is unsafe because these actions are - // not thread-safe. - if (socket_requests_in_flight_ > 0) { - shutdown_socket(socket_); - - // Aside from that, we set a flag for the socket to be closed when we're - // done. - socket_should_be_closed_when_request_is_done_ = true; - return; - } - - // Otherwise, still holding the mutex, we can shut everything down ourselves - shutdown_ssl(socket_, true); - shutdown_socket(socket_); - close_socket(socket_); -} - -inline std::string ClientImpl::host() const { return host_; } - -inline int ClientImpl::port() const { return port_; } - -inline size_t ClientImpl::is_socket_open() const { - std::lock_guard guard(socket_mutex_); - return socket_.is_open(); -} - -inline socket_t ClientImpl::socket() const { return socket_.sock; } - -inline void ClientImpl::set_connection_timeout(time_t sec, time_t usec) { - connection_timeout_sec_ = sec; - connection_timeout_usec_ = usec; -} - -inline void ClientImpl::set_read_timeout(time_t sec, time_t usec) { - read_timeout_sec_ = sec; - read_timeout_usec_ = usec; -} - -inline void ClientImpl::set_write_timeout(time_t sec, time_t usec) { - write_timeout_sec_ = sec; - write_timeout_usec_ = usec; -} - -inline void ClientImpl::set_max_timeout(time_t msec) { - max_timeout_msec_ = msec; -} - -inline void ClientImpl::set_basic_auth(const std::string &username, - const std::string &password) { - basic_auth_username_ = username; - basic_auth_password_ = password; -} - -inline void ClientImpl::set_bearer_token_auth(const std::string &token) { - bearer_token_auth_token_ = token; -} - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT -inline void ClientImpl::set_digest_auth(const std::string &username, - const std::string &password) { - digest_auth_username_ = username; - digest_auth_password_ = password; -} -#endif - -inline void ClientImpl::set_keep_alive(bool on) { keep_alive_ = on; } - -inline void ClientImpl::set_follow_location(bool on) { follow_location_ = on; } - -inline void ClientImpl::set_url_encode(bool on) { url_encode_ = on; } - -inline void -ClientImpl::set_hostname_addr_map(std::map addr_map) { - addr_map_ = std::move(addr_map); -} - -inline void ClientImpl::set_default_headers(Headers headers) { - default_headers_ = std::move(headers); -} - -inline void ClientImpl::set_header_writer( - std::function const &writer) { - header_writer_ = writer; -} - -inline void ClientImpl::set_address_family(int family) { - address_family_ = family; -} - -inline void ClientImpl::set_tcp_nodelay(bool on) { tcp_nodelay_ = on; } - -inline void ClientImpl::set_ipv6_v6only(bool on) { ipv6_v6only_ = on; } - -inline void ClientImpl::set_socket_options(SocketOptions socket_options) { - socket_options_ = std::move(socket_options); -} - -inline void ClientImpl::set_compress(bool on) { compress_ = on; } - -inline void ClientImpl::set_decompress(bool on) { decompress_ = on; } - -inline void ClientImpl::set_interface(const std::string &intf) { - interface_ = intf; -} - -inline void ClientImpl::set_proxy(const std::string &host, int port) { - proxy_host_ = host; - proxy_port_ = port; -} - -inline void ClientImpl::set_proxy_basic_auth(const std::string &username, - const std::string &password) { - proxy_basic_auth_username_ = username; - proxy_basic_auth_password_ = password; -} - -inline void ClientImpl::set_proxy_bearer_token_auth(const std::string &token) { - proxy_bearer_token_auth_token_ = token; -} - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT -inline void ClientImpl::set_proxy_digest_auth(const std::string &username, - const std::string &password) { - proxy_digest_auth_username_ = username; - proxy_digest_auth_password_ = password; -} - -inline void ClientImpl::set_ca_cert_path(const std::string &ca_cert_file_path, - const std::string &ca_cert_dir_path) { - ca_cert_file_path_ = ca_cert_file_path; - ca_cert_dir_path_ = ca_cert_dir_path; -} - -inline void ClientImpl::set_ca_cert_store(X509_STORE *ca_cert_store) { - if (ca_cert_store && ca_cert_store != ca_cert_store_) { - ca_cert_store_ = ca_cert_store; - } -} - -inline X509_STORE *ClientImpl::create_ca_cert_store(const char *ca_cert, - std::size_t size) const { - auto mem = BIO_new_mem_buf(ca_cert, static_cast(size)); - auto se = detail::scope_exit([&] { BIO_free_all(mem); }); - if (!mem) { return nullptr; } - - auto inf = PEM_X509_INFO_read_bio(mem, nullptr, nullptr, nullptr); - if (!inf) { return nullptr; } - - auto cts = X509_STORE_new(); - if (cts) { - for (auto i = 0; i < static_cast(sk_X509_INFO_num(inf)); i++) { - auto itmp = sk_X509_INFO_value(inf, i); - if (!itmp) { continue; } - - if (itmp->x509) { X509_STORE_add_cert(cts, itmp->x509); } - if (itmp->crl) { X509_STORE_add_crl(cts, itmp->crl); } - } - } - - sk_X509_INFO_pop_free(inf, X509_INFO_free); - return cts; -} - -inline void ClientImpl::enable_server_certificate_verification(bool enabled) { - server_certificate_verification_ = enabled; -} - -inline void ClientImpl::enable_server_hostname_verification(bool enabled) { - server_hostname_verification_ = enabled; -} - -inline void ClientImpl::set_server_certificate_verifier( - std::function verifier) { - server_certificate_verifier_ = verifier; -} -#endif - -inline void ClientImpl::set_logger(Logger logger) { - logger_ = std::move(logger); -} - -/* - * SSL Implementation - */ -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT -namespace detail { - -template -inline SSL *ssl_new(socket_t sock, SSL_CTX *ctx, std::mutex &ctx_mutex, - U SSL_connect_or_accept, V setup) { - SSL *ssl = nullptr; - { - std::lock_guard guard(ctx_mutex); - ssl = SSL_new(ctx); - } - - if (ssl) { - set_nonblocking(sock, true); - auto bio = BIO_new_socket(static_cast(sock), BIO_NOCLOSE); - BIO_set_nbio(bio, 1); - SSL_set_bio(ssl, bio, bio); - - if (!setup(ssl) || SSL_connect_or_accept(ssl) != 1) { - SSL_shutdown(ssl); - { - std::lock_guard guard(ctx_mutex); - SSL_free(ssl); - } - set_nonblocking(sock, false); - return nullptr; - } - BIO_set_nbio(bio, 0); - set_nonblocking(sock, false); - } - - return ssl; -} - -inline void ssl_delete(std::mutex &ctx_mutex, SSL *ssl, socket_t sock, - bool shutdown_gracefully) { - // sometimes we may want to skip this to try to avoid SIGPIPE if we know - // the remote has closed the network connection - // Note that it is not always possible to avoid SIGPIPE, this is merely a - // best-efforts. - if (shutdown_gracefully) { - (void)(sock); - // SSL_shutdown() returns 0 on first call (indicating close_notify alert - // sent) and 1 on subsequent call (indicating close_notify alert received) - if (SSL_shutdown(ssl) == 0) { - // Expected to return 1, but even if it doesn't, we free ssl - SSL_shutdown(ssl); - } - } - - std::lock_guard guard(ctx_mutex); - SSL_free(ssl); -} - -template -bool ssl_connect_or_accept_nonblocking(socket_t sock, SSL *ssl, - U ssl_connect_or_accept, - time_t timeout_sec, - time_t timeout_usec) { - auto res = 0; - while ((res = ssl_connect_or_accept(ssl)) != 1) { - auto err = SSL_get_error(ssl, res); - switch (err) { - case SSL_ERROR_WANT_READ: - if (select_read(sock, timeout_sec, timeout_usec) > 0) { continue; } - break; - case SSL_ERROR_WANT_WRITE: - if (select_write(sock, timeout_sec, timeout_usec) > 0) { continue; } - break; - default: break; - } - return false; - } - return true; -} - -template -inline bool process_server_socket_ssl( - const std::atomic &svr_sock, SSL *ssl, socket_t sock, - size_t keep_alive_max_count, time_t keep_alive_timeout_sec, - time_t read_timeout_sec, time_t read_timeout_usec, time_t write_timeout_sec, - time_t write_timeout_usec, T callback) { - return process_server_socket_core( - svr_sock, sock, keep_alive_max_count, keep_alive_timeout_sec, - [&](bool close_connection, bool &connection_closed) { - SSLSocketStream strm(sock, ssl, read_timeout_sec, read_timeout_usec, - write_timeout_sec, write_timeout_usec); - return callback(strm, close_connection, connection_closed); - }); -} - -template -inline bool process_client_socket_ssl( - SSL *ssl, socket_t sock, time_t read_timeout_sec, time_t read_timeout_usec, - time_t write_timeout_sec, time_t write_timeout_usec, - time_t max_timeout_msec, - std::chrono::time_point start_time, T callback) { - SSLSocketStream strm(sock, ssl, read_timeout_sec, read_timeout_usec, - write_timeout_sec, write_timeout_usec, max_timeout_msec, - start_time); - return callback(strm); -} - -// SSL socket stream implementation -inline SSLSocketStream::SSLSocketStream( - socket_t sock, SSL *ssl, time_t read_timeout_sec, time_t read_timeout_usec, - time_t write_timeout_sec, time_t write_timeout_usec, - time_t max_timeout_msec, - std::chrono::time_point start_time) - : sock_(sock), ssl_(ssl), read_timeout_sec_(read_timeout_sec), - read_timeout_usec_(read_timeout_usec), - write_timeout_sec_(write_timeout_sec), - write_timeout_usec_(write_timeout_usec), - max_timeout_msec_(max_timeout_msec), start_time_(start_time) { - SSL_clear_mode(ssl, SSL_MODE_AUTO_RETRY); -} - -inline SSLSocketStream::~SSLSocketStream() = default; - -inline bool SSLSocketStream::is_readable() const { - return SSL_pending(ssl_) > 0; -} - -inline bool SSLSocketStream::wait_readable() const { - if (max_timeout_msec_ <= 0) { - return select_read(sock_, read_timeout_sec_, read_timeout_usec_) > 0; - } - - time_t read_timeout_sec; - time_t read_timeout_usec; - calc_actual_timeout(max_timeout_msec_, duration(), read_timeout_sec_, - read_timeout_usec_, read_timeout_sec, read_timeout_usec); - - return select_read(sock_, read_timeout_sec, read_timeout_usec) > 0; -} - -inline bool SSLSocketStream::wait_writable() const { - return select_write(sock_, write_timeout_sec_, write_timeout_usec_) > 0 && - is_socket_alive(sock_) && !is_ssl_peer_could_be_closed(ssl_, sock_); -} - -inline ssize_t SSLSocketStream::read(char *ptr, size_t size) { - if (SSL_pending(ssl_) > 0) { - return SSL_read(ssl_, ptr, static_cast(size)); - } else if (wait_readable()) { - auto ret = SSL_read(ssl_, ptr, static_cast(size)); - if (ret < 0) { - auto err = SSL_get_error(ssl_, ret); - auto n = 1000; -#ifdef _WIN32 - while (--n >= 0 && (err == SSL_ERROR_WANT_READ || - (err == SSL_ERROR_SYSCALL && - WSAGetLastError() == WSAETIMEDOUT))) { -#else - while (--n >= 0 && err == SSL_ERROR_WANT_READ) { -#endif - if (SSL_pending(ssl_) > 0) { - return SSL_read(ssl_, ptr, static_cast(size)); - } else if (wait_readable()) { - std::this_thread::sleep_for(std::chrono::microseconds{10}); - ret = SSL_read(ssl_, ptr, static_cast(size)); - if (ret >= 0) { return ret; } - err = SSL_get_error(ssl_, ret); - } else { - return -1; - } - } - } - return ret; - } else { - return -1; - } -} - -inline ssize_t SSLSocketStream::write(const char *ptr, size_t size) { - if (wait_writable()) { - auto handle_size = static_cast( - std::min(size, (std::numeric_limits::max)())); - - auto ret = SSL_write(ssl_, ptr, static_cast(handle_size)); - if (ret < 0) { - auto err = SSL_get_error(ssl_, ret); - auto n = 1000; -#ifdef _WIN32 - while (--n >= 0 && (err == SSL_ERROR_WANT_WRITE || - (err == SSL_ERROR_SYSCALL && - WSAGetLastError() == WSAETIMEDOUT))) { -#else - while (--n >= 0 && err == SSL_ERROR_WANT_WRITE) { -#endif - if (wait_writable()) { - std::this_thread::sleep_for(std::chrono::microseconds{10}); - ret = SSL_write(ssl_, ptr, static_cast(handle_size)); - if (ret >= 0) { return ret; } - err = SSL_get_error(ssl_, ret); - } else { - return -1; - } - } - } - return ret; - } - return -1; -} - -inline void SSLSocketStream::get_remote_ip_and_port(std::string &ip, - int &port) const { - detail::get_remote_ip_and_port(sock_, ip, port); -} - -inline void SSLSocketStream::get_local_ip_and_port(std::string &ip, - int &port) const { - detail::get_local_ip_and_port(sock_, ip, port); -} - -inline socket_t SSLSocketStream::socket() const { return sock_; } - -inline time_t SSLSocketStream::duration() const { - return std::chrono::duration_cast( - std::chrono::steady_clock::now() - start_time_) - .count(); -} - -} // namespace detail - -// SSL HTTP server implementation -inline SSLServer::SSLServer(const char *cert_path, const char *private_key_path, - const char *client_ca_cert_file_path, - const char *client_ca_cert_dir_path, - const char *private_key_password) { - ctx_ = SSL_CTX_new(TLS_server_method()); - - if (ctx_) { - SSL_CTX_set_options(ctx_, - SSL_OP_NO_COMPRESSION | - SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION); - - SSL_CTX_set_min_proto_version(ctx_, TLS1_2_VERSION); - - if (private_key_password != nullptr && (private_key_password[0] != '\0')) { - SSL_CTX_set_default_passwd_cb_userdata( - ctx_, - reinterpret_cast(const_cast(private_key_password))); - } - - if (SSL_CTX_use_certificate_chain_file(ctx_, cert_path) != 1 || - SSL_CTX_use_PrivateKey_file(ctx_, private_key_path, SSL_FILETYPE_PEM) != - 1 || - SSL_CTX_check_private_key(ctx_) != 1) { - SSL_CTX_free(ctx_); - ctx_ = nullptr; - } else if (client_ca_cert_file_path || client_ca_cert_dir_path) { - SSL_CTX_load_verify_locations(ctx_, client_ca_cert_file_path, - client_ca_cert_dir_path); - - SSL_CTX_set_verify( - ctx_, SSL_VERIFY_PEER | SSL_VERIFY_FAIL_IF_NO_PEER_CERT, nullptr); - } - } -} - -inline SSLServer::SSLServer(X509 *cert, EVP_PKEY *private_key, - X509_STORE *client_ca_cert_store) { - ctx_ = SSL_CTX_new(TLS_server_method()); - - if (ctx_) { - SSL_CTX_set_options(ctx_, - SSL_OP_NO_COMPRESSION | - SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION); - - SSL_CTX_set_min_proto_version(ctx_, TLS1_2_VERSION); - - if (SSL_CTX_use_certificate(ctx_, cert) != 1 || - SSL_CTX_use_PrivateKey(ctx_, private_key) != 1) { - SSL_CTX_free(ctx_); - ctx_ = nullptr; - } else if (client_ca_cert_store) { - SSL_CTX_set_cert_store(ctx_, client_ca_cert_store); - - SSL_CTX_set_verify( - ctx_, SSL_VERIFY_PEER | SSL_VERIFY_FAIL_IF_NO_PEER_CERT, nullptr); - } - } -} - -inline SSLServer::SSLServer( - const std::function &setup_ssl_ctx_callback) { - ctx_ = SSL_CTX_new(TLS_method()); - if (ctx_) { - if (!setup_ssl_ctx_callback(*ctx_)) { - SSL_CTX_free(ctx_); - ctx_ = nullptr; - } - } -} - -inline SSLServer::~SSLServer() { - if (ctx_) { SSL_CTX_free(ctx_); } -} - -inline bool SSLServer::is_valid() const { return ctx_; } - -inline SSL_CTX *SSLServer::ssl_context() const { return ctx_; } - -inline void SSLServer::update_certs(X509 *cert, EVP_PKEY *private_key, - X509_STORE *client_ca_cert_store) { - - std::lock_guard guard(ctx_mutex_); - - SSL_CTX_use_certificate(ctx_, cert); - SSL_CTX_use_PrivateKey(ctx_, private_key); - - if (client_ca_cert_store != nullptr) { - SSL_CTX_set_cert_store(ctx_, client_ca_cert_store); - } -} - -inline bool SSLServer::process_and_close_socket(socket_t sock) { - auto ssl = detail::ssl_new( - sock, ctx_, ctx_mutex_, - [&](SSL *ssl2) { - return detail::ssl_connect_or_accept_nonblocking( - sock, ssl2, SSL_accept, read_timeout_sec_, read_timeout_usec_); - }, - [](SSL * /*ssl2*/) { return true; }); - - auto ret = false; - if (ssl) { - std::string remote_addr; - int remote_port = 0; - detail::get_remote_ip_and_port(sock, remote_addr, remote_port); - - std::string local_addr; - int local_port = 0; - detail::get_local_ip_and_port(sock, local_addr, local_port); - - ret = detail::process_server_socket_ssl( - svr_sock_, ssl, sock, keep_alive_max_count_, keep_alive_timeout_sec_, - read_timeout_sec_, read_timeout_usec_, write_timeout_sec_, - write_timeout_usec_, - [&](Stream &strm, bool close_connection, bool &connection_closed) { - return process_request(strm, remote_addr, remote_port, local_addr, - local_port, close_connection, - connection_closed, - [&](Request &req) { req.ssl = ssl; }); - }); - - // Shutdown gracefully if the result seemed successful, non-gracefully if - // the connection appeared to be closed. - const bool shutdown_gracefully = ret; - detail::ssl_delete(ctx_mutex_, ssl, sock, shutdown_gracefully); - } - - detail::shutdown_socket(sock); - detail::close_socket(sock); - return ret; -} - -// SSL HTTP client implementation -inline SSLClient::SSLClient(const std::string &host) - : SSLClient(host, 443, std::string(), std::string()) {} - -inline SSLClient::SSLClient(const std::string &host, int port) - : SSLClient(host, port, std::string(), std::string()) {} - -inline SSLClient::SSLClient(const std::string &host, int port, - const std::string &client_cert_path, - const std::string &client_key_path, - const std::string &private_key_password) - : ClientImpl(host, port, client_cert_path, client_key_path) { - ctx_ = SSL_CTX_new(TLS_client_method()); - - SSL_CTX_set_min_proto_version(ctx_, TLS1_2_VERSION); - - detail::split(&host_[0], &host_[host_.size()], '.', - [&](const char *b, const char *e) { - host_components_.emplace_back(b, e); - }); - - if (!client_cert_path.empty() && !client_key_path.empty()) { - if (!private_key_password.empty()) { - SSL_CTX_set_default_passwd_cb_userdata( - ctx_, reinterpret_cast( - const_cast(private_key_password.c_str()))); - } - - if (SSL_CTX_use_certificate_file(ctx_, client_cert_path.c_str(), - SSL_FILETYPE_PEM) != 1 || - SSL_CTX_use_PrivateKey_file(ctx_, client_key_path.c_str(), - SSL_FILETYPE_PEM) != 1) { - SSL_CTX_free(ctx_); - ctx_ = nullptr; - } - } -} - -inline SSLClient::SSLClient(const std::string &host, int port, - X509 *client_cert, EVP_PKEY *client_key, - const std::string &private_key_password) - : ClientImpl(host, port) { - ctx_ = SSL_CTX_new(TLS_client_method()); - - detail::split(&host_[0], &host_[host_.size()], '.', - [&](const char *b, const char *e) { - host_components_.emplace_back(b, e); - }); - - if (client_cert != nullptr && client_key != nullptr) { - if (!private_key_password.empty()) { - SSL_CTX_set_default_passwd_cb_userdata( - ctx_, reinterpret_cast( - const_cast(private_key_password.c_str()))); - } - - if (SSL_CTX_use_certificate(ctx_, client_cert) != 1 || - SSL_CTX_use_PrivateKey(ctx_, client_key) != 1) { - SSL_CTX_free(ctx_); - ctx_ = nullptr; - } - } -} - -inline SSLClient::~SSLClient() { - if (ctx_) { SSL_CTX_free(ctx_); } - // Make sure to shut down SSL since shutdown_ssl will resolve to the - // base function rather than the derived function once we get to the - // base class destructor, and won't free the SSL (causing a leak). - shutdown_ssl_impl(socket_, true); -} - -inline bool SSLClient::is_valid() const { return ctx_; } - -inline void SSLClient::set_ca_cert_store(X509_STORE *ca_cert_store) { - if (ca_cert_store) { - if (ctx_) { - if (SSL_CTX_get_cert_store(ctx_) != ca_cert_store) { - // Free memory allocated for old cert and use new store `ca_cert_store` - SSL_CTX_set_cert_store(ctx_, ca_cert_store); - } - } else { - X509_STORE_free(ca_cert_store); - } - } -} - -inline void SSLClient::load_ca_cert_store(const char *ca_cert, - std::size_t size) { - set_ca_cert_store(ClientImpl::create_ca_cert_store(ca_cert, size)); -} - -inline long SSLClient::get_openssl_verify_result() const { - return verify_result_; -} - -inline SSL_CTX *SSLClient::ssl_context() const { return ctx_; } - -inline bool SSLClient::create_and_connect_socket(Socket &socket, Error &error) { - return is_valid() && ClientImpl::create_and_connect_socket(socket, error); -} - -// Assumes that socket_mutex_ is locked and that there are no requests in flight -inline bool SSLClient::connect_with_proxy( - Socket &socket, - std::chrono::time_point start_time, - Response &res, bool &success, Error &error) { - success = true; - Response proxy_res; - if (!detail::process_client_socket( - socket.sock, read_timeout_sec_, read_timeout_usec_, - write_timeout_sec_, write_timeout_usec_, max_timeout_msec_, - start_time, [&](Stream &strm) { - Request req2; - req2.method = "CONNECT"; - req2.path = host_and_port_; - if (max_timeout_msec_ > 0) { - req2.start_time_ = std::chrono::steady_clock::now(); - } - return process_request(strm, req2, proxy_res, false, error); - })) { - // Thread-safe to close everything because we are assuming there are no - // requests in flight - shutdown_ssl(socket, true); - shutdown_socket(socket); - close_socket(socket); - success = false; - return false; - } - - if (proxy_res.status == StatusCode::ProxyAuthenticationRequired_407) { - if (!proxy_digest_auth_username_.empty() && - !proxy_digest_auth_password_.empty()) { - std::map auth; - if (detail::parse_www_authenticate(proxy_res, auth, true)) { - proxy_res = Response(); - if (!detail::process_client_socket( - socket.sock, read_timeout_sec_, read_timeout_usec_, - write_timeout_sec_, write_timeout_usec_, max_timeout_msec_, - start_time, [&](Stream &strm) { - Request req3; - req3.method = "CONNECT"; - req3.path = host_and_port_; - req3.headers.insert(detail::make_digest_authentication_header( - req3, auth, 1, detail::random_string(10), - proxy_digest_auth_username_, proxy_digest_auth_password_, - true)); - if (max_timeout_msec_ > 0) { - req3.start_time_ = std::chrono::steady_clock::now(); - } - return process_request(strm, req3, proxy_res, false, error); - })) { - // Thread-safe to close everything because we are assuming there are - // no requests in flight - shutdown_ssl(socket, true); - shutdown_socket(socket); - close_socket(socket); - success = false; - return false; - } - } - } - } - - // If status code is not 200, proxy request is failed. - // Set error to ProxyConnection and return proxy response - // as the response of the request - if (proxy_res.status != StatusCode::OK_200) { - error = Error::ProxyConnection; - res = std::move(proxy_res); - // Thread-safe to close everything because we are assuming there are - // no requests in flight - shutdown_ssl(socket, true); - shutdown_socket(socket); - close_socket(socket); - return false; - } - - return true; -} - -inline bool SSLClient::load_certs() { - auto ret = true; - - std::call_once(initialize_cert_, [&]() { - std::lock_guard guard(ctx_mutex_); - if (!ca_cert_file_path_.empty()) { - if (!SSL_CTX_load_verify_locations(ctx_, ca_cert_file_path_.c_str(), - nullptr)) { - ret = false; - } - } else if (!ca_cert_dir_path_.empty()) { - if (!SSL_CTX_load_verify_locations(ctx_, nullptr, - ca_cert_dir_path_.c_str())) { - ret = false; - } - } else { - auto loaded = false; -#ifdef _WIN32 - loaded = - detail::load_system_certs_on_windows(SSL_CTX_get_cert_store(ctx_)); -#elif defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN) && defined(__APPLE__) -#if TARGET_OS_OSX - loaded = detail::load_system_certs_on_macos(SSL_CTX_get_cert_store(ctx_)); -#endif // TARGET_OS_OSX -#endif // _WIN32 - if (!loaded) { SSL_CTX_set_default_verify_paths(ctx_); } - } - }); - - return ret; -} - -inline bool SSLClient::initialize_ssl(Socket &socket, Error &error) { - auto ssl = detail::ssl_new( - socket.sock, ctx_, ctx_mutex_, - [&](SSL *ssl2) { - if (server_certificate_verification_) { - if (!load_certs()) { - error = Error::SSLLoadingCerts; - return false; - } - SSL_set_verify(ssl2, SSL_VERIFY_NONE, nullptr); - } - - if (!detail::ssl_connect_or_accept_nonblocking( - socket.sock, ssl2, SSL_connect, connection_timeout_sec_, - connection_timeout_usec_)) { - error = Error::SSLConnection; - return false; - } - - if (server_certificate_verification_) { - auto verification_status = SSLVerifierResponse::NoDecisionMade; - - if (server_certificate_verifier_) { - verification_status = server_certificate_verifier_(ssl2); - } - - if (verification_status == SSLVerifierResponse::CertificateRejected) { - error = Error::SSLServerVerification; - return false; - } - - if (verification_status == SSLVerifierResponse::NoDecisionMade) { - verify_result_ = SSL_get_verify_result(ssl2); - - if (verify_result_ != X509_V_OK) { - error = Error::SSLServerVerification; - return false; - } - - auto server_cert = SSL_get1_peer_certificate(ssl2); - auto se = detail::scope_exit([&] { X509_free(server_cert); }); - - if (server_cert == nullptr) { - error = Error::SSLServerVerification; - return false; - } - - if (server_hostname_verification_) { - if (!verify_host(server_cert)) { - error = Error::SSLServerHostnameVerification; - return false; - } - } - } - } - - return true; - }, - [&](SSL *ssl2) { -#if defined(OPENSSL_IS_BORINGSSL) - SSL_set_tlsext_host_name(ssl2, host_.c_str()); -#else - // NOTE: Direct call instead of using the OpenSSL macro to suppress - // -Wold-style-cast warning - SSL_ctrl(ssl2, SSL_CTRL_SET_TLSEXT_HOSTNAME, TLSEXT_NAMETYPE_host_name, - static_cast(const_cast(host_.c_str()))); -#endif - return true; - }); - - if (ssl) { - socket.ssl = ssl; - return true; - } - - shutdown_socket(socket); - close_socket(socket); - return false; -} - -inline void SSLClient::shutdown_ssl(Socket &socket, bool shutdown_gracefully) { - shutdown_ssl_impl(socket, shutdown_gracefully); -} - -inline void SSLClient::shutdown_ssl_impl(Socket &socket, - bool shutdown_gracefully) { - if (socket.sock == INVALID_SOCKET) { - assert(socket.ssl == nullptr); - return; - } - if (socket.ssl) { - detail::ssl_delete(ctx_mutex_, socket.ssl, socket.sock, - shutdown_gracefully); - socket.ssl = nullptr; - } - assert(socket.ssl == nullptr); -} - -inline bool SSLClient::process_socket( - const Socket &socket, - std::chrono::time_point start_time, - std::function callback) { - assert(socket.ssl); - return detail::process_client_socket_ssl( - socket.ssl, socket.sock, read_timeout_sec_, read_timeout_usec_, - write_timeout_sec_, write_timeout_usec_, max_timeout_msec_, start_time, - std::move(callback)); -} - -inline bool SSLClient::is_ssl() const { return true; } - -inline bool SSLClient::verify_host(X509 *server_cert) const { - /* Quote from RFC2818 section 3.1 "Server Identity" - - If a subjectAltName extension of type dNSName is present, that MUST - be used as the identity. Otherwise, the (most specific) Common Name - field in the Subject field of the certificate MUST be used. Although - the use of the Common Name is existing practice, it is deprecated and - Certification Authorities are encouraged to use the dNSName instead. - - Matching is performed using the matching rules specified by - [RFC2459]. If more than one identity of a given type is present in - the certificate (e.g., more than one dNSName name, a match in any one - of the set is considered acceptable.) Names may contain the wildcard - character * which is considered to match any single domain name - component or component fragment. E.g., *.a.com matches foo.a.com but - not bar.foo.a.com. f*.com matches foo.com but not bar.com. - - In some cases, the URI is specified as an IP address rather than a - hostname. In this case, the iPAddress subjectAltName must be present - in the certificate and must exactly match the IP in the URI. - - */ - return verify_host_with_subject_alt_name(server_cert) || - verify_host_with_common_name(server_cert); -} - -inline bool -SSLClient::verify_host_with_subject_alt_name(X509 *server_cert) const { - auto ret = false; - - auto type = GEN_DNS; - - struct in6_addr addr6 = {}; - struct in_addr addr = {}; - size_t addr_len = 0; - -#ifndef __MINGW32__ - if (inet_pton(AF_INET6, host_.c_str(), &addr6)) { - type = GEN_IPADD; - addr_len = sizeof(struct in6_addr); - } else if (inet_pton(AF_INET, host_.c_str(), &addr)) { - type = GEN_IPADD; - addr_len = sizeof(struct in_addr); - } -#endif - - auto alt_names = static_cast( - X509_get_ext_d2i(server_cert, NID_subject_alt_name, nullptr, nullptr)); - - if (alt_names) { - auto dsn_matched = false; - auto ip_matched = false; - - auto count = sk_GENERAL_NAME_num(alt_names); - - for (decltype(count) i = 0; i < count && !dsn_matched; i++) { - auto val = sk_GENERAL_NAME_value(alt_names, i); - if (val->type == type) { - auto name = - reinterpret_cast(ASN1_STRING_get0_data(val->d.ia5)); - auto name_len = static_cast(ASN1_STRING_length(val->d.ia5)); - - switch (type) { - case GEN_DNS: dsn_matched = check_host_name(name, name_len); break; - - case GEN_IPADD: - if (!memcmp(&addr6, name, addr_len) || - !memcmp(&addr, name, addr_len)) { - ip_matched = true; - } - break; - } - } - } - - if (dsn_matched || ip_matched) { ret = true; } - } - - GENERAL_NAMES_free(const_cast( - reinterpret_cast(alt_names))); - return ret; -} - -inline bool SSLClient::verify_host_with_common_name(X509 *server_cert) const { - const auto subject_name = X509_get_subject_name(server_cert); - - if (subject_name != nullptr) { - char name[BUFSIZ]; - auto name_len = X509_NAME_get_text_by_NID(subject_name, NID_commonName, - name, sizeof(name)); - - if (name_len != -1) { - return check_host_name(name, static_cast(name_len)); - } - } - - return false; -} - -inline bool SSLClient::check_host_name(const char *pattern, - size_t pattern_len) const { - if (host_.size() == pattern_len && host_ == pattern) { return true; } - - // Wildcard match - // https://bugs.launchpad.net/ubuntu/+source/firefox-3.0/+bug/376484 - std::vector pattern_components; - detail::split(&pattern[0], &pattern[pattern_len], '.', - [&](const char *b, const char *e) { - pattern_components.emplace_back(b, e); - }); - - if (host_components_.size() != pattern_components.size()) { return false; } - - auto itr = pattern_components.begin(); - for (const auto &h : host_components_) { - auto &p = *itr; - if (p != h && p != "*") { - auto partial_match = (p.size() > 0 && p[p.size() - 1] == '*' && - !p.compare(0, p.size() - 1, h)); - if (!partial_match) { return false; } - } - ++itr; - } - - return true; -} -#endif - -// Universal client implementation -inline Client::Client(const std::string &scheme_host_port) - : Client(scheme_host_port, std::string(), std::string()) {} - -inline Client::Client(const std::string &scheme_host_port, - const std::string &client_cert_path, - const std::string &client_key_path) { - const static std::regex re( - R"((?:([a-z]+):\/\/)?(?:\[([a-fA-F\d:]+)\]|([^:/?#]+))(?::(\d+))?)"); - - std::smatch m; - if (std::regex_match(scheme_host_port, m, re)) { - auto scheme = m[1].str(); - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - if (!scheme.empty() && (scheme != "http" && scheme != "https")) { -#else - if (!scheme.empty() && scheme != "http") { -#endif -#ifndef CPPHTTPLIB_NO_EXCEPTIONS - std::string msg = "'" + scheme + "' scheme is not supported."; - throw std::invalid_argument(msg); -#endif - return; - } - - auto is_ssl = scheme == "https"; - - auto host = m[2].str(); - if (host.empty()) { host = m[3].str(); } - - auto port_str = m[4].str(); - auto port = !port_str.empty() ? std::stoi(port_str) : (is_ssl ? 443 : 80); - - if (is_ssl) { -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - cli_ = detail::make_unique(host, port, client_cert_path, - client_key_path); - is_ssl_ = is_ssl; -#endif - } else { - cli_ = detail::make_unique(host, port, client_cert_path, - client_key_path); - } - } else { - // NOTE: Update TEST(UniversalClientImplTest, Ipv6LiteralAddress) - // if port param below changes. - cli_ = detail::make_unique(scheme_host_port, 80, - client_cert_path, client_key_path); - } -} // namespace detail - -inline Client::Client(const std::string &host, int port) - : cli_(detail::make_unique(host, port)) {} - -inline Client::Client(const std::string &host, int port, - const std::string &client_cert_path, - const std::string &client_key_path) - : cli_(detail::make_unique(host, port, client_cert_path, - client_key_path)) {} - -inline Client::~Client() = default; - -inline bool Client::is_valid() const { - return cli_ != nullptr && cli_->is_valid(); -} - -inline Result Client::Get(const std::string &path) { return cli_->Get(path); } -inline Result Client::Get(const std::string &path, const Headers &headers) { - return cli_->Get(path, headers); -} -inline Result Client::Get(const std::string &path, Progress progress) { - return cli_->Get(path, std::move(progress)); -} -inline Result Client::Get(const std::string &path, const Headers &headers, - Progress progress) { - return cli_->Get(path, headers, std::move(progress)); -} -inline Result Client::Get(const std::string &path, - ContentReceiver content_receiver) { - return cli_->Get(path, std::move(content_receiver)); -} -inline Result Client::Get(const std::string &path, const Headers &headers, - ContentReceiver content_receiver) { - return cli_->Get(path, headers, std::move(content_receiver)); -} -inline Result Client::Get(const std::string &path, - ContentReceiver content_receiver, Progress progress) { - return cli_->Get(path, std::move(content_receiver), std::move(progress)); -} -inline Result Client::Get(const std::string &path, const Headers &headers, - ContentReceiver content_receiver, Progress progress) { - return cli_->Get(path, headers, std::move(content_receiver), - std::move(progress)); -} -inline Result Client::Get(const std::string &path, - ResponseHandler response_handler, - ContentReceiver content_receiver) { - return cli_->Get(path, std::move(response_handler), - std::move(content_receiver)); -} -inline Result Client::Get(const std::string &path, const Headers &headers, - ResponseHandler response_handler, - ContentReceiver content_receiver) { - return cli_->Get(path, headers, std::move(response_handler), - std::move(content_receiver)); -} -inline Result Client::Get(const std::string &path, - ResponseHandler response_handler, - ContentReceiver content_receiver, Progress progress) { - return cli_->Get(path, std::move(response_handler), - std::move(content_receiver), std::move(progress)); -} -inline Result Client::Get(const std::string &path, const Headers &headers, - ResponseHandler response_handler, - ContentReceiver content_receiver, Progress progress) { - return cli_->Get(path, headers, std::move(response_handler), - std::move(content_receiver), std::move(progress)); -} -inline Result Client::Get(const std::string &path, const Params ¶ms, - const Headers &headers, Progress progress) { - return cli_->Get(path, params, headers, std::move(progress)); -} -inline Result Client::Get(const std::string &path, const Params ¶ms, - const Headers &headers, - ContentReceiver content_receiver, Progress progress) { - return cli_->Get(path, params, headers, std::move(content_receiver), - std::move(progress)); -} -inline Result Client::Get(const std::string &path, const Params ¶ms, - const Headers &headers, - ResponseHandler response_handler, - ContentReceiver content_receiver, Progress progress) { - return cli_->Get(path, params, headers, std::move(response_handler), - std::move(content_receiver), std::move(progress)); -} - -inline Result Client::Head(const std::string &path) { return cli_->Head(path); } -inline Result Client::Head(const std::string &path, const Headers &headers) { - return cli_->Head(path, headers); -} - -inline Result Client::Post(const std::string &path) { return cli_->Post(path); } -inline Result Client::Post(const std::string &path, const Headers &headers) { - return cli_->Post(path, headers); -} -inline Result Client::Post(const std::string &path, const char *body, - size_t content_length, - const std::string &content_type) { - return cli_->Post(path, body, content_length, content_type); -} -inline Result Client::Post(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type) { - return cli_->Post(path, headers, body, content_length, content_type); -} -inline Result Client::Post(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type, Progress progress) { - return cli_->Post(path, headers, body, content_length, content_type, - progress); -} -inline Result Client::Post(const std::string &path, const std::string &body, - const std::string &content_type) { - return cli_->Post(path, body, content_type); -} -inline Result Client::Post(const std::string &path, const std::string &body, - const std::string &content_type, Progress progress) { - return cli_->Post(path, body, content_type, progress); -} -inline Result Client::Post(const std::string &path, const Headers &headers, - const std::string &body, - const std::string &content_type) { - return cli_->Post(path, headers, body, content_type); -} -inline Result Client::Post(const std::string &path, const Headers &headers, - const std::string &body, - const std::string &content_type, Progress progress) { - return cli_->Post(path, headers, body, content_type, progress); -} -inline Result Client::Post(const std::string &path, size_t content_length, - ContentProvider content_provider, - const std::string &content_type) { - return cli_->Post(path, content_length, std::move(content_provider), - content_type); -} -inline Result Client::Post(const std::string &path, - ContentProviderWithoutLength content_provider, - const std::string &content_type) { - return cli_->Post(path, std::move(content_provider), content_type); -} -inline Result Client::Post(const std::string &path, const Headers &headers, - size_t content_length, - ContentProvider content_provider, - const std::string &content_type) { - return cli_->Post(path, headers, content_length, std::move(content_provider), - content_type); -} -inline Result Client::Post(const std::string &path, const Headers &headers, - ContentProviderWithoutLength content_provider, - const std::string &content_type) { - return cli_->Post(path, headers, std::move(content_provider), content_type); -} -inline Result Client::Post(const std::string &path, const Params ¶ms) { - return cli_->Post(path, params); -} -inline Result Client::Post(const std::string &path, const Headers &headers, - const Params ¶ms) { - return cli_->Post(path, headers, params); -} -inline Result Client::Post(const std::string &path, const Headers &headers, - const Params ¶ms, Progress progress) { - return cli_->Post(path, headers, params, progress); -} -inline Result Client::Post(const std::string &path, - const MultipartFormDataItems &items) { - return cli_->Post(path, items); -} -inline Result Client::Post(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items) { - return cli_->Post(path, headers, items); -} -inline Result Client::Post(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items, - const std::string &boundary) { - return cli_->Post(path, headers, items, boundary); -} -inline Result -Client::Post(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items, - const MultipartFormDataProviderItems &provider_items) { - return cli_->Post(path, headers, items, provider_items); -} -inline Result Client::Put(const std::string &path) { return cli_->Put(path); } -inline Result Client::Put(const std::string &path, const char *body, - size_t content_length, - const std::string &content_type) { - return cli_->Put(path, body, content_length, content_type); -} -inline Result Client::Put(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type) { - return cli_->Put(path, headers, body, content_length, content_type); -} -inline Result Client::Put(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type, Progress progress) { - return cli_->Put(path, headers, body, content_length, content_type, progress); -} -inline Result Client::Put(const std::string &path, const std::string &body, - const std::string &content_type) { - return cli_->Put(path, body, content_type); -} -inline Result Client::Put(const std::string &path, const std::string &body, - const std::string &content_type, Progress progress) { - return cli_->Put(path, body, content_type, progress); -} -inline Result Client::Put(const std::string &path, const Headers &headers, - const std::string &body, - const std::string &content_type) { - return cli_->Put(path, headers, body, content_type); -} -inline Result Client::Put(const std::string &path, const Headers &headers, - const std::string &body, - const std::string &content_type, Progress progress) { - return cli_->Put(path, headers, body, content_type, progress); -} -inline Result Client::Put(const std::string &path, size_t content_length, - ContentProvider content_provider, - const std::string &content_type) { - return cli_->Put(path, content_length, std::move(content_provider), - content_type); -} -inline Result Client::Put(const std::string &path, - ContentProviderWithoutLength content_provider, - const std::string &content_type) { - return cli_->Put(path, std::move(content_provider), content_type); -} -inline Result Client::Put(const std::string &path, const Headers &headers, - size_t content_length, - ContentProvider content_provider, - const std::string &content_type) { - return cli_->Put(path, headers, content_length, std::move(content_provider), - content_type); -} -inline Result Client::Put(const std::string &path, const Headers &headers, - ContentProviderWithoutLength content_provider, - const std::string &content_type) { - return cli_->Put(path, headers, std::move(content_provider), content_type); -} -inline Result Client::Put(const std::string &path, const Params ¶ms) { - return cli_->Put(path, params); -} -inline Result Client::Put(const std::string &path, const Headers &headers, - const Params ¶ms) { - return cli_->Put(path, headers, params); -} -inline Result Client::Put(const std::string &path, const Headers &headers, - const Params ¶ms, Progress progress) { - return cli_->Put(path, headers, params, progress); -} -inline Result Client::Put(const std::string &path, - const MultipartFormDataItems &items) { - return cli_->Put(path, items); -} -inline Result Client::Put(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items) { - return cli_->Put(path, headers, items); -} -inline Result Client::Put(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items, - const std::string &boundary) { - return cli_->Put(path, headers, items, boundary); -} -inline Result -Client::Put(const std::string &path, const Headers &headers, - const MultipartFormDataItems &items, - const MultipartFormDataProviderItems &provider_items) { - return cli_->Put(path, headers, items, provider_items); -} -inline Result Client::Patch(const std::string &path) { - return cli_->Patch(path); -} -inline Result Client::Patch(const std::string &path, const char *body, - size_t content_length, - const std::string &content_type) { - return cli_->Patch(path, body, content_length, content_type); -} -inline Result Client::Patch(const std::string &path, const char *body, - size_t content_length, - const std::string &content_type, - Progress progress) { - return cli_->Patch(path, body, content_length, content_type, progress); -} -inline Result Client::Patch(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type) { - return cli_->Patch(path, headers, body, content_length, content_type); -} -inline Result Client::Patch(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type, - Progress progress) { - return cli_->Patch(path, headers, body, content_length, content_type, - progress); -} -inline Result Client::Patch(const std::string &path, const std::string &body, - const std::string &content_type) { - return cli_->Patch(path, body, content_type); -} -inline Result Client::Patch(const std::string &path, const std::string &body, - const std::string &content_type, - Progress progress) { - return cli_->Patch(path, body, content_type, progress); -} -inline Result Client::Patch(const std::string &path, const Headers &headers, - const std::string &body, - const std::string &content_type) { - return cli_->Patch(path, headers, body, content_type); -} -inline Result Client::Patch(const std::string &path, const Headers &headers, - const std::string &body, - const std::string &content_type, - Progress progress) { - return cli_->Patch(path, headers, body, content_type, progress); -} -inline Result Client::Patch(const std::string &path, size_t content_length, - ContentProvider content_provider, - const std::string &content_type) { - return cli_->Patch(path, content_length, std::move(content_provider), - content_type); -} -inline Result Client::Patch(const std::string &path, - ContentProviderWithoutLength content_provider, - const std::string &content_type) { - return cli_->Patch(path, std::move(content_provider), content_type); -} -inline Result Client::Patch(const std::string &path, const Headers &headers, - size_t content_length, - ContentProvider content_provider, - const std::string &content_type) { - return cli_->Patch(path, headers, content_length, std::move(content_provider), - content_type); -} -inline Result Client::Patch(const std::string &path, const Headers &headers, - ContentProviderWithoutLength content_provider, - const std::string &content_type) { - return cli_->Patch(path, headers, std::move(content_provider), content_type); -} -inline Result Client::Delete(const std::string &path) { - return cli_->Delete(path); -} -inline Result Client::Delete(const std::string &path, const Headers &headers) { - return cli_->Delete(path, headers); -} -inline Result Client::Delete(const std::string &path, const char *body, - size_t content_length, - const std::string &content_type) { - return cli_->Delete(path, body, content_length, content_type); -} -inline Result Client::Delete(const std::string &path, const char *body, - size_t content_length, - const std::string &content_type, - Progress progress) { - return cli_->Delete(path, body, content_length, content_type, progress); -} -inline Result Client::Delete(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type) { - return cli_->Delete(path, headers, body, content_length, content_type); -} -inline Result Client::Delete(const std::string &path, const Headers &headers, - const char *body, size_t content_length, - const std::string &content_type, - Progress progress) { - return cli_->Delete(path, headers, body, content_length, content_type, - progress); -} -inline Result Client::Delete(const std::string &path, const std::string &body, - const std::string &content_type) { - return cli_->Delete(path, body, content_type); -} -inline Result Client::Delete(const std::string &path, const std::string &body, - const std::string &content_type, - Progress progress) { - return cli_->Delete(path, body, content_type, progress); -} -inline Result Client::Delete(const std::string &path, const Headers &headers, - const std::string &body, - const std::string &content_type) { - return cli_->Delete(path, headers, body, content_type); -} -inline Result Client::Delete(const std::string &path, const Headers &headers, - const std::string &body, - const std::string &content_type, - Progress progress) { - return cli_->Delete(path, headers, body, content_type, progress); -} -inline Result Client::Options(const std::string &path) { - return cli_->Options(path); -} -inline Result Client::Options(const std::string &path, const Headers &headers) { - return cli_->Options(path, headers); -} - -inline bool Client::send(Request &req, Response &res, Error &error) { - return cli_->send(req, res, error); -} - -inline Result Client::send(const Request &req) { return cli_->send(req); } - -inline void Client::stop() { cli_->stop(); } - -inline std::string Client::host() const { return cli_->host(); } - -inline int Client::port() const { return cli_->port(); } - -inline size_t Client::is_socket_open() const { return cli_->is_socket_open(); } - -inline socket_t Client::socket() const { return cli_->socket(); } - -inline void -Client::set_hostname_addr_map(std::map addr_map) { - cli_->set_hostname_addr_map(std::move(addr_map)); -} - -inline void Client::set_default_headers(Headers headers) { - cli_->set_default_headers(std::move(headers)); -} - -inline void Client::set_header_writer( - std::function const &writer) { - cli_->set_header_writer(writer); -} - -inline void Client::set_address_family(int family) { - cli_->set_address_family(family); -} - -inline void Client::set_tcp_nodelay(bool on) { cli_->set_tcp_nodelay(on); } - -inline void Client::set_socket_options(SocketOptions socket_options) { - cli_->set_socket_options(std::move(socket_options)); -} - -inline void Client::set_connection_timeout(time_t sec, time_t usec) { - cli_->set_connection_timeout(sec, usec); -} - -inline void Client::set_read_timeout(time_t sec, time_t usec) { - cli_->set_read_timeout(sec, usec); -} - -inline void Client::set_write_timeout(time_t sec, time_t usec) { - cli_->set_write_timeout(sec, usec); -} - -inline void Client::set_basic_auth(const std::string &username, - const std::string &password) { - cli_->set_basic_auth(username, password); -} -inline void Client::set_bearer_token_auth(const std::string &token) { - cli_->set_bearer_token_auth(token); -} -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT -inline void Client::set_digest_auth(const std::string &username, - const std::string &password) { - cli_->set_digest_auth(username, password); -} -#endif - -inline void Client::set_keep_alive(bool on) { cli_->set_keep_alive(on); } -inline void Client::set_follow_location(bool on) { - cli_->set_follow_location(on); -} - -inline void Client::set_url_encode(bool on) { cli_->set_url_encode(on); } - -inline void Client::set_compress(bool on) { cli_->set_compress(on); } - -inline void Client::set_decompress(bool on) { cli_->set_decompress(on); } - -inline void Client::set_interface(const std::string &intf) { - cli_->set_interface(intf); -} - -inline void Client::set_proxy(const std::string &host, int port) { - cli_->set_proxy(host, port); -} -inline void Client::set_proxy_basic_auth(const std::string &username, - const std::string &password) { - cli_->set_proxy_basic_auth(username, password); -} -inline void Client::set_proxy_bearer_token_auth(const std::string &token) { - cli_->set_proxy_bearer_token_auth(token); -} -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT -inline void Client::set_proxy_digest_auth(const std::string &username, - const std::string &password) { - cli_->set_proxy_digest_auth(username, password); -} -#endif - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT -inline void Client::enable_server_certificate_verification(bool enabled) { - cli_->enable_server_certificate_verification(enabled); -} - -inline void Client::enable_server_hostname_verification(bool enabled) { - cli_->enable_server_hostname_verification(enabled); -} - -inline void Client::set_server_certificate_verifier( - std::function verifier) { - cli_->set_server_certificate_verifier(verifier); -} -#endif - -inline void Client::set_logger(Logger logger) { - cli_->set_logger(std::move(logger)); -} - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT -inline void Client::set_ca_cert_path(const std::string &ca_cert_file_path, - const std::string &ca_cert_dir_path) { - cli_->set_ca_cert_path(ca_cert_file_path, ca_cert_dir_path); -} - -inline void Client::set_ca_cert_store(X509_STORE *ca_cert_store) { - if (is_ssl_) { - static_cast(*cli_).set_ca_cert_store(ca_cert_store); - } else { - cli_->set_ca_cert_store(ca_cert_store); - } -} - -inline void Client::load_ca_cert_store(const char *ca_cert, std::size_t size) { - set_ca_cert_store(cli_->create_ca_cert_store(ca_cert, size)); -} - -inline long Client::get_openssl_verify_result() const { - if (is_ssl_) { - return static_cast(*cli_).get_openssl_verify_result(); - } - return -1; // NOTE: -1 doesn't match any of X509_V_ERR_??? -} - -inline SSL_CTX *Client::ssl_context() const { - if (is_ssl_) { return static_cast(*cli_).ssl_context(); } - return nullptr; -} -#endif - -// ---------------------------------------------------------------------------- - -} // namespace httplib - -#endif // CPPHTTPLIB_HTTPLIB_H diff --git a/llamacpp/native/src/server/utils.hpp b/llamacpp/native/src/server/server-common.cpp similarity index 59% rename from llamacpp/native/src/server/utils.hpp rename to llamacpp/native/src/server/server-common.cpp index b1ecc5af5..e2e41a0d5 100644 --- a/llamacpp/native/src/server/utils.hpp +++ b/llamacpp/native/src/server/server-common.cpp @@ -1,489 +1,737 @@ -#pragma once - #include "common.h" #include "log.h" #include "llama.h" -#include "arg.h" // common_remote_get_content -#include "base64.hpp" #include "mtmd.h" #include "mtmd-helper.h" #include "chat.h" +#include "arg.h" // for common_remote_get_content; TODO: use download.h only +#include "base64.hpp" -#include - -#define JSON_ASSERT GGML_ASSERT -#include +#include "server-common.h" #include #include -#include -#include -#include -#include - -#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo" - -using json = nlohmann::ordered_json; - -#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) -#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) -#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) -#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) - -#define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) - -#define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) - -using raw_buffer = std::vector; - -template -static T json_value(const json & body, const std::string & key, const T & default_value) { - // Fallback null to default value - if (body.contains(key) && !body.at(key).is_null()) { - try { - return body.at(key); - } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const & err) { - LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value: %s\n", key.c_str(), json(default_value).type_name(), err.what()); - return default_value; - } - } else { - return default_value; +#include + +json format_error_response(const std::string & message, const enum error_type type) { + std::string type_str; + int code = 500; + switch (type) { + case ERROR_TYPE_INVALID_REQUEST: + type_str = "invalid_request_error"; + code = 400; + break; + case ERROR_TYPE_AUTHENTICATION: + type_str = "authentication_error"; + code = 401; + break; + case ERROR_TYPE_NOT_FOUND: + type_str = "not_found_error"; + code = 404; + break; + case ERROR_TYPE_SERVER: + type_str = "server_error"; + code = 500; + break; + case ERROR_TYPE_PERMISSION: + type_str = "permission_error"; + code = 403; + break; + case ERROR_TYPE_NOT_SUPPORTED: + type_str = "not_supported_error"; + code = 501; + break; + case ERROR_TYPE_UNAVAILABLE: + type_str = "unavailable_error"; + code = 503; + break; + case ERROR_TYPE_EXCEED_CONTEXT_SIZE: + type_str = "exceed_context_size_error"; + code = 400; + break; } + return json { + {"code", code}, + {"message", message}, + {"type", type_str}, + }; } -const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT); +// +// random string / id +// -// thin wrapper around common_grammar_trigger with (de)serialization functions -struct server_grammar_trigger { - common_grammar_trigger value; +std::string random_string() { + static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); - server_grammar_trigger() = default; - server_grammar_trigger(const common_grammar_trigger & value) : value(value) {} - server_grammar_trigger(const json & in) { - value.type = (common_grammar_trigger_type) in.at("type").get(); - value.value = in.at("value").get(); - if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) { - value.token = (llama_token) in.at("token").get(); - } - } + std::random_device rd; + std::mt19937 generator(rd()); - json to_json() const { - json out { - {"type", (int) value.type}, - {"value", value.value}, - }; - if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) { - out["token"] = (int) value.token; - } - return out; + std::string result(32, ' '); + + for (int i = 0; i < 32; ++i) { + result[i] = str[generator() % str.size()]; } -}; + + return result; +} + +std::string gen_chatcmplid() { + return "chatcmpl-" + random_string(); +} + +std::string gen_tool_call_id() { + return random_string(); +} // -// tokenizer and input processing utils +// lora utils // -static bool json_is_array_of_numbers(const json & data) { - if (data.is_array()) { - for (const auto & e : data) { - if (!e.is_number_integer()) { +bool lora_all_alora(const std::vector & loras) { + bool found_alora = false; + for (const auto & lora : loras) { + if (lora.scale != 0) { + if (llama_adapter_get_alora_n_invocation_tokens(lora.ptr) == 0) { return false; } + found_alora = true; } - return true; } - return false; + return found_alora; } -// is array having BOTH numbers & strings? -static bool json_is_array_of_mixed_numbers_strings(const json & data) { - bool seen_string = false; - bool seen_number = false; - if (data.is_array()) { - for (const auto & e : data) { - seen_string |= e.is_string(); - seen_number |= e.is_number_integer(); - if (seen_number && seen_string) { - return true; - } +bool lora_should_clear_cache( + const std::vector & current, + const std::vector & next) { + + // This should always be called after determining that the two sets are + // _not_ equal. This assert is therefore some slightly wasted work and + // should be safe to remove as long as this method is called correctly. + GGML_ASSERT(!are_lora_equal(current, next)); + + return ( + !(lora_get_enabled_ids(current).empty() || lora_all_alora(current)) || + !lora_all_alora(next)); +} + +std::vector parse_lora_request( + const std::vector & lora_base, + const json & data) { + std::vector lora(lora_base); + int max_idx = lora.size(); + + // clear existing value + for (auto & entry : lora) { + entry.scale = 0.0f; + } + + // set value + for (const auto & entry : data) { + int id = json_value(entry, "id", -1); + float scale = json_value(entry, "scale", 0.0f); + if (0 <= id && id < max_idx) { + lora[id].scale = scale; + } else { + throw std::runtime_error("invalid adapter id"); } } - return false; + + return lora; } -// does array have any individual integers/tokens? -static bool json_is_array_and_contains_numbers(const json & data) { - if (data.is_array()) { - for (const auto & e : data) { - if (e.is_number_integer()) { - return true; - } - } +bool are_lora_equal( + const std::vector & l1, + const std::vector & l2) { + if (l1.size() != l2.size()) { return false; } - return false; + for (size_t i = 0; i < l1.size(); ++i) { + // we don't check lora.path to reduce the time complexity + if (l1[i].scale != l2[i].scale || l1[i].ptr != l2[i].ptr) { + return false; + } + } + return true; } -// get value by path(key1 / key2) -static json json_get_nested_values(const std::vector & paths, const json & js) { - json result = json::object(); - - for (const std::string & path : paths) { - json current = js; - const auto keys = string_split(path, /*separator*/ '/'); - bool valid_path = true; - for (const std::string & k : keys) { - if (valid_path && current.is_object() && current.contains(k)) { - current = current[k]; - } else { - valid_path = false; - } - } - if (valid_path) { - result[path] = current; +std::vector lora_get_enabled_ids(const std::vector & loras) { + std::vector enabled_ids; + for (size_t i = 0; i < loras.size(); ++i) { + if (loras[i].scale > 0) { + enabled_ids.push_back(i); } } - return result; + return enabled_ids; } -/** - * this handles 2 cases: - * - only string, example: "string" - * - mixed string and tokens, example: [12, 34, "string", 56, 78] - */ -static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) { - // If `add_bos` is true, we only add BOS, when json_prompt is a string, - // or the first element of the json_prompt array is a string. - llama_tokens prompt_tokens; +// +// base64 utils (TODO: use the base64::decode from base64.hpp) +// - if (json_prompt.is_array()) { - bool first = true; - for (const auto & p : json_prompt) { - if (p.is_string()) { - auto s = p.template get(); +static const std::string base64_chars = + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789+/"; - llama_tokens p; - if (first) { - p = common_tokenize(vocab, s, add_special, parse_special); - first = false; - } else { - p = common_tokenize(vocab, s, false, parse_special); - } +static inline bool is_base64(uint8_t c) { + return (isalnum(c) || (c == '+') || (c == '/')); +} - prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); - } else { - if (first) { - first = false; - } +static inline raw_buffer base64_decode(const std::string & encoded_string) { + int i = 0; + int j = 0; + int in_ = 0; - prompt_tokens.push_back(p.template get()); + int in_len = encoded_string.size(); + + uint8_t char_array_4[4]; + uint8_t char_array_3[3]; + + raw_buffer ret; + + while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) { + char_array_4[i++] = encoded_string[in_]; in_++; + if (i == 4) { + for (i = 0; i < 4; i++) { + char_array_4[i] = base64_chars.find(char_array_4[i]); + } + + char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4); + char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); + char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; + + for (i = 0; (i < 3); i++) { + ret.push_back(char_array_3[i]); } + + i = 0; } - } else { - auto s = json_prompt.template get(); - prompt_tokens = common_tokenize(vocab, s, add_special, parse_special); } - return prompt_tokens; -} + if (i) { + for (j = i; j < 4; j++) { + char_array_4[j] = 0; + } -// return the last index of character that can form a valid string -// if the last character is potentially cut in half, return the index before the cut -// if validate_utf8(text) == text.size(), then the whole text is valid utf8 -static size_t validate_utf8(const std::string& text) { - size_t len = text.size(); - if (len == 0) return 0; + for (j = 0; j < 4; j++) { + char_array_4[j] = base64_chars.find(char_array_4[j]); + } - // Check the last few bytes to see if a multi-byte character is cut off - for (size_t i = 1; i <= 4 && i <= len; ++i) { - unsigned char c = text[len - i]; - // Check for start of a multi-byte sequence from the end - if ((c & 0xE0) == 0xC0) { - // 2-byte character start: 110xxxxx - // Needs at least 2 bytes - if (i < 2) return len - i; - } else if ((c & 0xF0) == 0xE0) { - // 3-byte character start: 1110xxxx - // Needs at least 3 bytes - if (i < 3) return len - i; - } else if ((c & 0xF8) == 0xF0) { - // 4-byte character start: 11110xxx - // Needs at least 4 bytes - if (i < 4) return len - i; + char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4); + char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); + char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; + + for (j = 0; j < i - 1; j++) { + ret.push_back(char_array_3[j]); } } - // If no cut-off multi-byte character is found, return full length - return len; + return ret; } // -// template utils +// server_tokens implementation // -// format infill task -static llama_tokens format_infill( - const llama_vocab * vocab, - const json & input_prefix, - const json & input_suffix, - const json & input_extra, - const int n_batch, - const int n_predict, - const int n_ctx, - const bool spm_infill, - const llama_tokens & tokens_prompt - ) { - // TODO: optimize this block by reducing memory allocations and movement +server_tokens::server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd) { + for (size_t i = 0; i < mtmd_chunks.size(); ++i) { + push_back(mtmd_chunks[i]); + } +} - // use FIM repo-level pattern: - // ref: https://arxiv.org/pdf/2409.12186 - // - // [FIM_REP]myproject - // [FIM_SEP]filename0 - // extra chunk 0 - // [FIM_SEP]filename1 - // extra chunk 1 - // ... - // [FIM_SEP]filename - // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt - // - llama_tokens extra_tokens; - extra_tokens.reserve(n_ctx); +server_tokens::server_tokens(const llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) { +} - auto tokens_prefix = tokenize_mixed(vocab, input_prefix, false, false); - auto tokens_suffix = tokenize_mixed(vocab, input_suffix, false, false); +llama_pos server_tokens::pos_next() const { + if (!has_mtmd) { + return tokens.size(); + } - if (llama_vocab_fim_rep(vocab) != LLAMA_TOKEN_NULL) { - // TODO: make project name an input - static const auto k_fim_repo = common_tokenize(vocab, "myproject\n", false, false); + llama_pos res = tokens.size(); - extra_tokens.push_back(llama_vocab_fim_rep(vocab)); - extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end()); + for (auto it = map_idx_to_media.begin(); it != map_idx_to_media.end(); ++it) { + const auto & chunk = it->second; + res += mtmd_input_chunk_get_n_pos(chunk.get()) - mtmd_input_chunk_get_n_tokens(chunk.get()); } - for (const auto & chunk : input_extra) { - // { "text": string, "filename": string } - const std::string text = json_value(chunk, "text", std::string()); - const std::string filename = json_value(chunk, "filename", std::string("tmp")); - if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) { - const auto k_fim_file = common_tokenize(vocab, filename + "\n", false, false); + return res; +} - extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab)); - extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); +std::string server_tokens::str() const { + std::ostringstream oss; + oss << "tokens: "; + for (size_t idx = 0; idx < tokens.size(); ++idx) { + llama_token t = tokens[idx]; + oss << "idx:" << idx << " "; + if (t == LLAMA_TOKEN_NULL) { + oss << " "; } else { - // chunk separator in binary form to avoid confusing the AI - static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00}; - static const auto k_chunk_prefix_tokens = common_tokenize(vocab, k_chunk_prefix_str, false, false); - - extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end()); + oss << t << " "; } - - const auto chunk_tokens = common_tokenize(vocab, text, false, false); - extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end()); } + oss << "\n"; + oss << "image idx: "; + for (const auto & it : map_idx_to_media) { + oss << it.first << ", "; + } + return oss.str(); +} - if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) { - // TODO: current filename - static const auto k_fim_file = common_tokenize(vocab, "filename\n", false, false); - - extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab)); - extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); +const mtmd::input_chunk_ptr & server_tokens::find_chunk(size_t idx) const { + auto it = map_idx_to_media.find(idx); + if (it != map_idx_to_media.end()) { + return it->second; } + throw std::runtime_error("Chunk not found"); +} - // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?) - const int n_prefix_take = std::min(tokens_prefix.size(), 3*(n_batch/4)); - const int n_suffix_take = std::min(tokens_suffix.size(), std::max(0, (n_batch/4) - (2 + tokens_prompt.size()))); +void server_tokens::push_back(llama_token tok) { + if (tok == LLAMA_TOKEN_NULL) { + throw std::runtime_error("Invalid token"); + } + tokens.emplace_back(tok); +} - SRV_DBG("n_prefix_take = %d, n_suffix_take = %d, total = %d\n", n_prefix_take, n_suffix_take, (n_prefix_take + n_suffix_take)); +void server_tokens::push_back(const mtmd_input_chunk * chunk) { + auto type = mtmd_input_chunk_get_type(chunk); + if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE || type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { + GGML_ASSERT(has_mtmd); + const size_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk); + size_t start_idx = tokens.size(); + for (size_t i = 0; i < n_tokens; ++i) { + tokens.emplace_back(LLAMA_TOKEN_NULL); + } + mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk)); + map_idx_to_media[start_idx] = std::move(new_chunk); + } else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) { + size_t n_tokens; + const auto * text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens); + for (size_t i = 0; i < n_tokens; ++i) { + push_back(text_tokens[i]); + } + } else { + GGML_ABORT("Invalid chunk type"); + } +} - // fill the rest of the context with extra chunks - const int n_extra_take = std::min(std::max(0, n_ctx - (n_batch) - 2*n_predict), extra_tokens.size()); +void server_tokens::push_back(server_tokens & tokens) { + size_t start_idx = size(); + for (size_t i = 0; i < tokens.size(); i++) { + push_back(tokens[i]); + } + if (tokens.has_mtmd) { + // Assert if we are copying MTMD chunks to a server_tokens that does not have mtmd. + // We could also just check, but this will prevent silently dropping MTMD data. + GGML_ASSERT(has_mtmd); + for (auto it = tokens.map_idx_to_media.begin(); it != tokens.map_idx_to_media.end(); ) { + auto * chunk = tokens.map_idx_to_media[it->first].get(); + mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk)); + map_idx_to_media[start_idx + it->first] = std::move(new_chunk); + } + } +} - tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take); - tokens_suffix.resize(n_suffix_take); +void server_tokens::insert(const llama_tokens & inp_tokens) { + GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled + tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end()); +} - tokens_prefix.insert(tokens_prefix.begin(), llama_vocab_fim_pre(vocab)); - tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end()); - tokens_suffix.insert(tokens_suffix.begin(), llama_vocab_fim_suf(vocab)); +const llama_tokens & server_tokens::get_text_tokens() const { + GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled + return tokens; +} - auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix; - auto embd_end = spm_infill ? tokens_prefix : tokens_suffix; +void server_tokens::set_token(llama_pos pos, llama_token id) { + GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled + tokens[pos] = id; +} - if (llama_vocab_get_add_bos(vocab)) { - embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab)); +void server_tokens::keep_first(size_t n) { + GGML_ASSERT(n <= tokens.size()); + if (has_mtmd) { + if (n == tokens.size()) { + return; // nothing to do + } + // we throw an error if we try to remove a token in the middle of an image + // for ex. with input of 5 text tokens and 2 images: + // [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] + // n 1 2 3 4 5 6 7 8 9 10 + // allowed to resize ^ ^ + // disallowed to resize ^ ^ ^ + if (n > 0) { + // make sure we never remove tokens in the middle of an image + // note that the case where we keep a full image at the end is allowed: + // tokens[n - 1] == LLAMA_TOKEN_NULL && tokens[n] != LLAMA_TOKEN_NULL + if (tokens[n - 1] == LLAMA_TOKEN_NULL && tokens[n] == LLAMA_TOKEN_NULL) { + find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk + } + } + // remove all image chunks that are not used anymore + for (auto it = map_idx_to_media.begin(); it != map_idx_to_media.end(); ) { + size_t idx = it->first; + if (idx >= n) { + it = map_idx_to_media.erase(it); + } else { + ++it; + } + } } + tokens.resize(n); +} - SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size()); - - // put the extra context before the FIM prefix - embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end()); - - embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); - embd_inp.push_back(llama_vocab_fim_mid(vocab)); - - return embd_inp; +std::string server_tokens::detokenize(const llama_context * ctx, bool special) const { + llama_tokens text_tokens; + text_tokens.reserve(tokens.size()); + for (const auto & t : tokens) { + if (t != LLAMA_TOKEN_NULL) { + text_tokens.push_back(t); + } + } + return common_detokenize(ctx, text_tokens, special); } -// -// base64 utils (TODO: move to common in the future) -// +size_t server_tokens::get_common_prefix(const server_tokens & b) const { + const size_t max_idx = std::min(tokens.size(), b.tokens.size()); -static const std::string base64_chars = - "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "abcdefghijklmnopqrstuvwxyz" - "0123456789+/"; + if (!has_mtmd) { + for (size_t i = 0; i < max_idx; ++i) { + if (tokens[i] == b.tokens[i]) { + continue; + } -static inline bool is_base64(uint8_t c) { - return (isalnum(c) || (c == '+') || (c == '/')); -} + return i; + } -static inline raw_buffer base64_decode(const std::string & encoded_string) { - int i = 0; - int j = 0; - int in_ = 0; + return max_idx; + } - int in_len = encoded_string.size(); + for (size_t i = 0; i < max_idx; ++i) { + const llama_token ai = tokens[i]; + const llama_token bi = b.tokens[i]; - uint8_t char_array_4[4]; - uint8_t char_array_3[3]; + if (ai == LLAMA_TOKEN_NULL && bi == LLAMA_TOKEN_NULL) { + const auto & a_chunk = find_chunk(i); + const auto & b_chunk = b.find_chunk(i); - raw_buffer ret; + GGML_ASSERT(a_chunk && b_chunk); - while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) { - char_array_4[i++] = encoded_string[in_]; in_++; - if (i == 4) { - for (i = 0; i < 4; i++) { - char_array_4[i] = base64_chars.find(char_array_4[i]); - } + const std::string id_ai = mtmd_input_chunk_get_id(a_chunk.get()); + const std::string id_bi = mtmd_input_chunk_get_id(b_chunk.get()); - char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4); - char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); - char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; + const size_t n_tok_a = mtmd_input_chunk_get_n_tokens(a_chunk.get()); + const size_t n_tok_b = mtmd_input_chunk_get_n_tokens(b_chunk.get()); - for (i = 0; (i < 3); i++) { - ret.push_back(char_array_3[i]); + if (id_ai == id_bi && n_tok_a == n_tok_b) { + GGML_ASSERT(n_tok_a > 0 && "Invalid media chunk"); // should never happen + i += n_tok_a - 1; // will be +1 by the for loop + continue; } - i = 0; + return i; } - } - if (i) { - for (j = i; j < 4; j++) { - char_array_4[j] = 0; + if (ai == bi) { + continue; } - for (j = 0; j < 4; j++) { - char_array_4[j] = base64_chars.find(char_array_4[j]); - } + return i; + } - char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4); - char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); - char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; + return max_idx; // all tokens are equal +} - for (j = 0; j < i - 1; j++) { - ret.push_back(char_array_3[j]); +bool server_tokens::validate(const struct llama_context * ctx) const { + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + const int32_t n_vocab = llama_vocab_n_tokens(vocab); + + for (size_t i = 0; i < tokens.size(); ++i) { + const auto & t = tokens[i]; + if (t == LLAMA_TOKEN_NULL) { + try { + const auto & chunk = find_chunk(i); + size_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk.get()); + i += n_tokens - 1; // will be +1 by the for loop + } catch (const std::exception & e) { + return false; + } + } else if (t < 0 || t >= n_vocab) { + return false; } } + return true; +} - return ret; +int32_t server_tokens::process_chunk( + llama_context * ctx, + mtmd_context * mctx, + size_t idx, + llama_pos pos, + int32_t seq_id, + size_t & n_tokens_out) const { + const auto & chunk = find_chunk(idx); + const char * name = mtmd_input_chunk_get_type(chunk.get()) == MTMD_INPUT_CHUNK_TYPE_IMAGE + ? "image" : "audio"; + SRV_INF("processing %s...\n", name); + int32_t n_batch = llama_n_batch(ctx); + int64_t t0 = ggml_time_ms(); + llama_pos new_n_past; // unused for now + int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx, + chunk.get(), + pos, + seq_id, + n_batch, + true, // logits last + &new_n_past); + SRV_INF("%s processed in %" PRId64 " ms\n", name, ggml_time_ms() - t0); + if (result != 0) { + LOG_ERR("mtmd_helper_eval failed with status %d", result); + n_tokens_out = 0; + return result; + } + n_tokens_out = mtmd_input_chunk_get_n_tokens(chunk.get()); + return 0; } // -// random string / id +// tokenizer and input processing utils // -static std::string random_string() { - static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); - - std::random_device rd; - std::mt19937 generator(rd()); - - std::string result(32, ' '); - - for (int i = 0; i < 32; ++i) { - result[i] = str[generator() % str.size()]; +bool json_is_array_of_numbers(const json & data) { + if (data.is_array()) { + for (const auto & e : data) { + if (!e.is_number_integer()) { + return false; + } + } + return true; } - - return result; + return false; } -static std::string gen_chatcmplid() { - return "chatcmpl-" + random_string(); +bool json_is_array_of_mixed_numbers_strings(const json & data) { + bool seen_string = false; + bool seen_number = false; + if (data.is_array()) { + for (const auto & e : data) { + seen_string |= e.is_string(); + seen_number |= e.is_number_integer(); + if (seen_number && seen_string) { + return true; + } + } + } + return false; } -static std::string gen_tool_call_id() { - return random_string(); +bool json_is_array_and_contains_numbers(const json & data) { + if (data.is_array()) { + for (const auto & e : data) { + if (e.is_number_integer()) { + return true; + } + } + return false; + } + return false; } -// -// other common utils -// +json json_get_nested_values(const std::vector & paths, const json & js) { + json result = json::object(); -// TODO: reuse llama_detokenize -template -static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { - std::string ret; - for (; begin != end; ++begin) { - ret += common_token_to_piece(ctx, *begin); + for (const std::string & path : paths) { + json current = js; + const auto keys = string_split(path, /*separator*/ '/'); + bool valid_path = true; + for (const std::string & k : keys) { + if (valid_path && current.is_object() && current.contains(k)) { + current = current[k]; + } else { + valid_path = false; + } + } + if (valid_path) { + result[path] = current; + } } - - return ret; + return result; } -// format incomplete utf-8 multibyte character for output -static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) { - std::string out = token == LLAMA_TOKEN_NULL ? "" : common_token_to_piece(ctx, token); - - // if the size is 1 and first bit is 1, meaning it's a partial character - // (size > 1 meaning it's already a known token) - if (out.size() == 1 && (out[0] & 0x80) == 0x80) { - std::stringstream ss; - ss << std::hex << (out[0] & 0xff); - std::string res(ss.str()); - out = "byte: \\x" + res; - } +llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) { + // If `add_bos` is true, we only add BOS, when json_prompt is a string, + // or the first element of the json_prompt array is a string. + llama_tokens prompt_tokens; - return out; -} + if (json_prompt.is_array()) { + bool first = true; + for (const auto & p : json_prompt) { + if (p.is_string()) { + auto s = p.template get(); -// note: if data is a json array, it will be sent as multiple events, one per item -static bool server_sent_event(httplib::DataSink & sink, const json & data) { - static auto send_single = [](httplib::DataSink & sink, const json & data) -> bool { - const std::string str = - "data: " + - data.dump(-1, ' ', false, json::error_handler_t::replace) + - "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row). + llama_tokens p; + if (first) { + p = common_tokenize(vocab, s, add_special, parse_special); + first = false; + } else { + p = common_tokenize(vocab, s, false, parse_special); + } - LOG_DBG("data stream, to_send: %s", str.c_str()); - return sink.write(str.c_str(), str.size()); - }; + prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); + } else { + if (first) { + first = false; + } - if (data.is_array()) { - for (const auto & item : data) { - if (!send_single(sink, item)) { - return false; + prompt_tokens.push_back(p.template get()); } } } else { - return send_single(sink, data); + auto s = json_prompt.template get(); + prompt_tokens = common_tokenize(vocab, s, add_special, parse_special); } - return true; + return prompt_tokens; } -// -// OAI utils -// +size_t validate_utf8(const std::string& text) { + size_t len = text.size(); + if (len == 0) return 0; -// used by /completions endpoint -static json oaicompat_completion_params_parse(const json & body) { + // Check the last few bytes to see if a multi-byte character is cut off + for (size_t i = 1; i <= 4 && i <= len; ++i) { + unsigned char c = text[len - i]; + // Check for start of a multi-byte sequence from the end + if ((c & 0xE0) == 0xC0) { + // 2-byte character start: 110xxxxx + // Needs at least 2 bytes + if (i < 2) return len - i; + } else if ((c & 0xF0) == 0xE0) { + // 3-byte character start: 1110xxxx + // Needs at least 3 bytes + if (i < 3) return len - i; + } else if ((c & 0xF8) == 0xF0) { + // 4-byte character start: 11110xxx + // Needs at least 4 bytes + if (i < 4) return len - i; + } + } + + // If no cut-off multi-byte character is found, return full length + return len; +} + +// Computes FNV-1a hash of the data +static std::string fnv_hash(const uint8_t * data, size_t len) { + const uint64_t fnv_prime = 0x100000001b3ULL; + uint64_t hash = 0xcbf29ce484222325ULL; + + for (size_t i = 0; i < len; ++i) { + hash ^= data[i]; + hash *= fnv_prime; + } + return std::to_string(hash); +} + +server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector files) { + mtmd::bitmaps bitmaps; + for (auto & file : files) { + mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size())); + if (!bmp.ptr) { + throw std::runtime_error("Failed to load image or audio file"); + } + // calculate bitmap hash (for KV caching) + std::string hash = fnv_hash(bmp.data(), bmp.n_bytes()); + bmp.set_id(hash.c_str()); + bitmaps.entries.push_back(std::move(bmp)); + } + // process prompt + std::vector inputs; + // multimodal + mtmd_input_text inp_txt = { + prompt.c_str(), + /* add_special */ true, + /* parse_special */ true, + }; + mtmd::input_chunks chunks(mtmd_input_chunks_init()); + auto bitmaps_c_ptr = bitmaps.c_ptr(); + int32_t tokenized = mtmd_tokenize(mctx, + chunks.ptr.get(), + &inp_txt, + bitmaps_c_ptr.data(), + bitmaps_c_ptr.size()); + if (tokenized != 0) { + throw std::runtime_error("Failed to tokenize prompt"); + } + auto result = server_tokens(chunks, true); + return result; +} + +/** + * break the input "prompt" object into multiple prompt if needed, then tokenize them + * use tokenize_input_prompts() if the input could be an array. + * this supports these cases: + * - "prompt": "string" + * - "prompt": [12, 34, 56] + * - "prompt": [12, 34, "string", 56, 78] + * - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] } + */ +static server_tokens tokenize_input_subprompt(const llama_vocab * vocab, mtmd_context * mctx, const json & json_prompt, bool add_special, bool parse_special) { + constexpr char JSON_STRING_PROMPT_KEY[] = "prompt_string"; + constexpr char JSON_MTMD_DATA_KEY[] = "multimodal_data"; + const bool has_mtmd = mctx != nullptr; + if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) { + // string or mixed + llama_tokens tmp = tokenize_mixed(vocab, json_prompt, add_special, parse_special); + return server_tokens(tmp, false); + } else if (json_is_array_of_numbers(json_prompt)) { + // array of tokens + llama_tokens tmp = json_prompt.get(); + return server_tokens(tmp, false); + } else if (json_prompt.contains(JSON_STRING_PROMPT_KEY)) { + // JSON object with prompt key. + if (json_prompt.contains(JSON_MTMD_DATA_KEY)) { + if (!has_mtmd) + throw std::runtime_error("Multimodal data provided, but model does not support multimodal requests."); + + // JSON object with prompt and multimodal key. + std::vector files; + for (const auto & entry : json_prompt.at(JSON_MTMD_DATA_KEY)) { + files.push_back(base64_decode(entry)); + } + return process_mtmd_prompt(mctx, json_prompt.at(JSON_STRING_PROMPT_KEY), files); + } else { + // Not multimodal, but contains a subobject. + llama_tokens tmp = tokenize_mixed(vocab, json_prompt.at(JSON_STRING_PROMPT_KEY), add_special, parse_special); + return server_tokens(tmp, false); + } + } else { + throw std::runtime_error("\"prompt\" elements must be a string, a list of tokens, a JSON object containing a prompt string, or a list of mixed strings & tokens."); + } +} + +std::vector tokenize_input_prompts(const llama_vocab * vocab, mtmd_context * mctx, const json & json_prompt, bool add_special, bool parse_special) { + std::vector result; + if (json_prompt.is_array() && !json_is_array_and_contains_numbers(json_prompt)) { + result.reserve(json_prompt.size()); + for (const auto & p : json_prompt) { + result.push_back(tokenize_input_subprompt(vocab, mctx, p,add_special, parse_special)); + } + } else { + result.push_back(tokenize_input_subprompt(vocab, mctx, json_prompt, add_special, parse_special)); + } + if (result.empty()) { + throw std::runtime_error("\"prompt\" must not be empty"); + } + return result; +} + +// +// OAI utils +// + +// used by /completions endpoint +json oaicompat_completion_params_parse(const json & body) { json llama_params; if (!body.contains("prompt")) { @@ -527,19 +775,67 @@ static json oaicompat_completion_params_parse(const json & body) { return llama_params; } -struct oaicompat_parser_options { - bool use_jinja; - bool prefill_assistant; - common_reasoning_format reasoning_format; - std::map chat_template_kwargs; - common_chat_templates * tmpls; - bool allow_image; - bool allow_audio; - bool enable_thinking = true; -}; +// media_path always end with '/', see arg.cpp +static void handle_media( + std::vector & out_files, + json & media_obj, + const std::string & media_path) { + std::string url = json_value(media_obj, "url", std::string()); + if (string_starts_with(url, "http")) { + // download remote image + // TODO @ngxson : maybe make these params configurable + common_remote_params params; + params.headers.push_back("User-Agent: llama.cpp/" + build_info); + params.max_size = 1024 * 1024 * 10; // 10MB + params.timeout = 10; // seconds + SRV_INF("downloading image from '%s'\n", url.c_str()); + auto res = common_remote_get_content(url, params); + if (200 <= res.first && res.first < 300) { + SRV_INF("downloaded %ld bytes\n", res.second.size()); + raw_buffer data; + data.insert(data.end(), res.second.begin(), res.second.end()); + out_files.push_back(data); + } else { + throw std::runtime_error("Failed to download image"); + } + + } else if (string_starts_with(url, "file://")) { + if (media_path.empty()) { + throw std::invalid_argument("file:// URLs are not allowed unless --media-path is specified"); + } + // load local image file + std::string file_path = url.substr(7); // remove "file://" + raw_buffer data; + if (!fs_validate_filename(file_path, true)) { + throw std::invalid_argument("file path is not allowed: " + file_path); + } + SRV_INF("loading image from local file '%s'\n", (media_path + file_path).c_str()); + std::ifstream file(media_path + file_path, std::ios::binary); + if (!file) { + throw std::invalid_argument("file does not exist or cannot be opened: " + file_path); + } + data.assign((std::istreambuf_iterator(file)), std::istreambuf_iterator()); + out_files.push_back(data); + + } else { + // try to decode base64 image + std::vector parts = string_split(url, /*separator*/ ','); + if (parts.size() != 2) { + throw std::runtime_error("Invalid url value"); + } else if (!string_starts_with(parts[0], "data:image/")) { + throw std::runtime_error("Invalid url format: " + parts[0]); + } else if (!string_ends_with(parts[0], "base64")) { + throw std::runtime_error("url must be base64 encoded"); + } else { + auto base64_data = parts[1]; + auto decoded_data = base64_decode(base64_data); + out_files.push_back(decoded_data); + } + } +} // used by /chat/completions endpoint -static json oaicompat_chat_params_parse( +json oaicompat_chat_params_parse( json & body, /* openai api json semantics */ const oaicompat_parser_options & opt, std::vector & out_files) @@ -583,26 +879,26 @@ static json oaicompat_chat_params_parse( auto schema_wrapper = json_value(response_format, "json_schema", json::object()); json_schema = json_value(schema_wrapper, "schema", json::object()); } else if (!response_type.empty() && response_type != "text") { - throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type); + throw std::invalid_argument("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type); } } // get input files if (!body.contains("messages")) { - throw std::runtime_error("'messages' is required"); + throw std::invalid_argument("'messages' is required"); } json & messages = body.at("messages"); if (!messages.is_array()) { - throw std::runtime_error("Expected 'messages' to be an array"); + throw std::invalid_argument("Expected 'messages' to be an array"); } for (auto & msg : messages) { std::string role = json_value(msg, "role", std::string()); if (role != "assistant" && !msg.contains("content")) { - throw std::runtime_error("All non-assistant messages must contain 'content'"); + throw std::invalid_argument("All non-assistant messages must contain 'content'"); } if (role == "assistant") { if (!msg.contains("content") && !msg.contains("tool_calls")) { - throw std::runtime_error("Assistant message must contain either 'content' or 'tool_calls'!"); + throw std::invalid_argument("Assistant message must contain either 'content' or 'tool_calls'!"); } if (!msg.contains("content")) { continue; // avoid errors with no content @@ -614,7 +910,7 @@ static json oaicompat_chat_params_parse( } if (!content.is_array()) { - throw std::runtime_error("Expected 'content' to be a string or an array"); + throw std::invalid_argument("Expected 'content' to be a string or an array"); } for (auto & p : content) { @@ -624,41 +920,8 @@ static json oaicompat_chat_params_parse( throw std::runtime_error("image input is not supported - hint: if this is unexpected, you may need to provide the mmproj"); } - json image_url = json_value(p, "image_url", json::object()); - std::string url = json_value(image_url, "url", std::string()); - if (string_starts_with(url, "http")) { - // download remote image - // TODO @ngxson : maybe make these params configurable - common_remote_params params; - params.headers.push_back("User-Agent: llama.cpp/" + build_info); - params.max_size = 1024 * 1024 * 10; // 10MB - params.timeout = 10; // seconds - SRV_INF("downloading image from '%s'\n", url.c_str()); - auto res = common_remote_get_content(url, params); - if (200 <= res.first && res.first < 300) { - SRV_INF("downloaded %ld bytes\n", res.second.size()); - raw_buffer data; - data.insert(data.end(), res.second.begin(), res.second.end()); - out_files.push_back(data); - } else { - throw std::runtime_error("Failed to download image"); - } - - } else { - // try to decode base64 image - std::vector parts = string_split(url, /*separator*/ ','); - if (parts.size() != 2) { - throw std::runtime_error("Invalid image_url.url value"); - } else if (!string_starts_with(parts[0], "data:image/")) { - throw std::runtime_error("Invalid image_url.url format: " + parts[0]); - } else if (!string_ends_with(parts[0], "base64")) { - throw std::runtime_error("image_url.url must be base64 encoded"); - } else { - auto base64_data = parts[1]; - auto decoded_data = base64_decode(base64_data); - out_files.push_back(decoded_data); - } - } + json image_url = json_value(p, "image_url", json::object()); + handle_media(out_files, image_url, opt.media_path); // replace this chunk with a marker p["type"] = "text"; @@ -675,18 +938,20 @@ static json oaicompat_chat_params_parse( std::string format = json_value(input_audio, "format", std::string()); // while we also support flac, we don't allow it here so we matches the OAI spec if (format != "wav" && format != "mp3") { - throw std::runtime_error("input_audio.format must be either 'wav' or 'mp3'"); + throw std::invalid_argument("input_audio.format must be either 'wav' or 'mp3'"); } auto decoded_data = base64_decode(data); // expected to be base64 encoded out_files.push_back(decoded_data); + // TODO: add audio_url support by reusing handle_media() + // replace this chunk with a marker p["type"] = "text"; p["text"] = mtmd_default_marker(); p.erase("input_audio"); } else if (type != "text") { - throw std::runtime_error("unsupported content[].type"); + throw std::invalid_argument("unsupported content[].type"); } } } @@ -704,7 +969,7 @@ static json oaicompat_chat_params_parse( inputs.enable_thinking = opt.enable_thinking; if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) { if (body.contains("grammar")) { - throw std::runtime_error("Cannot use custom grammar constraints with tools."); + throw std::invalid_argument("Cannot use custom grammar constraints with tools."); } llama_params["parse_tool_calls"] = true; } @@ -723,7 +988,7 @@ static json oaicompat_chat_params_parse( } else if (enable_thinking_kwarg == "false") { inputs.enable_thinking = false; } else if (!enable_thinking_kwarg.empty() && enable_thinking_kwarg[0] == '"') { - throw std::runtime_error("invalid type for \"enable_thinking\" (expected boolean, got string)"); + throw std::invalid_argument("invalid type for \"enable_thinking\" (expected boolean, got string)"); } // if the assistant message appears at the end of list, we do not add end-of-turn token @@ -736,14 +1001,14 @@ static json oaicompat_chat_params_parse( /* sanity check, max one assistant message at the end of the list */ if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){ - throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list."); + throw std::invalid_argument("Cannot have 2 or more assistant messages at the end of the list."); } /* TODO: test this properly */ inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE; if ( inputs.enable_thinking ) { - throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking."); + throw std::invalid_argument("Assistant response prefill is incompatible with enable_thinking."); } inputs.add_generation_prompt = true; @@ -784,18 +1049,18 @@ static json oaicompat_chat_params_parse( // Handle "n" field int n_choices = json_value(body, "n", 1); if (n_choices != 1) { - throw std::runtime_error("Only one completion choice is allowed"); + throw std::invalid_argument("Only one completion choice is allowed"); } // Handle "logprobs" field // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future if (json_value(body, "logprobs", false)) { if (has_tools && stream) { - throw std::runtime_error("logprobs is not supported with tools + stream"); + throw std::invalid_argument("logprobs is not supported with tools + stream"); } llama_params["n_probs"] = json_value(body, "top_logprobs", 20); } else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) { - throw std::runtime_error("top_logprobs requires logprobs to be set to true"); + throw std::invalid_argument("top_logprobs requires logprobs to be set to true"); } // Copy remaining properties to llama_params @@ -811,7 +1076,227 @@ static json oaicompat_chat_params_parse( return llama_params; } -static json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false) { +json convert_anthropic_to_oai(const json & body) { + json oai_body; + + // Convert system prompt + json oai_messages = json::array(); + auto system_param = json_value(body, "system", json()); + if (!system_param.is_null()) { + std::string system_content; + + if (system_param.is_string()) { + system_content = system_param.get(); + } else if (system_param.is_array()) { + for (const auto & block : system_param) { + if (json_value(block, "type", std::string()) == "text") { + system_content += json_value(block, "text", std::string()); + } + } + } + + oai_messages.push_back({ + {"role", "system"}, + {"content", system_content} + }); + } + + // Convert messages + if (!body.contains("messages")) { + throw std::runtime_error("'messages' is required"); + } + const json & messages = body.at("messages"); + if (messages.is_array()) { + for (const auto & msg : messages) { + std::string role = json_value(msg, "role", std::string()); + + if (!msg.contains("content")) { + if (role == "assistant") { + continue; + } + oai_messages.push_back(msg); + continue; + } + + const json & content = msg.at("content"); + + if (content.is_string()) { + oai_messages.push_back(msg); + continue; + } + + if (!content.is_array()) { + oai_messages.push_back(msg); + continue; + } + + json tool_calls = json::array(); + json converted_content = json::array(); + json tool_results = json::array(); + bool has_tool_calls = false; + + for (const auto & block : content) { + std::string type = json_value(block, "type", std::string()); + + if (type == "text") { + converted_content.push_back(block); + } else if (type == "image") { + json source = json_value(block, "source", json::object()); + std::string source_type = json_value(source, "type", std::string()); + + if (source_type == "base64") { + std::string media_type = json_value(source, "media_type", std::string("image/jpeg")); + std::string data = json_value(source, "data", std::string()); + std::ostringstream ss; + ss << "data:" << media_type << ";base64," << data; + + converted_content.push_back({ + {"type", "image_url"}, + {"image_url", { + {"url", ss.str()} + }} + }); + } else if (source_type == "url") { + std::string url = json_value(source, "url", std::string()); + converted_content.push_back({ + {"type", "image_url"}, + {"image_url", { + {"url", url} + }} + }); + } + } else if (type == "tool_use") { + tool_calls.push_back({ + {"id", json_value(block, "id", std::string())}, + {"type", "function"}, + {"function", { + {"name", json_value(block, "name", std::string())}, + {"arguments", json_value(block, "input", json::object()).dump()} + }} + }); + has_tool_calls = true; + } else if (type == "tool_result") { + std::string tool_use_id = json_value(block, "tool_use_id", std::string()); + + auto result_content = json_value(block, "content", json()); + std::string result_text; + if (result_content.is_string()) { + result_text = result_content.get(); + } else if (result_content.is_array()) { + for (const auto & c : result_content) { + if (json_value(c, "type", std::string()) == "text") { + result_text += json_value(c, "text", std::string()); + } + } + } + + tool_results.push_back({ + {"role", "tool"}, + {"tool_call_id", tool_use_id}, + {"content", result_text} + }); + } + } + + if (!converted_content.empty() || has_tool_calls) { + json new_msg = {{"role", role}}; + if (!converted_content.empty()) { + new_msg["content"] = converted_content; + } else if (has_tool_calls) { + new_msg["content"] = ""; + } + if (!tool_calls.empty()) { + new_msg["tool_calls"] = tool_calls; + } + oai_messages.push_back(new_msg); + } + + for (const auto & tool_msg : tool_results) { + oai_messages.push_back(tool_msg); + } + } + } + + oai_body["messages"] = oai_messages; + + // Convert tools + if (body.contains("tools")) { + const json & tools = body.at("tools"); + if (tools.is_array()) { + json oai_tools = json::array(); + for (const auto & tool : tools) { + oai_tools.push_back({ + {"type", "function"}, + {"function", { + {"name", json_value(tool, "name", std::string())}, + {"description", json_value(tool, "description", std::string())}, + {"parameters", tool.contains("input_schema") ? tool.at("input_schema") : json::object()} + }} + }); + } + oai_body["tools"] = oai_tools; + } + } + + // Convert tool_choice + if (body.contains("tool_choice")) { + const json & tc = body.at("tool_choice"); + if (tc.is_object()) { + std::string type = json_value(tc, "type", std::string()); + if (type == "auto") { + oai_body["tool_choice"] = "auto"; + } else if (type == "any" || type == "tool") { + oai_body["tool_choice"] = "required"; + } + } + } + + // Convert stop_sequences to stop + if (body.contains("stop_sequences")) { + oai_body["stop"] = body.at("stop_sequences"); + } + + // Handle max_tokens (required in Anthropic, but we're permissive) + if (body.contains("max_tokens")) { + oai_body["max_tokens"] = body.at("max_tokens"); + } else { + oai_body["max_tokens"] = 4096; + } + + // Pass through common params + for (const auto & key : {"temperature", "top_p", "top_k", "stream"}) { + if (body.contains(key)) { + oai_body[key] = body.at(key); + } + } + + // Handle Anthropic-specific thinking param + if (body.contains("thinking")) { + json thinking = json_value(body, "thinking", json::object()); + std::string thinking_type = json_value(thinking, "type", std::string()); + if (thinking_type == "enabled") { + int budget_tokens = json_value(thinking, "budget_tokens", 10000); + oai_body["thinking_budget_tokens"] = budget_tokens; + } + } + + // Handle Anthropic-specific metadata param + if (body.contains("metadata")) { + json metadata = json_value(body, "metadata", json::object()); + std::string user_id = json_value(metadata, "user_id", std::string()); + if (!user_id.empty()) { + oai_body["__metadata_user_id"] = user_id; + } + } + + return oai_body; +} + +json format_embeddings_response_oaicompat( + const json & request, + const std::string & model_name, + const json & embeddings, + bool use_base64) { json data = json::array(); int32_t n_tokens = 0; int i = 0; @@ -841,7 +1326,7 @@ static json format_embeddings_response_oaicompat(const json & request, const jso } json res = json { - {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, + {"model", json_value(request, "model", model_name)}, {"object", "list"}, {"usage", json { {"prompt_tokens", n_tokens}, @@ -853,8 +1338,9 @@ static json format_embeddings_response_oaicompat(const json & request, const jso return res; } -static json format_response_rerank( +json format_response_rerank( const json & request, + const std::string & model_name, const json & ranks, bool is_tei_format, std::vector & texts, @@ -886,7 +1372,7 @@ static json format_response_rerank( if (is_tei_format) return results; json res = json{ - {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, + {"model", json_value(request, "model", model_name)}, {"object", "list"}, {"usage", json{ {"prompt_tokens", n_tokens}, @@ -898,74 +1384,19 @@ static json format_response_rerank( return res; } -static bool is_valid_utf8(const std::string & str) { - const unsigned char* bytes = reinterpret_cast(str.data()); - const unsigned char* end = bytes + str.length(); - while (bytes < end) { - if (*bytes <= 0x7F) { - // 1-byte sequence (0xxxxxxx) - bytes++; - } else if ((*bytes & 0xE0) == 0xC0) { - // 2-byte sequence (110xxxxx 10xxxxxx) - if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80) - return false; - bytes += 2; - } else if ((*bytes & 0xF0) == 0xE0) { - // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx) - if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80) - return false; - bytes += 3; - } else if ((*bytes & 0xF8) == 0xF0) { - // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) - if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 || - (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80) - return false; - bytes += 4; - } else { - // Invalid UTF-8 lead byte - return false; - } - } +// +// other utils +// - return true; -} +std::vector get_token_probabilities(llama_context * ctx, int idx) { + std::vector cur; + const auto * logits = llama_get_logits_ith(ctx, idx); -static json format_tokenizer_response(const json & tokens) { - return json { - {"tokens", tokens} - }; -} + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); -static json format_detokenized_response(const std::string & content) { - return json { - {"content", content} - }; -} - -static json format_logit_bias(const std::vector & logit_bias) { - json data = json::array(); - for (const auto & lb : logit_bias) { - data.push_back(json{ - {"bias", lb.bias}, - {"token", lb.token}, - }); - } - return data; -} - -static std::string safe_json_to_str(const json & data) { - return data.dump(-1, ' ', false, json::error_handler_t::replace); -} - -static std::vector get_token_probabilities(llama_context * ctx, int idx) { - std::vector cur; - const auto * logits = llama_get_logits_ith(ctx, idx); - - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); - - const int n_vocab = llama_vocab_n_tokens(vocab); + const int n_vocab = llama_vocab_n_tokens(vocab); cur.resize(n_vocab); for (llama_token token_id = 0; token_id < n_vocab; token_id++) { @@ -992,538 +1423,226 @@ static std::vector get_token_probabilities(llama_context * ctx return cur; } -static bool are_lora_equal( - const std::vector & l1, - const std::vector & l2) { - if (l1.size() != l2.size()) { - return false; - } - for (size_t i = 0; i < l1.size(); ++i) { - // we don't check lora.path to reduce the time complexity - if (l1[i].scale != l2[i].scale || l1[i].ptr != l2[i].ptr) { - return false; - } - } - return true; +std::string safe_json_to_str(const json & data) { + return data.dump(-1, ' ', false, json::error_handler_t::replace); } -// get the ids of all enabled loras -static std::vector lora_get_enabled_ids(const std::vector & loras) { - std::vector enabled_ids; - for (size_t i = 0; i < loras.size(); ++i) { - if (loras[i].scale > 0) { - enabled_ids.push_back(i); - } +// TODO: reuse llama_detokenize +template +static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { + std::string ret; + for (; begin != end; ++begin) { + ret += common_token_to_piece(ctx, *begin); } - return enabled_ids; -} -// check whether the given lora set has only aloras activated (empty => false) -static bool lora_all_alora(const std::vector & loras) { - bool found_alora = false; - for (const auto & lora : loras) { - if (lora.scale != 0) { - if (llama_adapter_get_alora_n_invocation_tokens(lora.ptr) == 0) { - return false; - } - found_alora = true; - } - } - return found_alora; + return ret; } -// if the two sets of loras are different, they require a cache clear unless the -// change is only from aloras to aloras. -static bool lora_should_clear_cache( - const std::vector & current, - const std::vector & next) { - - // This should always be called after determining that the two sets are - // _not_ equal. This assert is therefore some slightly wasted work and - // should be safe to remove as long as this method is called correctly. - GGML_ASSERT(!are_lora_equal(current, next)); - - return ( - !(lora_get_enabled_ids(current).empty() || lora_all_alora(current)) || - !lora_all_alora(next)); +std::string tokens_to_str(llama_context * ctx, const llama_tokens & tokens) { + return tokens_to_str(ctx, tokens.begin(), tokens.end()); } -// parse lora config from JSON request, returned a copy of lora_base with updated scale -static std::vector parse_lora_request( - const std::vector & lora_base, - const json & data) { - std::vector lora(lora_base); - int max_idx = lora.size(); - - // clear existing value - for (auto & entry : lora) { - entry.scale = 0.0f; - } +// format incomplete utf-8 multibyte character for output +std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) { + std::string out = token == LLAMA_TOKEN_NULL ? "" : common_token_to_piece(ctx, token); - // set value - for (const auto & entry : data) { - int id = json_value(entry, "id", -1); - float scale = json_value(entry, "scale", 0.0f); - if (0 <= id && id < max_idx) { - lora[id].scale = scale; - } else { - throw std::runtime_error("invalid adapter id"); - } + // if the size is 1 and first bit is 1, meaning it's a partial character + // (size > 1 meaning it's already a known token) + if (out.size() == 1 && (out[0] & 0x80) == 0x80) { + std::stringstream ss; + ss << std::hex << (out[0] & 0xff); + std::string res(ss.str()); + out = "byte: \\x" + res; } - return lora; + return out; } -// -// utils for interacting with libmtmd -// (may need to refactor in near future) -// - -/** - * server_tokens is a helper to manage the input tokens and image for the server. - * it is made this way to simplify the logic of KV cache management. - */ -struct server_tokens { - bool has_mtmd = false; - -private: // disallow accessing these members directly, risking out-of-sync - - // map a **start** index in tokens to the image chunk - // note: the order need to be in-sync with tokens - std::map map_idx_to_media; - - // list of tokens - // if the token is LLAMA_TOKEN_NULL, it indicates that this position is occupied by media chunk - // otherwise, it is a normal text token - // note: a non-text chunk can occupy multiple tokens (aka memory cells) in the token list - // note(2): for M-RoPE, an image can occupy different number of pos; do not assume 1-to-1 mapping tokens <-> pos - llama_tokens tokens; - - // for ex. with input of 5 text tokens and 2 images (each image occupies 3 tokens and 2 pos): - // [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] [img1] - // idx 0 1 2 3 4 5 6 7 8 9 10 - // pos 0 1 2 3 4 5 5 5 7 7 7 - // map_idx_to_media will contain: {5, img0}, {8, img1} - -public: - server_tokens() = default; - ~server_tokens() = default; - - // Prevent copying - // TODO: server_tokens should be copyable - remove this: - server_tokens(const server_tokens&) = delete; - server_tokens& operator=(const server_tokens&) = delete; - - // Allow moving (usually implicitly generated if members are movable) - server_tokens(server_tokens&&) = default; - server_tokens& operator=(server_tokens&&) = default; - - // Allow accessing elements using [] operator - llama_token operator[](size_t index) { return tokens[index]; } - const llama_token& operator[](size_t index) const { return tokens[index]; } +// format server-sent event (SSE), return the formatted string to send +// note: if data is a json array, it will be sent as multiple events, one per item +std::string format_oai_sse(const json & data) { + std::ostringstream ss; + auto send_single = [&ss](const json & data) { + ss << "data: " << + safe_json_to_str(data) << + "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row). + }; - server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd) { - for (size_t i = 0; i < mtmd_chunks.size(); ++i) { - push_back(mtmd_chunks[i]); + if (data.is_array()) { + for (const auto & item : data) { + send_single(item); } + } else { + send_single(data); } - server_tokens(const llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) { - } - - llama_pos pos_next() const { - if (!has_mtmd) { - return tokens.size(); - } + return ss.str(); +} - llama_pos res = tokens.size(); +std::string format_anthropic_sse(const json & data) { + std::ostringstream ss; - for (auto it = map_idx_to_media.begin(); it != map_idx_to_media.end(); ++it) { - const auto & chunk = it->second; - res += mtmd_input_chunk_get_n_pos(chunk.get()) - mtmd_input_chunk_get_n_tokens(chunk.get()); + auto send_event = [&ss](const json & event_obj) { + if (event_obj.contains("event") && event_obj.contains("data")) { + ss << "event: " << event_obj.at("event").get() << "\n"; + ss << "data: " << safe_json_to_str(event_obj.at("data")) << "\n\n"; + } else { + ss << "data: " << safe_json_to_str(event_obj) << "\n\n"; } + }; - return res; - } - - // for debugging - std::string str() const { - std::ostringstream oss; - oss << "tokens: "; - for (size_t idx = 0; idx < tokens.size(); ++idx) { - llama_token t = tokens[idx]; - oss << "idx:" << idx << " "; - if (t == LLAMA_TOKEN_NULL) { - oss << " "; - } else { - oss << t << " "; - } - } - oss << "\n"; - oss << "image idx: "; - for (const auto & it : map_idx_to_media) { - oss << it.first << ", "; + if (data.is_array()) { + for (const auto & event : data) { + send_event(event); } - return oss.str(); + } else { + send_event(data); } - const mtmd::input_chunk_ptr & find_chunk(size_t idx) const { - auto it = map_idx_to_media.find(idx); - if (it != map_idx_to_media.end()) { - return it->second; - } - throw std::runtime_error("Chunk not found"); - } + return ss.str(); +} - void push_back(llama_token tok) { - if (tok == LLAMA_TOKEN_NULL) { - throw std::runtime_error("Invalid token"); - } - tokens.emplace_back(tok); - } +bool is_valid_utf8(const std::string & str) { + const unsigned char* bytes = reinterpret_cast(str.data()); + const unsigned char* end = bytes + str.length(); - // will create a copy of the chunk if it contains non-text data - void push_back(const mtmd_input_chunk * chunk) { - auto type = mtmd_input_chunk_get_type(chunk); - if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE || type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { - GGML_ASSERT(has_mtmd); - const size_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk); - size_t start_idx = tokens.size(); - for (size_t i = 0; i < n_tokens; ++i) { - tokens.emplace_back(LLAMA_TOKEN_NULL); - } - mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk)); - map_idx_to_media[start_idx] = std::move(new_chunk); - } else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) { - size_t n_tokens; - const auto * text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens); - for (size_t i = 0; i < n_tokens; ++i) { - push_back(text_tokens[i]); - } + while (bytes < end) { + if (*bytes <= 0x7F) { + // 1-byte sequence (0xxxxxxx) + bytes++; + } else if ((*bytes & 0xE0) == 0xC0) { + // 2-byte sequence (110xxxxx 10xxxxxx) + if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80) + return false; + bytes += 2; + } else if ((*bytes & 0xF0) == 0xE0) { + // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx) + if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80) + return false; + bytes += 3; + } else if ((*bytes & 0xF8) == 0xF0) { + // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) + if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 || + (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80) + return false; + bytes += 4; } else { - GGML_ABORT("Invalid chunk type"); - } - } - - // appends server tokens, updates the media map. copies media chunks. - void push_back(server_tokens & tokens) { - size_t start_idx = size(); - for (size_t i = 0; i < tokens.size(); i++) { - push_back(tokens[i]); - } - if (tokens.has_mtmd) { - // Assert if we are copying MTMD chunks to a server_tokens that does not have mtmd. - // We could also just check, but this will prevent silently dropping MTMD data. - GGML_ASSERT(has_mtmd); - for (auto it = tokens.map_idx_to_media.begin(); it != tokens.map_idx_to_media.end(); ) { - auto * chunk = tokens.map_idx_to_media[it->first].get(); - mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk)); - map_idx_to_media[start_idx + it->first] = std::move(new_chunk); - } + // Invalid UTF-8 lead byte + return false; } } - // for compatibility with context shift and prompt truncation - void insert(const llama_tokens & inp_tokens) { - GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled - tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end()); - } - - // for compatibility with speculative decoding, ctx shift, slot save/load - const llama_tokens & get_text_tokens() const { - GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled - return tokens; - } - - // for compatibility with speculative decoding - void set_token(llama_pos pos, llama_token id) { - GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled - tokens[pos] = id; - } + return true; +} - size_t size() const { - return tokens.size(); - } +llama_tokens format_prompt_infill( + const llama_vocab * vocab, + const json & input_prefix, + const json & input_suffix, + const json & input_extra, + const int n_batch, + const int n_predict, + const int n_ctx, + const bool spm_infill, + const llama_tokens & tokens_prompt + ) { + // TODO: optimize this block by reducing memory allocations and movement - bool empty() const { - return tokens.empty(); - } + // use FIM repo-level pattern: + // ref: https://arxiv.org/pdf/2409.12186 + // + // [FIM_REP]myproject + // [FIM_SEP]filename0 + // extra chunk 0 + // [FIM_SEP]filename1 + // extra chunk 1 + // ... + // [FIM_SEP]filename + // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt + // + llama_tokens extra_tokens; + extra_tokens.reserve(n_ctx); - void clear() { - map_idx_to_media.clear(); - tokens.clear(); - } + auto tokens_prefix = tokenize_mixed(vocab, input_prefix, false, false); + auto tokens_suffix = tokenize_mixed(vocab, input_suffix, false, false); - void keep_first(size_t n) { - GGML_ASSERT(n <= tokens.size()); - if (has_mtmd) { - if (n == tokens.size()) { - return; // nothing to do - } - // we throw an error if we try to remove a token in the middle of an image - // for ex. with input of 5 text tokens and 2 images: - // [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] - // n 1 2 3 4 5 6 7 8 9 10 - // allowed to resize ^ ^ - // disallowed to resize ^ ^ ^ - if (n > 0) { - // make sure we never remove tokens in the middle of an image - // note that the case where we keep a full image at the end is allowed: - // tokens[n - 1] == LLAMA_TOKEN_NULL && tokens[n] != LLAMA_TOKEN_NULL - if (tokens[n - 1] == LLAMA_TOKEN_NULL && tokens[n] == LLAMA_TOKEN_NULL) { - find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk - } - } - // remove all image chunks that are not used anymore - for (auto it = map_idx_to_media.begin(); it != map_idx_to_media.end(); ) { - size_t idx = it->first; - if (idx >= n) { - it = map_idx_to_media.erase(it); - } else { - ++it; - } - } - } - tokens.resize(n); - } + if (llama_vocab_fim_rep(vocab) != LLAMA_TOKEN_NULL) { + // TODO: make project name an input + static const auto k_fim_repo = common_tokenize(vocab, "myproject\n", false, false); - std::string detokenize(const llama_context * ctx, bool special) const { - llama_tokens text_tokens; - text_tokens.reserve(tokens.size()); - for (const auto & t : tokens) { - if (t != LLAMA_TOKEN_NULL) { - text_tokens.push_back(t); - } - } - return common_detokenize(ctx, text_tokens, special); + extra_tokens.push_back(llama_vocab_fim_rep(vocab)); + extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end()); } + for (const auto & chunk : input_extra) { + // { "text": string, "filename": string } + const std::string text = json_value(chunk, "text", std::string()); + const std::string filename = json_value(chunk, "filename", std::string("tmp")); - size_t get_common_prefix(const server_tokens & b) const { - const size_t max_idx = std::min(tokens.size(), b.tokens.size()); - - if (!has_mtmd) { - for (size_t i = 0; i < max_idx; ++i) { - if (tokens[i] == b.tokens[i]) { - continue; - } + if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) { + const auto k_fim_file = common_tokenize(vocab, filename + "\n", false, false); - return i; - } + extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab)); + extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); + } else { + // chunk separator in binary form to avoid confusing the AI + static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00}; + static const auto k_chunk_prefix_tokens = common_tokenize(vocab, k_chunk_prefix_str, false, false); - return max_idx; + extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end()); } - for (size_t i = 0; i < max_idx; ++i) { - const llama_token ai = tokens[i]; - const llama_token bi = b.tokens[i]; - - if (ai == LLAMA_TOKEN_NULL && bi == LLAMA_TOKEN_NULL) { - const auto & a_chunk = find_chunk(i); - const auto & b_chunk = b.find_chunk(i); - - GGML_ASSERT(a_chunk && b_chunk); - - const std::string id_ai = mtmd_input_chunk_get_id(a_chunk.get()); - const std::string id_bi = mtmd_input_chunk_get_id(b_chunk.get()); - - const size_t n_tok_a = mtmd_input_chunk_get_n_tokens(a_chunk.get()); - const size_t n_tok_b = mtmd_input_chunk_get_n_tokens(b_chunk.get()); - - if (id_ai == id_bi && n_tok_a == n_tok_b) { - GGML_ASSERT(n_tok_a > 0 && "Invalid media chunk"); // should never happen - i += n_tok_a - 1; // will be +1 by the for loop - continue; - } + const auto chunk_tokens = common_tokenize(vocab, text, false, false); + extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end()); + } - return i; - } + if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) { + // TODO: current filename + static const auto k_fim_file = common_tokenize(vocab, "filename\n", false, false); - if (ai == bi) { - continue; - } + extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab)); + extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); + } - return i; - } + // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?) + const int n_prefix_take = std::min(tokens_prefix.size(), 3*(n_batch/4)); + const int n_suffix_take = std::min(tokens_suffix.size(), std::max(0, (n_batch/4) - (2 + tokens_prompt.size()))); - return max_idx; // all tokens are equal - } + SRV_DBG("n_prefix_take = %d, n_suffix_take = %d, total = %d\n", n_prefix_take, n_suffix_take, (n_prefix_take + n_suffix_take)); - // make sure all text tokens are within the vocab range - bool validate(const struct llama_context * ctx) const { - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); - const int32_t n_vocab = llama_vocab_n_tokens(vocab); + // fill the rest of the context with extra chunks + const int n_extra_take = std::min(std::max(0, n_ctx - (n_batch) - 2*n_predict), extra_tokens.size()); - for (size_t i = 0; i < tokens.size(); ++i) { - const auto & t = tokens[i]; - if (t == LLAMA_TOKEN_NULL) { - try { - const auto & chunk = find_chunk(i); - size_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk.get()); - i += n_tokens - 1; // will be +1 by the for loop - } catch (const std::exception & e) { - return false; - } - } else if (t < 0 || t >= n_vocab) { - return false; - } - } - return true; - } + tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take); + tokens_suffix.resize(n_suffix_take); - // encode and decode the image chunk - int32_t process_chunk( - llama_context * ctx, - mtmd_context * mctx, - size_t idx, - llama_pos pos, - int32_t seq_id, - size_t & n_tokens_out) const { - const auto & chunk = find_chunk(idx); - const char * name = mtmd_input_chunk_get_type(chunk.get()) == MTMD_INPUT_CHUNK_TYPE_IMAGE - ? "image" : "audio"; - SRV_INF("processing %s...\n", name); - int32_t n_batch = llama_n_batch(ctx); - int64_t t0 = ggml_time_ms(); - llama_pos new_n_past; // unused for now - int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx, - chunk.get(), - pos, - seq_id, - n_batch, - true, // logits last - &new_n_past); - SRV_INF("%s processed in %" PRId64 " ms\n", name, ggml_time_ms() - t0); - if (result != 0) { - LOG_ERR("mtmd_helper_eval failed with status %d", result); - n_tokens_out = 0; - return result; - } - n_tokens_out = mtmd_input_chunk_get_n_tokens(chunk.get()); - return 0; - } -}; + tokens_prefix.insert(tokens_prefix.begin(), llama_vocab_fim_pre(vocab)); + tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end()); + tokens_suffix.insert(tokens_suffix.begin(), llama_vocab_fim_suf(vocab)); -// Computes FNV-1a hash of the data -static std::string fnv_hash(const uint8_t * data, size_t len) { - const uint64_t fnv_prime = 0x100000001b3ULL; - uint64_t hash = 0xcbf29ce484222325ULL; + auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix; + auto embd_end = spm_infill ? tokens_prefix : tokens_suffix; - for (size_t i = 0; i < len; ++i) { - hash ^= data[i]; - hash *= fnv_prime; + if (llama_vocab_get_add_bos(vocab)) { + embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab)); } - return std::to_string(hash); -} -static server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector files) { - mtmd::bitmaps bitmaps; - for (auto & file : files) { - mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size())); - if (!bmp.ptr) { - throw std::runtime_error("Failed to load image or audio file"); - } - // calculate bitmap hash (for KV caching) - std::string hash = fnv_hash(bmp.data(), bmp.n_bytes()); - bmp.set_id(hash.c_str()); - bitmaps.entries.push_back(std::move(bmp)); - } - // process prompt - std::vector inputs; - // multimodal - mtmd_input_text inp_txt = { - prompt.c_str(), - /* add_special */ true, - /* parse_special */ true, - }; - mtmd::input_chunks chunks(mtmd_input_chunks_init()); - auto bitmaps_c_ptr = bitmaps.c_ptr(); - int32_t tokenized = mtmd_tokenize(mctx, - chunks.ptr.get(), - &inp_txt, - bitmaps_c_ptr.data(), - bitmaps_c_ptr.size()); - if (tokenized != 0) { - throw std::runtime_error("Failed to tokenize prompt"); - } - auto result = server_tokens(chunks, true); - return result; -} + SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size()); -/** - * break the input "prompt" object into multiple prompt if needed, then tokenize them - * use tokenize_input_prompts() if the input could be an array. - * this supports these cases: - * - "prompt": "string" - * - "prompt": [12, 34, 56] - * - "prompt": [12, 34, "string", 56, 78] - * - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] } - */ -static server_tokens tokenize_input_subprompt(const llama_vocab * vocab, mtmd_context * mctx, const json & json_prompt, bool add_special, bool parse_special) { - constexpr char JSON_STRING_PROMPT_KEY[] = "prompt_string"; - constexpr char JSON_MTMD_DATA_KEY[] = "multimodal_data"; - const bool has_mtmd = mctx != nullptr; - if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) { - // string or mixed - llama_tokens tmp = tokenize_mixed(vocab, json_prompt, add_special, parse_special); - return server_tokens(tmp, false); - } else if (json_is_array_of_numbers(json_prompt)) { - // array of tokens - llama_tokens tmp = json_prompt.get(); - return server_tokens(tmp, false); - } else if (json_prompt.contains(JSON_STRING_PROMPT_KEY)) { - // JSON object with prompt key. - if (json_prompt.contains(JSON_MTMD_DATA_KEY)) { - if (!has_mtmd) - throw std::runtime_error("Multimodal data provided, but model does not support multimodal requests."); + // put the extra context before the FIM prefix + embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end()); - // JSON object with prompt and multimodal key. - std::vector files; - for (const auto & entry : json_prompt.at(JSON_MTMD_DATA_KEY)) { - files.push_back(base64_decode(entry)); - } - return process_mtmd_prompt(mctx, json_prompt.at(JSON_STRING_PROMPT_KEY), files); - } else { - // Not multimodal, but contains a subobject. - llama_tokens tmp = tokenize_mixed(vocab, json_prompt.at(JSON_STRING_PROMPT_KEY), add_special, parse_special); - return server_tokens(tmp, false); - } - } else { - throw std::runtime_error("\"prompt\" elements must be a string, a list of tokens, a JSON object containing a prompt string, or a list of mixed strings & tokens."); - } -} + embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); + embd_inp.push_back(llama_vocab_fim_mid(vocab)); -/** - * break the input "prompt" object into multiple prompt if needed, then tokenize them - * this supports these cases: - * - "prompt": "string" - * - "prompt": [12, 34, 56] - * - "prompt": [12, 34, "string", 56, 78] - * - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] } - * and multiple prompts (multi-tasks): - * - "prompt": ["string1", "string2"] - * - "prompt": ["string1", [12, 34, 56]] - * - "prompt": [[12, 34, 56], [78, 90, 12]] - * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56], { "prompt_string": "string", "multimodal_data": [ "base64" ]}] - */ -static std::vector tokenize_input_prompts(const llama_vocab * vocab, mtmd_context * mctx, const json & json_prompt, bool add_special, bool parse_special) { - std::vector result; - if (json_prompt.is_array() && !json_is_array_and_contains_numbers(json_prompt)) { - result.reserve(json_prompt.size()); - for (const auto & p : json_prompt) { - result.push_back(tokenize_input_subprompt(vocab, mctx, p,add_special, parse_special)); - } - } else { - result.push_back(tokenize_input_subprompt(vocab, mctx, json_prompt, add_special, parse_special)); - } - if (result.empty()) { - throw std::runtime_error("\"prompt\" must not be empty"); - } - return result; + return embd_inp; } -// format rerank task: [BOS]query[EOS][SEP]doc[EOS]. -static server_tokens format_rerank(const struct llama_model * model, const struct llama_vocab * vocab, mtmd_context * mctx, const std::string & query, const std::string & doc) { +server_tokens format_prompt_rerank( + const struct llama_model * model, + const struct llama_vocab * vocab, + mtmd_context * mctx, + const std::string & query, + const std::string & doc) { server_tokens result = {}; const char * rerank_prompt = llama_model_chat_template(model, "rerank"); diff --git a/llamacpp/native/src/server/server-common.h b/llamacpp/native/src/server/server-common.h new file mode 100644 index 000000000..bb04e82b4 --- /dev/null +++ b/llamacpp/native/src/server/server-common.h @@ -0,0 +1,359 @@ +#pragma once + +#include "common.h" +#include "log.h" +#include "llama.h" +#include "chat.h" +#include "mtmd.h" + +#define JSON_ASSERT GGML_ASSERT +#include + +#include +#include +#include + +const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT); + +using json = nlohmann::ordered_json; + +#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) +#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) +#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) +#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) + +#define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) + +using raw_buffer = std::vector; + +template +static T json_value(const json & body, const std::string & key, const T & default_value) { + // Fallback null to default value + if (body.contains(key) && !body.at(key).is_null()) { + try { + return body.at(key); + } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const & err) { + LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value: %s\n", key.c_str(), json(default_value).type_name(), err.what()); + return default_value; + } + } else { + return default_value; + } +} + +// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 +enum error_type { + ERROR_TYPE_INVALID_REQUEST, + ERROR_TYPE_AUTHENTICATION, + ERROR_TYPE_SERVER, + ERROR_TYPE_NOT_FOUND, + ERROR_TYPE_PERMISSION, + ERROR_TYPE_UNAVAILABLE, // custom error + ERROR_TYPE_NOT_SUPPORTED, // custom error + ERROR_TYPE_EXCEED_CONTEXT_SIZE, // custom error +}; + +// thin wrapper around common_grammar_trigger with (de)serialization functions +struct server_grammar_trigger { + common_grammar_trigger value; + + server_grammar_trigger() = default; + server_grammar_trigger(const common_grammar_trigger & value) : value(value) {} + server_grammar_trigger(const json & in) { + value.type = (common_grammar_trigger_type) in.at("type").get(); + value.value = in.at("value").get(); + if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) { + value.token = (llama_token) in.at("token").get(); + } + } + + json to_json() const { + json out { + {"type", (int) value.type}, + {"value", value.value}, + }; + if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) { + out["token"] = (int) value.token; + } + return out; + } +}; + +json format_error_response(const std::string & message, const enum error_type type); + +// +// random string / id +// + +std::string random_string(); +std::string gen_chatcmplid(); +std::string gen_tool_call_id(); + +// +// lora utils +// + +// check whether the given lora set has only aloras activated (empty => false) +bool lora_all_alora(const std::vector & loras); + +// if the two sets of loras are different, they require a cache clear unless the +// change is only from aloras to aloras. +bool lora_should_clear_cache( + const std::vector & current, + const std::vector & next); + +std::vector parse_lora_request( + const std::vector & lora_base, + const json & data); + +bool are_lora_equal( + const std::vector & l1, + const std::vector & l2); + +// get the ids of all enabled loras +std::vector lora_get_enabled_ids(const std::vector & loras); + +// +// server_tokens +// + +/** + * server_tokens is a helper to manage the input tokens and image for the server. + * it is made this way to simplify the logic of KV cache management. + */ +struct server_tokens { + bool has_mtmd = false; + +private: // disallow accessing these members directly, risking out-of-sync + + // map a **start** index in tokens to the image chunk + // note: the order need to be in-sync with tokens + std::map map_idx_to_media; + + // list of tokens + // if the token is LLAMA_TOKEN_NULL, it indicates that this position is occupied by media chunk + // otherwise, it is a normal text token + // note: a non-text chunk can occupy multiple tokens (aka memory cells) in the token list + // note(2): for M-RoPE, an image can occupy different number of pos; do not assume 1-to-1 mapping tokens <-> pos + llama_tokens tokens; + + // for ex. with input of 5 text tokens and 2 images (each image occupies 3 tokens and 2 pos): + // [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] [img1] + // idx 0 1 2 3 4 5 6 7 8 9 10 + // pos 0 1 2 3 4 5 5 5 7 7 7 + // map_idx_to_media will contain: {5, img0}, {8, img1} + +public: + server_tokens() = default; + ~server_tokens() = default; + + // Prevent copying + // TODO: server_tokens should be copyable - remove this: + server_tokens(const server_tokens&) = delete; + server_tokens& operator=(const server_tokens&) = delete; + + // Allow moving (usually implicitly generated if members are movable) + server_tokens(server_tokens&&) = default; + server_tokens& operator=(server_tokens&&) = default; + + // Allow accessing elements using [] operator + llama_token operator[](size_t index) { return tokens[index]; } + const llama_token& operator[](size_t index) const { return tokens[index]; } + + server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd); + server_tokens(const llama_tokens & tokens, bool has_mtmd); + + // for debugging + std::string str() const; + + llama_pos pos_next() const; + const mtmd::input_chunk_ptr & find_chunk(size_t idx) const; + + void push_back(llama_token tok); + + // will create a copy of the chunk if it contains non-text data + void push_back(const mtmd_input_chunk * chunk); + + // appends server tokens, updates the media map. copies media chunks. + void push_back(server_tokens & tokens); + + // for compatibility with context shift and prompt truncation + void insert(const llama_tokens & inp_tokens); + + // for compatibility with speculative decoding, ctx shift, slot save/load + const llama_tokens & get_text_tokens() const; + + // for compatibility with speculative decoding + void set_token(llama_pos pos, llama_token id); + + size_t size() const { return tokens.size(); } + + bool empty() const { return tokens.empty(); } + + void clear() { + map_idx_to_media.clear(); + tokens.clear(); + } + + void keep_first(size_t n); + + std::string detokenize(const llama_context * ctx, bool special) const; + + size_t get_common_prefix(const server_tokens & b) const; + + // make sure all text tokens are within the vocab range + bool validate(const struct llama_context * ctx) const; + + // encode and decode the image chunk + int32_t process_chunk( + llama_context * ctx, + mtmd_context * mctx, + size_t idx, + llama_pos pos, + int32_t seq_id, + size_t & n_tokens_out) const; +}; + + +// +// tokenizer and input processing utils +// + +bool json_is_array_of_numbers(const json & data); + +// is array having BOTH numbers & strings? +bool json_is_array_of_mixed_numbers_strings(const json & data); + +// does array have any individual integers/tokens? +bool json_is_array_and_contains_numbers(const json & data); + +// get value by path(key1 / key2) +json json_get_nested_values(const std::vector & paths, const json & js); + +/** + * this handles 2 cases: + * - only string, example: "string" + * - mixed string and tokens, example: [12, 34, "string", 56, 78] + */ +llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special); + +// return the last index of character that can form a valid string +// if the last character is potentially cut in half, return the index before the cut +// if validate_utf8(text) == text.size(), then the whole text is valid utf8 +size_t validate_utf8(const std::string& text); + +// process mtmd prompt, return the server_tokens containing both text tokens and media chunks +server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector files); + +/** + * break the input "prompt" object into multiple prompt if needed, then tokenize them + * this supports these cases: + * - "prompt": "string" + * - "prompt": [12, 34, 56] + * - "prompt": [12, 34, "string", 56, 78] + * - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] } + * and multiple prompts (multi-tasks): + * - "prompt": ["string1", "string2"] + * - "prompt": ["string1", [12, 34, 56]] + * - "prompt": [[12, 34, 56], [78, 90, 12]] + * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56], { "prompt_string": "string", "multimodal_data": [ "base64" ]}] + */ +std::vector tokenize_input_prompts( + const llama_vocab * vocab, + mtmd_context * mctx, + const json & json_prompt, + bool add_special, + bool parse_special); + +// +// OAI utils +// + +// used by /completions endpoint +json oaicompat_completion_params_parse(const json & body); + +struct oaicompat_parser_options { + bool use_jinja; + bool prefill_assistant; + common_reasoning_format reasoning_format; + std::map chat_template_kwargs; + common_chat_templates * tmpls; + bool allow_image; + bool allow_audio; + bool enable_thinking = true; + std::string media_path; +}; + +// used by /chat/completions endpoint +json oaicompat_chat_params_parse( + json & body, /* openai api json semantics */ + const oaicompat_parser_options & opt, + std::vector & out_files); + +// convert Anthropic Messages API format to OpenAI Chat Completions API format +json convert_anthropic_to_oai(const json & body); + +// TODO: move it to server-task.cpp +json format_embeddings_response_oaicompat( + const json & request, + const std::string & model_name, + const json & embeddings, + bool use_base64 = false); + +// TODO: move it to server-task.cpp +json format_response_rerank( + const json & request, + const std::string & model_name, + const json & ranks, + bool is_tei_format, + std::vector & texts, + int top_n); + +// +// other utils +// + +std::vector get_token_probabilities(llama_context * ctx, int idx); + +std::string safe_json_to_str(const json & data); + +std::string tokens_to_str(llama_context * ctx, const llama_tokens & tokens); + +// format incomplete utf-8 multibyte character for output +std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token); + +// format server-sent event (SSE), return the formatted string to send +// note: if data is a json array, it will be sent as multiple events, one per item +std::string format_oai_sse(const json & data); + +// format Anthropic-style SSE with event types +std::string format_anthropic_sse(const json & data); + +bool is_valid_utf8(const std::string & str); + +// +// formatting output responses +// TODO: move these to server-task.cpp +// + +llama_tokens format_prompt_infill( + const llama_vocab * vocab, + const json & input_prefix, + const json & input_suffix, + const json & input_extra, + const int n_batch, + const int n_predict, + const int n_ctx, + const bool spm_infill, + const llama_tokens & tokens_prompt); + +// format rerank task: [BOS]query[EOS][SEP]doc[EOS]. +server_tokens format_prompt_rerank( + const struct llama_model * model, + const struct llama_vocab * vocab, + mtmd_context * mctx, + const std::string & query, + const std::string & doc); diff --git a/llamacpp/native/src/server/server-context.cpp b/llamacpp/native/src/server/server-context.cpp new file mode 100644 index 000000000..c92457457 --- /dev/null +++ b/llamacpp/native/src/server/server-context.cpp @@ -0,0 +1,3637 @@ +#include "server-context.h" +#include "server-common.h" +#include "server-http.h" +#include "server-task.h" +#include "server-queue.h" + +#include "arg.h" +#include "common.h" +#include "llama.h" +#include "log.h" +#include "sampling.h" +#include "speculative.h" +#include "mtmd.h" +#include "mtmd-helper.h" + +#include +#include +#include +#include +#include + +// fix problem with std::min and std::max +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +# define NOMINMAX +#endif +#include +#endif + +using json = nlohmann::ordered_json; + +constexpr int HTTP_POLLING_SECONDS = 1; + +// state diagram: https://github.com/ggml-org/llama.cpp/pull/9283 +enum slot_state { + SLOT_STATE_IDLE, + SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future + SLOT_STATE_PROCESSING_PROMPT, + SLOT_STATE_DONE_PROMPT, + SLOT_STATE_GENERATING, +}; + +enum server_state { + SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet + SERVER_STATE_READY, // Server is ready and model is loaded +}; + +static bool server_task_type_need_embd(server_task_type task_type) { + switch (task_type) { + case SERVER_TASK_TYPE_EMBEDDING: + case SERVER_TASK_TYPE_RERANK: + return true; + default: + return false; + } +} + +static bool server_task_type_need_logits(server_task_type task_type) { + switch (task_type) { + case SERVER_TASK_TYPE_COMPLETION: + case SERVER_TASK_TYPE_INFILL: + return true; + default: + return false; + } +} + +struct server_slot { + int id; + + llama_batch batch_spec = {}; + + // TODO: change to unique_ptrs for consistency: + llama_context * ctx = nullptr; + llama_context * ctx_dft = nullptr; + + // multimodal + mtmd_context * mctx = nullptr; + + common_speculative * spec = nullptr; + + std::unique_ptr task; + std::unique_ptr task_prev; // used for debugging + + // used to determine the slot that has been used the longest + int64_t t_last_used = -1; + + // generation props + int32_t n_ctx = 0; // context size per slot + int32_t n_keep = 0; + int32_t n_decoded = 0; + int32_t n_remaining = -1; + int32_t i_batch = -1; + + int32_t n_prompt_tokens_cache = 0; + int32_t n_prompt_tokens_processed = 0; + + size_t last_nl_pos = 0; + + std::string generated_text; + llama_tokens generated_tokens; + + common_chat_msg chat_msg; + + std::vector generated_token_probs; + + bool has_next_token = true; + bool has_new_line = false; + bool truncated = false; + + stop_type stop; + + std::string stopping_word; + + // state + slot_state state = SLOT_STATE_IDLE; + + server_prompt prompt; + + void prompt_save(server_prompt_cache & prompt_cache) const { + GGML_ASSERT(prompt.data.size() == 0); + + const size_t cur_size = llama_state_seq_get_size_ext(ctx, id, 0); + + SRV_WRN(" - saving prompt with length %d, total state size = %.3f MiB\n", + (int) prompt.tokens.size(), cur_size / (1024.0 * 1024.0)); + + auto * cur = prompt_cache.alloc(prompt, cur_size); + if (cur == nullptr) { + return; + } + + llama_state_seq_get_data_ext(ctx, cur->data.data(), cur_size, id, 0); + } + + bool prompt_load(server_prompt_cache & prompt_cache, const server_tokens & tokens) { + bool res = prompt_cache.load(prompt, tokens, ctx, id); + if (!res) { + SLT_WRN(*this, "%s", "failed to load prompt from cache\n"); + } + + return res; + } + + std::vector lora; + int32_t alora_invocation_start = -1; + + // sampling + json json_schema; + + struct common_sampler * smpl = nullptr; + + llama_token sampled; + + common_chat_format chat_format = COMMON_CHAT_FORMAT_CONTENT_ONLY; + std::vector generated_tool_call_ids; + + // stats + size_t n_sent_text = 0; // number of sent text character + + int64_t t_start_process_prompt; + int64_t t_start_generation; + + double t_prompt_processing; // ms + double t_token_generation; // ms + + std::function callback_on_release; + + // Speculative decoding stats + int32_t n_draft_total = 0; // Total draft tokens generated + int32_t n_draft_accepted = 0; // Draft tokens actually accepted + + void reset() { + SLT_DBG(*this, "%s", "\n"); + + n_prompt_tokens_cache = 0; + + last_nl_pos = 0; + generated_text = ""; + has_new_line = false; + truncated = false; + stop = STOP_TYPE_NONE; + stopping_word = ""; + n_sent_text = 0; + chat_format = COMMON_CHAT_FORMAT_CONTENT_ONLY; + + generated_tokens.clear(); + generated_token_probs.clear(); + chat_msg = {}; + json_schema = json(); + generated_tool_call_ids.clear(); + + // clear speculative decoding stats + n_draft_total = 0; + n_draft_accepted = 0; + + task.reset(); + task_prev.reset(); + + // clear alora start + alora_invocation_start = -1; + } + + bool need_embd() const { + GGML_ASSERT(task); + + return server_task_type_need_embd(task->type); + } + + bool need_logits() const { + GGML_ASSERT(task); + + return server_task_type_need_logits(task->type); + } + + // if the context does not have a memory module then all embeddings have to be computed within a single ubatch + // also we cannot split if the pooling would require any past tokens + bool can_split() const { + return + !need_embd() || + (llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST); + } + + bool can_batch_with(server_slot & other_slot) const { + GGML_ASSERT(task); + + return task->type == other_slot.task->type && are_lora_equal(lora, other_slot.lora); + } + + bool has_budget(const common_params & global_params) { + GGML_ASSERT(task); + + if (task->params.n_predict == -1 && global_params.n_predict == -1) { + return true; // limitless + } + + n_remaining = -1; + + if (task->params.n_predict != -1) { + n_remaining = task->params.n_predict - n_decoded; + } else if (global_params.n_predict != -1) { + n_remaining = global_params.n_predict - n_decoded; + } + + return n_remaining > 0; // no budget + } + + bool is_processing() const { + return state != SLOT_STATE_IDLE; + } + + bool can_speculate() const { + return ctx_dft; + } + + void add_token(const completion_token_output & token) { + if (!is_processing()) { + SLT_WRN(*this, "%s", "slot is not processing\n"); + return; + } + generated_token_probs.push_back(token); + } + + void release() { + if (is_processing()) { + GGML_ASSERT(task); + + SLT_INF(*this, "stop processing: n_tokens = %d, truncated = %d\n", prompt.n_tokens(), truncated); + + t_last_used = ggml_time_us(); + t_token_generation = (ggml_time_us() - t_start_generation) / 1e3; + state = SLOT_STATE_IDLE; + + task_prev = std::move(task); + task.reset(); + + callback_on_release(id); + } + } + + result_timings get_timings() const { + result_timings timings; + timings.cache_n = n_prompt_tokens_cache; + + timings.prompt_n = n_prompt_tokens_processed; + timings.prompt_ms = t_prompt_processing; + timings.prompt_per_token_ms = t_prompt_processing / n_prompt_tokens_processed; + timings.prompt_per_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; + + timings.predicted_n = n_decoded; + timings.predicted_ms = t_token_generation; + timings.predicted_per_token_ms = t_token_generation / n_decoded; + timings.predicted_per_second = 1e3 / t_token_generation * n_decoded; + + // Add speculative metrics + if (n_draft_total > 0) { + timings.draft_n = n_draft_total; + timings.draft_n_accepted = n_draft_accepted; + } + + return timings; + } + + const common_chat_msg & update_chat_msg(std::vector & diffs) { + GGML_ASSERT(task); + + auto previous_msg = chat_msg; + SRV_DBG("Parsing chat message: %s\n", generated_text.c_str()); + auto new_msg = common_chat_parse( + generated_text, + /* is_partial= */ stop != STOP_TYPE_EOS, + task->params.oaicompat_chat_syntax); + if (!new_msg.empty()) { + new_msg.set_tool_call_ids(generated_tool_call_ids, gen_tool_call_id); + chat_msg = new_msg; + diffs = common_chat_msg_diff::compute_diffs(previous_msg, new_msg.empty() ? previous_msg : new_msg); + } + return chat_msg; + } + + size_t find_stopping_strings(const std::string & text, const size_t last_token_size, bool is_full_stop) { + GGML_ASSERT(task); + + size_t stop_pos = std::string::npos; + + for (const std::string & word : task->params.antiprompt) { + size_t pos; + + if (is_full_stop) { + const size_t tmp = word.size() + last_token_size; + const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; + + pos = text.find(word, from_pos); + } else { + // otherwise, partial stop + pos = string_find_partial_stop(text, word); + } + + if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) { + if (is_full_stop) { + stop = STOP_TYPE_WORD; + stopping_word = word; + has_next_token = false; + } + stop_pos = pos; + } + } + + return stop_pos; + } + + void print_timings() const { + const double t_prompt = t_prompt_processing / n_prompt_tokens_processed; + const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; + + const double t_gen = t_token_generation / n_decoded; + const double n_gen_second = 1e3 / t_token_generation * n_decoded; + + SLT_INF(*this, + "\n" + "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" + " eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" + " total time = %10.2f ms / %5d tokens\n", + t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second, + t_token_generation, n_decoded, t_gen, n_gen_second, + t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded); + + if (n_draft_total > 0) { + const float draft_ratio = (float) n_draft_accepted / n_draft_total; + SLT_INF(*this, + "\n" + "draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n", + draft_ratio, n_draft_accepted, n_draft_total + ); + } + } + + json to_json(bool only_metrics = false) const { + json res; + + res = { + {"id", id}, + {"n_ctx", n_ctx}, + {"speculative", can_speculate()}, + {"is_processing", is_processing()}, + }; + + const auto & ptask = task ? task : task_prev; + + if (ptask) { + res["id_task"] = ptask->id; + res["params"] = ptask->params.to_json(only_metrics); + res["next_token"] = { + { + {"has_next_token", has_next_token}, + {"has_new_line", has_new_line}, + {"n_remain", n_remaining}, + {"n_decoded", n_decoded}, + } + }; + + if (!only_metrics) { + res["prompt"] = ptask->tokens.detokenize(ctx, true); + res["generated"] = generated_text; + } + } + + return res; + } +}; + + + +// +// server_metrics +// + +struct server_metrics { + int64_t t_start = 0; + + uint64_t n_prompt_tokens_processed_total = 0; + uint64_t t_prompt_processing_total = 0; + uint64_t n_tokens_predicted_total = 0; + uint64_t t_tokens_generation_total = 0; + + uint64_t n_tokens_max = 0; + + uint64_t n_prompt_tokens_processed = 0; + uint64_t t_prompt_processing = 0; + + uint64_t n_tokens_predicted = 0; + uint64_t t_tokens_generation = 0; + + uint64_t n_decode_total = 0; + uint64_t n_busy_slots_total = 0; + + void init() { + t_start = ggml_time_us(); + } + + void on_prompt_eval(const server_slot & slot) { + n_prompt_tokens_processed_total += slot.n_prompt_tokens_processed; + n_prompt_tokens_processed += slot.n_prompt_tokens_processed; + t_prompt_processing += slot.t_prompt_processing; + t_prompt_processing_total += slot.t_prompt_processing; + + n_tokens_max = std::max(n_tokens_max, (uint64_t) slot.prompt.n_tokens()); + } + + void on_prediction(const server_slot & slot) { + n_tokens_predicted_total += slot.n_decoded; + n_tokens_predicted += slot.n_decoded; + t_tokens_generation += slot.t_token_generation; + t_tokens_generation_total += slot.t_token_generation; + } + + void on_decoded(const std::vector & slots) { + n_decode_total++; + for (const auto & slot : slots) { + if (slot.is_processing()) { + n_busy_slots_total++; + } + n_tokens_max = std::max(n_tokens_max, (uint64_t) slot.prompt.n_tokens()); + } + } + + void reset_bucket() { + n_prompt_tokens_processed = 0; + t_prompt_processing = 0; + n_tokens_predicted = 0; + t_tokens_generation = 0; + } +}; + + +// +// server_context_impl (private implementation) +// + +struct server_context_impl { + common_params params_base; + + // note: keep these alive - they determine the lifetime of the model, context, etc. + common_init_result llama_init; + common_init_result llama_init_dft; + + llama_model * model = nullptr; + llama_context * ctx = nullptr; + + // multimodal + mtmd_context * mctx = nullptr; + + const llama_vocab * vocab = nullptr; + bool vocab_dft_compatible = true; + + llama_model * model_dft = nullptr; + + llama_context_params cparams_dft; + + llama_batch batch {}; + + bool add_bos_token = true; + + int32_t n_ctx; // total context for all clients / slots + + // slots / clients + std::vector slots; + + int slots_debug = 0; + + server_queue queue_tasks; + server_response queue_results; + + std::unique_ptr prompt_cache; + + server_metrics metrics; + + // Necessary similarity of prompt for slot selection + float slot_prompt_similarity = 0.0f; + + std::string model_name; // name of the loaded model, to be used by API + + common_chat_templates_ptr chat_templates; + oaicompat_parser_options oai_parser_opt; + + ~server_context_impl() { + mtmd_free(mctx); + + // Clear any sampling context + for (server_slot & slot : slots) { + common_sampler_free(slot.smpl); + slot.smpl = nullptr; + + llama_free(slot.ctx_dft); + slot.ctx_dft = nullptr; + + common_speculative_free(slot.spec); + slot.spec = nullptr; + + llama_batch_free(slot.batch_spec); + } + + llama_batch_free(batch); + } + + // load the model and initialize llama_context + bool load_model(const common_params & params) { + SRV_INF("loading model '%s'\n", params.model.path.c_str()); + + params_base = params; + + llama_init = common_init_from_params(params_base); + + model = llama_init.model.get(); + ctx = llama_init.context.get(); + + if (model == nullptr) { + SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str()); + return false; + } + + vocab = llama_model_get_vocab(model); + + n_ctx = llama_n_ctx(ctx); + + add_bos_token = llama_vocab_get_add_bos(vocab); + + if (params_base.has_speculative()) { + SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str()); + + auto params_dft = params_base; + + params_dft.devices = params_base.speculative.devices; + params_dft.model = params_base.speculative.model; + params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? llama_n_ctx_seq(ctx) : params_base.speculative.n_ctx; + params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers; + params_dft.n_parallel = 1; + params_dft.cache_type_k = params_base.speculative.cache_type_k; + params_dft.cache_type_v = params_base.speculative.cache_type_v; + + params_dft.cpuparams.n_threads = params_base.speculative.cpuparams.n_threads; + params_dft.cpuparams_batch.n_threads = params_base.speculative.cpuparams_batch.n_threads; + params_dft.tensor_buft_overrides = params_base.speculative.tensor_buft_overrides; + + llama_init_dft = common_init_from_params(params_dft); + + model_dft = llama_init_dft.model.get(); + + if (model_dft == nullptr) { + SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str()); + return false; + } + + vocab_dft_compatible = common_speculative_are_compatible(ctx, llama_init_dft.context.get()); + if (!vocab_dft_compatible) { + SRV_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str()); + } + + const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get()); + + cparams_dft = common_context_params_to_llama(params_dft); + cparams_dft.n_batch = n_ctx_dft; + + // the context is not needed - we will create one for each slot + llama_init_dft.context.reset(); + } + + chat_templates = common_chat_templates_init(model, params_base.chat_template); + try { + common_chat_format_example(chat_templates.get(), params.use_jinja, params.default_template_kwargs); + } catch (const std::exception & e) { + SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what()); + SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__); + chat_templates = common_chat_templates_init(model, "chatml"); + } + + std::string & mmproj_path = params_base.mmproj.path; + if (!mmproj_path.empty()) { + mtmd_helper_log_set(common_log_default_callback, nullptr); + + mtmd_context_params mparams = mtmd_context_params_default(); + mparams.use_gpu = params_base.mmproj_use_gpu; + mparams.print_timings = false; + mparams.n_threads = params_base.cpuparams.n_threads; + mparams.flash_attn_type = params_base.flash_attn_type; + mparams.warmup = params_base.warmup; + mparams.image_min_tokens = params_base.image_min_tokens; + mparams.image_max_tokens = params_base.image_max_tokens; + mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams); + if (mctx == nullptr) { + SRV_ERR("failed to load multimodal model, '%s'\n", mmproj_path.c_str()); + return false; + } + SRV_INF("loaded multimodal model, '%s'\n", mmproj_path.c_str()); + + if (params_base.ctx_shift) { + params_base.ctx_shift = false; + SRV_WRN("%s\n", "ctx_shift is not supported by multimodal, it will be disabled"); + } + + if (params_base.n_cache_reuse) { + params_base.n_cache_reuse = 0; + SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled"); + } + + if (params_base.has_speculative()) { + SRV_ERR("%s\n", "err: speculative decode is not supported by multimodal"); + return false; + } + } + + if (!llama_memory_can_shift(llama_get_memory(ctx))) { + if (params_base.ctx_shift) { + params_base.ctx_shift = false; + SRV_WRN("%s\n", "ctx_shift is not supported by this context, it will be disabled"); + } + + if (params_base.n_cache_reuse) { + params_base.n_cache_reuse = 0; + SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled"); + } + } + + return true; + } + + // initialize slots and server-related data + void init() { + // wiring up server queues + queue_tasks.on_new_task([this](server_task && task) { + process_single_task(std::move(task)); + }); + queue_tasks.on_update_slots([this]() { + update_slots(); + }); + + // Necessary similarity of prompt for slot selection + slot_prompt_similarity = params_base.slot_prompt_similarity; + + // setup slots + SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel); + + const int n_ctx_train = llama_model_n_ctx_train(model); + + int n_ctx_slot = llama_n_ctx_seq(ctx); + if (n_ctx_slot > n_ctx_train) { + SRV_WRN("the slot context (%d) exceeds the training context of the model (%d) - capping\n", n_ctx_slot, n_ctx_train); + n_ctx_slot = n_ctx_train; + } + + for (int i = 0; i < params_base.n_parallel; i++) { + server_slot slot; + + slot.id = i; + slot.ctx = ctx; + slot.n_ctx = n_ctx_slot; + slot.mctx = mctx; + slot.prompt.tokens.has_mtmd = mctx != nullptr; + + if (model_dft) { + slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1); + + // TODO: rework speculative decoding [TAG_SERVER_SPEC_REWORK] + slot.ctx_dft = llama_init_from_model(model_dft, cparams_dft); + if (slot.ctx_dft == nullptr) { + SRV_ERR("%s", "failed to create draft context\n"); + return; + } + + slot.spec = common_speculative_init(slot.ctx, slot.ctx_dft); + if (slot.spec == nullptr) { + SRV_ERR("%s", "failed to create speculator\n"); + return; + } + for (auto & pair : params_base.speculative.replacements) { + common_speculative_add_replacement_tgt_dft(slot.spec, pair.first.c_str(), pair.second.c_str()); + } + } + + SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx); + + slot.callback_on_release = [this](int) { + queue_tasks.pop_deferred_task(); + }; + + slot.reset(); + + slots.push_back(std::move(slot)); + } + + { + const char * LLAMA_SERVER_SLOTS_DEBUG = getenv("LLAMA_SERVER_SLOTS_DEBUG"); + slots_debug = LLAMA_SERVER_SLOTS_DEBUG ? atoi(LLAMA_SERVER_SLOTS_DEBUG) : 0; + + if (slots_debug) { + SRV_WRN("slots debug = %d\n", slots_debug); + } + } + + // the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens + // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used) + { + const int32_t n_batch = llama_n_batch(ctx); + batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1); + } + + metrics.init(); + + if (params_base.cache_ram_mib != 0) { + if (params_base.cache_ram_mib < 0) { + SRV_WRN("prompt cache is enabled, size limit: %s\n", "no limit"); + } else { + SRV_WRN("prompt cache is enabled, size limit: %d MiB\n", params_base.cache_ram_mib); + } + SRV_WRN("%s", "use `--cache-ram 0` to disable the prompt cache\n"); + + prompt_cache = std::make_unique(params_base.cache_ram_mib, n_ctx); + } else { + SRV_WRN("%s", "prompt cache is disabled - use `--cache-ram N` to enable it\n"); + } + SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n"); + + if (!params_base.model_alias.empty()) { + // user explicitly specified model name + model_name = params_base.model_alias; + } else if (!params_base.model.name.empty()) { + // use model name in registry format (for models in cache) + model_name = params_base.model.name; + } else { + // fallback: derive model name from file name + auto model_path = std::filesystem::path(params_base.model.path); + model_name = model_path.filename().string(); + } + + // thinking is enabled if: + // 1. It's not explicitly disabled (reasoning_budget == 0) + // 2. The chat template supports it + const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get()); + SRV_INF("thinking = %d\n", enable_thinking); + + oai_parser_opt = { + /* use_jinja */ params_base.use_jinja, + /* prefill_assistant */ params_base.prefill_assistant, + /* reasoning_format */ params_base.reasoning_format, + /* chat_template_kwargs */ params_base.default_template_kwargs, + /* common_chat_templates */ chat_templates.get(), + /* allow_image */ mctx ? mtmd_support_vision(mctx) : false, + /* allow_audio */ mctx ? mtmd_support_audio (mctx) : false, + /* enable_thinking */ enable_thinking, + /* media_path */ params_base.media_path, + }; + + // print sample chat example to make it clear which template is used + LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__, + common_chat_templates_source(chat_templates.get()), + common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str()); + } + + server_slot * get_slot_by_id(int id) { + for (server_slot & slot : slots) { + if (slot.id == id) { + return &slot; + } + } + + return nullptr; + } + + server_slot * get_available_slot(const server_task & task) { + server_slot * ret = nullptr; + + bool update_cache = false; + + // find the slot that has at least n% prompt similarity + if (ret == nullptr && slot_prompt_similarity != 0.0f) { + float sim_best = 0; + + for (server_slot & slot : slots) { + // skip the slot if it is not available + if (slot.is_processing()) { + continue; + } + + const auto & tokens = slot.prompt.tokens; + + // skip the slot if it does not contains cached tokens + if (tokens.empty()) { + continue; + } + + // fraction of the Longest Common Prefix length with respect to the input prompt length + const float sim_cur = float(tokens.get_common_prefix(task.tokens)) / task.tokens.size(); + + // select the current slot if the criteria match + if (sim_cur > sim_best && sim_cur > slot_prompt_similarity) { + sim_best = sim_cur; + + ret = &slot; + } + } + + if (ret != nullptr) { + const float f_keep = (sim_best*task.tokens.size()) / ret->prompt.tokens.size(); + + SLT_INF(*ret, "selected slot by LCP similarity, sim_best = %.3f (> %.3f thold), f_keep = %.3f\n", + sim_best, slot_prompt_similarity, f_keep); + + // if we are about to lose a large portion of the existing context - save it in the prompt cache + if (f_keep < 0.5f) { + update_cache = true; + } + } + } + + // find the slot that has been least recently used + if (ret == nullptr) { + int64_t t_last = -1; + + for (server_slot & slot : slots) { + // skip the slot if it is not available + if (slot.is_processing()) { + continue; + } + + // select the current slot if the criteria match + if (!ret || slot.t_last_used <= t_last) { + t_last = slot.t_last_used; + ret = &slot; + } + } + + if (ret != nullptr) { + SLT_INF(*ret, "selected slot by LRU, t_last = %" PRId64 "\n", t_last); + + update_cache = true; + } + } + + if (ret) { + const auto & tokens = ret->prompt.tokens; + + update_cache = update_cache && prompt_cache; + + // cache prompts only for completion tasks + update_cache = update_cache && task.type == SERVER_TASK_TYPE_COMPLETION; + + // don't update the cache if the slot's context is empty + update_cache = update_cache && tokens.size() > 0; + + // TODO: mtmd does not support prompt cache + update_cache = update_cache && (ret->mctx == nullptr); + + if (update_cache) { + SRV_WRN("%s", "updating prompt cache\n"); + + const int64_t t_start = ggml_time_us(); + + ret->prompt_save(*prompt_cache); + + if (!ret->prompt_load(*prompt_cache, task.tokens)) { + clear_slot(*ret); + } + + prompt_cache->update(); + + SRV_WRN("prompt cache update took %.2f ms\n", (ggml_time_us() - t_start) / 1000.0); + } + } + + return ret; + } + + void clear_slot(server_slot & slot) const { + GGML_ASSERT(!slot.is_processing()); + + SLT_WRN(slot, "clearing slot with %zu tokens\n", slot.prompt.tokens.size()); + + llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1); + slot.prompt.tokens.clear(); + } + + // return true if at least one slot has been cleared + // TODO: improve logic + // - smarter decision which slot to clear (LRU or longest prompt?) + // - move slot to level 2 cache instead of removing? + // - instead of purging, try to store and resume later? + bool try_clear_idle_slots() { + bool res = false; + + if (!params_base.kv_unified) { + return res; + } + + for (auto & slot : slots) { + if (slot.is_processing()) { + continue; + } + + if (slot.prompt.n_tokens() > 0) { + SRV_WRN("purging slot %d with %zu tokens\n", slot.id, slot.prompt.tokens.size()); + + clear_slot(slot); + + res = true; + + // clear slots one by one + break; + } + } + + return res; + } + + bool launch_slot_with_task(server_slot & slot, server_task && task) { + slot.reset(); + + if (!are_lora_equal(task.params.lora, slot.lora)) { + // if lora has changed, check to see if the cache should be cleared + if (lora_should_clear_cache(slot.lora, task.params.lora)) { + SLT_INF(slot, "clearing cache for lora change. %zu loras -> %zu loras\n", slot.lora.size(), task.params.lora.size()); + slot.prompt.tokens.clear(); + } else { + SLT_INF(slot, "keeping cache for alora. %zu target loras\n", task.params.lora.size()); + } + slot.lora = task.params.lora; + } + + // if using alora, make sure it's only a single one requested and active + size_t alora_invocation_start = task.tokens.size(); + if (lora_all_alora(slot.lora)) { + const auto & enabled_ids = lora_get_enabled_ids(slot.lora); + // TODO: This will error out if a user requests two aloras, but only + // provides the activation string for one. We could, instead search + // for all requested alora activation strings and then either keep + // only the last one, or reject if multiple are found. + if (enabled_ids.size() != 1) { + send_error(task, "Cannot run multiple aLoRAs in a single request", ERROR_TYPE_INVALID_REQUEST); + return false; + } + const auto & lora = slot.lora[enabled_ids[0]].ptr; + + // get the pointer and count for the invocation tokens + const uint64_t n_invocation_tokens = llama_adapter_get_alora_n_invocation_tokens(lora); + const llama_token * invocation_tokens = llama_adapter_get_alora_invocation_tokens (lora); + + // scan backwards through the prompt tokens to find the last + // occurrence of the invocation sequence + int match_idx = static_cast(n_invocation_tokens) - 1; + for (int i = task.tokens.size() - 1; i >= 0; --i) { + // the token in this position matches the next token to find in + // the invocation sequence + if (task.tokens[i] == invocation_tokens[match_idx]) { + // if it's a full match, we've found the start + if (match_idx == 0) { + alora_invocation_start = i; + break; + } + // otherwise, check the next token in the sequence + --match_idx; + } else { + // no match in this position, so start looking over again + match_idx = static_cast(n_invocation_tokens) - 1; + } + } + + // if the activation string is not found, disable the alora + if (alora_invocation_start == task.tokens.size()) { + SLT_DBG(slot, "alora %zu requested, but not found. deactivating\n", enabled_ids[0]); + slot.lora[enabled_ids[0]].scale = 0.0f; + } else { + SLT_DBG(slot, "alora %zu activated starting at %zu\n", enabled_ids[0], alora_invocation_start); + slot.alora_invocation_start = alora_invocation_start; + } + } + + if (!task.tokens.validate(ctx)) { + send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST); + return false; + } + + SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str()); + + // initialize samplers + { + if (slot.smpl != nullptr) { + common_sampler_free(slot.smpl); + } + + slot.smpl = common_sampler_init(model, task.params.sampling); + if (slot.smpl == nullptr) { + // for now, the only error that may happen here is invalid grammar + send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST); + return false; + } + + SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl).c_str()); + } + + // initialize draft batch + // TODO: rework speculative decoding [TAG_SERVER_SPEC_REWORK] + if (slot.ctx_dft) { + llama_batch_free(slot.batch_spec); + + slot.batch_spec = llama_batch_init(task.params.speculative.n_max + 1, 0, 1); + } + + slot.task = std::make_unique(std::move(task)); + + slot.state = SLOT_STATE_STARTED; + + SLT_INF(slot, "%s", "processing task\n"); + + return true; + } + + bool process_token(completion_token_output & result, server_slot & slot) { + // remember which tokens were sampled - used for repetition penalties during sampling + const std::string token_str = result.text_to_send; + slot.sampled = result.tok; + + slot.generated_text += token_str; + if (slot.task->params.return_tokens) { + slot.generated_tokens.push_back(result.tok); + } + slot.has_next_token = true; + + // check if there is incomplete UTF-8 character at the end + bool incomplete = validate_utf8(slot.generated_text) < slot.generated_text.size(); + + // search stop word and delete it + if (!incomplete) { + size_t pos = std::min(slot.n_sent_text, slot.generated_text.size()); + + const std::string str_test = slot.generated_text.substr(pos); + bool send_text = true; + + size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), true); + if (stop_pos != std::string::npos) { + slot.generated_text.erase( + slot.generated_text.begin() + pos + stop_pos, + slot.generated_text.end()); + pos = std::min(slot.n_sent_text, slot.generated_text.size()); + } else if (slot.has_next_token && !llama_vocab_is_eog(vocab, result.tok) ) { + stop_pos = slot.find_stopping_strings(str_test, token_str.size(), false); + send_text = stop_pos == std::string::npos; + } + + // check if there is any token to predict + if (send_text) { + // no send the stop word in the response + result.text_to_send = slot.generated_text.substr(pos, std::string::npos); + slot.n_sent_text += result.text_to_send.size(); + // add the token to slot queue and cache + } else { + result.text_to_send = ""; + } + + slot.add_token(result); + if (slot.task->params.stream) { + send_partial_response(slot, result, false); + } + } + + if (incomplete) { + slot.has_next_token = true; + } + + // if context shifting is disabled, make sure that we don't run out of context + if (!params_base.ctx_shift && slot.prompt.n_tokens() + 1 >= slot.n_ctx) { + slot.truncated = true; + slot.stop = STOP_TYPE_LIMIT; + slot.has_next_token = false; + + SLT_DBG(slot, "stopped due to running out of context capacity, prompt.n_tokens() = %d, task.n_tokens = %d, n_decoded = %d, n_ctx = %d\n", + slot.prompt.n_tokens(), slot.task->n_tokens(), slot.n_decoded, slot.n_ctx); + } + + // check the limits + if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) { + slot.stop = STOP_TYPE_LIMIT; + slot.has_next_token = false; + + SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.task->params.n_predict); + } + + if (slot.has_new_line) { + // require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent + if (slot.task->params.n_indent > 0) { + // check the current indentation + // TODO: improve by not doing it more than once for each new line + if (slot.last_nl_pos > 0) { + size_t pos = slot.last_nl_pos; + + int n_indent = 0; + while (pos < slot.generated_text.size() && (slot.generated_text[pos] == ' ' || slot.generated_text[pos] == '\t')) { + n_indent++; + pos++; + } + + if (pos < slot.generated_text.size() && n_indent < slot.task->params.n_indent) { + slot.stop = STOP_TYPE_LIMIT; + slot.has_next_token = false; + + // cut the last line + slot.generated_text.erase(pos, std::string::npos); + + SLT_DBG(slot, "stopped by indentation limit, n_decoded = %d, n_indent = %d\n", slot.n_decoded, n_indent); + } + } + + // find the next new line + { + const size_t pos = slot.generated_text.find('\n', slot.last_nl_pos); + + if (pos != std::string::npos) { + slot.last_nl_pos = pos + 1; + } + } + } + } + + // check if there is a new line in the generated text + if (result.text_to_send.find('\n') != std::string::npos) { + slot.has_new_line = true; + + // if we have seen a new line, we stop after a certain time limit, but only upon another new line + if (slot.task->params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.task->params.t_max_predict_ms)) { + slot.stop = STOP_TYPE_LIMIT; + slot.has_next_token = false; + + SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.task->params.t_max_predict_ms); + } + } + + if (llama_vocab_is_eog(vocab, result.tok)) { + slot.stop = STOP_TYPE_EOS; + slot.has_next_token = false; + + SLT_DBG(slot, "%s", "stopped by EOS\n"); + } + + SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n", slot.n_decoded, slot.n_remaining, result.tok, token_str.c_str()); + + return slot.has_next_token; // continue + } + + void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) const { + size_t n_probs = slot.task->params.sampling.n_probs; + size_t n_vocab = llama_vocab_n_tokens(vocab); + + if (post_sampling) { + const auto * cur_p = common_sampler_get_candidates(slot.smpl, true); + const size_t max_probs = cur_p->size; + + // set probability for sampled token + for (size_t i = 0; i < max_probs; i++) { + if (cur_p->data[i].id == result.tok) { + result.prob = cur_p->data[i].p; + break; + } + } + + // set probability for top n_probs tokens + result.probs.reserve(max_probs); + for (size_t i = 0; i < std::min(max_probs, n_probs); i++) { + result.probs.push_back({ + cur_p->data[i].id, + common_token_to_piece(ctx, cur_p->data[i].id, special), + cur_p->data[i].p + }); + } + } else { + // TODO: optimize this with min-p optimization + std::vector cur = get_token_probabilities(ctx, idx); + + // set probability for sampled token + for (size_t i = 0; i < n_vocab; i++) { + // set probability for sampled token + if (cur[i].id == result.tok) { + result.prob = cur[i].p; + break; + } + } + + // set probability for top n_probs tokens + result.probs.reserve(n_probs); + for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) { + result.probs.push_back({ + cur[i].id, + common_token_to_piece(ctx, cur[i].id, special), + cur[i].p + }); + } + } + } + + void send_error(const server_task & task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { + send_error(task.id, error, type); + } + + void send_error(const server_slot & slot, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { + send_error(slot.task->id, error, type, slot.task->n_tokens(), slot.n_ctx); + } + + void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER, const int32_t n_prompt_tokens = 0, const int32_t n_ctx = 0) { + SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str()); + + if (type == ERROR_TYPE_EXCEED_CONTEXT_SIZE) { + GGML_ASSERT(n_ctx > 0 && n_prompt_tokens > 0); + } + + auto res = std::make_unique(); + res->id = id_task; + res->err_type = type; + res->err_msg = error; + res->n_prompt_tokens = n_prompt_tokens; + res->n_ctx = n_ctx; + + queue_results.send(std::move(res)); + } + + // if multimodal is enabled, send an error and return false + bool check_no_mtmd(const int id_task) { + if (mctx) { + send_error(id_task, "This feature is not supported by multimodal", ERROR_TYPE_NOT_SUPPORTED); + return false; + } + return true; + } + + void send_partial_response(server_slot & slot, const completion_token_output & tkn, bool is_progress) { + auto res = std::make_unique(); + + res->id = slot.task->id; + res->index = slot.task->index; + + if (is_progress) { + res->is_progress = true; + res->progress.total = slot.task->n_tokens(); + res->progress.cache = slot.n_prompt_tokens_cache; + res->progress.processed = slot.prompt.tokens.size(); + res->progress.time_ms = (ggml_time_us() - slot.t_start_process_prompt) / 1000; + } else { + res->content = tkn.text_to_send; + res->tokens = { tkn.tok }; + + slot.update_chat_msg(res->oaicompat_msg_diffs); + } + + res->n_decoded = slot.n_decoded; + res->n_prompt_tokens = slot.task->n_tokens(); + res->post_sampling_probs = slot.task->params.post_sampling_probs; + + res->verbose = slot.task->params.verbose; + res->res_type = slot.task->params.res_type; + res->oaicompat_model = slot.task->params.oaicompat_model; + res->oaicompat_cmpl_id = slot.task->params.oaicompat_cmpl_id; + + // populate res.probs_output + if (slot.task->params.sampling.n_probs > 0) { + res->prob_output = tkn; // copy the token probs + } + + // populate timings if this is final response or timings_per_token is enabled + if (slot.stop != STOP_TYPE_NONE || slot.task->params.timings_per_token) { + res->timings = slot.get_timings(); + } + + queue_results.send(std::move(res)); + } + + void send_final_response(server_slot & slot) { + auto res = std::make_unique(); + + res->id = slot.task->id; + res->id_slot = slot.id; + + res->index = slot.task->index; + res->content = slot.generated_text; + res->tokens = std::move(slot.generated_tokens); + res->timings = slot.get_timings(); + res->prompt = slot.task->tokens.detokenize(ctx, true); + res->response_fields = std::move(slot.task->params.response_fields); + + res->truncated = slot.truncated; + res->n_decoded = slot.n_decoded; + res->n_prompt_tokens = slot.task->n_tokens(); + res->n_tokens_cached = slot.prompt.n_tokens(); + res->has_new_line = slot.has_new_line; + res->stopping_word = slot.stopping_word; + res->stop = slot.stop; + res->post_sampling_probs = slot.task->params.post_sampling_probs; + + res->verbose = slot.task->params.verbose; + res->stream = slot.task->params.stream; + res->include_usage = slot.task->params.include_usage; + res->res_type = slot.task->params.res_type; + res->oaicompat_model = slot.task->params.oaicompat_model; + res->oaicompat_cmpl_id = slot.task->params.oaicompat_cmpl_id; + res->oaicompat_msg = slot.update_chat_msg(res->oaicompat_msg_diffs); + + // populate res.probs_output + if (slot.task->params.sampling.n_probs > 0) { + if (!slot.task->params.stream && slot.stop == STOP_TYPE_WORD) { + const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false); + + size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size()); + res->probs_output = std::vector( + slot.generated_token_probs.begin(), + slot.generated_token_probs.end() - safe_offset); + } else { + res->probs_output = std::vector( + slot.generated_token_probs.begin(), + slot.generated_token_probs.end()); + } + } + + res->generation_params = slot.task->params; // copy the parameters + + queue_results.send(std::move(res)); + } + + void send_embedding(const server_slot & slot, const llama_batch & batch) { + auto res = std::make_unique(); + res->id = slot.task->id; + res->index = slot.task->index; + res->n_tokens = slot.task->n_tokens(); + res->res_type = slot.task->params.res_type; + + const int n_embd = llama_model_n_embd(model); + + std::vector embd_res(n_embd, 0.0f); + + for (int i = 0; i < batch.n_tokens; ++i) { + if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { + continue; + } + + const float * embd = nullptr; + if (llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE) { + embd = llama_get_embeddings_ith(ctx, i); + } else { + embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); + } + + if (embd == nullptr) { + SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]); + + res->embedding.push_back(std::vector(n_embd, 0.0f)); + continue; + } + + // normalize only when there is pooling + if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) { + common_embd_normalize(embd, embd_res.data(), n_embd, slot.task->params.embd_normalize); + res->embedding.push_back(embd_res); + break; + } + + res->embedding.emplace_back(embd, embd + n_embd); + } + + SLT_DBG(slot, "%s", "sending embeddings\n"); + + queue_results.send(std::move(res)); + } + + void send_rerank(const server_slot & slot, const llama_batch & batch) { + auto res = std::make_unique(); + res->id = slot.task->id; + res->index = slot.task->index; + res->n_tokens = slot.task->n_tokens(); + + for (int i = 0; i < batch.n_tokens; ++i) { + if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { + continue; + } + + const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); + if (embd == NULL) { + embd = llama_get_embeddings_ith(ctx, i); + } + + if (embd == NULL) { + SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]); + + res->score = -1e6; + continue; + } + + res->score = embd[0]; + } + + SLT_DBG(slot, "sending rerank result, res.score = %f\n", res->score); + + queue_results.send(std::move(res)); + } + + // + // Functions to process the task + // + + void process_single_task(server_task && task) { + switch (task.type) { + case SERVER_TASK_TYPE_COMPLETION: + case SERVER_TASK_TYPE_INFILL: + case SERVER_TASK_TYPE_EMBEDDING: + case SERVER_TASK_TYPE_RERANK: + { + const int id_slot = task.id_slot; + + server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task); + + if (slot == nullptr) { + // if no slot is available, we defer this task for processing later + SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id); + queue_tasks.defer(std::move(task)); + break; + } + + if (slot->is_processing()) { + // if requested slot is unavailable, we defer this task for processing later + SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); + queue_tasks.defer(std::move(task)); + break; + } + + if (!launch_slot_with_task(*slot, std::move(task))) { + SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id); + break; + } + } break; + case SERVER_TASK_TYPE_CANCEL: + { + // release slot linked with the task id + for (auto & slot : slots) { + if (slot.task && slot.task->id == task.id_target) { + slot.release(); + break; + } + } + } break; + case SERVER_TASK_TYPE_NEXT_RESPONSE: + { + // do nothing + } break; + case SERVER_TASK_TYPE_METRICS: + { + json slots_data = json::array(); + + int n_idle_slots = 0; + int n_processing_slots = 0; + + for (server_slot & slot : slots) { + json slot_data = slot.to_json(slots_debug == 0); + + if (slot.is_processing()) { + n_processing_slots++; + } else { + n_idle_slots++; + } + + slots_data.push_back(slot_data); + } + SRV_DBG("n_idle_slots = %d, n_processing_slots = %d\n", n_idle_slots, n_processing_slots); + + auto res = std::make_unique(); + res->id = task.id; + res->slots_data = std::move(slots_data); + res->n_idle_slots = n_idle_slots; + res->n_processing_slots = n_processing_slots; + res->n_tasks_deferred = queue_tasks.queue_tasks_deferred_size(); + res->t_start = metrics.t_start; + + res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total; + res->t_prompt_processing_total = metrics.t_prompt_processing_total; + res->n_tokens_predicted_total = metrics.n_tokens_predicted_total; + res->t_tokens_generation_total = metrics.t_tokens_generation_total; + + res->n_tokens_max = metrics.n_tokens_max; + + res->n_prompt_tokens_processed = metrics.n_prompt_tokens_processed; + res->t_prompt_processing = metrics.t_prompt_processing; + res->n_tokens_predicted = metrics.n_tokens_predicted; + res->t_tokens_generation = metrics.t_tokens_generation; + + res->n_decode_total = metrics.n_decode_total; + res->n_busy_slots_total = metrics.n_busy_slots_total; + + if (task.metrics_reset_bucket) { + metrics.reset_bucket(); + } + queue_results.send(std::move(res)); + } break; + case SERVER_TASK_TYPE_SLOT_SAVE: + { + if (!check_no_mtmd(task.id)) { + break; + } + + int id_slot = task.slot_action.slot_id; + server_slot * slot = get_slot_by_id(id_slot); + if (slot == nullptr) { + send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); + break; + } + if (slot->is_processing()) { + // if requested slot is unavailable, we defer this task for processing later + SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); + queue_tasks.defer(std::move(task)); + break; + } + + const size_t token_count = slot->prompt.tokens.size(); + const int64_t t_start = ggml_time_us(); + + std::string filename = task.slot_action.filename; + std::string filepath = task.slot_action.filepath; + + const llama_tokens & tokens = slot->prompt.tokens.get_text_tokens(); + const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, tokens.data(), token_count); + + const int64_t t_end = ggml_time_us(); + const double t_save_ms = (t_end - t_start) / 1000.0; + + auto res = std::make_unique(); + res->id = task.id; + res->id_slot = id_slot; + res->filename = filename; + res->is_save = true; + res->n_tokens = token_count; + res->n_bytes = nwrite; + res->t_ms = t_save_ms; + queue_results.send(std::move(res)); + } break; + case SERVER_TASK_TYPE_SLOT_RESTORE: + { + if (!check_no_mtmd(task.id)) break; + int id_slot = task.slot_action.slot_id; + server_slot * slot = get_slot_by_id(id_slot); + if (slot == nullptr) { + send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); + break; + } + if (slot->is_processing()) { + // if requested slot is unavailable, we defer this task for processing later + SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); + queue_tasks.defer(std::move(task)); + break; + } + + const int64_t t_start = ggml_time_us(); + + std::string filename = task.slot_action.filename; + std::string filepath = task.slot_action.filepath; + + llama_tokens tokens; + tokens.resize(slot->n_ctx); + size_t token_count = 0; + size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, tokens.data(), tokens.size(), &token_count); + if (nread == 0) { + slot->prompt.tokens.clear(); // KV may already been invalidated? + send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST); + break; + } + tokens.resize(token_count); + slot->prompt.tokens.clear(); + slot->prompt.tokens.insert(tokens); + + const int64_t t_end = ggml_time_us(); + const double t_restore_ms = (t_end - t_start) / 1000.0; + + auto res = std::make_unique(); + res->id = task.id; + res->id_slot = id_slot; + res->filename = filename; + res->is_save = false; + res->n_tokens = token_count; + res->n_bytes = nread; + res->t_ms = t_restore_ms; + queue_results.send(std::move(res)); + } break; + case SERVER_TASK_TYPE_SLOT_ERASE: + { + if (!check_no_mtmd(task.id)) { + break; + } + int id_slot = task.slot_action.slot_id; + server_slot * slot = get_slot_by_id(id_slot); + if (slot == nullptr) { + send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); + break; + } + if (slot->is_processing()) { + // if requested slot is unavailable, we defer this task for processing later + SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); + queue_tasks.defer(std::move(task)); + break; + } + + // Erase token cache + const size_t n_erased = slot->prompt.tokens.size(); + + clear_slot(*slot); + + auto res = std::make_unique(); + res->id = task.id; + res->id_slot = id_slot; + res->n_erased = n_erased; + queue_results.send(std::move(res)); + } break; + case SERVER_TASK_TYPE_SET_LORA: + { + params_base.lora_adapters = std::move(task.set_lora); + auto res = std::make_unique(); + res->id = task.id; + queue_results.send(std::move(res)); + } break; + + } + } + + void update_slots() { + // check if all slots are idle + { + bool all_idle = true; + + for (auto & slot : slots) { + if (slot.is_processing()) { + all_idle = false; + break; + } + } + + if (all_idle) { + SRV_INF("%s", "all slots are idle\n"); + + return; + } + } + + { + SRV_DBG("%s", "posting NEXT_RESPONSE\n"); + + server_task task(SERVER_TASK_TYPE_NEXT_RESPONSE); + task.id = queue_tasks.get_new_id(); + queue_tasks.post(std::move(task)); + } + + // apply context-shift if needed + // TODO: simplify and improve + for (server_slot & slot : slots) { + if (slot.state == SLOT_STATE_GENERATING && slot.prompt.n_tokens() + 1 >= slot.n_ctx) { + if (!params_base.ctx_shift) { + // this check is redundant (for good) + // we should never get here, because generation should already stopped in process_token() + send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER); + slot.release(); + continue; + } + + if (mctx) { + // we should never reach this because params_base.ctx_shift is automatically disabled if mmproj is loaded + // we don't support ctx_shift because an image chunk may contains multiple tokens + GGML_ABORT("not supported by multimodal"); + } + + // Shift context + int n_keep = slot.task->params.n_keep < 0 ? slot.task->n_tokens() : slot.task->params.n_keep; + + if (add_bos_token) { + n_keep += 1; + } + + n_keep = std::min(slot.n_ctx - 4, n_keep); + + const int n_left = slot.prompt.n_tokens() - n_keep; + const int n_discard = slot.task->params.n_discard ? slot.task->params.n_discard : (n_left / 2); + + SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); + + llama_memory_seq_rm (llama_get_memory(ctx), slot.id, n_keep , n_keep + n_discard); + llama_memory_seq_add(llama_get_memory(ctx), slot.id, n_keep + n_discard, slot.prompt.n_tokens(), -n_discard); + + // add generated tokens to cache + // ref: https://github.com/ggml-org/llama.cpp/pull/16818#discussion_r2473269481 + { + GGML_ASSERT(!slot.prompt.tokens.has_mtmd); + + llama_tokens new_tokens = slot.prompt.tokens.get_text_tokens(); // copy + for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) { + new_tokens[i - n_discard] = new_tokens[i]; + } + + new_tokens.resize(slot.prompt.tokens.size() - n_discard); + + slot.prompt.tokens.clear(); + slot.prompt.tokens.insert(new_tokens); + } + + slot.truncated = true; + } + } + + // start populating the batch for this iteration + common_batch_clear(batch); + + // track if given slot can be batched with slots already in the batch + server_slot * slot_batched = nullptr; + + auto accept_special_token = [&](server_slot & slot, llama_token token) { + return params_base.special || + slot.task->params.sampling.preserved_tokens.find(token) != slot.task->params.sampling.preserved_tokens.end(); + }; + + // first, add sampled tokens from any ongoing sequences + for (auto & slot : slots) { + if (slot.state != SLOT_STATE_GENERATING) { + continue; + } + + // check if we can batch this slot with the previous one + if (!slot_batched) { + slot_batched = &slot; + } else if (!slot_batched->can_batch_with(slot)) { + continue; + } + + slot.i_batch = batch.n_tokens; + + common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true); + + slot.prompt.tokens.push_back(slot.sampled); + + SLT_DBG(slot, "slot decode token, n_ctx = %d, n_tokens = %d, truncated = %d\n", + slot.n_ctx, slot.prompt.n_tokens(), slot.truncated); + } + + // process in chunks of params.n_batch + int32_t n_batch = llama_n_batch(ctx); + int32_t n_ubatch = llama_n_ubatch(ctx); + + float alora_scale = -1.0f; + size_t alora_disabled_id = 0; + + // next, batch any pending prompts without exceeding n_batch + if (params_base.cont_batching || batch.n_tokens == 0) { + for (auto & slot : slots) { + if (!slot.is_processing()) { + continue; + } + + // check if we can batch this slot with the previous one + if (slot_batched && !slot_batched->can_batch_with(slot)) { + continue; + } + + // this slot still has a prompt to be processed + if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) { + const auto & input_tokens = slot.task->tokens; + + // TODO: maybe move branch to outside of this loop in the future + if (slot.state == SLOT_STATE_STARTED) { + slot.t_start_process_prompt = ggml_time_us(); + slot.t_start_generation = 0; + + slot.state = SLOT_STATE_PROCESSING_PROMPT; + + SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, task.n_tokens = %d\n", + slot.n_ctx, slot.task->params.n_keep, slot.task->n_tokens()); + + // print prompt tokens (for debugging) + /*if (1) { + // first 16 tokens (avoid flooding logs) + for (int i = 0; i < std::min(16, input_tokens.size()); i++) { + SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, input_tokens[i], common_token_to_piece(ctx, input_tokens[i]).c_str()); + } + } else { + // all + for (int i = 0; i < (int) input_tokens.size(); i++) { + SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, input_tokens[i], common_token_to_piece(ctx, input_tokens[i]).c_str()); + } + }*/ + + // keep track how many tokens we can reuse from the previous state + int n_past = 0; + + // empty prompt passed -> release the slot and send empty response + if (input_tokens.empty()) { + SLT_WRN(slot, "%s", "empty prompt - releasing slot\n"); + + slot.print_timings(); + send_final_response(slot); + slot.release(); + + continue; + } + + // TODO: support memory-less logits computation + if (slot.need_logits() && !llama_get_memory(ctx)) { + send_error(slot, "the current context does not logits computation. skipping", ERROR_TYPE_SERVER); + slot.release(); + continue; + } + + if (!slot.can_split()) { + if (slot.task->n_tokens() > n_ubatch) { + send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER); + slot.release(); + continue; + } + + if (slot.task->n_tokens() > slot.n_ctx) { + send_error(slot, "input is larger than the max context size. skipping", ERROR_TYPE_EXCEED_CONTEXT_SIZE); + slot.release(); + continue; + } + } else { + if (slot.task->n_tokens() >= slot.n_ctx) { + send_error(slot, "the request exceeds the available context size, try increasing it", ERROR_TYPE_EXCEED_CONTEXT_SIZE); + slot.release(); + continue; + } + + if (slot.task->params.cache_prompt) { + // reuse any previously computed tokens that are common with the new prompt + n_past = slot.prompt.tokens.get_common_prefix(input_tokens); + + // if there is an alora invoked, don't cache after the invocation start + if (slot.alora_invocation_start > 0) { + SLT_DBG(slot, "only caching to alora invocation start (n_past = %d, alora_invocation_start = %d)\n", n_past, slot.alora_invocation_start); + n_past = std::min(n_past, slot.alora_invocation_start - 1); + } + + // reuse chunks from the cached prompt by shifting their KV cache in the new position + if (params_base.n_cache_reuse > 0) { + GGML_ASSERT(!slot.prompt.tokens.has_mtmd); + + size_t head_c = n_past; // cache + size_t head_p = n_past; // current prompt + + if (mctx) { + // we should never reach this + GGML_ABORT("not supported by multimodal"); + } + + SLT_DBG(slot, "trying to reuse chunks with size > %d, n_past = %d\n", params_base.n_cache_reuse, n_past); + + while (head_c < slot.prompt.tokens.size() && + head_p < input_tokens.size()) { + + size_t n_match = 0; + while (head_c + n_match < slot.prompt.tokens.size() && + head_p + n_match < input_tokens.size() && + slot.prompt.tokens[head_c + n_match] == input_tokens[head_p + n_match]) { + + n_match++; + } + + if (n_match >= (size_t) params_base.n_cache_reuse) { + SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match); + //for (size_t i = head_p; i < head_p + n_match; i++) { + // SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); + //} + + const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; + + llama_memory_seq_rm (llama_get_memory(ctx), slot.id, head_p, head_c); + llama_memory_seq_add(llama_get_memory(ctx), slot.id, head_c, head_c + n_match, kv_shift); + + for (size_t i = 0; i < n_match; i++) { + slot.prompt.tokens.set_token(head_p + i, slot.prompt.tokens[head_c + i]); + n_past++; + } + + head_c += n_match; + head_p += n_match; + } else { + head_c += 1; + } + } + + SLT_DBG(slot, "after context reuse, new n_past = %d\n", n_past); + } + } else { + // if we don't cache the prompt, we have to remove all previous tokens + n_past = 0; + } + + // note: when n_swa == 0, the model does not use SWA, which is equivalent to a window of 1 + const auto n_swa = std::max(1, llama_model_n_swa(model)); + + // the largest pos_min required for a checkpoint to be useful + const auto pos_min_thold = std::max(0, n_past - n_swa); + + // note: disallow with mtmd contexts for now + // https://github.com/ggml-org/llama.cpp/issues/17043 + if (!mctx && n_past > 0 && n_past < slot.prompt.n_tokens()) { + const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id); + if (pos_min == -1) { + SLT_ERR(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min); + GGML_ABORT("pos_min == -1, but n_past > 0 - should not happen: https://github.com/ggml-org/llama.cpp/pull/13833#discussion_r2116181237"); + } + + // when the prompt prefix does not match, print the tokens around the mismatch + // this is useful for debugging prompt caching + if (slots_debug) { + const int np0 = std::max(n_past - 4, 0); + const int np1 = std::min(n_past + 6, std::min(slot.prompt.tokens.size(), slot.task->tokens.size())); + + std::stringstream ss0; + std::stringstream ss1; + + std::stringstream st0; + std::stringstream st1; + + ss0 << "old: ... "; + ss1 << "new: ... "; + + for (int i = np0; i < np1; i++) { + if (i == n_past) { + ss0 << " | "; + ss1 << " | "; + } + + { + const auto token = slot.prompt.tokens[i]; + const auto piece = token != LLAMA_TOKEN_NULL ? common_token_to_piece(ctx, token) : "[mtmd]"; + ss0 << piece; + st0 << std::setw(8) << token; + } + + { + const auto token = slot.task->tokens[i]; + const auto piece = token != LLAMA_TOKEN_NULL ? common_token_to_piece(ctx, token) : "[mtmd]"; + ss1 << piece; + st1 << std::setw(8) << token; + } + } + + SLT_WRN(slot, "%s\n", ss0.str().c_str()); + SLT_WRN(slot, "%s\n", ss1.str().c_str()); + + SLT_WRN(slot, "%s\n", st0.str().c_str()); + SLT_WRN(slot, "%s\n", st1.str().c_str()); + } + + if (pos_min > pos_min_thold) { + // TODO: support can be added in the future when corresponding vision models get released + GGML_ASSERT(!slot.prompt.tokens.has_mtmd); + + SLT_WRN(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min, n_swa); + + // search for a context checkpoint + const auto it = std::find_if( + slot.prompt.checkpoints.rbegin(), + slot.prompt.checkpoints.rend(), + [&](const auto & cur) { + // guarantee that a checkpoint will result in at least one token being processed [TAG_PROMPT_LOGITS] + return cur.pos_min < pos_min_thold; + } + ); + + bool do_reset = it == slot.prompt.checkpoints.rend(); + + if (!do_reset) { + // restore the context checkpoint + const size_t checkpoint_size = it->data.size(); + const size_t n = llama_state_seq_set_data_ext(ctx, it->data.data(), checkpoint_size, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); + + if (n != checkpoint_size) { + SLT_ERR(slot, "failed to restore context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, (float) checkpoint_size / 1024 / 1024); + do_reset = true; + //printf("[DEBUG] `do_reset` was set to `true` after failing to restore a checkpoint"); + } else { + n_past = std::min(n_past, std::max(it->pos_min + 1, it->pos_max)); + SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, (float) checkpoint_size / 1024 / 1024); + } + } + + if (do_reset) { + SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA or hybrid/recurrent memory, see %s)\n", + "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055"); + n_past = 0; + } + } + } + + { + // erase any checkpoints with pos_min > pos_min_thold + for (auto it = slot.prompt.checkpoints.begin(); it != slot.prompt.checkpoints.end();) { + const auto & cur = *it; + if (cur.pos_min > pos_min_thold) { + SLT_WRN(slot, "erased invalidated context checkpoint (pos_min = %d, pos_max = %d, n_swa = %d, size = %.3f MiB)\n", cur.pos_min, cur.pos_max, n_swa, (float) cur.data.size() / 1024 / 1024); + it = slot.prompt.checkpoints.erase(it); + } else { + ++it; + } + } + } + } + + // [TAG_PROMPT_LOGITS] + if (n_past == slot.task->n_tokens() && n_past > 0) { + SLT_WRN(slot, "need to evaluate at least 1 token for each active slot (n_past = %d, task.n_tokens() = %d)\n", n_past, slot.task->n_tokens()); + n_past--; + SLT_WRN(slot, "n_past was set to %d\n", n_past); + } + + slot.n_prompt_tokens_cache = n_past; + slot.n_prompt_tokens_processed = 0; + + slot.prompt.tokens.keep_first(n_past); + } + + if (!slot.can_split()) { + // cannot fit the prompt in the current batch - will try next iter + if (batch.n_tokens + slot.task->n_tokens() > n_batch) { + continue; + } + } + + // truncate any tokens that are beyond n_past for this slot + const llama_pos p0 = slot.prompt.tokens.pos_next(); + + SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0); + + if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) { + SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0); + + clear_slot(slot); + + // there is no common part left + slot.n_prompt_tokens_cache = 0; + } + + // check if we should process the image + if (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) { + // process the image + size_t n_tokens_out = 0; + int32_t res = input_tokens.process_chunk(ctx, mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out); + if (res != 0) { + SLT_ERR(slot, "failed to process image, res = %d\n", res); + send_error(slot, "failed to process image", ERROR_TYPE_SERVER); + slot.release(); + continue; + } + + slot.n_prompt_tokens_processed += n_tokens_out; + + // add the image chunk to cache + { + const auto & chunk = input_tokens.find_chunk(slot.prompt.n_tokens()); + slot.prompt.tokens.push_back(chunk.get()); // copy + } + } + + // If using an alora, there may be uncached tokens that come + // before the invocation sequence. When this happens, the + // tokens before the invocation sequence need to be + // processed without the adapter in a separate batch, then + // the adapter needs to be enabled for the remaining tokens. + if (lora_all_alora(slot.lora) && slot.alora_invocation_start - 1 > slot.prompt.n_tokens()) { + SLT_DBG(slot, "processing pre-alora tokens without the adapter (n_tokens = %d, alora_invocation_start = %d)\n", slot.prompt.n_tokens(), slot.alora_invocation_start); + const auto & enabled_loras = lora_get_enabled_ids(slot.lora); + GGML_ASSERT(enabled_loras.size() == 1); + alora_scale = slot.lora[enabled_loras[0]].scale; + slot.lora[enabled_loras[0]].scale = 0.0f; + alora_disabled_id = enabled_loras[0]; + } + + bool do_checkpoint = params_base.n_ctx_checkpoints > 0; + + // make checkpoints only for completion tasks + do_checkpoint = do_checkpoint && slot.task->type == SERVER_TASK_TYPE_COMPLETION; + + // make a checkpoint of the parts of the memory that cannot be rolled back. + // checkpoints are created only if: + // - the model uses SWA and we are not using `swa_full` + // - the model architecture is marked as recurrent or hybrid + // + // TODO: try to make this conditional on the context or the memory module, instead of the model type + do_checkpoint = do_checkpoint && ( + llama_model_is_recurrent(model) || + llama_model_is_hybrid(model) || + (llama_model_n_swa(model) > 0 && !params_base.swa_full) + ); + + // add prompt tokens for processing in the current batch + while (slot.prompt.n_tokens() < slot.task->n_tokens() && batch.n_tokens < n_batch) { + // get next token to process + llama_token cur_tok = input_tokens[slot.prompt.n_tokens()]; + if (cur_tok == LLAMA_TOKEN_NULL) { + break; // end of text chunk + } + + // if this is an alora request with pre-invocation + // tokens that are not cached, we need to stop filling + // this batch at those pre-invocation tokens. + if (alora_scale > 0 && slot.prompt.n_tokens() == slot.alora_invocation_start - 1) { + SLT_DBG(slot, "stop prompt batch filling at (n_tokens = %d, alora_invocation_start = %d)\n", slot.prompt.n_tokens(), slot.alora_invocation_start); + break; + } + + // embedding requires all tokens in the batch to be output + common_batch_add(batch, + cur_tok, + slot.prompt.tokens.pos_next(), + { slot.id }, + slot.need_embd()); + slot.prompt.tokens.push_back(cur_tok); + + slot.n_prompt_tokens_processed++; + + // process the last few tokens of the prompt separately in order to allow for a checkpoint to be created. + if (do_checkpoint && slot.task->n_tokens() - slot.prompt.n_tokens() == 64) { + break; + } + } + + // SLT_INF(slot, "new slot.prompt.tokens: %s\n", slot.slot.prompt.tokens.str().c_str()); + + SLT_INF(slot, "prompt processing progress, n_tokens = %d, batch.n_tokens = %d, progress = %f\n", slot.prompt.n_tokens(), batch.n_tokens, (float) slot.prompt.n_tokens() / slot.task->n_tokens()); + + // entire prompt has been processed + if (slot.prompt.n_tokens() == slot.task->n_tokens()) { + slot.state = SLOT_STATE_DONE_PROMPT; + + GGML_ASSERT(batch.n_tokens > 0); + + common_sampler_reset(slot.smpl); + + // Process all prompt tokens through sampler system + for (int i = 0; i < slot.task->n_tokens(); ++i) { + llama_token id = input_tokens[i]; + if (id != LLAMA_TOKEN_NULL) { + common_sampler_accept(slot.smpl, id, false); + } + } + + // extract the logits only for the last token + batch.logits[batch.n_tokens - 1] = true; + + slot.n_decoded = 0; + slot.i_batch = batch.n_tokens - 1; + + SLT_INF(slot, "prompt done, n_tokens = %d, batch.n_tokens = %d\n", slot.prompt.n_tokens(), batch.n_tokens); + + const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id); + const auto pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx), slot.id); + + // no need for empty or small checkpoints + do_checkpoint = do_checkpoint && (pos_min >= 0 && pos_max >= 64); + + // no need to create checkpoints that are too close together + do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || pos_max > slot.prompt.checkpoints.back().pos_max + 64); + + if (do_checkpoint) { + while (slot.prompt.checkpoints.size() >= (size_t) params_base.n_ctx_checkpoints) { + // make room for the new checkpoint, if needed + const auto & cur = slot.prompt.checkpoints.front(); + + SLT_WRN(slot, "erasing old context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", + cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024); + + slot.prompt.checkpoints.erase(slot.prompt.checkpoints.begin()); + } + + const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); + + auto & cur = slot.prompt.checkpoints.emplace_back(server_prompt_checkpoint{ + /*.pos_min = */ pos_min, + /*.pos_max = */ pos_max, + /*.data = */ std::vector(checkpoint_size), + }); + + llama_state_seq_get_data_ext(ctx, cur.data.data(), checkpoint_size, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); + + SLT_WRN(slot, "created context checkpoint %d of %d (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", + (int) slot.prompt.checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024); + } + } + } + + if (!slot_batched) { + slot_batched = &slot; + } + + if (batch.n_tokens >= n_batch) { + break; + } + } + } + + if (batch.n_tokens == 0) { + SRV_WRN("%s", "no tokens to decode\n"); + return; + } + + SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens); + + if (slot_batched) { + // apply lora, only need to do it once per batch + common_set_adapter_lora(ctx, slot_batched->lora); + + // if the lora is temporarily disabled for an alora, re-enable it + // for next time + if (alora_scale > 0.0f) { + SRV_DBG("re-enabling alora with scale %f\n", alora_scale); + slot_batched->lora[alora_disabled_id].scale = alora_scale; + } + + llama_set_embeddings(ctx, slot_batched->need_embd()); + } + + int32_t i_next = 0; + + // process the created batch of tokens + for (int32_t i = 0; i < batch.n_tokens; i = i_next) { + const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i); + + llama_batch batch_view = { + n_tokens, + batch.token + i, + nullptr, + batch.pos + i, + batch.n_seq_id + i, + batch.seq_id + i, + batch.logits + i, + }; + + const int ret = llama_decode(ctx, batch_view); + + metrics.on_decoded(slots); + + if (ret != 0) { + { + std::string err; + + if (n_batch == 1 && ret == 1) { + // TODO: try to terminate only the largest active slot/sequence and continue with the rest + // need to remove the tokens from the current batch too + err = "Context size has been exceeded."; + } + + if (ret == -1) { + err = "Invalid input batch."; + } + + if (ret < -1) { + // TODO: update slot state based on llama_memory_seq_pos_min() and llama_memory_seq_pos_max() + err = "Compute error."; + } + + // TODO: handle ret == 2 (abort) when we start aborting + + if (!err.empty()) { + SRV_ERR("%s i = %d, n_batch = %d, ret = %d\n", err.c_str(), i, n_batch, ret); + + for (auto & slot : slots) { + if (slot.is_processing()) { + send_error(slot, err); + slot.release(); + + // note: it's complicated to keep track of how much of the current batch has been + // processed before the error occurred, so we simply clear the entire context + clear_slot(slot); + } + } + + break; + } + } + + // retry with half the batch size to try to find a free slot in the KV cache + if (!try_clear_idle_slots()) { + n_batch /= 2; + } + + SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret); + + continue; // continue loop of n_batch + } + + // move the head of the batch forward with the number of tokens we just processed + i_next = i + n_tokens; + + // on successful decode, restore the original batch size + n_batch = llama_n_batch(ctx); + + for (auto & slot : slots) { + // optionally send prompt processing progress + if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_DONE_PROMPT) { + if (slot.task->params.stream && slot.task->params.return_progress) { + send_partial_response(slot, {}, true); + } + } + + if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) { + continue; // continue loop of slots + } + + if (slot.state == SLOT_STATE_DONE_PROMPT) { + if (slot.task->type == SERVER_TASK_TYPE_EMBEDDING) { + // prompt evaluated for embedding + send_embedding(slot, batch_view); + slot.release(); + slot.i_batch = -1; + continue; // continue loop of slots + } + + if (slot.task->type == SERVER_TASK_TYPE_RERANK) { + send_rerank(slot, batch_view); + slot.release(); + slot.i_batch = -1; + continue; // continue loop of slots + } + + // prompt evaluated for next-token prediction + slot.state = SLOT_STATE_GENERATING; + } else if (slot.state != SLOT_STATE_GENERATING) { + continue; // continue loop of slots + } + + const int tok_idx = slot.i_batch - i; + + llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx); + + slot.i_batch = -1; + + common_sampler_accept(slot.smpl, id, true); + + slot.n_decoded += 1; + + const int64_t t_current = ggml_time_us(); + + if (slot.n_decoded == 1) { + slot.t_start_generation = t_current; + slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3; + metrics.on_prompt_eval(slot); + } + + slot.t_token_generation = std::max(1, t_current - slot.t_start_generation) / 1e3; + + completion_token_output result; + result.tok = id; + result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok)); + result.prob = 1.0f; // TODO: set it here instead of doing inside populate_token_probs + + if (slot.task->params.sampling.n_probs > 0) { + populate_token_probs(slot, result, slot.task->params.post_sampling_probs, params_base.special, tok_idx); + } + + if (!process_token(result, slot)) { + // release slot because of stop condition + slot.print_timings(); + send_final_response(slot); + metrics.on_prediction(slot); + slot.release(); + + continue; + } + } + + // do speculative decoding + // TODO: rework to have a single draft llama_context shared across all slots [TAG_SERVER_SPEC_REWORK] + // perform the speculative drafting for all sequences at the same time in a single batch + for (auto & slot : slots) { + if (!slot.is_processing() || !slot.can_speculate()) { + continue; + } + + if (slot.state != SLOT_STATE_GENERATING) { + continue; + } + + if (mctx) { + // we should never reach this, as speculative is automatically disabled if mmproj is loaded + GGML_ABORT("not supported by multimodal"); + } + + // determine the max draft that fits the current slot state + int n_draft_max = slot.task->params.speculative.n_max; + + // note: slot.prompt is not yet expanded with the `id` token sampled above + // also, need to leave space for 1 extra token to allow context shifts + n_draft_max = std::min(n_draft_max, slot.n_ctx - slot.prompt.n_tokens() - 2); + + if (slot.n_remaining > 0) { + n_draft_max = std::min(n_draft_max, slot.n_remaining - 1); + } + + SLT_DBG(slot, "max possible draft: %d\n", n_draft_max); + + if (n_draft_max < slot.task->params.speculative.n_min) { + SLT_DBG(slot, "the max possible draft is too small: %d < %d - skipping speculative decoding\n", n_draft_max, slot.task->params.speculative.n_min); + + continue; + } + + llama_token id = slot.sampled; + + struct common_speculative_params params_spec; + params_spec.n_draft = n_draft_max; + params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.task->params.speculative.n_max; + params_spec.p_min = slot.task->params.speculative.p_min; + + const llama_tokens & cached_text_tokens = slot.prompt.tokens.get_text_tokens(); + llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, id); + + // ignore small drafts + if (slot.task->params.speculative.n_min > (int) draft.size()) { + SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.task->params.speculative.n_min); + + continue; + } + + // keep track of total number of drafted tokens tested + slot.n_draft_total += draft.size(); + + // construct the speculation batch + common_batch_clear(slot.batch_spec); + common_batch_add (slot.batch_spec, id, slot.prompt.tokens.pos_next(), { slot.id }, true); + + for (size_t i = 0; i < draft.size(); ++i) { + common_batch_add(slot.batch_spec, draft[i], slot.prompt.tokens.pos_next() + 1 + i, { slot.id }, true); + } + + SLT_DBG(slot, "decoding speculative batch, size = %d\n", slot.batch_spec.n_tokens); + + llama_decode(ctx, slot.batch_spec); + + // the accepted tokens from the speculation + const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft); + + slot.n_decoded += ids.size(); + + // update how many tokens out of those tested were accepted + slot.n_draft_accepted += ids.size() - 1; + + slot.prompt.tokens.push_back(id); + slot.prompt.tokens.insert({ids.begin(), ids.end() - 1}); + + llama_memory_seq_rm(llama_get_memory(ctx), slot.id, slot.prompt.n_tokens(), -1); + + for (size_t i = 0; i < ids.size(); ++i) { + completion_token_output result; + + result.tok = ids[i]; + result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok)); + result.prob = 1.0f; // set later + + // TODO: set result.probs + + if (!process_token(result, slot)) { + slot.print_timings(); + send_final_response(slot); + metrics.on_prediction(slot); + slot.release(); + + break; + } + } + + SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int) ids.size() - 1, (int) draft.size(), slot.prompt.n_tokens()); + } + } + + SRV_DBG("%s", "run slots completed\n"); + } + + json model_meta() const { + return json { + {"vocab_type", llama_vocab_type (vocab)}, + {"n_vocab", llama_vocab_n_tokens (vocab)}, + {"n_ctx_train", llama_model_n_ctx_train(model)}, + {"n_embd", llama_model_n_embd (model)}, + {"n_params", llama_model_n_params (model)}, + {"size", llama_model_size (model)}, + }; + } + + int get_slot_n_ctx() { + return slots.back().n_ctx; + } +}; + +// +// server_context (public API) +// + +server_context::server_context() : impl(new server_context_impl()) {} +server_context::~server_context() = default; + +void server_context::init() { + impl->init(); +} + +bool server_context::load_model(const common_params & params) { + return impl->load_model(params); +} + +void server_context::start_loop() { + impl->queue_tasks.start_loop(); +} + +void server_context::terminate() { + impl->queue_tasks.terminate(); +} + +llama_context * server_context::get_llama_context() const { + return impl->ctx; +} + +std::pair server_context::get_queues() { + return { impl->queue_tasks, impl->queue_results }; +} + + + +// generator-like API for HTTP response generation +struct server_res_generator : server_http_res { + server_response_reader rd; + server_res_generator(server_context_impl & ctx_server) + : rd({ctx_server.queue_tasks, ctx_server.queue_results}, HTTP_POLLING_SECONDS) {} + void ok(const json & response_data) { + status = 200; + data = safe_json_to_str(response_data); + } + void error(const json & error_data) { + status = json_value(error_data, "code", 500); + data = safe_json_to_str({{ "error", error_data }}); + } +}; + + + +// +// server_routes +// + +static std::unique_ptr handle_completions_impl( + server_context_impl & ctx_server, + server_task_type type, + const json & data, + const std::vector & files, + const std::function & should_stop, + task_response_type res_type) { + GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL); + + auto res = std::make_unique(ctx_server); + auto completion_id = gen_chatcmplid(); + auto & rd = res->rd; + + try { + std::vector tasks; + + const auto & prompt = data.at("prompt"); + // TODO: this log can become very long, put it behind a flag or think about a more compact format + //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get().c_str() : prompt.dump(2).c_str()); + + // process prompt + std::vector inputs; + + if (res_type != TASK_RESPONSE_TYPE_NONE && ctx_server.mctx != nullptr) { + // This is the case used by OAI compatible chat path with MTMD. TODO It can be moved to the path below. + inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt.get(), files)); + } else { + // Everything else, including multimodal completions. + inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true); + } + tasks.reserve(inputs.size()); + for (size_t i = 0; i < inputs.size(); i++) { + server_task task = server_task(type); + + task.id = ctx_server.queue_tasks.get_new_id(); + task.index = i; + + task.tokens = std::move(inputs[i]); + task.params = server_task::params_from_json_cmpl( + ctx_server.ctx, + ctx_server.params_base, + data); + task.id_slot = json_value(data, "id_slot", -1); + + // OAI-compat + task.params.res_type = res_type; + task.params.oaicompat_cmpl_id = completion_id; + task.params.oaicompat_model = ctx_server.model_name; + + tasks.push_back(std::move(task)); + } + + rd.post_tasks(std::move(tasks)); + } catch (const std::exception & e) { + res->error(format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST)); + return res; + } + + bool stream = json_value(data, "stream", false); + + if (!stream) { + // non-stream, wait for the results + auto all_results = rd.wait_for_all(should_stop); + if (all_results.is_terminated) { + return res; // connection is closed + } else if (all_results.error) { + res->error(all_results.error->to_json()); + return res; + } else { + json arr = json::array(); + for (auto & res : all_results.results) { + GGML_ASSERT(dynamic_cast(res.get()) != nullptr); + arr.push_back(res->to_json()); + } + // if single request, return single object instead of array + res->ok(arr.size() == 1 ? arr[0] : arr); + } + + } else { + // in streaming mode, the first error must be treated as non-stream response + // this is to match the OAI API behavior + // ref: https://github.com/ggml-org/llama.cpp/pull/16486#discussion_r2419657309 + server_task_result_ptr first_result = rd.next(should_stop); + if (first_result == nullptr) { + return res; // connection is closed + } else if (first_result->is_error()) { + res->error(first_result->to_json()); + return res; + } else { + GGML_ASSERT( + dynamic_cast(first_result.get()) != nullptr + || dynamic_cast(first_result.get()) != nullptr + ); + } + + // next responses are streamed + if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) { + res->data = format_anthropic_sse(first_result->to_json()); + } else { + res->data = format_oai_sse(first_result->to_json()); // to be sent immediately + } + res->status = 200; + res->content_type = "text/event-stream"; + res->next = [res_this = res.get(), res_type, &should_stop](std::string & output) -> bool { + if (should_stop()) { + SRV_DBG("%s", "stopping streaming due to should_stop condition\n"); + return false; // should_stop condition met + } + + if (!res_this->data.empty()) { + // flush the first chunk + output = std::move(res_this->data); + res_this->data.clear(); + return true; + } + + server_response_reader & rd = res_this->rd; + + // check if there is more data + if (!rd.has_next()) { + if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) { + // Anthropic doesn't send [DONE], message_stop was already sent + output = ""; + } else if (res_type != TASK_RESPONSE_TYPE_NONE) { + output = "data: [DONE]\n\n"; + } else { + output = ""; + } + SRV_DBG("%s", "all results received, terminating stream\n"); + return false; // no more data, terminate + } + + // receive subsequent results + auto result = rd.next(should_stop); + if (result == nullptr) { + SRV_DBG("%s", "stopping streaming due to should_stop condition\n"); + return false; // should_stop condition met + } + + // send the results + json res_json = result->to_json(); + if (result->is_error()) { + if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) { + output = format_anthropic_sse({ + {"event", "error"}, + {"data", res_json}, + }); + } else { + output = format_oai_sse(json {{ "error", res_json }}); + } + SRV_DBG("%s", "error received during streaming, terminating stream\n"); + return false; // terminate on error + } else { + GGML_ASSERT( + dynamic_cast(result.get()) != nullptr + || dynamic_cast(result.get()) != nullptr + ); + if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) { + output = format_anthropic_sse(res_json); + } else { + output = format_oai_sse(res_json); + } + } + + // has next data, continue + return true; + }; + } + + return res; +} + +void server_routes::init_routes() { + this->get_health = [this](const server_http_req &) { + // error and loading states are handled by middleware + auto res = std::make_unique(ctx_server); + res->ok({{"status", "ok"}}); + return res; + }; + + this->get_metrics = [this](const server_http_req &) { + auto res = std::make_unique(ctx_server); + if (!params.endpoint_metrics) { + res->error(format_error_response("This server does not support metrics endpoint. Start it with `--metrics`", ERROR_TYPE_NOT_SUPPORTED)); + return res; + } + + // request slots data using task queue + // TODO: use server_response_reader + int task_id = ctx_server.queue_tasks.get_new_id(); + { + server_task task(SERVER_TASK_TYPE_METRICS); + task.id = task_id; + ctx_server.queue_results.add_waiting_task_id(task_id); + ctx_server.queue_tasks.post(std::move(task), true); // high-priority task + } + + // get the result + server_task_result_ptr result = ctx_server.queue_results.recv(task_id); + ctx_server.queue_results.remove_waiting_task_id(task_id); + + if (result->is_error()) { + res->error(result->to_json()); + return res; + } + + // TODO: get rid of this dynamic_cast + auto res_task = dynamic_cast(result.get()); + GGML_ASSERT(res_task != nullptr); + + // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names + json all_metrics_def = json { + {"counter", {{ + {"name", "prompt_tokens_total"}, + {"help", "Number of prompt tokens processed."}, + {"value", (uint64_t) res_task->n_prompt_tokens_processed_total} + }, { + {"name", "prompt_seconds_total"}, + {"help", "Prompt process time"}, + {"value", (uint64_t) res_task->t_prompt_processing_total / 1.e3} + }, { + {"name", "tokens_predicted_total"}, + {"help", "Number of generation tokens processed."}, + {"value", (uint64_t) res_task->n_tokens_predicted_total} + }, { + {"name", "tokens_predicted_seconds_total"}, + {"help", "Predict process time"}, + {"value", (uint64_t) res_task->t_tokens_generation_total / 1.e3} + }, { + {"name", "n_decode_total"}, + {"help", "Total number of llama_decode() calls"}, + {"value", res_task->n_decode_total} + }, { + {"name", "n_tokens_max"}, + {"help", "Largest observed n_tokens."}, + {"value", res_task->n_tokens_max} + }, { + {"name", "n_busy_slots_per_decode"}, + {"help", "Average number of busy slots per llama_decode() call"}, + {"value", (float) res_task->n_busy_slots_total / std::max((float) res_task->n_decode_total, 1.f)} + }}}, + {"gauge", {{ + {"name", "prompt_tokens_seconds"}, + {"help", "Average prompt throughput in tokens/s."}, + {"value", res_task->n_prompt_tokens_processed ? 1.e3 / res_task->t_prompt_processing * res_task->n_prompt_tokens_processed : 0.} + },{ + {"name", "predicted_tokens_seconds"}, + {"help", "Average generation throughput in tokens/s."}, + {"value", res_task->n_tokens_predicted ? 1.e3 / res_task->t_tokens_generation * res_task->n_tokens_predicted : 0.} + },{ + {"name", "requests_processing"}, + {"help", "Number of requests processing."}, + {"value", (uint64_t) res_task->n_processing_slots} + },{ + {"name", "requests_deferred"}, + {"help", "Number of requests deferred."}, + {"value", (uint64_t) res_task->n_tasks_deferred} + }}} + }; + + std::stringstream prometheus; + + for (const auto & el : all_metrics_def.items()) { + const auto & type = el.key(); + const auto & metrics_def = el.value(); + + for (const auto & metric_def : metrics_def) { + const std::string name = metric_def.at("name"); + const std::string help = metric_def.at("help"); + + auto value = json_value(metric_def, "value", 0.); + prometheus << "# HELP llamacpp:" << name << " " << help << "\n" + << "# TYPE llamacpp:" << name << " " << type << "\n" + << "llamacpp:" << name << " " << value << "\n"; + } + } + + res->headers["Process-Start-Time-Unix"] = std::to_string(res_task->t_start); + res->content_type = "text/plain; version=0.0.4"; + res->status = 200; + res->data = prometheus.str(); + return res; + }; + + this->get_slots = [this](const server_http_req & req) { + auto res = std::make_unique(ctx_server); + if (!params.endpoint_slots) { + res->error(format_error_response("This server does not support slots endpoint. Start it with `--slots`", ERROR_TYPE_NOT_SUPPORTED)); + return res; + } + + // request slots data using task queue + int task_id = ctx_server.queue_tasks.get_new_id(); + { + server_task task(SERVER_TASK_TYPE_METRICS); + task.id = task_id; + ctx_server.queue_results.add_waiting_task_id(task_id); + ctx_server.queue_tasks.post(std::move(task), true); // high-priority task + } + + // get the result + server_task_result_ptr result = ctx_server.queue_results.recv(task_id); + ctx_server.queue_results.remove_waiting_task_id(task_id); + + if (result->is_error()) { + res->error(result->to_json()); + return res; + } + + // TODO: get rid of this dynamic_cast + auto res_task = dynamic_cast(result.get()); + GGML_ASSERT(res_task != nullptr); + + // optionally return "fail_on_no_slot" error + if (!req.get_param("fail_on_no_slot").empty()) { + if (res_task->n_idle_slots == 0) { + res->error(format_error_response("no slot available", ERROR_TYPE_UNAVAILABLE)); + return res; + } + } + + res->ok(res_task->slots_data); + return res; + }; + + this->post_slots = [this](const server_http_req & req) { + auto res = std::make_unique(ctx_server); + if (params.slot_save_path.empty()) { + res->error(format_error_response("This server does not support slots action. Start it with `--slot-save-path`", ERROR_TYPE_NOT_SUPPORTED)); + return res; + } + + std::string id_slot_str = req.get_param("id_slot"); + int id_slot; + + try { + id_slot = std::stoi(id_slot_str); + } catch (const std::exception &) { + res->error(format_error_response("Invalid slot ID", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + + std::string action = req.get_param("action"); + + if (action == "save") { + return handle_slots_save(req, id_slot); + } else if (action == "restore") { + return handle_slots_restore(req, id_slot); + } else if (action == "erase") { + return handle_slots_erase(req, id_slot); + } else { + res->error(format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + }; + + this->get_props = [this](const server_http_req &) { + auto res = std::make_unique(ctx_server); + json default_generation_settings_for_props; + + { + task_params params; + + params.sampling = ctx_server.params_base.sampling; + + default_generation_settings_for_props = json { + {"params", params.to_json(true)}, + {"n_ctx", ctx_server.get_slot_n_ctx()}, + }; + } + + // this endpoint is publicly available, please only return what is safe to be exposed + json data = { + { "default_generation_settings", default_generation_settings_for_props }, + { "total_slots", ctx_server.params_base.n_parallel }, + { "model_alias", ctx_server.model_name }, + { "model_path", ctx_server.params_base.model.path }, + { "modalities", json { + {"vision", ctx_server.oai_parser_opt.allow_image}, + {"audio", ctx_server.oai_parser_opt.allow_audio}, + } }, + { "endpoint_slots", params.endpoint_slots }, + { "endpoint_props", params.endpoint_props }, + { "endpoint_metrics", params.endpoint_metrics }, + { "webui", params.webui }, + { "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) }, + { "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)}, + { "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)}, + { "build_info", build_info }, + }; + if (ctx_server.params_base.use_jinja) { + if (auto tool_use_src = common_chat_templates_source(ctx_server.chat_templates.get(), "tool_use")) { + data["chat_template_tool_use"] = tool_use_src; + } + } + + res->ok(data); + return res; + }; + + this->post_props = [this](const server_http_req &) { + auto res = std::make_unique(ctx_server); + if (!params.endpoint_props) { + res->error(format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED)); + return res; + } + // update any props here + + res->ok({{ "success", true }}); + return res; + }; + + this->get_api_show = [this](const server_http_req &) { + auto res = std::make_unique(ctx_server); + bool has_mtmd = ctx_server.mctx != nullptr; + json data = { + { + "template", common_chat_templates_source(ctx_server.chat_templates.get()), + }, + { + "model_info", { + { "llama.context_length", ctx_server.get_slot_n_ctx() }, + } + }, + {"modelfile", ""}, + {"parameters", ""}, + {"template", common_chat_templates_source(ctx_server.chat_templates.get())}, + {"details", { + {"parent_model", ""}, + {"format", "gguf"}, + {"family", ""}, + {"families", {""}}, + {"parameter_size", ""}, + {"quantization_level", ""} + }}, + {"model_info", ""}, + {"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})} + }; + + res->ok(data); + return res; + }; + + this->post_infill = [this](const server_http_req & req) { + auto res = std::make_unique(ctx_server); + // check model compatibility + std::string err; + if (llama_vocab_fim_pre(ctx_server.vocab) == LLAMA_TOKEN_NULL) { + err += "prefix token is missing. "; + } + if (llama_vocab_fim_suf(ctx_server.vocab) == LLAMA_TOKEN_NULL) { + err += "suffix token is missing. "; + } + if (llama_vocab_fim_mid(ctx_server.vocab) == LLAMA_TOKEN_NULL) { + err += "middle token is missing. "; + } + if (!err.empty()) { + res->error(format_error_response(string_format("Infill is not supported by this model: %s", err.c_str()), ERROR_TYPE_NOT_SUPPORTED)); + return res; + } + + // validate input + json data = json::parse(req.body); + if (data.contains("prompt") && !data.at("prompt").is_string()) { + // prompt is optional + res->error(format_error_response("\"prompt\" must be a string", ERROR_TYPE_INVALID_REQUEST)); + } + + if (!data.contains("input_prefix")) { + res->error(format_error_response("\"input_prefix\" is required", ERROR_TYPE_INVALID_REQUEST)); + } + + if (!data.contains("input_suffix")) { + res->error(format_error_response("\"input_suffix\" is required", ERROR_TYPE_INVALID_REQUEST)); + } + + if (data.contains("input_extra") && !data.at("input_extra").is_array()) { + // input_extra is optional + res->error(format_error_response("\"input_extra\" must be an array of {\"filename\": string, \"text\": string}", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + + json input_extra = json_value(data, "input_extra", json::array()); + for (const auto & chunk : input_extra) { + // { "text": string, "filename": string } + if (!chunk.contains("text") || !chunk.at("text").is_string()) { + res->error(format_error_response("extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + // filename is optional + if (chunk.contains("filename") && !chunk.at("filename").is_string()) { + res->error(format_error_response("extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + } + data["input_extra"] = input_extra; // default to empty array if it's not exist + + std::string prompt = json_value(data, "prompt", std::string()); + std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, false, true); + SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); + data["prompt"] = format_prompt_infill( + ctx_server.vocab, + data.at("input_prefix"), + data.at("input_suffix"), + data.at("input_extra"), + ctx_server.params_base.n_batch, + ctx_server.params_base.n_predict, + ctx_server.get_slot_n_ctx(), + ctx_server.params_base.spm_infill, + tokenized_prompts[0].get_text_tokens() // TODO: this could maybe be multimodal. + ); + + std::vector files; // dummy + return handle_completions_impl( + ctx_server, + SERVER_TASK_TYPE_INFILL, + data, + files, + req.should_stop, + TASK_RESPONSE_TYPE_NONE); // infill is not OAI compatible + }; + + this->post_completions = [this](const server_http_req & req) { + std::vector files; // dummy + const json body = json::parse(req.body); + return handle_completions_impl( + ctx_server, + SERVER_TASK_TYPE_COMPLETION, + body, + files, + req.should_stop, + TASK_RESPONSE_TYPE_NONE); + }; + + this->post_completions_oai = [this](const server_http_req & req) { + std::vector files; // dummy + const json body = json::parse(req.body); + return handle_completions_impl( + ctx_server, + SERVER_TASK_TYPE_COMPLETION, + body, + files, + req.should_stop, + TASK_RESPONSE_TYPE_OAI_CMPL); + }; + + this->post_chat_completions = [this](const server_http_req & req) { + std::vector files; + json body = json::parse(req.body); + json body_parsed = oaicompat_chat_params_parse( + body, + ctx_server.oai_parser_opt, + files); + return handle_completions_impl( + ctx_server, + SERVER_TASK_TYPE_COMPLETION, + body_parsed, + files, + req.should_stop, + TASK_RESPONSE_TYPE_OAI_CHAT); + }; + + this->post_anthropic_messages = [this](const server_http_req & req) { + std::vector files; + json body = convert_anthropic_to_oai(json::parse(req.body)); + json body_parsed = oaicompat_chat_params_parse( + body, + ctx_server.oai_parser_opt, + files); + return handle_completions_impl( + ctx_server, + SERVER_TASK_TYPE_COMPLETION, + body_parsed, + files, + req.should_stop, + TASK_RESPONSE_TYPE_ANTHROPIC); + }; + + this->post_anthropic_count_tokens = [this](const server_http_req & req) { + auto res = std::make_unique(ctx_server); + std::vector files; + json body = convert_anthropic_to_oai(json::parse(req.body)); + json body_parsed = oaicompat_chat_params_parse( + body, + ctx_server.oai_parser_opt, + files); + + json prompt = body_parsed.at("prompt"); + llama_tokens tokens = tokenize_mixed(ctx_server.vocab, prompt, true, true); + + res->ok({{"input_tokens", static_cast(tokens.size())}}); + return res; + }; + + // same with handle_chat_completions, but without inference part + this->post_apply_template = [this](const server_http_req & req) { + auto res = std::make_unique(ctx_server); + std::vector files; // dummy, unused + json body = json::parse(req.body); + json data = oaicompat_chat_params_parse( + body, + ctx_server.oai_parser_opt, + files); + res->ok({{ "prompt", std::move(data.at("prompt")) }}); + return res; + }; + + this->get_models = [this](const server_http_req &) { + auto res = std::make_unique(ctx_server); + json model_meta = nullptr; + if (is_ready()) { + model_meta = ctx_server.model_meta(); + } + bool has_mtmd = ctx_server.mctx != nullptr; + json models = { + {"models", { + { + {"name", ctx_server.model_name}, + {"model", ctx_server.model_name}, + {"modified_at", ""}, + {"size", ""}, + {"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash + {"type", "model"}, + {"description", ""}, + {"tags", {""}}, + {"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})}, + {"parameters", ""}, + {"details", { + {"parent_model", ""}, + {"format", "gguf"}, + {"family", ""}, + {"families", {""}}, + {"parameter_size", ""}, + {"quantization_level", ""} + }} + } + }}, + {"object", "list"}, + {"data", { + { + {"id", ctx_server.model_name}, + {"object", "model"}, + {"created", std::time(0)}, + {"owned_by", "llamacpp"}, + {"meta", model_meta}, + }, + }} + }; + + res->ok(models); + return res; + }; + + this->post_tokenize = [this](const server_http_req & req) { + auto res = std::make_unique(ctx_server); + const json body = json::parse(req.body); + json tokens_response = json::array(); + if (body.count("content") != 0) { + const bool add_special = json_value(body, "add_special", false); + const bool parse_special = json_value(body, "parse_special", true); + const bool with_pieces = json_value(body, "with_pieces", false); + + llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, parse_special); + + if (with_pieces) { + for (const auto& token : tokens) { + std::string piece = common_token_to_piece(ctx_server.ctx, token); + json piece_json; + + // Check if the piece is valid UTF-8 + if (is_valid_utf8(piece)) { + piece_json = piece; + } else { + // If not valid UTF-8, store as array of byte values + piece_json = json::array(); + for (unsigned char c : piece) { + piece_json.push_back(static_cast(c)); + } + } + + tokens_response.push_back({ + {"id", token}, + {"piece", piece_json} + }); + } + } else { + tokens_response = tokens; + } + } + + res->ok(json{{"tokens", std::move(tokens_response)}}); + return res; + }; + + this->post_detokenize = [this](const server_http_req & req) { + auto res = std::make_unique(ctx_server); + const json body = json::parse(req.body); + + std::string content; + if (body.count("tokens") != 0) { + const llama_tokens tokens = body.at("tokens"); + content = tokens_to_str(ctx_server.ctx, tokens); + } + + res->ok(json{{"content", std::move(content)}}); + return res; + }; + + this->post_embeddings = [this](const server_http_req & req) { + return handle_embeddings_impl(req, TASK_RESPONSE_TYPE_NONE); + }; + + this->post_embeddings_oai = [this](const server_http_req & req) { + return handle_embeddings_impl(req, TASK_RESPONSE_TYPE_OAI_EMBD); + }; + + this->post_rerank = [this](const server_http_req & req) { + auto res = std::make_unique(ctx_server); + if (!ctx_server.params_base.embedding || ctx_server.params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) { + res->error(format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED)); + return res; + } + + const json body = json::parse(req.body); + + // if true, use TEI API format, otherwise use Jina API format + // Jina: https://jina.ai/reranker/ + // TEI: https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/rerank + bool is_tei_format = body.contains("texts"); + + json query; + if (body.count("query") == 1) { + query = body.at("query"); + if (!query.is_string()) { + res->error(format_error_response("\"query\" must be a string", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + } else { + res->error(format_error_response("\"query\" must be provided", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + + std::vector documents = json_value(body, "documents", + json_value(body, "texts", std::vector())); + if (documents.empty()) { + res->error(format_error_response("\"documents\" must be a non-empty string array", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + + int top_n = json_value(body, "top_n", (int)documents.size()); + + // create and queue the task + json responses = json::array(); + server_response_reader rd({ctx_server.queue_tasks, ctx_server.queue_results}, HTTP_POLLING_SECONDS); + { + std::vector tasks; + tasks.reserve(documents.size()); + for (size_t i = 0; i < documents.size(); i++) { + auto tmp = format_prompt_rerank(ctx_server.model, ctx_server.vocab, ctx_server.mctx, query, documents[i]); + server_task task = server_task(SERVER_TASK_TYPE_RERANK); + task.id = ctx_server.queue_tasks.get_new_id(); + task.index = i; + task.tokens = std::move(tmp); + tasks.push_back(std::move(task)); + } + rd.post_tasks(std::move(tasks)); + } + + // wait for the results + auto all_results = rd.wait_for_all(req.should_stop); + + // collect results + if (all_results.is_terminated) { + return res; // connection is closed + } else if (all_results.error) { + res->error(all_results.error->to_json()); + return res; + } else { + for (auto & res : all_results.results) { + GGML_ASSERT(dynamic_cast(res.get()) != nullptr); + responses.push_back(res->to_json()); + } + } + + // write JSON response + json root = format_response_rerank( + body, + ctx_server.model_name, + responses, + is_tei_format, + documents, + top_n); + + res->ok(root); + return res; + }; + + this->get_lora_adapters = [this](const server_http_req &) { + auto res = std::make_unique(ctx_server); + json result = json::array(); + const auto & loras = ctx_server.params_base.lora_adapters; + for (size_t i = 0; i < loras.size(); ++i) { + auto & lora = loras[i]; + json entry = { + {"id", i}, + {"path", lora.path}, + {"scale", lora.scale}, + {"task_name", lora.task_name}, + {"prompt_prefix", lora.prompt_prefix}, + }; + std::string alora_invocation_string = ""; + const uint64_t n_alora_tokens = llama_adapter_get_alora_n_invocation_tokens(lora.ptr); + std::vector alora_invocation_tokens; + if (n_alora_tokens) { + const llama_token * alora_tokens = llama_adapter_get_alora_invocation_tokens(lora.ptr); + for (uint64_t i = 0; i < n_alora_tokens; ++i) { + alora_invocation_string += common_token_to_piece(ctx_server.ctx, alora_tokens[i]); + alora_invocation_tokens.push_back(alora_tokens[i]); + } + entry["alora_invocation_string"] = alora_invocation_string; + entry["alora_invocation_tokens"] = alora_invocation_tokens; + } + result.push_back(std::move(entry)); + } + res->ok(result); + return res; + }; + + this->post_lora_adapters = [this](const server_http_req & req) { + auto res = std::make_unique(ctx_server); + const json body = json::parse(req.body); + if (!body.is_array()) { + res->error(format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + + int task_id = ctx_server.queue_tasks.get_new_id(); + { + server_task task(SERVER_TASK_TYPE_SET_LORA); + task.id = task_id; + task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body); + ctx_server.queue_results.add_waiting_task_id(task_id); + ctx_server.queue_tasks.post(std::move(task)); + } + + // get the result + server_task_result_ptr result = ctx_server.queue_results.recv(task_id); + ctx_server.queue_results.remove_waiting_task_id(task_id); + + if (result->is_error()) { + res->error(result->to_json()); + return res; + } + + GGML_ASSERT(dynamic_cast(result.get()) != nullptr); + res->ok(result->to_json()); + return res; + }; +} + +std::unique_ptr server_routes::handle_slots_save(const server_http_req & req, int id_slot) { + auto res = std::make_unique(ctx_server); + const json request_data = json::parse(req.body); + std::string filename = request_data.at("filename"); + if (!fs_validate_filename(filename)) { + res->error(format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + std::string filepath = params.slot_save_path + filename; + + int task_id = ctx_server.queue_tasks.get_new_id(); + { + server_task task(SERVER_TASK_TYPE_SLOT_SAVE); + task.id = task_id; + task.slot_action.slot_id = id_slot; + task.slot_action.filename = filename; + task.slot_action.filepath = filepath; + + // TODO: use server_response_reader + ctx_server.queue_results.add_waiting_task_id(task_id); + ctx_server.queue_tasks.post(std::move(task)); + } + + server_task_result_ptr result = ctx_server.queue_results.recv(task_id); + ctx_server.queue_results.remove_waiting_task_id(task_id); + + if (result->is_error()) { + res->error(result->to_json()); + return res; + } + + res->ok(result->to_json()); + return res; +} + +std::unique_ptr server_routes::handle_slots_restore(const server_http_req & req, int id_slot) { + auto res = std::make_unique(ctx_server); + const json request_data = json::parse(req.body); + std::string filename = request_data.at("filename"); + if (!fs_validate_filename(filename)) { + res->error(format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + std::string filepath = params.slot_save_path + filename; + + int task_id = ctx_server.queue_tasks.get_new_id(); + { + server_task task(SERVER_TASK_TYPE_SLOT_RESTORE); + task.id = task_id; + task.slot_action.slot_id = id_slot; + task.slot_action.filename = filename; + task.slot_action.filepath = filepath; + + // TODO: use server_response_reader + ctx_server.queue_results.add_waiting_task_id(task_id); + ctx_server.queue_tasks.post(std::move(task)); + } + + server_task_result_ptr result = ctx_server.queue_results.recv(task_id); + ctx_server.queue_results.remove_waiting_task_id(task_id); + + if (result->is_error()) { + res->error(result->to_json()); + return res; + } + + GGML_ASSERT(dynamic_cast(result.get()) != nullptr); + res->ok(result->to_json()); + return res; +} + +std::unique_ptr server_routes::handle_slots_erase(const server_http_req &, int id_slot) { + auto res = std::make_unique(ctx_server); + int task_id = ctx_server.queue_tasks.get_new_id(); + { + server_task task(SERVER_TASK_TYPE_SLOT_ERASE); + task.id = task_id; + task.slot_action.slot_id = id_slot; + + // TODO: use server_response_reader + ctx_server.queue_results.add_waiting_task_id(task_id); + ctx_server.queue_tasks.post(std::move(task)); + } + + server_task_result_ptr result = ctx_server.queue_results.recv(task_id); + ctx_server.queue_results.remove_waiting_task_id(task_id); + + if (result->is_error()) { + res->error(result->to_json()); + return res; + } + + GGML_ASSERT(dynamic_cast(result.get()) != nullptr); + res->ok(result->to_json()); + return res; +} + +std::unique_ptr server_routes::handle_embeddings_impl(const server_http_req & req, task_response_type res_type) { + auto res = std::make_unique(ctx_server); + if (!ctx_server.params_base.embedding) { + res->error(format_error_response("This server does not support embeddings. Start it with `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); + return res; + } + + if (res_type != TASK_RESPONSE_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) { + res->error(format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + + const json body = json::parse(req.body); + + // for the shape of input/content, see tokenize_input_prompts() + json prompt; + if (body.count("input") != 0) { + prompt = body.at("input"); + } else if (body.contains("content")) { + res_type = TASK_RESPONSE_TYPE_NONE; // "content" field is not OAI compatible + prompt = body.at("content"); + } else { + res->error(format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + + bool use_base64 = false; + if (body.count("encoding_format") != 0) { + const std::string& format = body.at("encoding_format"); + if (format == "base64") { + use_base64 = true; + } else if (format != "float") { + res->error(format_error_response("The format to return the embeddings in. Can be either float or base64", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + } + + auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true); + for (const auto & tokens : tokenized_prompts) { + // this check is necessary for models that do not add BOS token to the input + if (tokens.empty()) { + res->error(format_error_response("Input content cannot be empty", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + } + + int embd_normalize = 2; // default to Euclidean/L2 norm + if (body.count("embd_normalize") != 0) { + embd_normalize = body.at("embd_normalize"); + if (llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) { + SRV_DBG("embd_normalize is not supported by pooling type %d, ignoring it\n", llama_pooling_type(ctx_server.ctx)); + } + } + + // create and queue the task + json responses = json::array(); + server_response_reader rd({ctx_server.queue_tasks, ctx_server.queue_results}, HTTP_POLLING_SECONDS); + { + std::vector tasks; + for (size_t i = 0; i < tokenized_prompts.size(); i++) { + server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING); + + task.id = ctx_server.queue_tasks.get_new_id(); + task.index = i; + task.tokens = std::move(tokenized_prompts[i]); + + // OAI-compat + task.params.res_type = res_type; + task.params.embd_normalize = embd_normalize; + + tasks.push_back(std::move(task)); + } + rd.post_tasks(std::move(tasks)); + } + + // wait for the results + auto all_results = rd.wait_for_all(req.should_stop); + + // collect results + if (all_results.is_terminated) { + return res; // connection is closed + } else if (all_results.error) { + res->error(all_results.error->to_json()); + return res; + } else { + for (auto & res : all_results.results) { + GGML_ASSERT(dynamic_cast(res.get()) != nullptr); + responses.push_back(res->to_json()); + } + } + + // write JSON response + json root = res_type == TASK_RESPONSE_TYPE_OAI_EMBD + ? format_embeddings_response_oaicompat(body, ctx_server.model_name, responses, use_base64) + : json(responses); + res->ok(root); + return res; +} diff --git a/llamacpp/native/src/server/server-context.h b/llamacpp/native/src/server/server-context.h new file mode 100644 index 000000000..05b4afaee --- /dev/null +++ b/llamacpp/native/src/server/server-context.h @@ -0,0 +1,83 @@ +#include "server-http.h" +#include "server-task.h" +#include "server-queue.h" + +#include + +#include +#include + +struct server_context_impl; // private implementation + +struct server_context { + std::unique_ptr impl; + + server_context(); + ~server_context(); + + // initialize slots and server-related data + void init(); + + // load the model and initialize llama_context + // returns true on success + bool load_model(const common_params & params); + + // this function will block main thread until termination + void start_loop(); + + // terminate main loop (will unblock start_loop) + void terminate(); + + // get the underlaying llama_context + llama_context * get_llama_context() const; + + // get the underlaying queue_tasks and queue_results + // used by CLI application + std::pair get_queues(); +}; + + +// forward declarations +struct server_res_generator; + +struct server_routes { + server_routes(const common_params & params, server_context & ctx_server, std::function is_ready = []() { return true; }) + : params(params), ctx_server(*ctx_server.impl), is_ready(is_ready) { + init_routes(); + } + + void init_routes(); + // handlers using lambda function, so that they can capture `this` without `std::bind` + server_http_context::handler_t get_health; + server_http_context::handler_t get_metrics; + server_http_context::handler_t get_slots; + server_http_context::handler_t post_slots; + server_http_context::handler_t get_props; + server_http_context::handler_t post_props; + server_http_context::handler_t get_api_show; + server_http_context::handler_t post_infill; + server_http_context::handler_t post_completions; + server_http_context::handler_t post_completions_oai; + server_http_context::handler_t post_chat_completions; + server_http_context::handler_t post_anthropic_messages; + server_http_context::handler_t post_anthropic_count_tokens; + server_http_context::handler_t post_apply_template; + server_http_context::handler_t get_models; + server_http_context::handler_t post_tokenize; + server_http_context::handler_t post_detokenize; + server_http_context::handler_t post_embeddings; + server_http_context::handler_t post_embeddings_oai; + server_http_context::handler_t post_rerank; + server_http_context::handler_t get_lora_adapters; + server_http_context::handler_t post_lora_adapters; +private: + // TODO: move these outside of server_routes? + std::unique_ptr handle_slots_save(const server_http_req & req, int id_slot); + std::unique_ptr handle_slots_restore(const server_http_req & req, int id_slot); + std::unique_ptr handle_slots_erase(const server_http_req &, int id_slot); + std::unique_ptr handle_embeddings_impl(const server_http_req & req, task_response_type res_type); + + const common_params & params; + server_context_impl & ctx_server; + std::function is_ready; +}; diff --git a/llamacpp/native/src/server/server-http.cpp b/llamacpp/native/src/server/server-http.cpp new file mode 100644 index 000000000..77e54d192 --- /dev/null +++ b/llamacpp/native/src/server/server-http.cpp @@ -0,0 +1,380 @@ +#include "common.h" +#include "server-http.h" +#include "server-common.h" + +#include + +#include +#include +#include + +// +// HTTP implementation using cpp-httplib +// + +class server_http_context::Impl { +public: + std::unique_ptr srv; +}; + +server_http_context::server_http_context() + : pimpl(std::make_unique()) +{} + +server_http_context::~server_http_context() = default; + +static void log_server_request(const httplib::Request & req, const httplib::Response & res) { + // skip GH copilot requests when using default port + if (req.path == "/v1/health") { + return; + } + + // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch + + SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status); + + SRV_DBG("request: %s\n", req.body.c_str()); + SRV_DBG("response: %s\n", res.body.c_str()); +} + +bool server_http_context::init(const common_params & params) { + path_prefix = params.api_prefix; + port = params.port; + hostname = params.hostname; + + auto & srv = pimpl->srv; + +#ifdef CPPHTTPLIB_OPENSSL_SUPPORT + if (params.ssl_file_key != "" && params.ssl_file_cert != "") { + LOG_INF("Running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str()); + srv.reset( + new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str()) + ); + } else { + LOG_INF("Running without SSL\n"); + srv.reset(new httplib::Server()); + } +#else + if (params.ssl_file_key != "" && params.ssl_file_cert != "") { + LOG_ERR("Server is built without SSL support\n"); + return false; + } + srv.reset(new httplib::Server()); +#endif + + srv->set_default_headers({{"Server", "llama.cpp"}}); + srv->set_logger(log_server_request); + srv->set_exception_handler([](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) { + // this is fail-safe; exceptions should already handled by `ex_wrapper` + + std::string message; + try { + std::rethrow_exception(ep); + } catch (const std::exception & e) { + message = e.what(); + } catch (...) { + message = "Unknown Exception"; + } + + res.status = 500; + res.set_content(message, "text/plain"); + LOG_ERR("got exception: %s\n", message.c_str()); + }); + + srv->set_error_handler([](const httplib::Request &, httplib::Response & res) { + if (res.status == 404) { + res.set_content( + safe_json_to_str(json { + {"error", { + {"message", "File Not Found"}, + {"type", "not_found_error"}, + {"code", 404} + }} + }), + "application/json; charset=utf-8" + ); + } + // for other error codes, we skip processing here because it's already done by res->error() + }); + + // set timeouts and change hostname and port + srv->set_read_timeout (params.timeout_read); + srv->set_write_timeout(params.timeout_write); + + if (params.api_keys.size() == 1) { + auto key = params.api_keys[0]; + std::string substr = key.substr(std::max((int)(key.length() - 4), 0)); + LOG_INF("%s: api_keys: ****%s\n", __func__, substr.c_str()); + } else if (params.api_keys.size() > 1) { + LOG_INF("%s: api_keys: %zu keys loaded\n", __func__, params.api_keys.size()); + } + + // + // Middlewares + // + + auto middleware_validate_api_key = [api_keys = params.api_keys](const httplib::Request & req, httplib::Response & res) { + static const std::unordered_set public_endpoints = { + "/health", + "/v1/health", + "/models", + "/v1/models", + "/api/tags" + }; + + // If API key is not set, skip validation + if (api_keys.empty()) { + return true; + } + + // If path is public or is static file, skip validation + if (public_endpoints.find(req.path) != public_endpoints.end() || req.path == "/") { + return true; + } + + // Check for API key in the Authorization header + std::string req_api_key = req.get_header_value("Authorization"); + if (req_api_key.empty()) { + // retry with anthropic header + req_api_key = req.get_header_value("X-Api-Key"); + } + + // remove the "Bearer " prefix if needed + std::string prefix = "Bearer "; + if (req_api_key.substr(0, prefix.size()) == prefix) { + req_api_key = req_api_key.substr(prefix.size()); + } + + // validate the API key + if (std::find(api_keys.begin(), api_keys.end(), req_api_key) != api_keys.end()) { + return true; // API key is valid + } + + // API key is invalid or not provided + res.status = 401; + res.set_content( + safe_json_to_str(json { + {"error", { + {"message", "Invalid API Key"}, + {"type", "authentication_error"}, + {"code", 401} + }} + }), + "application/json; charset=utf-8" + ); + + LOG_WRN("Unauthorized: Invalid API Key\n"); + + return false; + }; + + auto middleware_server_state = [this](const httplib::Request & req, httplib::Response & res) { + bool ready = is_ready.load(); + if (!ready) { + res.status = 503; + res.set_content( + safe_json_to_str(json { + {"error", { + {"message", "Loading model"}, + {"type", "unavailable_error"}, + {"code", 503} + }} + }), + "application/json; charset=utf-8" + ); + return false; + } + return true; + }; + + // register server middlewares + srv->set_pre_routing_handler([middleware_validate_api_key, middleware_server_state](const httplib::Request & req, httplib::Response & res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + // If this is OPTIONS request, skip validation because browsers don't include Authorization header + if (req.method == "OPTIONS") { + res.set_header("Access-Control-Allow-Credentials", "true"); + res.set_header("Access-Control-Allow-Methods", "GET, POST"); + res.set_header("Access-Control-Allow-Headers", "*"); + res.set_content("", "text/html"); // blank response, no data + return httplib::Server::HandlerResponse::Handled; // skip further processing + } + if (!middleware_server_state(req, res)) { + return httplib::Server::HandlerResponse::Handled; + } + if (!middleware_validate_api_key(req, res)) { + return httplib::Server::HandlerResponse::Handled; + } + return httplib::Server::HandlerResponse::Unhandled; + }); + + int n_threads_http = params.n_threads_http; + if (n_threads_http < 1) { + // +2 threads for monitoring endpoints + n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1); + } + LOG_INF("%s: using %d threads for HTTP server\n", __func__, n_threads_http); + srv->new_task_queue = [n_threads_http] { return new httplib::ThreadPool(n_threads_http); }; + + // + // Web UI setup + // + + if (!params.webui) { + LOG_INF("Web UI is disabled\n"); + } else { + // register static assets routes + if (!params.public_path.empty()) { + // Set the base directory for serving static files + bool is_found = srv->set_mount_point(params.api_prefix + "/", params.public_path); + if (!is_found) { + LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str()); + return 1; + } + } else { + // using embedded static index.html + srv->Get(params.api_prefix + "/", [](const httplib::Request & req, httplib::Response & res) { + if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) { + res.set_content("Error: gzip is not supported by this browser", "text/plain"); + } else { + res.set_header("Content-Encoding", "gzip"); + // COEP and COOP headers, required by pyodide (python interpreter) + res.set_header("Cross-Origin-Embedder-Policy", "require-corp"); + res.set_header("Cross-Origin-Opener-Policy", "same-origin"); + } + return false; + }); + } + } + return true; +} + +bool server_http_context::start() { + // Bind and listen + + auto & srv = pimpl->srv; + bool was_bound = false; + bool is_sock = false; + if (string_ends_with(std::string(hostname), ".sock")) { + is_sock = true; + LOG_INF("%s: setting address family to AF_UNIX\n", __func__); + srv->set_address_family(AF_UNIX); + // bind_to_port requires a second arg, any value other than 0 should + // simply get ignored + was_bound = srv->bind_to_port(hostname, 8080); + } else { + LOG_INF("%s: binding port with default address family\n", __func__); + // bind HTTP listen port + if (port == 0) { + int bound_port = srv->bind_to_any_port(hostname); + was_bound = (bound_port >= 0); + if (was_bound) { + port = bound_port; + } + } else { + was_bound = srv->bind_to_port(hostname, port); + } + } + + if (!was_bound) { + LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, hostname.c_str(), port); + return false; + } + + // run the HTTP server in a thread + thread = std::thread([this]() { pimpl->srv->listen_after_bind(); }); + srv->wait_until_ready(); + + listening_address = is_sock ? string_format("unix://%s", hostname.c_str()) + : string_format("http://%s:%d", hostname.c_str(), port); + return true; +} + +void server_http_context::stop() const { + if (pimpl->srv) { + pimpl->srv->stop(); + } +} + +static void set_headers(httplib::Response & res, const std::map & headers) { + for (const auto & [key, value] : headers) { + res.set_header(key, value); + } +} + +static std::map get_params(const httplib::Request & req) { + std::map params; + for (const auto & [key, value] : req.params) { + params[key] = value; + } + for (const auto & [key, value] : req.path_params) { + params[key] = value; + } + return params; +} + +static std::map get_headers(const httplib::Request & req) { + std::map headers; + for (const auto & [key, value] : req.headers) { + headers[key] = value; + } + return headers; +} + +static void process_handler_response(server_http_res_ptr & response, httplib::Response & res) { + if (response->is_stream()) { + res.status = response->status; + set_headers(res, response->headers); + std::string content_type = response->content_type; + // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it + std::shared_ptr r_ptr = std::move(response); + const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool { + std::string chunk; + bool has_next = response->next(chunk); + if (!chunk.empty()) { + // TODO: maybe handle sink.write unsuccessful? for now, we rely on is_connection_closed() + sink.write(chunk.data(), chunk.size()); + SRV_DBG("http: streamed chunk: %s\n", chunk.c_str()); + } + if (!has_next) { + sink.done(); + SRV_DBG("%s", "http: stream ended\n"); + } + return has_next; + }; + const auto on_complete = [response = r_ptr](bool) mutable { + response.reset(); // trigger the destruction of the response object + }; + res.set_chunked_content_provider(content_type, chunked_content_provider, on_complete); + } else { + res.status = response->status; + set_headers(res, response->headers); + res.set_content(response->data, response->content_type); + } +} + +void server_http_context::get(const std::string & path, const server_http_context::handler_t & handler) const { + pimpl->srv->Get(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) { + server_http_res_ptr response = handler(server_http_req{ + get_params(req), + get_headers(req), + req.path, + req.body, + req.is_connection_closed + }); + process_handler_response(response, res); + }); +} + +void server_http_context::post(const std::string & path, const server_http_context::handler_t & handler) const { + pimpl->srv->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) { + server_http_res_ptr response = handler(server_http_req{ + get_params(req), + get_headers(req), + req.path, + req.body, + req.is_connection_closed + }); + process_handler_response(response, res); + }); +} + diff --git a/llamacpp/native/src/server/server-http.h b/llamacpp/native/src/server/server-http.h new file mode 100644 index 000000000..24c0b4011 --- /dev/null +++ b/llamacpp/native/src/server/server-http.h @@ -0,0 +1,78 @@ +#pragma once + +#include +#include +#include +#include +#include + +struct common_params; + +// generator-like API for HTTP response generation +// this object response with one of the 2 modes: +// 1) normal response: `data` contains the full response body +// 2) streaming response: each call to next(output) generates the next chunk +// when next(output) returns false, no more data after the current chunk +// note: some chunks can be empty, in which case no data is sent for that chunk +struct server_http_res { + std::string content_type = "application/json; charset=utf-8"; + int status = 200; + std::string data; + std::map headers; + + // TODO: move this to a virtual function once we have proper polymorphism support + std::function next = nullptr; + bool is_stream() const { + return next != nullptr; + } + + virtual ~server_http_res() = default; +}; + +// unique pointer, used by set_chunked_content_provider +// httplib requires the stream provider to be stored in heap +using server_http_res_ptr = std::unique_ptr; + +struct server_http_req { + std::map params; // path_params + query_params + std::map headers; // reserved for future use + std::string path; // reserved for future use + std::string body; + const std::function & should_stop; + + std::string get_param(const std::string & key, const std::string & def = "") const { + auto it = params.find(key); + if (it != params.end()) { + return it->second; + } + return def; + } +}; + +struct server_http_context { + class Impl; + std::unique_ptr pimpl; + + std::thread thread; // server thread + std::atomic is_ready = false; + + std::string path_prefix; + std::string hostname; + int port; + + server_http_context(); + ~server_http_context(); + + bool init(const common_params & params); + bool start(); + void stop() const; + + // note: the handler should never throw exceptions + using handler_t = std::function; + + void get(const std::string & path, const handler_t & handler) const; + void post(const std::string & path, const handler_t & handler) const; + + // for debugging + std::string listening_address; +}; diff --git a/llamacpp/native/src/server/server-http.patch b/llamacpp/native/src/server/server-http.patch new file mode 100644 index 000000000..900dae89b --- /dev/null +++ b/llamacpp/native/src/server/server-http.patch @@ -0,0 +1,61 @@ +diff --git a/llamacpp/native/src/server/server-http.cpp b/llamacpp/native/src/server/server-http.cpp +index 62250571..77e54d19 100644 +--- a/llamacpp/native/src/server/server-http.cpp ++++ b/llamacpp/native/src/server/server-http.cpp +@@ -8,10 +8,6 @@ + #include + #include + +-// auto generated files (see README.md for details) +-#include "index.html.gz.hpp" +-#include "loading.html.hpp" +- + // + // HTTP implementation using cpp-httplib + // +@@ -175,26 +171,17 @@ bool server_http_context::init(const common_params & params) { + auto middleware_server_state = [this](const httplib::Request & req, httplib::Response & res) { + bool ready = is_ready.load(); + if (!ready) { +- auto tmp = string_split(req.path, '.'); +- if (req.path == "/" || tmp.back() == "html") { +- res.set_content(reinterpret_cast(loading_html), loading_html_len, "text/html; charset=utf-8"); +- res.status = 503; +- } else if (req.path == "/models" || req.path == "/v1/models" || req.path == "/api/tags") { +- // allow the models endpoint to be accessed during loading +- return true; +- } else { +- res.status = 503; +- res.set_content( +- safe_json_to_str(json { +- {"error", { +- {"message", "Loading model"}, +- {"type", "unavailable_error"}, +- {"code", 503} +- }} +- }), +- "application/json; charset=utf-8" +- ); +- } ++ res.status = 503; ++ res.set_content( ++ safe_json_to_str(json { ++ {"error", { ++ {"message", "Loading model"}, ++ {"type", "unavailable_error"}, ++ {"code", 503} ++ }} ++ }), ++ "application/json; charset=utf-8" ++ ); + return false; + } + return true; +@@ -253,7 +240,6 @@ bool server_http_context::init(const common_params & params) { + // COEP and COOP headers, required by pyodide (python interpreter) + res.set_header("Cross-Origin-Embedder-Policy", "require-corp"); + res.set_header("Cross-Origin-Opener-Policy", "same-origin"); +- res.set_content(reinterpret_cast(index_html_gz), index_html_gz_len, "text/html; charset=utf-8"); + } + return false; + }); diff --git a/llamacpp/native/src/server/server-models.cpp b/llamacpp/native/src/server/server-models.cpp new file mode 100644 index 000000000..ac7f6b86b --- /dev/null +++ b/llamacpp/native/src/server/server-models.cpp @@ -0,0 +1,975 @@ +#include "server-common.h" +#include "server-models.h" + +#include "download.h" + +#include // TODO: remove this once we use HTTP client from download.h +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#else +#include +#include +#include +#include +#endif + +#if defined(__APPLE__) && defined(__MACH__) +// macOS: use _NSGetExecutablePath to get the executable path +#include +#include +#endif + +#define CMD_EXIT "exit" + +static std::filesystem::path get_server_exec_path() { +#if defined(_WIN32) + wchar_t buf[32768] = { 0 }; // Large buffer to handle long paths + DWORD len = GetModuleFileNameW(nullptr, buf, _countof(buf)); + if (len == 0 || len >= _countof(buf)) { + throw std::runtime_error("GetModuleFileNameW failed or path too long"); + } + return std::filesystem::path(buf); +#elif defined(__APPLE__) && defined(__MACH__) + char small_path[PATH_MAX]; + uint32_t size = sizeof(small_path); + + if (_NSGetExecutablePath(small_path, &size) == 0) { + // resolve any symlinks to get absolute path + try { + return std::filesystem::canonical(std::filesystem::path(small_path)); + } catch (...) { + return std::filesystem::path(small_path); + } + } else { + // buffer was too small, allocate required size and call again + std::vector buf(size); + if (_NSGetExecutablePath(buf.data(), &size) == 0) { + try { + return std::filesystem::canonical(std::filesystem::path(buf.data())); + } catch (...) { + return std::filesystem::path(buf.data()); + } + } + throw std::runtime_error("_NSGetExecutablePath failed after buffer resize"); + } +#else + char path[FILENAME_MAX]; + ssize_t count = readlink("/proc/self/exe", path, FILENAME_MAX); + if (count <= 0) { + throw std::runtime_error("failed to resolve /proc/self/exe"); + } + return std::filesystem::path(std::string(path, count)); +#endif +} + +struct local_model { + std::string name; + std::string path; + std::string path_mmproj; +}; + +static std::vector list_local_models(const std::string & dir) { + if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) { + throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", dir.c_str())); + } + + std::vector models; + auto scan_subdir = [&models](const std::string & subdir_path, const std::string & name) { + auto files = fs_list(subdir_path, false); + common_file_info model_file; + common_file_info first_shard_file; + common_file_info mmproj_file; + for (const auto & file : files) { + if (string_ends_with(file.name, ".gguf")) { + if (file.name.find("mmproj") != std::string::npos) { + mmproj_file = file; + } else if (file.name.find("-00001-of-") != std::string::npos) { + first_shard_file = file; + } else { + model_file = file; + } + } + } + // single file model + local_model model{ + /* name */ name, + /* path */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path, + /* path_mmproj */ mmproj_file.path // can be empty + }; + if (!model.path.empty()) { + models.push_back(model); + } + }; + + auto files = fs_list(dir, true); + for (const auto & file : files) { + if (file.is_dir) { + scan_subdir(file.path, file.name); + } else if (string_ends_with(file.name, ".gguf")) { + // single file model + std::string name = file.name; + string_replace_all(name, ".gguf", ""); + local_model model{ + /* name */ name, + /* path */ file.path, + /* path_mmproj */ "" + }; + models.push_back(model); + } + } + return models; +} + +// +// server_models +// + +server_models::server_models( + const common_params & params, + int argc, + char ** argv, + char ** envp) : base_params(params) { + for (int i = 0; i < argc; i++) { + base_args.push_back(std::string(argv[i])); + } + for (char ** env = envp; *env != nullptr; env++) { + base_env.push_back(std::string(*env)); + } + GGML_ASSERT(!base_args.empty()); + // set binary path + try { + base_args[0] = get_server_exec_path().string(); + } catch (const std::exception & e) { + LOG_WRN("failed to get server executable path: %s\n", e.what()); + LOG_WRN("using original argv[0] as fallback: %s\n", base_args[0].c_str()); + } + // TODO: allow refreshing cached model list + // add cached models + auto cached_models = common_list_cached_models(); + for (const auto & model : cached_models) { + server_model_meta meta{ + /* name */ model.to_string(), + /* path */ model.manifest_path, + /* path_mmproj */ "", // auto-detected when loading + /* in_cache */ true, + /* port */ 0, + /* status */ SERVER_MODEL_STATUS_UNLOADED, + /* last_used */ 0, + /* args */ std::vector(), + /* exit_code */ 0 + }; + mapping[meta.name] = instance_t{ + /* subproc */ std::make_shared(), + /* th */ std::thread(), + /* meta */ meta + }; + } + // add local models specificed via --models-dir + if (!params.models_dir.empty()) { + auto local_models = list_local_models(params.models_dir); + for (const auto & model : local_models) { + if (mapping.find(model.name) != mapping.end()) { + // already exists in cached models, skip + continue; + } + server_model_meta meta{ + /* name */ model.name, + /* path */ model.path, + /* path_mmproj */ model.path_mmproj, + /* in_cache */ false, + /* port */ 0, + /* status */ SERVER_MODEL_STATUS_UNLOADED, + /* last_used */ 0, + /* args */ std::vector(), + /* exit_code */ 0 + }; + mapping[meta.name] = instance_t{ + /* subproc */ std::make_shared(), + /* th */ std::thread(), + /* meta */ meta + }; + } + } +} + +void server_models::update_meta(const std::string & name, const server_model_meta & meta) { + std::lock_guard lk(mutex); + auto it = mapping.find(name); + if (it != mapping.end()) { + it->second.meta = meta; + } + cv.notify_all(); // notify wait_until_loaded +} + +bool server_models::has_model(const std::string & name) { + std::lock_guard lk(mutex); + return mapping.find(name) != mapping.end(); +} + +std::optional server_models::get_meta(const std::string & name) { + std::lock_guard lk(mutex); + auto it = mapping.find(name); + if (it != mapping.end()) { + return it->second.meta; + } + return std::nullopt; +} + +static int get_free_port() { +#ifdef _WIN32 + WSADATA wsaData; + if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) { + return -1; + } + typedef SOCKET native_socket_t; +#define INVALID_SOCKET_VAL INVALID_SOCKET +#define CLOSE_SOCKET(s) closesocket(s) +#else + typedef int native_socket_t; +#define INVALID_SOCKET_VAL -1 +#define CLOSE_SOCKET(s) close(s) +#endif + + native_socket_t sock = socket(AF_INET, SOCK_STREAM, 0); + if (sock == INVALID_SOCKET_VAL) { +#ifdef _WIN32 + WSACleanup(); +#endif + return -1; + } + + struct sockaddr_in serv_addr; + std::memset(&serv_addr, 0, sizeof(serv_addr)); + serv_addr.sin_family = AF_INET; + serv_addr.sin_addr.s_addr = htonl(INADDR_ANY); + serv_addr.sin_port = htons(0); + + if (bind(sock, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) != 0) { + CLOSE_SOCKET(sock); +#ifdef _WIN32 + WSACleanup(); +#endif + return -1; + } + +#ifdef _WIN32 + int namelen = sizeof(serv_addr); +#else + socklen_t namelen = sizeof(serv_addr); +#endif + if (getsockname(sock, (struct sockaddr*)&serv_addr, &namelen) != 0) { + CLOSE_SOCKET(sock); +#ifdef _WIN32 + WSACleanup(); +#endif + return -1; + } + + int port = ntohs(serv_addr.sin_port); + + CLOSE_SOCKET(sock); +#ifdef _WIN32 + WSACleanup(); +#endif + + return port; +} + +// helper to convert vector to char ** +// pointers are only valid as long as the original vector is valid +static std::vector to_char_ptr_array(const std::vector & vec) { + std::vector result; + result.reserve(vec.size() + 1); + for (const auto & s : vec) { + result.push_back(const_cast(s.c_str())); + } + result.push_back(nullptr); + return result; +} + +std::vector server_models::get_all_meta() { + std::lock_guard lk(mutex); + std::vector result; + result.reserve(mapping.size()); + for (const auto & [name, inst] : mapping) { + result.push_back(inst.meta); + } + return result; +} + +void server_models::unload_lru() { + if (base_params.models_max <= 0) { + return; // no limit + } + // remove one of the servers if we passed the models_max (least recently used - LRU) + std::string lru_model_name = ""; + int64_t lru_last_used = ggml_time_ms(); + size_t count_active = 0; + { + std::lock_guard lk(mutex); + for (const auto & m : mapping) { + if (m.second.meta.is_active()) { + count_active++; + if (m.second.meta.last_used < lru_last_used) { + lru_model_name = m.first; + lru_last_used = m.second.meta.last_used; + } + } + } + } + if (!lru_model_name.empty() && count_active >= (size_t)base_params.models_max) { + SRV_INF("models_max limit reached, removing LRU name=%s\n", lru_model_name.c_str()); + unload(lru_model_name); + } +} + +static void add_or_replace_arg(std::vector & args, const std::string & key, const std::string & value) { + for (size_t i = 0; i < args.size(); i++) { + if (args[i] == key && i + 1 < args.size()) { + args[i + 1] = value; + return; + } + } + // not found, append + args.push_back(key); + args.push_back(value); +} + +void server_models::load(const std::string & name, bool auto_load) { + if (!has_model(name)) { + throw std::runtime_error("model name=" + name + " is not found"); + } + unload_lru(); + + std::lock_guard lk(mutex); + + auto meta = mapping[name].meta; + if (meta.status != SERVER_MODEL_STATUS_UNLOADED) { + SRV_INF("model %s is not ready\n", name.c_str()); + return; + } + + // prepare new instance info + instance_t inst; + inst.meta = meta; + inst.meta.port = get_free_port(); + inst.meta.status = SERVER_MODEL_STATUS_LOADING; + inst.meta.last_used = ggml_time_ms(); + + if (inst.meta.port <= 0) { + throw std::runtime_error("failed to get a port number"); + } + + inst.subproc = std::make_shared(); + { + SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port); + + std::vector child_args; + if (auto_load && !meta.args.empty()) { + child_args = meta.args; // copy previous args + } else { + child_args = base_args; // copy + if (inst.meta.in_cache) { + add_or_replace_arg(child_args, "-hf", inst.meta.name); + } else { + add_or_replace_arg(child_args, "-m", inst.meta.path); + if (!inst.meta.path_mmproj.empty()) { + add_or_replace_arg(child_args, "--mmproj", inst.meta.path_mmproj); + } + } + } + + // set model args + add_or_replace_arg(child_args, "--port", std::to_string(inst.meta.port)); + add_or_replace_arg(child_args, "--alias", inst.meta.name); + + std::vector child_env = base_env; // copy + child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port)); + + SRV_INF("%s", "spawning server instance with args:\n"); + for (const auto & arg : child_args) { + SRV_INF(" %s\n", arg.c_str()); + } + inst.meta.args = child_args; // save for debugging + + std::vector argv = to_char_ptr_array(child_args); + std::vector envp = to_char_ptr_array(child_env); + + int options = subprocess_option_no_window | subprocess_option_combined_stdout_stderr; + int result = subprocess_create_ex(argv.data(), options, envp.data(), inst.subproc.get()); + if (result != 0) { + throw std::runtime_error("failed to spawn server instance"); + } + + inst.stdin_file = subprocess_stdin(inst.subproc.get()); + } + + // start a thread to manage the child process + // captured variables are guaranteed to be destroyed only after the thread is joined + inst.th = std::thread([this, name, child_proc = inst.subproc, port = inst.meta.port]() { + // read stdout/stderr and forward to main server log + FILE * p_stdout_stderr = subprocess_stdout(child_proc.get()); + if (p_stdout_stderr) { + char buffer[4096]; + while (fgets(buffer, sizeof(buffer), p_stdout_stderr) != nullptr) { + LOG("[%5d] %s", port, buffer); + } + } else { + SRV_ERR("failed to get stdout/stderr of child process for name=%s\n", name.c_str()); + } + // we reach here when the child process exits + int exit_code = 0; + subprocess_join(child_proc.get(), &exit_code); + subprocess_destroy(child_proc.get()); + // update PID and status + { + std::lock_guard lk(mutex); + auto it = mapping.find(name); + if (it != mapping.end()) { + auto & meta = it->second.meta; + meta.exit_code = exit_code; + meta.status = SERVER_MODEL_STATUS_UNLOADED; + } + cv.notify_all(); + } + SRV_INF("instance name=%s exited with status %d\n", name.c_str(), exit_code); + }); + + // clean up old process/thread if exists + { + auto & old_instance = mapping[name]; + // old process should have exited already, but just in case, we clean it up here + if (subprocess_alive(old_instance.subproc.get())) { + SRV_WRN("old process for model name=%s is still alive, this is unexpected\n", name.c_str()); + subprocess_terminate(old_instance.subproc.get()); // force kill + } + if (old_instance.th.joinable()) { + old_instance.th.join(); + } + } + + mapping[name] = std::move(inst); + cv.notify_all(); +} + +static void interrupt_subprocess(FILE * stdin_file) { + // because subprocess.h does not provide a way to send SIGINT, + // we will send a command to the child process to exit gracefully + if (stdin_file) { + fprintf(stdin_file, "%s\n", CMD_EXIT); + fflush(stdin_file); + } +} + +void server_models::unload(const std::string & name) { + std::lock_guard lk(mutex); + auto it = mapping.find(name); + if (it != mapping.end()) { + if (it->second.meta.is_active()) { + SRV_INF("unloading model instance name=%s\n", name.c_str()); + interrupt_subprocess(it->second.stdin_file); + // status change will be handled by the managing thread + } else { + SRV_WRN("model instance name=%s is not loaded\n", name.c_str()); + } + } +} + +void server_models::unload_all() { + std::vector to_join; + { + std::lock_guard lk(mutex); + for (auto & [name, inst] : mapping) { + if (inst.meta.is_active()) { + SRV_INF("unloading model instance name=%s\n", name.c_str()); + interrupt_subprocess(inst.stdin_file); + // status change will be handled by the managing thread + } + // moving the thread to join list to avoid deadlock + to_join.push_back(std::move(inst.th)); + } + } + for (auto & th : to_join) { + if (th.joinable()) { + th.join(); + } + } +} + +void server_models::update_status(const std::string & name, server_model_status status) { + // for now, we only allow updating to LOADED status + if (status != SERVER_MODEL_STATUS_LOADED) { + throw std::runtime_error("invalid status value"); + } + auto meta = get_meta(name); + if (meta.has_value()) { + meta->status = status; + update_meta(name, meta.value()); + } +} + +void server_models::wait_until_loaded(const std::string & name) { + std::unique_lock lk(mutex); + cv.wait(lk, [this, &name]() { + auto it = mapping.find(name); + if (it != mapping.end()) { + return it->second.meta.status != SERVER_MODEL_STATUS_LOADING; + } + return false; + }); +} + +bool server_models::ensure_model_loaded(const std::string & name) { + auto meta = get_meta(name); + if (!meta.has_value()) { + throw std::runtime_error("model name=" + name + " is not found"); + } + if (meta->status == SERVER_MODEL_STATUS_LOADED) { + return false; // already loaded + } + if (meta->status == SERVER_MODEL_STATUS_UNLOADED) { + SRV_INF("model name=%s is not loaded, loading...\n", name.c_str()); + load(name, true); + } + + SRV_INF("waiting until model name=%s is fully loaded...\n", name.c_str()); + wait_until_loaded(name); + + // check final status + meta = get_meta(name); + if (!meta.has_value() || meta->is_failed()) { + throw std::runtime_error("model name=" + name + " failed to load"); + } + + return true; +} + +server_http_res_ptr server_models::proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used) { + auto meta = get_meta(name); + if (!meta.has_value()) { + throw std::runtime_error("model name=" + name + " is not found"); + } + if (meta->status != SERVER_MODEL_STATUS_LOADED) { + throw std::invalid_argument("model name=" + name + " is not loaded"); + } + if (update_last_used) { + std::unique_lock lk(mutex); + mapping[name].meta.last_used = ggml_time_ms(); + } + SRV_INF("proxying request to model %s on port %d\n", name.c_str(), meta->port); + auto proxy = std::make_unique( + method, + base_params.hostname, + meta->port, + req.path, + req.headers, + req.body, + req.should_stop); + return proxy; +} + +std::thread server_models::setup_child_server(const common_params & base_params, int router_port, const std::string & name, std::function & shutdown_handler) { + // send a notification to the router server that a model instance is ready + // TODO @ngxson : use HTTP client from libcommon + httplib::Client cli(base_params.hostname, router_port); + cli.set_connection_timeout(0, 200000); // 200 milliseconds + + httplib::Request req; + req.method = "POST"; + req.path = "/models/status"; + req.set_header("Content-Type", "application/json"); + if (!base_params.api_keys.empty()) { + req.set_header("Authorization", "Bearer " + base_params.api_keys[0]); + } + + json body; + body["model"] = name; + body["value"] = server_model_status_to_string(SERVER_MODEL_STATUS_LOADED); + req.body = body.dump(); + + SRV_INF("notifying router server (port=%d) that model %s is ready\n", router_port, name.c_str()); + auto result = cli.send(std::move(req)); + if (result.error() != httplib::Error::Success) { + auto err_str = httplib::to_string(result.error()); + SRV_ERR("failed to notify router server: %s\n", err_str.c_str()); + exit(1); // force exit + } + + // setup thread for monitoring stdin + return std::thread([shutdown_handler]() { + // wait for EOF on stdin + SRV_INF("%s", "child server monitoring thread started, waiting for EOF on stdin...\n"); + bool eof = false; + while (true) { + std::string line; + if (!std::getline(std::cin, line)) { + // EOF detected, that means the router server is unexpectedly exit or killed + eof = true; + break; + } + if (line.find(CMD_EXIT) != std::string::npos) { + SRV_INF("%s", "exit command received, exiting...\n"); + shutdown_handler(0); + break; + } + } + if (eof) { + SRV_INF("%s", "EOF on stdin detected, forcing shutdown...\n"); + exit(1); + } + }); +} + + + +// +// server_models_routes +// + +static void res_ok(std::unique_ptr & res, const json & response_data) { + res->status = 200; + res->data = safe_json_to_str(response_data); +} + +static void res_err(std::unique_ptr & res, const json & error_data) { + res->status = json_value(error_data, "code", 500); + res->data = safe_json_to_str({{ "error", error_data }}); +} + +static bool router_validate_model(const std::string & name, server_models & models, bool models_autoload, std::unique_ptr & res) { + if (name.empty()) { + res_err(res, format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST)); + return false; + } + auto meta = models.get_meta(name); + if (!meta.has_value()) { + res_err(res, format_error_response("model not found", ERROR_TYPE_INVALID_REQUEST)); + return false; + } + if (models_autoload) { + models.ensure_model_loaded(name); + } else { + if (meta->status != SERVER_MODEL_STATUS_LOADED) { + res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST)); + return false; + } + } + return true; +} + +static bool is_autoload(const common_params & params, const server_http_req & req) { + std::string autoload = req.get_param("autoload"); + if (autoload.empty()) { + return params.models_autoload; + } else { + return autoload == "true" || autoload == "1"; + } +} + +void server_models_routes::init_routes() { + this->get_router_props = [this](const server_http_req & req) { + std::string name = req.get_param("model"); + if (name.empty()) { + // main instance + auto res = std::make_unique(); + res_ok(res, { + // TODO: add support for this on web UI + {"role", "router"}, + {"max_instances", 4}, // dummy value for testing + // this is a dummy response to make sure webui doesn't break + {"model_alias", "llama-server"}, + {"model_path", "none"}, + {"default_generation_settings", { + {"params", json{}}, + {"n_ctx", 0}, + }}, + }); + return res; + } + return proxy_get(req); + }; + + this->proxy_get = [this](const server_http_req & req) { + std::string method = "GET"; + std::string name = req.get_param("model"); + bool autoload = is_autoload(params, req); + auto error_res = std::make_unique(); + if (!router_validate_model(name, models, autoload, error_res)) { + return error_res; + } + return models.proxy_request(req, method, name, false); + }; + + this->proxy_post = [this](const server_http_req & req) { + std::string method = "POST"; + json body = json::parse(req.body); + std::string name = json_value(body, "model", std::string()); + bool autoload = is_autoload(params, req); + auto error_res = std::make_unique(); + if (!router_validate_model(name, models, autoload, error_res)) { + return error_res; + } + return models.proxy_request(req, method, name, true); // update last usage for POST request only + }; + + this->get_router_models = [this](const server_http_req &) { + auto res = std::make_unique(); + json models_json = json::array(); + auto all_models = models.get_all_meta(); + std::time_t t = std::time(0); + for (const auto & meta : all_models) { + json status { + {"value", server_model_status_to_string(meta.status)}, + {"args", meta.args}, + }; + if (meta.is_failed()) { + status["exit_code"] = meta.exit_code; + status["failed"] = true; + } + models_json.push_back(json { + {"id", meta.name}, + {"object", "model"}, // for OAI-compat + {"owned_by", "llamacpp"}, // for OAI-compat + {"created", t}, // for OAI-compat + {"in_cache", meta.in_cache}, + {"path", meta.path}, + {"status", status}, + // TODO: add other fields, may require reading GGUF metadata + }); + } + res_ok(res, { + {"data", models_json}, + {"object", "list"}, + }); + return res; + }; + + this->post_router_models_load = [this](const server_http_req & req) { + auto res = std::make_unique(); + json body = json::parse(req.body); + std::string name = json_value(body, "model", std::string()); + auto model = models.get_meta(name); + if (!model.has_value()) { + res_err(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND)); + return res; + } + if (model->status == SERVER_MODEL_STATUS_LOADED) { + res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + models.load(name, false); + res_ok(res, {{"success", true}}); + return res; + }; + + // used by child process to notify the router about status change + // TODO @ngxson : maybe implement authentication for this endpoint in the future + this->post_router_models_status = [this](const server_http_req & req) { + auto res = std::make_unique(); + json body = json::parse(req.body); + std::string model = json_value(body, "model", std::string()); + std::string value = json_value(body, "value", std::string()); + models.update_status(model, server_model_status_from_string(value)); + res_ok(res, {{"success", true}}); + return res; + }; + + this->get_router_models = [this](const server_http_req &) { + auto res = std::make_unique(); + json models_json = json::array(); + auto all_models = models.get_all_meta(); + std::time_t t = std::time(0); + for (const auto & meta : all_models) { + json status { + {"value", server_model_status_to_string(meta.status)}, + {"args", meta.args}, + }; + if (meta.is_failed()) { + status["exit_code"] = meta.exit_code; + status["failed"] = true; + } + models_json.push_back(json { + {"id", meta.name}, + {"object", "model"}, // for OAI-compat + {"owned_by", "llamacpp"}, // for OAI-compat + {"created", t}, // for OAI-compat + {"in_cache", meta.in_cache}, + {"path", meta.path}, + {"status", status}, + // TODO: add other fields, may require reading GGUF metadata + }); + } + res_ok(res, { + {"data", models_json}, + {"object", "list"}, + }); + return res; + }; + + this->post_router_models_unload = [this](const server_http_req & req) { + auto res = std::make_unique(); + json body = json::parse(req.body); + std::string name = json_value(body, "model", std::string()); + auto model = models.get_meta(name); + if (!model.has_value()) { + res_err(res, format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + if (model->status != SERVER_MODEL_STATUS_LOADED) { + res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + models.unload(name); + res_ok(res, {{"success", true}}); + return res; + }; +} + + + +// +// server_http_proxy +// + +// simple implementation of a pipe +// used for streaming data between threads +template +struct pipe_t { + std::mutex mutex; + std::condition_variable cv; + std::queue queue; + std::atomic writer_closed{false}; + std::atomic reader_closed{false}; + void close_write() { + writer_closed.store(true, std::memory_order_relaxed); + cv.notify_all(); + } + void close_read() { + reader_closed.store(true, std::memory_order_relaxed); + cv.notify_all(); + } + bool read(T & output, const std::function & should_stop) { + std::unique_lock lk(mutex); + constexpr auto poll_interval = std::chrono::milliseconds(500); + while (true) { + if (!queue.empty()) { + output = std::move(queue.front()); + queue.pop(); + return true; + } + if (writer_closed.load()) { + return false; // clean EOF + } + if (should_stop()) { + close_read(); // signal broken pipe to writer + return false; // cancelled / reader no longer alive + } + cv.wait_for(lk, poll_interval); + } + } + bool write(T && data) { + std::lock_guard lk(mutex); + if (reader_closed.load()) { + return false; // broken pipe + } + queue.push(std::move(data)); + cv.notify_one(); + return true; + } +}; + +server_http_proxy::server_http_proxy( + const std::string & method, + const std::string & host, + int port, + const std::string & path, + const std::map & headers, + const std::string & body, + const std::function should_stop) { + // shared between reader and writer threads + auto cli = std::make_shared(host, port); + auto pipe = std::make_shared>(); + + // setup Client + cli->set_connection_timeout(0, 200000); // 200 milliseconds + this->status = 500; // to be overwritten upon response + this->cleanup = [pipe]() { + pipe->close_read(); + pipe->close_write(); + }; + + // wire up the receive end of the pipe + this->next = [pipe, should_stop](std::string & out) -> bool { + msg_t msg; + bool has_next = pipe->read(msg, should_stop); + if (!msg.data.empty()) { + out = std::move(msg.data); + } + return has_next; // false if EOF or pipe broken + }; + + // wire up the HTTP client + // note: do NOT capture `this` pointer, as it may be destroyed before the thread ends + httplib::ResponseHandler response_handler = [pipe, cli](const httplib::Response & response) { + msg_t msg; + msg.status = response.status; + for (const auto & [key, value] : response.headers) { + msg.headers[key] = value; + } + return pipe->write(std::move(msg)); // send headers first + }; + httplib::ContentReceiverWithProgress content_receiver = [pipe](const char * data, size_t data_length, size_t, size_t) { + // send data chunks + // returns false if pipe is closed / broken (signal to stop receiving) + return pipe->write({{}, 0, std::string(data, data_length)}); + }; + + // prepare the request to destination server + httplib::Request req; + { + req.method = method; + req.path = path; + for (const auto & [key, value] : headers) { + req.set_header(key, value); + } + req.body = body; + req.response_handler = response_handler; + req.content_receiver = content_receiver; + } + + // start the proxy thread + SRV_DBG("start proxy thread %s %s\n", req.method.c_str(), req.path.c_str()); + this->thread = std::thread([cli, pipe, req]() { + auto result = cli->send(std::move(req)); + if (result.error() != httplib::Error::Success) { + auto err_str = httplib::to_string(result.error()); + SRV_ERR("http client error: %s\n", err_str.c_str()); + pipe->write({{}, 500, ""}); // header + pipe->write({{}, 0, "proxy error: " + err_str}); // body + } + pipe->close_write(); // signal EOF to reader + SRV_DBG("%s", "client request thread ended\n"); + }); + this->thread.detach(); + + // wait for the first chunk (headers) + msg_t header; + if (pipe->read(header, should_stop)) { + SRV_DBG("%s", "received response headers\n"); + this->status = header.status; + this->headers = header.headers; + } else { + SRV_DBG("%s", "no response headers received (request cancelled?)\n"); + } +} diff --git a/llamacpp/native/src/server/server-models.h b/llamacpp/native/src/server/server-models.h new file mode 100644 index 000000000..b9bec983e --- /dev/null +++ b/llamacpp/native/src/server/server-models.h @@ -0,0 +1,174 @@ +#pragma once + +#include "common.h" +#include "server-http.h" + +#include +#include +#include +#include + +/** + * state diagram: + * + * UNLOADED ──► LOADING ──► LOADED + * â–² │ │ + * └───failed───┘ │ + * â–² │ + * └────────unloaded─────────┘ + */ +enum server_model_status { + // TODO: also add downloading state when the logic is added + SERVER_MODEL_STATUS_UNLOADED, + SERVER_MODEL_STATUS_LOADING, + SERVER_MODEL_STATUS_LOADED +}; + +static server_model_status server_model_status_from_string(const std::string & status_str) { + if (status_str == "unloaded") { + return SERVER_MODEL_STATUS_UNLOADED; + } + if (status_str == "loading") { + return SERVER_MODEL_STATUS_LOADING; + } + if (status_str == "loaded") { + return SERVER_MODEL_STATUS_LOADED; + } + throw std::runtime_error("invalid server model status"); +} + +static std::string server_model_status_to_string(server_model_status status) { + switch (status) { + case SERVER_MODEL_STATUS_UNLOADED: return "unloaded"; + case SERVER_MODEL_STATUS_LOADING: return "loading"; + case SERVER_MODEL_STATUS_LOADED: return "loaded"; + default: return "unknown"; + } +} + +struct server_model_meta { + std::string name; + std::string path; + std::string path_mmproj; // only available if in_cache=false + bool in_cache = false; // if true, use -hf; use -m otherwise + int port = 0; + server_model_status status = SERVER_MODEL_STATUS_UNLOADED; + int64_t last_used = 0; // for LRU unloading + std::vector args; // additional args passed to the model instance (used for debugging) + int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) + + bool is_active() const { + return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING; + } + + bool is_failed() const { + return status == SERVER_MODEL_STATUS_UNLOADED && exit_code != 0; + } +}; + +struct subprocess_s; + +struct server_models { +private: + struct instance_t { + std::shared_ptr subproc; // shared between main thread and monitoring thread + std::thread th; + server_model_meta meta; + FILE * stdin_file = nullptr; + }; + + std::mutex mutex; + std::condition_variable cv; + std::map mapping; + + common_params base_params; + std::vector base_args; + std::vector base_env; + + void update_meta(const std::string & name, const server_model_meta & meta); + + // unload least recently used models if the limit is reached + void unload_lru(); + +public: + server_models(const common_params & params, int argc, char ** argv, char ** envp); + + // check if a model instance exists + bool has_model(const std::string & name); + + // return a copy of model metadata + std::optional get_meta(const std::string & name); + + // return a copy of all model metadata + std::vector get_all_meta(); + + // if auto_load is true, load the model with previous args if any + void load(const std::string & name, bool auto_load); + void unload(const std::string & name); + void unload_all(); + + // update the status of a model instance + void update_status(const std::string & name, server_model_status status); + + // wait until the model instance is fully loaded + // return when the model is loaded or failed to load + void wait_until_loaded(const std::string & name); + + // load the model if not loaded, otherwise do nothing + // return false if model is already loaded; return true otherwise (meta may need to be refreshed) + bool ensure_model_loaded(const std::string & name); + + // proxy an HTTP request to the model instance + server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used); + + // notify the router server that a model instance is ready + // return the monitoring thread (to be joined by the caller) + static std::thread setup_child_server(const common_params & base_params, int router_port, const std::string & name, std::function & shutdown_handler); +}; + +struct server_models_routes { + common_params params; + server_models models; + server_models_routes(const common_params & params, int argc, char ** argv, char ** envp) + : params(params), models(params, argc, argv, envp) { + init_routes(); + } + + void init_routes(); + // handlers using lambda function, so that they can capture `this` without `std::bind` + server_http_context::handler_t get_router_props; + server_http_context::handler_t proxy_get; + server_http_context::handler_t proxy_post; + server_http_context::handler_t get_router_models; + server_http_context::handler_t post_router_models_load; + server_http_context::handler_t post_router_models_status; + server_http_context::handler_t post_router_models_unload; +}; + +/** + * A simple HTTP proxy that forwards requests to another server + * and relays the responses back. + */ +struct server_http_proxy : server_http_res { + std::function cleanup = nullptr; +public: + server_http_proxy(const std::string & method, + const std::string & host, + int port, + const std::string & path, + const std::map & headers, + const std::string & body, + const std::function should_stop); + ~server_http_proxy() { + if (cleanup) { + cleanup(); + } + } +private: + std::thread thread; + struct msg_t { + std::map headers; + int status = 0; + std::string data; + }; +}; diff --git a/llamacpp/native/src/server/server-queue.cpp b/llamacpp/native/src/server/server-queue.cpp new file mode 100644 index 000000000..38a485852 --- /dev/null +++ b/llamacpp/native/src/server/server-queue.cpp @@ -0,0 +1,351 @@ +#include "server-task.h" +#include "server-queue.h" + +#include "log.h" + +#include + +#define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) + +#define RES_INF(fmt, ...) LOG_INF("res %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define RES_WRN(fmt, ...) LOG_WRN("res %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define RES_ERR(fmt, ...) LOG_ERR("res %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define RES_DBG(fmt, ...) LOG_DBG("res %12.*s: " fmt, 12, __func__, __VA_ARGS__) + +// +// server_queue +// + +int server_queue::post(server_task && task, bool front) { + std::unique_lock lock(mutex_tasks); + GGML_ASSERT(task.id != -1); + // if this is cancel task make sure to clean up pending tasks + if (task.type == SERVER_TASK_TYPE_CANCEL) { + cleanup_pending_task(task.id_target); + } + const int task_id = task.id; + QUE_DBG("new task, id = %d, front = %d\n", task_id, front); + if (front) { + queue_tasks.push_front(std::move(task)); + } else { + queue_tasks.push_back(std::move(task)); + } + condition_tasks.notify_one(); + return task_id; +} + +int server_queue::post(std::vector && tasks, bool front) { + std::unique_lock lock(mutex_tasks); + for (auto & task : tasks) { + if (task.id == -1) { + task.id = id++; + } + // if this is cancel task make sure to clean up pending tasks + if (task.type == SERVER_TASK_TYPE_CANCEL) { + cleanup_pending_task(task.id_target); + } + QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front); + if (front) { + queue_tasks.push_front(std::move(task)); + } else { + queue_tasks.push_back(std::move(task)); + } + } + condition_tasks.notify_one(); + return 0; +} + +void server_queue::defer(server_task && task) { + std::unique_lock lock(mutex_tasks); + QUE_DBG("defer task, id = %d\n", task.id); + queue_tasks_deferred.push_back(std::move(task)); + condition_tasks.notify_one(); +} + +int server_queue::get_new_id() { + std::unique_lock lock(mutex_tasks); + int new_id = id++; + return new_id; +} + +void server_queue::on_new_task(std::function callback) { + callback_new_task = std::move(callback); +} + +void server_queue::on_update_slots(std::function callback) { + callback_update_slots = std::move(callback); +} + +void server_queue::pop_deferred_task() { + std::unique_lock lock(mutex_tasks); + if (!queue_tasks_deferred.empty()) { + queue_tasks.emplace_front(std::move(queue_tasks_deferred.front())); + queue_tasks_deferred.pop_front(); + } + condition_tasks.notify_one(); +} + +void server_queue::terminate() { + std::unique_lock lock(mutex_tasks); + running = false; + condition_tasks.notify_all(); +} + +void server_queue::start_loop() { + running = true; + + while (true) { + QUE_DBG("%s", "processing new tasks\n"); + + while (true) { + std::unique_lock lock(mutex_tasks); + if (!running) { + QUE_DBG("%s", "terminate\n"); + return; + } + if (queue_tasks.empty()) { + lock.unlock(); + break; + } + server_task task = std::move(queue_tasks.front()); + queue_tasks.pop_front(); + lock.unlock(); + + QUE_DBG("processing task, id = %d\n", task.id); + callback_new_task(std::move(task)); + } + + // all tasks in the current loop is processed, slots data is now ready + QUE_DBG("%s", "update slots\n"); + + callback_update_slots(); + + QUE_DBG("%s", "waiting for new tasks\n"); + { + std::unique_lock lock(mutex_tasks); + if (!running) { + QUE_DBG("%s", "terminate\n"); + return; + } + if (queue_tasks.empty()) { + condition_tasks.wait(lock, [&]{ + return (!queue_tasks.empty() || !running); + }); + } + } + } +} + +void server_queue::cleanup_pending_task(int id_target) { + // no need lock because this is called exclusively by post() + auto rm_func = [id_target](const server_task & task) { + return task.id == id_target; + }; + queue_tasks.erase( + std::remove_if(queue_tasks.begin(), queue_tasks.end(), rm_func), + queue_tasks.end()); + queue_tasks_deferred.erase( + std::remove_if(queue_tasks_deferred.begin(), queue_tasks_deferred.end(), rm_func), + queue_tasks_deferred.end()); +} + +// +// server_response +// + +void server_response::add_waiting_task_id(int id_task) { + RES_DBG("add task %d to waiting list. current waiting = %d (before add)\n", id_task, (int) waiting_task_ids.size()); + + std::unique_lock lock(mutex_results); + waiting_task_ids.insert(id_task); +} + +void server_response::add_waiting_tasks(const std::vector & tasks) { + std::unique_lock lock(mutex_results); + + for (const auto & task : tasks) { + RES_DBG("add task %d to waiting list. current waiting = %d (before add)\n", task.id, (int) waiting_task_ids.size()); + waiting_task_ids.insert(task.id); + } +} + +void server_response::remove_waiting_task_id(int id_task) { + RES_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size()); + + std::unique_lock lock(mutex_results); + waiting_task_ids.erase(id_task); + // make sure to clean up all pending results + queue_results.erase( + std::remove_if(queue_results.begin(), queue_results.end(), [id_task](const server_task_result_ptr & res) { + return res->id == id_task; + }), + queue_results.end()); +} + +void server_response::remove_waiting_task_ids(const std::unordered_set & id_tasks) { + std::unique_lock lock(mutex_results); + + for (const auto & id_task : id_tasks) { + RES_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size()); + waiting_task_ids.erase(id_task); + } +} + +server_task_result_ptr server_response::recv(const std::unordered_set & id_tasks) { + while (true) { + std::unique_lock lock(mutex_results); + condition_results.wait(lock, [&]{ + if (!running) { + RES_DBG("%s : queue result stop\n", "recv"); + std::terminate(); // we cannot return here since the caller is HTTP code + } + return !queue_results.empty(); + }); + + for (size_t i = 0; i < queue_results.size(); i++) { + if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) { + server_task_result_ptr res = std::move(queue_results[i]); + queue_results.erase(queue_results.begin() + i); + return res; + } + } + } + + // should never reach here +} + +server_task_result_ptr server_response::recv_with_timeout(const std::unordered_set & id_tasks, int timeout) { + while (true) { + std::unique_lock lock(mutex_results); + + for (int i = 0; i < (int) queue_results.size(); i++) { + if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) { + server_task_result_ptr res = std::move(queue_results[i]); + queue_results.erase(queue_results.begin() + i); + return res; + } + } + + std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout)); + if (!running) { + RES_DBG("%s : queue result stop\n", __func__); + std::terminate(); // we cannot return here since the caller is HTTP code + } + if (cr_res == std::cv_status::timeout) { + return nullptr; + } + } + + // should never reach here +} + +server_task_result_ptr server_response::recv(int id_task) { + std::unordered_set id_tasks = {id_task}; + return recv(id_tasks); +} + +void server_response::send(server_task_result_ptr && result) { + RES_DBG("sending result for task id = %d\n", result->id); + + std::unique_lock lock(mutex_results); + for (const auto & id_task : waiting_task_ids) { + if (result->id == id_task) { + RES_DBG("task id = %d pushed to result queue\n", result->id); + + queue_results.emplace_back(std::move(result)); + condition_results.notify_all(); + return; + } + } +} + +void server_response::terminate() { + running = false; + condition_results.notify_all(); +} + +// +// server_response_reader +// + +void server_response_reader::post_tasks(std::vector && tasks) { + id_tasks = server_task::get_list_id(tasks); + queue_results.add_waiting_tasks(tasks); + queue_tasks.post(std::move(tasks)); +} + +bool server_response_reader::has_next() const { + return !cancelled && received_count < id_tasks.size(); +} + +// return nullptr if should_stop() is true before receiving a result +// note: if one error is received, it will stop further processing and return error result +server_task_result_ptr server_response_reader::next(const std::function & should_stop) { + while (true) { + server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, polling_interval_seconds); + if (result == nullptr) { + // timeout, check stop condition + if (should_stop()) { + SRV_DBG("%s", "stopping wait for next result due to should_stop condition\n"); + return nullptr; + } + } else { + if (result->is_error()) { + stop(); // cancel remaining tasks + SRV_DBG("%s", "received error result, stopping further processing\n"); + return result; + } + if (result->is_stop()) { + received_count++; + } + return result; + } + } + + // should not reach here +} + +server_response_reader::batch_response server_response_reader::wait_for_all(const std::function & should_stop) { + batch_response batch_res; + batch_res.results.resize(id_tasks.size()); + while (has_next()) { + auto res = next(should_stop); + if (res == nullptr) { + batch_res.is_terminated = true; + return batch_res; + } + if (res->is_error()) { + batch_res.error = std::move(res); + return batch_res; + } + const size_t idx = res->get_index(); + GGML_ASSERT(idx < batch_res.results.size() && "index out of range"); + GGML_ASSERT(batch_res.results[idx] == nullptr && "duplicate result received"); + batch_res.results[idx] = std::move(res); + } + return batch_res; +} + +void server_response_reader::stop() { + queue_results.remove_waiting_task_ids(id_tasks); + if (has_next() && !cancelled) { + // if tasks is not finished yet, cancel them + cancelled = true; + std::vector cancel_tasks; + cancel_tasks.reserve(id_tasks.size()); + for (const auto & id_task : id_tasks) { + SRV_WRN("cancel task, id_task = %d\n", id_task); + server_task task(SERVER_TASK_TYPE_CANCEL); + task.id_target = id_task; + queue_results.remove_waiting_task_id(id_task); + cancel_tasks.push_back(std::move(task)); + } + // push to beginning of the queue, so it has highest priority + queue_tasks.post(std::move(cancel_tasks), true); + } else { + SRV_DBG("%s", "all tasks already finished, no need to cancel\n"); + } +} diff --git a/llamacpp/native/src/server/server-queue.h b/llamacpp/native/src/server/server-queue.h new file mode 100644 index 000000000..209d2017c --- /dev/null +++ b/llamacpp/native/src/server/server-queue.h @@ -0,0 +1,146 @@ +#pragma once + +#include "server-task.h" + +#include +#include +#include +#include + +struct server_queue { +private: + int id = 0; + bool running; + + // queues + std::deque queue_tasks; + std::deque queue_tasks_deferred; + + std::mutex mutex_tasks; + std::condition_variable condition_tasks; + + // callback functions + std::function callback_new_task; + std::function callback_update_slots; + +public: + // Add a new task to the end of the queue + int post(server_task && task, bool front = false); + + // multi-task version of post() + int post(std::vector && tasks, bool front = false); + + // Add a new task, but defer until one slot is available + void defer(server_task && task); + + // Get the next id for creating a new task + int get_new_id(); + + // Register function to process a new task + void on_new_task(std::function callback); + + // Register the function to be called when all slots data is ready to be processed + void on_update_slots(std::function callback); + + // Call when the state of one slot is changed, it will move one task from deferred to main queue + void pop_deferred_task(); + + // end the start_loop routine + void terminate(); + + /** + * Main loop consists of these steps: + * - Wait until a new task arrives + * - Process the task (i.e. maybe copy data into slot) + * - Check if multitask is finished + * - Update all slots + */ + void start_loop(); + + // for metrics + size_t queue_tasks_deferred_size() { + std::unique_lock lock(mutex_tasks); + return queue_tasks_deferred.size(); + } + +private: + void cleanup_pending_task(int id_target); +}; + +struct server_response { +private: + bool running = true; + + // for keeping track of all tasks waiting for the result + std::unordered_set waiting_task_ids; + + // the main result queue (using ptr for polymorphism) + std::vector queue_results; + + std::mutex mutex_results; + std::condition_variable condition_results; + +public: + // add the id_task to the list of tasks waiting for response + void add_waiting_task_id(int id_task); + + void add_waiting_tasks(const std::vector & tasks); + + // when the request is finished, we can remove task associated with it + void remove_waiting_task_id(int id_task); + + // remove multiple tasks from waiting list + void remove_waiting_task_ids(const std::unordered_set & id_tasks); + + // This function blocks the thread until there is a response for one of the id_tasks + server_task_result_ptr recv(const std::unordered_set & id_tasks); + + // same as recv(), but have timeout in seconds + // if timeout is reached, nullptr is returned + server_task_result_ptr recv_with_timeout(const std::unordered_set & id_tasks, int timeout); + + // single-task version of recv() + server_task_result_ptr recv(int id_task); + + // Send a new result to a waiting id_task + void send(server_task_result_ptr && result); + + // terminate the waiting loop + void terminate(); +}; + +// utility class to make working with server_queue and server_response easier +// it provides a generator-like API for server responses +// support pooling connection state and aggregating multiple results +struct server_response_reader { + std::unordered_set id_tasks; + server_queue & queue_tasks; + server_response & queue_results; + size_t received_count = 0; + bool cancelled = false; + int polling_interval_seconds; + + // should_stop function will be called each polling_interval_seconds + server_response_reader(std::pair server_queues, int polling_interval_seconds) + : queue_tasks(server_queues.first), queue_results(server_queues.second), polling_interval_seconds(polling_interval_seconds) {} + ~server_response_reader() { + stop(); + } + + void post_tasks(std::vector && tasks); + bool has_next() const; + + // return nullptr if should_stop() is true before receiving a result + // note: if one error is received, it will stop further processing and return error result + server_task_result_ptr next(const std::function & should_stop); + + struct batch_response { + bool is_terminated = false; // if true, indicates that processing was stopped before all results were received + std::vector results; + server_task_result_ptr error; // nullptr if no error + }; + // aggregate multiple results + batch_response wait_for_all(const std::function & should_stop); + + void stop(); +}; diff --git a/llamacpp/native/src/server/server-task.cpp b/llamacpp/native/src/server/server-task.cpp new file mode 100644 index 000000000..3f59127fb --- /dev/null +++ b/llamacpp/native/src/server/server-task.cpp @@ -0,0 +1,1471 @@ +#include "server-common.h" +#include "server-task.h" + +#include "common.h" +#include "llama.h" +#include "chat.h" +#include "sampling.h" +#include "json-schema-to-grammar.h" + +using json = nlohmann::ordered_json; + +// +// task_params +// + +json task_params::format_logit_bias(const std::vector & logit_bias) const { + json data = json::array(); + for (const auto & lb : logit_bias) { + data.push_back(json{ + {"bias", lb.bias}, + {"token", lb.token}, + }); + } + return data; +} + +json task_params::to_json(bool only_metrics) const { + std::vector samplers; + samplers.reserve(sampling.samplers.size()); + for (const auto & sampler : sampling.samplers) { + samplers.emplace_back(common_sampler_type_to_str(sampler)); + } + + json lora = json::array(); + for (size_t i = 0; i < this->lora.size(); ++i) { + lora.push_back({{"id", i}, {"scale", this->lora[i].scale}}); + } + + if (only_metrics) { + return json { + {"seed", sampling.seed}, + {"temperature", sampling.temp}, + {"dynatemp_range", sampling.dynatemp_range}, + {"dynatemp_exponent", sampling.dynatemp_exponent}, + {"top_k", sampling.top_k}, + {"top_p", sampling.top_p}, + {"min_p", sampling.min_p}, + {"top_n_sigma", sampling.top_n_sigma}, + {"xtc_probability", sampling.xtc_probability}, + {"xtc_threshold", sampling.xtc_threshold}, + {"typical_p", sampling.typ_p}, + {"repeat_last_n", sampling.penalty_last_n}, + {"repeat_penalty", sampling.penalty_repeat}, + {"presence_penalty", sampling.penalty_present}, + {"frequency_penalty", sampling.penalty_freq}, + {"dry_multiplier", sampling.dry_multiplier}, + {"dry_base", sampling.dry_base}, + {"dry_allowed_length", sampling.dry_allowed_length}, + {"dry_penalty_last_n", sampling.dry_penalty_last_n}, + {"mirostat", sampling.mirostat}, + {"mirostat_tau", sampling.mirostat_tau}, + {"mirostat_eta", sampling.mirostat_eta}, + {"max_tokens", n_predict}, + {"n_predict", n_predict}, // TODO: deduplicate? + {"n_keep", n_keep}, + {"n_discard", n_discard}, + {"ignore_eos", sampling.ignore_eos}, + {"stream", stream}, + {"n_probs", sampling.n_probs}, + {"min_keep", sampling.min_keep}, + {"chat_format", common_chat_format_name(oaicompat_chat_syntax.format)}, + {"reasoning_format", common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)}, + {"reasoning_in_content", oaicompat_chat_syntax.reasoning_in_content}, + {"thinking_forced_open", oaicompat_chat_syntax.thinking_forced_open}, + {"samplers", samplers}, + {"speculative.n_max", speculative.n_max}, + {"speculative.n_min", speculative.n_min}, + {"speculative.p_min", speculative.p_min}, + {"timings_per_token", timings_per_token}, + {"post_sampling_probs", post_sampling_probs}, + {"lora", lora}, + }; + } + + auto grammar_triggers = json::array(); + for (const auto & trigger : sampling.grammar_triggers) { + server_grammar_trigger ct(trigger); + grammar_triggers.push_back(ct.to_json()); + } + + return json { + {"seed", sampling.seed}, + {"temperature", sampling.temp}, + {"dynatemp_range", sampling.dynatemp_range}, + {"dynatemp_exponent", sampling.dynatemp_exponent}, + {"top_k", sampling.top_k}, + {"top_p", sampling.top_p}, + {"min_p", sampling.min_p}, + {"top_n_sigma", sampling.top_n_sigma}, + {"xtc_probability", sampling.xtc_probability}, + {"xtc_threshold", sampling.xtc_threshold}, + {"typical_p", sampling.typ_p}, + {"repeat_last_n", sampling.penalty_last_n}, + {"repeat_penalty", sampling.penalty_repeat}, + {"presence_penalty", sampling.penalty_present}, + {"frequency_penalty", sampling.penalty_freq}, + {"dry_multiplier", sampling.dry_multiplier}, + {"dry_base", sampling.dry_base}, + {"dry_allowed_length", sampling.dry_allowed_length}, + {"dry_penalty_last_n", sampling.dry_penalty_last_n}, + {"dry_sequence_breakers", sampling.dry_sequence_breakers}, + {"mirostat", sampling.mirostat}, + {"mirostat_tau", sampling.mirostat_tau}, + {"mirostat_eta", sampling.mirostat_eta}, + {"stop", antiprompt}, + {"max_tokens", n_predict}, + {"n_predict", n_predict}, // TODO: deduplicate? + {"n_keep", n_keep}, + {"n_discard", n_discard}, + {"ignore_eos", sampling.ignore_eos}, + {"stream", stream}, + {"logit_bias", format_logit_bias(sampling.logit_bias)}, + {"n_probs", sampling.n_probs}, + {"min_keep", sampling.min_keep}, + {"grammar", sampling.grammar}, + {"grammar_lazy", sampling.grammar_lazy}, + {"grammar_triggers", grammar_triggers}, + {"preserved_tokens", sampling.preserved_tokens}, + {"chat_format", common_chat_format_name(oaicompat_chat_syntax.format)}, + {"reasoning_format", common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)}, + {"reasoning_in_content", oaicompat_chat_syntax.reasoning_in_content}, + {"thinking_forced_open", oaicompat_chat_syntax.thinking_forced_open}, + {"samplers", samplers}, + {"speculative.n_max", speculative.n_max}, + {"speculative.n_min", speculative.n_min}, + {"speculative.p_min", speculative.p_min}, + {"timings_per_token", timings_per_token}, + {"post_sampling_probs", post_sampling_probs}, + {"lora", lora}, + }; +} + +// +// server_task +// + +task_params server_task::params_from_json_cmpl( + const llama_context * ctx, + const common_params & params_base, + const json & data) { + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + + task_params params; + + // Sampling parameter defaults are loaded from the global server context (but individual requests can still them) + task_params defaults; + defaults.sampling = params_base.sampling; + defaults.speculative = params_base.speculative; + defaults.n_keep = params_base.n_keep; + defaults.n_predict = params_base.n_predict; + defaults.antiprompt = params_base.antiprompt; + + // enabling this will output extra debug information in the HTTP responses from the server + params.verbose = params_base.verbosity > 9; + params.timings_per_token = json_value(data, "timings_per_token", false); + + params.stream = json_value(data, "stream", false); + auto stream_opt = json_value(data, "stream_options", json::object()); + params.include_usage = json_value(stream_opt, "include_usage", false); + params.cache_prompt = json_value(data, "cache_prompt", true); + params.return_tokens = json_value(data, "return_tokens", false); + params.return_progress = json_value(data, "return_progress", false); + params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict)); + params.n_indent = json_value(data, "n_indent", defaults.n_indent); + params.n_keep = json_value(data, "n_keep", defaults.n_keep); + params.n_discard = json_value(data, "n_discard", defaults.n_discard); + //params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement + params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms); + params.response_fields = json_value(data, "response_fields", std::vector()); + + params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k); + params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p); + params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p); + params.sampling.top_n_sigma = json_value(data, "top_n_sigma", defaults.sampling.top_n_sigma); + params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability); + params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold); + params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p); + params.sampling.temp = json_value(data, "temperature", defaults.sampling.temp); + params.sampling.dynatemp_range = json_value(data, "dynatemp_range", defaults.sampling.dynatemp_range); + params.sampling.dynatemp_exponent = json_value(data, "dynatemp_exponent", defaults.sampling.dynatemp_exponent); + params.sampling.penalty_last_n = json_value(data, "repeat_last_n", defaults.sampling.penalty_last_n); + params.sampling.penalty_repeat = json_value(data, "repeat_penalty", defaults.sampling.penalty_repeat); + params.sampling.penalty_freq = json_value(data, "frequency_penalty", defaults.sampling.penalty_freq); + params.sampling.penalty_present = json_value(data, "presence_penalty", defaults.sampling.penalty_present); + params.sampling.dry_multiplier = json_value(data, "dry_multiplier", defaults.sampling.dry_multiplier); + params.sampling.dry_base = json_value(data, "dry_base", defaults.sampling.dry_base); + params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length); + params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n); + params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat); + params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau); + params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta); + params.sampling.seed = json_value(data, "seed", defaults.sampling.seed); + params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs); + params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep); + params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs); + + params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min); + params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max); + params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min); + + params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min); + params.speculative.n_min = std::max(params.speculative.n_min, 0); + params.speculative.n_max = std::max(params.speculative.n_max, 0); + + // Use OpenAI API logprobs only if n_probs wasn't provided + if (data.contains("logprobs") && params.sampling.n_probs == defaults.sampling.n_probs){ + params.sampling.n_probs = json_value(data, "logprobs", defaults.sampling.n_probs); + } + + if (data.contains("lora")) { + if (data.at("lora").is_array()) { + params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora")); + } else { + throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields"); + } + } else { + params.lora = params_base.lora_adapters; + } + + // TODO: add more sanity checks for the input parameters + + if (params.sampling.penalty_last_n < -1) { + throw std::runtime_error("Error: repeat_last_n must be >= -1"); + } + + if (params.sampling.dry_penalty_last_n < -1) { + throw std::runtime_error("Error: dry_penalty_last_n must be >= -1"); + } + + if (params.sampling.penalty_last_n == -1) { + // note: should be the slot's context and not the full context, but it's ok + params.sampling.penalty_last_n = llama_n_ctx(ctx); + } + + if (params.sampling.dry_penalty_last_n == -1) { + params.sampling.dry_penalty_last_n = llama_n_ctx(ctx); + } + + if (params.sampling.dry_base < 1.0f) { + params.sampling.dry_base = defaults.sampling.dry_base; + } + + // sequence breakers for DRY + { + // Currently, this is not compatible with TextGen WebUI, Koboldcpp and SillyTavern format + // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39 + + if (data.contains("dry_sequence_breakers")) { + params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector()); + if (params.sampling.dry_sequence_breakers.empty()) { + throw std::runtime_error("Error: dry_sequence_breakers must be a non-empty array of strings"); + } + } + } + + // process "json_schema" and "grammar" + if (data.contains("json_schema") && !data.contains("grammar")) { + try { + auto schema = json_value(data, "json_schema", json::object()); + SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str()); + params.sampling.grammar = json_schema_to_grammar(schema); + SRV_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str()); + } catch (const std::exception & e) { + throw std::runtime_error(std::string("\"json_schema\": ") + e.what()); + } + } else { + params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar); + SRV_DBG("Grammar: %s\n", params.sampling.grammar.c_str()); + params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy); + SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false"); + } + + { + auto it = data.find("chat_format"); + if (it != data.end()) { + params.oaicompat_chat_syntax.format = static_cast(it->get()); + SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_syntax.format)); + } else { + params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format; + } + common_reasoning_format reasoning_format = params_base.reasoning_format; + if (data.contains("reasoning_format")) { + reasoning_format = common_reasoning_format_from_name(data.at("reasoning_format").get()); + } + params.oaicompat_chat_syntax.reasoning_format = reasoning_format; + params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY); + params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false); + params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false); + } + + { + const auto preserved_tokens = data.find("preserved_tokens"); + if (preserved_tokens != data.end()) { + for (const auto & t : *preserved_tokens) { + auto ids = common_tokenize(vocab, t.get(), /* add_special= */ false, /* parse_special= */ true); + if (ids.size() == 1) { + SRV_DBG("Preserved token: %d\n", ids[0]); + params.sampling.preserved_tokens.insert(ids[0]); + } else { + // This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens. + SRV_DBG("Not preserved because more than 1 token: %s\n", t.get().c_str()); + } + } + } + const auto grammar_triggers = data.find("grammar_triggers"); + if (grammar_triggers != data.end()) { + for (const auto & t : *grammar_triggers) { + server_grammar_trigger ct(t); + if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) { + const auto & word = ct.value.value; + auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true); + if (ids.size() == 1) { + auto token = ids[0]; + if (std::find(params.sampling.preserved_tokens.begin(), params.sampling.preserved_tokens.end(), (llama_token) token) == params.sampling.preserved_tokens.end()) { + throw std::runtime_error("Grammar trigger word should be marked as preserved token: " + word); + } + SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str()); + common_grammar_trigger trigger; + trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN; + trigger.value = word; + trigger.token = token; + params.sampling.grammar_triggers.push_back(std::move(trigger)); + } else { + SRV_DBG("Grammar trigger word: `%s`\n", word.c_str()); + params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word}); + } + } else { + if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN) { + SRV_DBG("Grammar trigger pattern: `%s`\n", ct.value.value.c_str()); + } else if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL) { + SRV_DBG("Grammar trigger pattern full: `%s`\n", ct.value.value.c_str()); + } else { + throw std::runtime_error("Unknown grammar trigger type"); + } + params.sampling.grammar_triggers.emplace_back(std::move(ct.value)); + } + } + } + if (params.sampling.grammar_lazy && params.sampling.grammar_triggers.empty()) { + throw std::runtime_error("Error: no triggers set for lazy grammar!"); + } + } + + { + params.sampling.logit_bias.clear(); + + const auto & logit_bias = data.find("logit_bias"); + if (logit_bias != data.end() && logit_bias->is_array()) { + const int n_vocab = llama_vocab_n_tokens(vocab); + for (const auto & el : *logit_bias) { + // TODO: we may want to throw errors here, in case "el" is incorrect + if (el.is_array() && el.size() == 2) { + float bias; + if (el[1].is_number()) { + bias = el[1].get(); + } else if (el[1].is_boolean() && !el[1].get()) { + bias = -INFINITY; + } else { + continue; + } + + if (el[0].is_number_integer()) { + llama_token tok = el[0].get(); + if (tok >= 0 && tok < n_vocab) { + params.sampling.logit_bias.push_back({tok, bias}); + } + } else if (el[0].is_string()) { + auto toks = common_tokenize(vocab, el[0].get(), false); + for (auto tok : toks) { + params.sampling.logit_bias.push_back({tok, bias}); + } + } + } + } + } else if (logit_bias != data.end() && logit_bias->is_object()) { + const int n_vocab = llama_vocab_n_tokens(vocab); + for (const auto & el : logit_bias->items()) { + float bias; + const auto & key = el.key(); + const auto & value = el.value(); + if (value.is_number()) { + bias = value.get(); + } else if (value.is_boolean() && !value.get()) { + bias = -INFINITY; + } else { + continue; + } + + char *end; + llama_token tok = strtol(key.c_str(), &end, 10); + if (*end == 0) { + if (tok >= 0 && tok < n_vocab) { + params.sampling.logit_bias.push_back({tok, bias}); + } + } else { + auto toks = common_tokenize(vocab, key, false); + for (auto tok : toks) { + params.sampling.logit_bias.push_back({tok, bias}); + } + } + } + } + + params.sampling.ignore_eos = json_value(data, "ignore_eos", params_base.sampling.ignore_eos); + if (params.sampling.ignore_eos) { + params.sampling.logit_bias.insert( + params.sampling.logit_bias.end(), + defaults.sampling.logit_bias_eog.begin(), defaults.sampling.logit_bias_eog.end()); + } + } + + { + params.antiprompt.clear(); + + const auto & stop = data.find("stop"); + if (stop != data.end() && stop->is_array()) { + for (const auto & word : *stop) { + if (!word.empty()) { + params.antiprompt.push_back(word); + } + } + } + // set reverse prompt from cli args if not set in the request + if (params.antiprompt.empty()) { + params.antiprompt = defaults.antiprompt; + } + } + + { + const auto samplers = data.find("samplers"); + if (samplers != data.end()) { + if (samplers->is_array()) { + params.sampling.samplers = common_sampler_types_from_names(*samplers, false); + } else if (samplers->is_string()){ + params.sampling.samplers = common_sampler_types_from_chars(samplers->get()); + } + } else { + params.sampling.samplers = defaults.sampling.samplers; + } + } + + return params; +} + +// +// result_timings +// + +json result_timings::to_json() const { + json base = { + {"cache_n", cache_n}, + + {"prompt_n", prompt_n}, + {"prompt_ms", prompt_ms}, + {"prompt_per_token_ms", prompt_per_token_ms}, + {"prompt_per_second", prompt_per_second}, + + {"predicted_n", predicted_n}, + {"predicted_ms", predicted_ms}, + {"predicted_per_token_ms", predicted_per_token_ms}, + {"predicted_per_second", predicted_per_second}, + }; + + if (draft_n > 0) { + base["draft_n"] = draft_n; + base["draft_n_accepted"] = draft_n_accepted; + } + + return base; +} + +// +// result_prompt_progress +// +json result_prompt_progress::to_json() const { + return json { + {"total", total}, + {"cache", cache}, + {"processed", processed}, + {"time_ms", time_ms}, + }; +} + +static inline std::string stop_type_to_str(stop_type type) { + switch (type) { + case STOP_TYPE_EOS: return "eos"; + case STOP_TYPE_WORD: return "word"; + case STOP_TYPE_LIMIT: return "limit"; + default: return "none"; + } +} + +// +// completion_token_output +// + +json completion_token_output::to_json(bool post_sampling_probs) const { + json probs_for_token = json::array(); + for (const auto & p : probs) { + std::string txt(p.txt); + txt.resize(validate_utf8(txt)); + probs_for_token.push_back(json { + {"id", p.tok}, + {"token", txt}, + {"bytes", str_to_bytes(p.txt)}, + { + post_sampling_probs ? "prob" : "logprob", + post_sampling_probs ? p.prob : logarithm(p.prob) + }, + }); + } + return probs_for_token; +} + +json completion_token_output::probs_vector_to_json(const std::vector & probs, bool post_sampling_probs) { + json out = json::array(); + for (const auto & p : probs) { + std::string txt(p.text_to_send); + txt.resize(validate_utf8(txt)); + out.push_back(json { + {"id", p.tok}, + {"token", txt}, + {"bytes", str_to_bytes(p.text_to_send)}, + { + post_sampling_probs ? "prob" : "logprob", + post_sampling_probs ? p.prob : logarithm(p.prob) + }, + { + post_sampling_probs ? "top_probs" : "top_logprobs", + p.to_json(post_sampling_probs) + }, + }); + } + return out; +} + +float completion_token_output::logarithm(float x) { + // nlohmann::json converts -inf to null, so we need to prevent that + return x == 0.0f ? std::numeric_limits::lowest() : std::log(x); +} + +std::vector completion_token_output::str_to_bytes(const std::string & str) { + std::vector bytes; + for (unsigned char c : str) { + bytes.push_back(c); + } + return bytes; +} + +// +// server_task_result_cmpl_final +// +json server_task_result_cmpl_final::to_json() { + switch (res_type) { + case TASK_RESPONSE_TYPE_NONE: + return to_json_non_oaicompat(); + case TASK_RESPONSE_TYPE_OAI_CMPL: + return to_json_oaicompat(); + case TASK_RESPONSE_TYPE_OAI_CHAT: + return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat(); + case TASK_RESPONSE_TYPE_ANTHROPIC: + return stream ? to_json_anthropic_stream() : to_json_anthropic(); + default: + GGML_ASSERT(false && "Invalid task_response_type"); + } +} + +json server_task_result_cmpl_final::to_json_non_oaicompat() { + json res = json { + {"index", index}, + {"content", stream ? "" : content}, // in stream mode, content is already in last partial chunk + {"tokens", stream ? llama_tokens {} : tokens}, + {"id_slot", id_slot}, + {"stop", true}, + {"model", oaicompat_model}, + {"tokens_predicted", n_decoded}, + {"tokens_evaluated", n_prompt_tokens}, + {"generation_settings", generation_params.to_json()}, + {"prompt", prompt}, + {"has_new_line", has_new_line}, + {"truncated", truncated}, + {"stop_type", stop_type_to_str(stop)}, + {"stopping_word", stopping_word}, + {"tokens_cached", n_tokens_cached}, + {"timings", timings.to_json()}, + }; + if (!stream && !probs_output.empty()) { + res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs); + } + return response_fields.empty() ? res : json_get_nested_values(response_fields, res); +} + +json server_task_result_cmpl_final::to_json_oaicompat() { + std::time_t t = std::time(0); + json logprobs = json(nullptr); // OAI default to null + if (!stream && probs_output.size() > 0) { + logprobs = json{ + {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)}, + }; + } + json finish_reason = "length"; + if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { + finish_reason = "stop"; + } + json res = json { + {"choices", json::array({ + json{ + {"text", stream ? "" : content}, // in stream mode, content is already in last partial chunk + {"index", index}, + {"logprobs", logprobs}, + {"finish_reason", finish_reason}, + } + })}, + {"created", t}, + {"model", oaicompat_model}, + {"system_fingerprint", build_info}, + {"object", "text_completion"}, + {"usage", json { + {"completion_tokens", n_decoded}, + {"prompt_tokens", n_prompt_tokens}, + {"total_tokens", n_decoded + n_prompt_tokens} + }}, + {"id", oaicompat_cmpl_id} + }; + + // extra fields for debugging purposes + if (verbose) { + res["__verbose"] = to_json_non_oaicompat(); + } + if (timings.prompt_n >= 0) { + res.push_back({"timings", timings.to_json()}); + } + + return res; +} + +json server_task_result_cmpl_final::to_json_oaicompat_chat() { + std::string finish_reason = "length"; + common_chat_msg msg; + if (!oaicompat_msg.empty()) { + msg = oaicompat_msg; + } else { + msg.role = "assistant"; + msg.content = content; + } + if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { + finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls"; + } + + json choice { + {"finish_reason", finish_reason}, + {"index", 0}, + {"message", msg.to_json_oaicompat()}, + }; + + if (!stream && probs_output.size() > 0) { + choice["logprobs"] = json{ + {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)}, + }; + } + + std::time_t t = std::time(0); + + json res = json { + {"choices", json::array({choice})}, + {"created", t}, + {"model", oaicompat_model}, + {"system_fingerprint", build_info}, + {"object", "chat.completion"}, + {"usage", json { + {"completion_tokens", n_decoded}, + {"prompt_tokens", n_prompt_tokens}, + {"total_tokens", n_decoded + n_prompt_tokens} + }}, + {"id", oaicompat_cmpl_id} + }; + + // extra fields for debugging purposes + if (verbose) { + res["__verbose"] = to_json_non_oaicompat(); + } + if (timings.prompt_n >= 0) { + res.push_back({"timings", timings.to_json()}); + } + + return res; +} + +json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() { + std::time_t t = std::time(0); + std::string finish_reason = "length"; + if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { + finish_reason = oaicompat_msg.tool_calls.empty() ? "stop" : "tool_calls"; + } + + json deltas = json::array(); + for (const auto & diff : oaicompat_msg_diffs) { + deltas.push_back({ + {"choices", json::array({ + json { + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", common_chat_msg_diff_to_json_oaicompat(diff)}, + }, + })}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"system_fingerprint", build_info}, + {"object", "chat.completion.chunk"}, + }); + } + + deltas.push_back({ + {"choices", json::array({ + json { + {"finish_reason", finish_reason}, + {"index", 0}, + {"delta", json::object()}, + }, + })}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"system_fingerprint", build_info}, + {"object", "chat.completion.chunk"}, + }); + + if (include_usage) { + // OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage + // https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices + deltas.push_back({ + {"choices", json::array()}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"system_fingerprint", build_info}, + {"object", "chat.completion.chunk"}, + {"usage", json { + {"completion_tokens", n_decoded}, + {"prompt_tokens", n_prompt_tokens}, + {"total_tokens", n_decoded + n_prompt_tokens}, + }}, + }); + } + + if (timings.prompt_n >= 0) { + deltas.back().push_back({"timings", timings.to_json()}); + } + + // extra fields for debugging purposes + if (verbose && !deltas.empty()) { + deltas.front()["__verbose"] = to_json_non_oaicompat(); + } + + return deltas; +} + +json server_task_result_cmpl_final::to_json_anthropic() { + std::string stop_reason = "max_tokens"; + if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { + stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use"; + } + + json content_blocks = json::array(); + + common_chat_msg msg; + if (!oaicompat_msg.empty()) { + msg = oaicompat_msg; + } else { + msg.role = "assistant"; + msg.content = content; + } + + if (!msg.content.empty()) { + content_blocks.push_back({ + {"type", "text"}, + {"text", msg.content} + }); + } + + for (const auto & tool_call : msg.tool_calls) { + json tool_use_block = { + {"type", "tool_use"}, + {"id", tool_call.id}, + {"name", tool_call.name} + }; + + try { + tool_use_block["input"] = json::parse(tool_call.arguments); + } catch (const std::exception &) { + tool_use_block["input"] = json::object(); + } + + content_blocks.push_back(tool_use_block); + } + + json res = { + {"id", oaicompat_cmpl_id}, + {"type", "message"}, + {"role", "assistant"}, + {"content", content_blocks}, + {"model", oaicompat_model}, + {"stop_reason", stop_reason}, + {"stop_sequence", stopping_word.empty() ? nullptr : json(stopping_word)}, + {"usage", { + {"input_tokens", n_prompt_tokens}, + {"output_tokens", n_decoded} + }} + }; + + return res; +} + +json server_task_result_cmpl_final::to_json_anthropic_stream() { + json events = json::array(); + + std::string stop_reason = "max_tokens"; + if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { + stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use"; + } + + bool has_text = !oaicompat_msg.content.empty(); + size_t num_tool_calls = oaicompat_msg.tool_calls.size(); + + bool text_block_started = false; + std::unordered_set tool_calls_started; + + for (const auto & diff : oaicompat_msg_diffs) { + if (!diff.content_delta.empty()) { + if (!text_block_started) { + events.push_back({ + {"event", "content_block_start"}, + {"data", { + {"type", "content_block_start"}, + {"index", 0}, + {"content_block", { + {"type", "text"}, + {"text", ""} + }} + }} + }); + text_block_started = true; + } + + events.push_back({ + {"event", "content_block_delta"}, + {"data", { + {"type", "content_block_delta"}, + {"index", 0}, + {"delta", { + {"type", "text_delta"}, + {"text", diff.content_delta} + }} + }} + }); + } + + if (diff.tool_call_index != std::string::npos) { + size_t content_block_index = (has_text ? 1 : 0) + diff.tool_call_index; + + if (tool_calls_started.find(diff.tool_call_index) == tool_calls_started.end()) { + const auto & full_tool_call = oaicompat_msg.tool_calls[diff.tool_call_index]; + + events.push_back({ + {"event", "content_block_start"}, + {"data", { + {"type", "content_block_start"}, + {"index", content_block_index}, + {"content_block", { + {"type", "tool_use"}, + {"id", full_tool_call.id}, + {"name", full_tool_call.name} + }} + }} + }); + tool_calls_started.insert(diff.tool_call_index); + } + + if (!diff.tool_call_delta.arguments.empty()) { + events.push_back({ + {"event", "content_block_delta"}, + {"data", { + {"type", "content_block_delta"}, + {"index", content_block_index}, + {"delta", { + {"type", "input_json_delta"}, + {"partial_json", diff.tool_call_delta.arguments} + }} + }} + }); + } + } + } + + if (has_text) { + events.push_back({ + {"event", "content_block_stop"}, + {"data", { + {"type", "content_block_stop"}, + {"index", 0} + }} + }); + } + + for (size_t i = 0; i < num_tool_calls; i++) { + size_t content_block_index = (has_text ? 1 : 0) + i; + events.push_back({ + {"event", "content_block_stop"}, + {"data", { + {"type", "content_block_stop"}, + {"index", content_block_index} + }} + }); + } + + events.push_back({ + {"event", "message_delta"}, + {"data", { + {"type", "message_delta"}, + {"delta", { + {"stop_reason", stop_reason}, + {"stop_sequence", stopping_word.empty() ? nullptr : json(stopping_word)} + }}, + {"usage", { + {"output_tokens", n_decoded} + }} + }} + }); + + events.push_back({ + {"event", "message_stop"}, + {"data", { + {"type", "message_stop"} + }} + }); + + return events; +} + +// +// server_task_result_cmpl_partial +// +json server_task_result_cmpl_partial::to_json() { + switch (res_type) { + case TASK_RESPONSE_TYPE_NONE: + return to_json_non_oaicompat(); + case TASK_RESPONSE_TYPE_OAI_CMPL: + return to_json_oaicompat(); + case TASK_RESPONSE_TYPE_OAI_CHAT: + return to_json_oaicompat_chat(); + case TASK_RESPONSE_TYPE_ANTHROPIC: + return to_json_anthropic(); + default: + GGML_ASSERT(false && "Invalid task_response_type"); + } +} + +json server_task_result_cmpl_partial::to_json_non_oaicompat() { + // non-OAI-compat JSON + json res = json { + {"index", index}, + {"content", content}, + {"tokens", tokens}, + {"stop", false}, + {"id_slot", id_slot}, + {"tokens_predicted", n_decoded}, + {"tokens_evaluated", n_prompt_tokens}, + }; + // populate the timings object when needed (usually for the last response or with timings_per_token enabled) + if (timings.prompt_n > 0) { + res.push_back({"timings", timings.to_json()}); + } + if (is_progress) { + res.push_back({"prompt_progress", progress.to_json()}); + } + if (!prob_output.probs.empty()) { + res["completion_probabilities"] = completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs); + } + return res; +} + +json server_task_result_cmpl_partial::to_json_oaicompat() { + std::time_t t = std::time(0); + json logprobs = json(nullptr); // OAI default to null + if (prob_output.probs.size() > 0) { + logprobs = json{ + {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)}, + }; + } + json res = json { + {"choices", json::array({ + json{ + {"text", content}, + {"index", index}, + {"logprobs", logprobs}, + {"finish_reason", nullptr}, + } + })}, + {"created", t}, + {"model", oaicompat_model}, + {"system_fingerprint", build_info}, + {"object", "text_completion"}, + {"id", oaicompat_cmpl_id} + }; + + // extra fields for debugging purposes + if (verbose) { + res["__verbose"] = to_json_non_oaicompat(); + } + if (timings.prompt_n >= 0) { + res.push_back({"timings", timings.to_json()}); + } + if (is_progress) { + res.push_back({"prompt_progress", progress.to_json()}); + } + + return res; +} + +json server_task_result_cmpl_partial::to_json_oaicompat_chat() { + bool first = n_decoded == 1; + std::time_t t = std::time(0); + json choices; + + std::vector deltas; + auto add_delta = [&](const json & delta) { + deltas.push_back({ + {"choices", json::array({ + json { + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", delta}, + }, + })}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"system_fingerprint", build_info}, + {"object", "chat.completion.chunk"}, + }); + }; + // We have to send an initial update to conform to openai behavior + if (first || is_progress) { + add_delta({ + {"role", "assistant"}, + {"content", nullptr}, + }); + } + + for (const auto & diff : oaicompat_msg_diffs) { + add_delta(common_chat_msg_diff_to_json_oaicompat(diff)); + } + + if (!deltas.empty()) { + auto & last_json = deltas[deltas.size() - 1]; + GGML_ASSERT(last_json.at("choices").size() >= 1); + + if (prob_output.probs.size() > 0) { + last_json.at("choices").at(0)["logprobs"] = json { + {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)}, + }; + } + + if (timings.prompt_n >= 0) { + last_json.push_back({"timings", timings.to_json()}); + } + if (is_progress) { + last_json.push_back({"prompt_progress", progress.to_json()}); + } + } + + return deltas; +} + +// +// server_task_result_embd +// +json server_task_result_embd::to_json() { + return res_type == TASK_RESPONSE_TYPE_OAI_EMBD + ? to_json_oaicompat() + : to_json_non_oaicompat(); +} + +json server_task_result_embd::to_json_non_oaicompat() { + return json { + {"index", index}, + {"embedding", embedding}, + }; +} + +json server_task_result_embd::to_json_oaicompat() { + return json { + {"index", index}, + {"embedding", embedding[0]}, + {"tokens_evaluated", n_tokens}, + }; +} + +// +// server_task_result_rerank +// +json server_task_result_rerank::to_json() { + return json { + {"index", index}, + {"score", score}, + {"tokens_evaluated", n_tokens}, + }; +} + +json server_task_result_cmpl_partial::to_json_anthropic() { + json events = json::array(); + bool first = (n_decoded == 1); + static bool text_block_started = false; + + if (first) { + text_block_started = false; + + events.push_back({ + {"event", "message_start"}, + {"data", { + {"type", "message_start"}, + {"message", { + {"id", oaicompat_cmpl_id}, + {"type", "message"}, + {"role", "assistant"}, + {"content", json::array()}, + {"model", oaicompat_model}, + {"stop_reason", nullptr}, + {"stop_sequence", nullptr}, + {"usage", { + {"input_tokens", n_prompt_tokens}, + {"output_tokens", 0} + }} + }} + }} + }); + } + + for (const auto & diff : oaicompat_msg_diffs) { + if (!diff.content_delta.empty()) { + if (!text_block_started) { + events.push_back({ + {"event", "content_block_start"}, + {"data", { + {"type", "content_block_start"}, + {"index", 0}, + {"content_block", { + {"type", "text"}, + {"text", ""} + }} + }} + }); + text_block_started = true; + } + + events.push_back({ + {"event", "content_block_delta"}, + {"data", { + {"type", "content_block_delta"}, + {"index", 0}, + {"delta", { + {"type", "text_delta"}, + {"text", diff.content_delta} + }} + }} + }); + } + + if (diff.tool_call_index != std::string::npos) { + size_t content_block_index = (text_block_started ? 1 : 0) + diff.tool_call_index; + + if (!diff.tool_call_delta.name.empty()) { + events.push_back({ + {"event", "content_block_start"}, + {"data", { + {"type", "content_block_start"}, + {"index", content_block_index}, + {"content_block", { + {"type", "tool_use"}, + {"id", diff.tool_call_delta.id}, + {"name", diff.tool_call_delta.name} + }} + }} + }); + } + + if (!diff.tool_call_delta.arguments.empty()) { + events.push_back({ + {"event", "content_block_delta"}, + {"data", { + {"type", "content_block_delta"}, + {"index", content_block_index}, + {"delta", { + {"type", "input_json_delta"}, + {"partial_json", diff.tool_call_delta.arguments} + }} + }} + }); + } + } + } + + return events; +} + +// +// server_task_result_error +// +json server_task_result_error::to_json() { + json res = format_error_response(err_msg, err_type); + if (err_type == ERROR_TYPE_EXCEED_CONTEXT_SIZE) { + res["n_prompt_tokens"] = n_prompt_tokens; + res["n_ctx"] = n_ctx; + } + return res; +} + +// +// server_task_result_metrics +// +json server_task_result_metrics::to_json() { + return json { + { "idle", n_idle_slots }, + { "processing", n_processing_slots }, + { "deferred", n_tasks_deferred }, + { "t_start", t_start }, + + { "n_prompt_tokens_processed_total", n_prompt_tokens_processed_total }, + { "t_tokens_generation_total", t_tokens_generation_total }, + { "n_tokens_predicted_total", n_tokens_predicted_total }, + { "t_prompt_processing_total", t_prompt_processing_total }, + + { "n_tokens_max", n_tokens_max }, + + { "n_prompt_tokens_processed", n_prompt_tokens_processed }, + { "t_prompt_processing", t_prompt_processing }, + { "n_tokens_predicted", n_tokens_predicted }, + { "t_tokens_generation", t_tokens_generation }, + + { "n_decode_total", n_decode_total }, + { "n_busy_slots_total", n_busy_slots_total }, + + { "slots", slots_data }, + }; +} + +// +// server_task_result_slot_save_load +// +json server_task_result_slot_save_load::to_json() { + if (is_save) { + return json { + { "id_slot", id_slot }, + { "filename", filename }, + { "n_saved", n_tokens }, + { "n_written", n_bytes }, + { "timings", { + { "save_ms", t_ms } + }}, + }; + } + + return json { + { "id_slot", id_slot }, + { "filename", filename }, + { "n_restored", n_tokens }, + { "n_read", n_bytes }, + { "timings", { + { "restore_ms", t_ms } + }}, + }; +} + +// +// server_task_result_slot_erase +// +json server_task_result_slot_erase::to_json() { + return json { + { "id_slot", id_slot }, + { "n_erased", n_erased }, + }; +} + +// +// server_task_result_apply_lora +// + +json server_task_result_apply_lora::to_json() { + return json {{ "success", true }}; +} + +// +// server_prompt_cache +// +size_t server_prompt_cache::size() const { + size_t res = 0; + + for (const auto & state : states) { + res += state.size(); + } + + return res; +} + +size_t server_prompt_cache::n_tokens() const { + size_t res = 0; + + for (const auto & state : states) { + res += state.n_tokens(); + } + + return res; +} + +server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t state_size) { + // first check if the current state is contained fully in the cache + for (auto it = states.begin(); it != states.end(); ++it) { + const int cur_lcp_len = it->tokens.get_common_prefix(prompt.tokens); + + if (cur_lcp_len == (int) prompt.tokens.size()) { + SRV_WRN("%s", " - prompt is already in the cache, skipping\n"); + return nullptr; + } + } + + // next, remove any cached prompts that are fully contained in the current prompt + for (auto it = states.begin(); it != states.end();) { + const int len = it->tokens.get_common_prefix(prompt.tokens); + + if (len == (int) it->tokens.size()) { + SRV_WRN(" - removing obsolete cached prompt with length %d\n", len); + + it = states.erase(it); + } else { + ++it; + } + } + + std::vector state_data; + + // check if we can allocate enough memory for the new state + try { + state_data.resize(state_size); + } catch (const std::bad_alloc & e) { + SRV_ERR("failed to allocate memory for prompt cache state: %s\n", e.what()); + + limit_size = std::max(1, 0.4*size()); + + SRV_WRN(" - cache size limit reduced to %.3f MiB\n", limit_size / (1024.0 * 1024.0)); + + update(); + + return nullptr; + } + + // TODO: for some reason we can't copy server_tokens, so we have to do this workaround + auto & cur = states.emplace_back(); + cur = { + /*.tokens =*/ server_tokens(prompt.tokens.get_text_tokens(), false), + /*.data =*/ std::move(state_data), + /*.checkpoints =*/ prompt.checkpoints, + }; + + return &cur; +} + +bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, int32_t id_slot) { + const int lcp_best = prompt.tokens.get_common_prefix(tokens_new); + + float f_keep_best = float(lcp_best) / prompt.tokens.size(); + float sim_best = float(lcp_best) / tokens_new.size(); + + SRV_WRN(" - looking for better prompt, base f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best); + + auto it_best = states.end(); + + // find the most similar cached prompt, that would also preserve the most context + for (auto it = states.begin(); it != states.end(); ++it) { + const int lcp_cur = it->tokens.get_common_prefix(tokens_new); + + const float f_keep_cur = float(lcp_cur) / it->tokens.size(); + const float sim_cur = float(lcp_cur) / tokens_new.size(); + + // don't trash large prompts + if (f_keep_cur < 0.25f) { + continue; + } + + if (f_keep_best < f_keep_cur && sim_best < sim_cur) { + f_keep_best = f_keep_cur; + sim_best = sim_cur; + + it_best = it; + } + } + + if (it_best != states.end()) { + SRV_WRN(" - found better prompt with f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best); + + const size_t size = it_best->data.size(); + const size_t n = llama_state_seq_set_data_ext(ctx, it_best->data.data(), size, id_slot, 0); + if (n != size) { + SRV_WRN("failed to restore state with size %zu\n", size); + + return false; + } + + it_best->data.clear(); + it_best->data.shrink_to_fit(); + + prompt = std::move(*it_best); + + states.erase(it_best); + } + + return true; +} + +void server_prompt_cache::update() { + if (limit_size > 0) { + // always keep at least one state, regardless of the limits + while (states.size() > 1 && size() > limit_size) { + if (states.empty()) { + break; + } + + SRV_WRN(" - cache size limit reached, removing oldest entry (size = %.3f MiB)\n", states.front().size() / (1024.0 * 1024.0)); + + states.pop_front(); + } + } + + // average size per token + const float size_per_token = std::max(1.0f, float(size()) / (std::max(1, n_tokens()))); + + // dynamically increase the token limit if it can fit in the memory limit + const size_t limit_tokens_cur = limit_size > 0 ? std::max(limit_tokens, limit_size/size_per_token) : limit_tokens; + + if (limit_tokens > 0) { + while (states.size() > 1 && n_tokens() > limit_tokens_cur) { + if (states.empty()) { + break; + } + + SRV_WRN(" - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n", + limit_tokens, limit_tokens_cur, states.front().size() / (1024.0 * 1024.0)); + + states.pop_front(); + } + } + + SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n", + states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens, limit_tokens_cur); + + for (const auto & state : states) { + SRV_WRN(" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n", + (const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0)); + } +} diff --git a/llamacpp/native/src/server/server-task.h b/llamacpp/native/src/server/server-task.h new file mode 100644 index 000000000..a22d7cab1 --- /dev/null +++ b/llamacpp/native/src/server/server-task.h @@ -0,0 +1,460 @@ +#pragma once + +#include "common.h" +#include "llama.h" + +#include +#include +#include + +// TODO: prevent including the whole server-common.h as we only use server_tokens +#include "server-common.h" + +using json = nlohmann::ordered_json; + +enum server_task_type { + SERVER_TASK_TYPE_COMPLETION, + SERVER_TASK_TYPE_EMBEDDING, + SERVER_TASK_TYPE_RERANK, + SERVER_TASK_TYPE_INFILL, + SERVER_TASK_TYPE_CANCEL, + SERVER_TASK_TYPE_NEXT_RESPONSE, + SERVER_TASK_TYPE_METRICS, + SERVER_TASK_TYPE_SLOT_SAVE, + SERVER_TASK_TYPE_SLOT_RESTORE, + SERVER_TASK_TYPE_SLOT_ERASE, + SERVER_TASK_TYPE_SET_LORA, +}; + +// TODO: change this to more generic "response_format" to replace the "format_response_*" in server-common +enum task_response_type { + TASK_RESPONSE_TYPE_NONE, // llama.cpp native format + TASK_RESPONSE_TYPE_OAI_CHAT, + TASK_RESPONSE_TYPE_OAI_CMPL, + TASK_RESPONSE_TYPE_OAI_EMBD, + TASK_RESPONSE_TYPE_ANTHROPIC, +}; + +enum stop_type { + STOP_TYPE_NONE, + STOP_TYPE_EOS, + STOP_TYPE_WORD, + STOP_TYPE_LIMIT, +}; + +struct task_params { + bool stream = true; + bool include_usage = false; + bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt + bool return_tokens = false; + bool return_progress = false; + + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half + int32_t n_predict = -1; // new tokens to predict + int32_t n_indent = 0; // minimum line indentation for the generated text in number of whitespace characters + + int64_t t_max_prompt_ms = -1; // TODO: implement + int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit + + std::vector lora; + + std::vector antiprompt; + std::vector response_fields; + bool timings_per_token = false; + bool post_sampling_probs = false; + + struct common_params_sampling sampling; + struct common_params_speculative speculative; + + // response formatting + bool verbose = false; + task_response_type res_type = TASK_RESPONSE_TYPE_NONE; + std::string oaicompat_model; + std::string oaicompat_cmpl_id; + common_chat_syntax oaicompat_chat_syntax; + + // Embeddings + int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm) + + json format_logit_bias(const std::vector & logit_bias) const; + json to_json(bool only_metrics = false) const; +}; + +struct server_task { + int id = -1; // to be filled by server_queue + int index = -1; // used when there are multiple prompts (batch request) + + // used by SERVER_TASK_TYPE_CANCEL + int id_target = -1; + int id_slot = -1; + + // used by SERVER_TASK_TYPE_INFERENCE + task_params params; + server_tokens tokens; + + server_task_type type; + + // used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE + struct slot_action { + int slot_id; + std::string filename; + std::string filepath; + }; + slot_action slot_action; + + // used by SERVER_TASK_TYPE_METRICS + bool metrics_reset_bucket = false; + + // used by SERVER_TASK_TYPE_SET_LORA + std::vector set_lora; + + server_task() = default; + + server_task(server_task_type type) : type(type) {} + + int32_t n_tokens() const { + return tokens.size(); + } + + static task_params params_from_json_cmpl( + const llama_context * ctx, + const common_params & params_base, + const json & data); + + // utility function + static std::unordered_set get_list_id(const std::vector & tasks) { + std::unordered_set ids(tasks.size()); + for (size_t i = 0; i < tasks.size(); i++) { + ids.insert(tasks[i].id); + } + return ids; + } +}; + +struct result_timings { + int32_t cache_n = -1; + + int32_t prompt_n = -1; + double prompt_ms; + double prompt_per_token_ms; + double prompt_per_second; + + int32_t predicted_n = -1; + double predicted_ms; + double predicted_per_token_ms; + double predicted_per_second; + + // Optional speculative metrics - only included when > 0 + int32_t draft_n = 0; + int32_t draft_n_accepted = 0; + + json to_json() const; +}; + +struct result_prompt_progress { + int32_t total = 0; + int32_t cache = 0; + int32_t processed = 0; + int64_t time_ms = 0; + + json to_json() const; +}; + +struct server_task_result { + int id = -1; + int id_slot = -1; + virtual bool is_error() { + // only used by server_task_result_error + return false; + } + virtual bool is_stop() { + // only used by server_task_result_cmpl_* + return true; + } + virtual int get_index() { + return -1; + } + virtual json to_json() = 0; + virtual ~server_task_result() = default; +}; + +// using shared_ptr for polymorphism of server_task_result +using server_task_result_ptr = std::unique_ptr; + +struct completion_token_output { + llama_token tok; + float prob; + std::string text_to_send; + struct prob_info { + llama_token tok; + std::string txt; + float prob; + }; + std::vector probs; + + json to_json(bool post_sampling_probs) const; + + static json probs_vector_to_json(const std::vector & probs, bool post_sampling_probs); + + static float logarithm(float x); + + static std::vector str_to_bytes(const std::string & str); + +}; + +struct server_task_result_cmpl_final : server_task_result { + int index = 0; + + std::string content; + llama_tokens tokens; + + bool stream; + bool include_usage; + result_timings timings; + std::string prompt; + + bool truncated; + int32_t n_decoded; + int32_t n_prompt_tokens; + int32_t n_tokens_cached; + bool has_new_line; + std::string stopping_word; + stop_type stop = STOP_TYPE_NONE; + + bool post_sampling_probs; + std::vector probs_output; + std::vector response_fields; + + task_params generation_params; + + // response formatting + bool verbose = false; + task_response_type res_type = TASK_RESPONSE_TYPE_NONE; + std::string oaicompat_model; + std::string oaicompat_cmpl_id; + common_chat_msg oaicompat_msg; + + std::vector oaicompat_msg_diffs; + + virtual int get_index() override { + return index; + } + + virtual bool is_stop() override { + return true; // in stream mode, final responses are considered stop + } + + virtual json to_json() override; + + json to_json_non_oaicompat(); + + json to_json_oaicompat(); + + json to_json_oaicompat_chat(); + + json to_json_oaicompat_chat_stream(); + + json to_json_anthropic(); + + json to_json_anthropic_stream(); +}; + +struct server_task_result_cmpl_partial : server_task_result { + int index = 0; + + std::string content; + llama_tokens tokens; + + int32_t n_decoded; + int32_t n_prompt_tokens; + + bool post_sampling_probs; + bool is_progress = false; + completion_token_output prob_output; + result_timings timings; + result_prompt_progress progress; + + // response formatting + bool verbose = false; + task_response_type res_type = TASK_RESPONSE_TYPE_NONE; + std::string oaicompat_model; + std::string oaicompat_cmpl_id; + std::vector oaicompat_msg_diffs; + + virtual int get_index() override { + return index; + } + + virtual bool is_stop() override { + return false; // in stream mode, partial responses are not considered stop + } + + virtual json to_json() override; + + json to_json_non_oaicompat(); + + json to_json_oaicompat(); + + json to_json_oaicompat_chat(); + + json to_json_anthropic(); +}; + +struct server_task_result_embd : server_task_result { + int index = 0; + std::vector> embedding; + + int32_t n_tokens; + + // response formatting + task_response_type res_type = TASK_RESPONSE_TYPE_NONE; + + virtual int get_index() override { + return index; + } + + virtual json to_json() override; + + json to_json_non_oaicompat(); + + json to_json_oaicompat(); +}; + +struct server_task_result_rerank : server_task_result { + int index = 0; + float score = -1e6; + + int32_t n_tokens; + + virtual int get_index() override { + return index; + } + + virtual json to_json() override; +}; + +struct server_task_result_error : server_task_result { + int index = 0; + error_type err_type = ERROR_TYPE_SERVER; + std::string err_msg; + + // for ERROR_TYPE_EXCEED_CONTEXT_SIZE + int32_t n_prompt_tokens = 0; + int32_t n_ctx = 0; + + virtual bool is_error() override { + return true; + } + + virtual json to_json() override; +}; + +struct server_task_result_metrics : server_task_result { + int n_idle_slots; + int n_processing_slots; + int n_tasks_deferred; + int64_t t_start; + + // TODO: somehow reuse server_metrics in the future, instead of duplicating the fields + uint64_t n_prompt_tokens_processed_total = 0; + uint64_t t_prompt_processing_total = 0; + uint64_t n_tokens_predicted_total = 0; + uint64_t t_tokens_generation_total = 0; + + uint64_t n_tokens_max = 0; + + uint64_t n_prompt_tokens_processed = 0; + uint64_t t_prompt_processing = 0; + + uint64_t n_tokens_predicted = 0; + uint64_t t_tokens_generation = 0; + + uint64_t n_decode_total = 0; + uint64_t n_busy_slots_total = 0; + + // while we can also use std::vector this requires copying the slot object which can be quite messy + // therefore, we use json to temporarily store the slot.to_json() result + json slots_data = json::array(); + + virtual json to_json() override; +}; + +struct server_task_result_slot_save_load : server_task_result { + std::string filename; + bool is_save; // true = save, false = load + + size_t n_tokens; + size_t n_bytes; + double t_ms; + + virtual json to_json() override; +}; + +struct server_task_result_slot_erase : server_task_result { + size_t n_erased; + + virtual json to_json() override; +}; + +struct server_task_result_apply_lora : server_task_result { + virtual json to_json() override; +}; + +struct server_prompt_checkpoint { + llama_pos pos_min; + llama_pos pos_max; + + std::vector data; + + size_t size() const { + return data.size(); + } +}; + +struct server_prompt { + server_tokens tokens; + + std::vector data; + + std::list checkpoints; + + size_t size() const { + size_t res = data.size(); + + for (const auto & checkpoint : checkpoints) { + res += checkpoint.size(); + } + + return res; + } + + int n_tokens() const { + return tokens.size(); + } +}; + +struct server_prompt_cache { + server_prompt_cache(int32_t limit_size_mib, size_t limit_tokens) { + this->limit_size = 1024ull*1024ull*(limit_size_mib < 0 ? 0 : limit_size_mib); + this->limit_tokens = limit_tokens; + } + + std::list states; + + // in bytes, 0 = no limit + size_t limit_size = 0; + + // in tokens, 0 = no limit + size_t limit_tokens = 0; + + size_t size() const; + + size_t n_tokens() const; + + server_prompt * alloc(const server_prompt & prompt, size_t state_size); + + bool load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, int32_t id_slot); + + void update(); +}; diff --git a/llamacpp/native/src/server/server.cpp b/llamacpp/native/src/server/server.cpp index bea951b97..d5bef3df4 100644 --- a/llamacpp/native/src/server/server.cpp +++ b/llamacpp/native/src/server/server.cpp @@ -1,5762 +1,259 @@ -#include "chat.h" -#include "utils.hpp" +#include "server-context.h" +#include "server-http.h" +#include "server-models.h" #include "arg.h" #include "common.h" -#include "json-schema-to-grammar.h" #include "llama.h" #include "log.h" -#include "sampling.h" -#include "speculative.h" -#include "mtmd.h" -// mime type for sending response -#define MIMETYPE_JSON "application/json; charset=utf-8" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using json = nlohmann::ordered_json; - -constexpr int HTTP_POLLING_SECONDS = 1; - -enum stop_type { - STOP_TYPE_NONE, - STOP_TYPE_EOS, - STOP_TYPE_WORD, - STOP_TYPE_LIMIT, -}; - -// state diagram: https://github.com/ggml-org/llama.cpp/pull/9283 -enum slot_state { - SLOT_STATE_IDLE, - SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future - SLOT_STATE_PROCESSING_PROMPT, - SLOT_STATE_DONE_PROMPT, - SLOT_STATE_GENERATING, -}; - -enum server_state { - SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet - SERVER_STATE_READY, // Server is ready and model is loaded -}; - -enum server_task_type { - SERVER_TASK_TYPE_COMPLETION, - SERVER_TASK_TYPE_EMBEDDING, - SERVER_TASK_TYPE_RERANK, - SERVER_TASK_TYPE_INFILL, - SERVER_TASK_TYPE_CANCEL, - SERVER_TASK_TYPE_NEXT_RESPONSE, - SERVER_TASK_TYPE_METRICS, - SERVER_TASK_TYPE_SLOT_SAVE, - SERVER_TASK_TYPE_SLOT_RESTORE, - SERVER_TASK_TYPE_SLOT_ERASE, - SERVER_TASK_TYPE_SET_LORA, -}; - -enum oaicompat_type { - OAICOMPAT_TYPE_NONE, - OAICOMPAT_TYPE_CHAT, - OAICOMPAT_TYPE_COMPLETION, - OAICOMPAT_TYPE_EMBEDDING, -}; - -// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 -enum error_type { - ERROR_TYPE_INVALID_REQUEST, - ERROR_TYPE_AUTHENTICATION, - ERROR_TYPE_SERVER, - ERROR_TYPE_NOT_FOUND, - ERROR_TYPE_PERMISSION, - ERROR_TYPE_UNAVAILABLE, // custom error - ERROR_TYPE_NOT_SUPPORTED, // custom error - ERROR_TYPE_EXCEED_CONTEXT_SIZE, // custom error -}; - -static bool server_task_type_need_embd(server_task_type task_type) { - switch (task_type) { - case SERVER_TASK_TYPE_EMBEDDING: - case SERVER_TASK_TYPE_RERANK: - return true; - default: - return false; - } -} - -static bool server_task_type_need_logits(server_task_type task_type) { - switch (task_type) { - case SERVER_TASK_TYPE_COMPLETION: - case SERVER_TASK_TYPE_INFILL: - return true; - default: - return false; - } -} - -struct slot_params { - bool stream = true; - bool include_usage = false; - bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt - bool return_tokens = false; - bool return_progress = false; - - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half - int32_t n_predict = -1; // new tokens to predict - int32_t n_indent = 0; // minimum line indentation for the generated text in number of whitespace characters - - int64_t t_max_prompt_ms = -1; // TODO: implement - int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit - - std::vector lora; - - std::vector antiprompt; - std::vector response_fields; - bool timings_per_token = false; - bool post_sampling_probs = false; - - struct common_params_sampling sampling; - struct common_params_speculative speculative; - - // OAI-compat fields - bool verbose = false; - oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE; - std::string oaicompat_model; - std::string oaicompat_cmpl_id; - common_chat_syntax oaicompat_chat_syntax; - - // Embeddings - int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm) - - json to_json(bool only_metrics = false) const { - std::vector samplers; - samplers.reserve(sampling.samplers.size()); - for (const auto & sampler : sampling.samplers) { - samplers.emplace_back(common_sampler_type_to_str(sampler)); - } - - json lora = json::array(); - for (size_t i = 0; i < this->lora.size(); ++i) { - lora.push_back({{"id", i}, {"scale", this->lora[i].scale}}); - } - - if (only_metrics) { - return json { - {"seed", sampling.seed}, - {"temperature", sampling.temp}, - {"dynatemp_range", sampling.dynatemp_range}, - {"dynatemp_exponent", sampling.dynatemp_exponent}, - {"top_k", sampling.top_k}, - {"top_p", sampling.top_p}, - {"min_p", sampling.min_p}, - {"top_n_sigma", sampling.top_n_sigma}, - {"xtc_probability", sampling.xtc_probability}, - {"xtc_threshold", sampling.xtc_threshold}, - {"typical_p", sampling.typ_p}, - {"repeat_last_n", sampling.penalty_last_n}, - {"repeat_penalty", sampling.penalty_repeat}, - {"presence_penalty", sampling.penalty_present}, - {"frequency_penalty", sampling.penalty_freq}, - {"dry_multiplier", sampling.dry_multiplier}, - {"dry_base", sampling.dry_base}, - {"dry_allowed_length", sampling.dry_allowed_length}, - {"dry_penalty_last_n", sampling.dry_penalty_last_n}, - {"mirostat", sampling.mirostat}, - {"mirostat_tau", sampling.mirostat_tau}, - {"mirostat_eta", sampling.mirostat_eta}, - {"max_tokens", n_predict}, - {"n_predict", n_predict}, // TODO: deduplicate? - {"n_keep", n_keep}, - {"n_discard", n_discard}, - {"ignore_eos", sampling.ignore_eos}, - {"stream", stream}, - {"n_probs", sampling.n_probs}, - {"min_keep", sampling.min_keep}, - {"chat_format", common_chat_format_name(oaicompat_chat_syntax.format)}, - {"reasoning_format", common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)}, - {"reasoning_in_content", oaicompat_chat_syntax.reasoning_in_content}, - {"thinking_forced_open", oaicompat_chat_syntax.thinking_forced_open}, - {"samplers", samplers}, - {"speculative.n_max", speculative.n_max}, - {"speculative.n_min", speculative.n_min}, - {"speculative.p_min", speculative.p_min}, - {"timings_per_token", timings_per_token}, - {"post_sampling_probs", post_sampling_probs}, - {"lora", lora}, - }; - } - - auto grammar_triggers = json::array(); - for (const auto & trigger : sampling.grammar_triggers) { - server_grammar_trigger ct(trigger); - grammar_triggers.push_back(ct.to_json()); - } - - return json { - {"seed", sampling.seed}, - {"temperature", sampling.temp}, - {"dynatemp_range", sampling.dynatemp_range}, - {"dynatemp_exponent", sampling.dynatemp_exponent}, - {"top_k", sampling.top_k}, - {"top_p", sampling.top_p}, - {"min_p", sampling.min_p}, - {"top_n_sigma", sampling.top_n_sigma}, - {"xtc_probability", sampling.xtc_probability}, - {"xtc_threshold", sampling.xtc_threshold}, - {"typical_p", sampling.typ_p}, - {"repeat_last_n", sampling.penalty_last_n}, - {"repeat_penalty", sampling.penalty_repeat}, - {"presence_penalty", sampling.penalty_present}, - {"frequency_penalty", sampling.penalty_freq}, - {"dry_multiplier", sampling.dry_multiplier}, - {"dry_base", sampling.dry_base}, - {"dry_allowed_length", sampling.dry_allowed_length}, - {"dry_penalty_last_n", sampling.dry_penalty_last_n}, - {"dry_sequence_breakers", sampling.dry_sequence_breakers}, - {"mirostat", sampling.mirostat}, - {"mirostat_tau", sampling.mirostat_tau}, - {"mirostat_eta", sampling.mirostat_eta}, - {"stop", antiprompt}, - {"max_tokens", n_predict}, - {"n_predict", n_predict}, // TODO: deduplicate? - {"n_keep", n_keep}, - {"n_discard", n_discard}, - {"ignore_eos", sampling.ignore_eos}, - {"stream", stream}, - {"logit_bias", format_logit_bias(sampling.logit_bias)}, - {"n_probs", sampling.n_probs}, - {"min_keep", sampling.min_keep}, - {"grammar", sampling.grammar}, - {"grammar_lazy", sampling.grammar_lazy}, - {"grammar_triggers", grammar_triggers}, - {"preserved_tokens", sampling.preserved_tokens}, - {"chat_format", common_chat_format_name(oaicompat_chat_syntax.format)}, - {"reasoning_format", common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)}, - {"reasoning_in_content", oaicompat_chat_syntax.reasoning_in_content}, - {"thinking_forced_open", oaicompat_chat_syntax.thinking_forced_open}, - {"samplers", samplers}, - {"speculative.n_max", speculative.n_max}, - {"speculative.n_min", speculative.n_min}, - {"speculative.p_min", speculative.p_min}, - {"timings_per_token", timings_per_token}, - {"post_sampling_probs", post_sampling_probs}, - {"lora", lora}, - }; - } -}; - -struct server_task { - int id = -1; // to be filled by server_queue - int index = -1; // used when there are multiple prompts (batch request) - - // used by SERVER_TASK_TYPE_CANCEL - int id_target = -1; - int id_slot = -1; - - // used by SERVER_TASK_TYPE_INFERENCE - slot_params params; - server_tokens tokens; - - server_task_type type; - - // used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE - struct slot_action { - int slot_id; - std::string filename; - std::string filepath; - }; - slot_action slot_action; - - // used by SERVER_TASK_TYPE_METRICS - bool metrics_reset_bucket = false; - - // used by SERVER_TASK_TYPE_SET_LORA - std::vector set_lora; - - server_task() = default; - - server_task(server_task_type type) : type(type) {} - - int32_t n_tokens() const { - return tokens.size(); - } - - static slot_params params_from_json_cmpl( - const llama_context * ctx, - const common_params & params_base, - const json & data) { - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); - - slot_params params; - - // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) - slot_params defaults; - defaults.sampling = params_base.sampling; - defaults.speculative = params_base.speculative; - defaults.n_keep = params_base.n_keep; - defaults.n_predict = params_base.n_predict; - defaults.antiprompt = params_base.antiprompt; - - // enabling this will output extra debug information in the HTTP responses from the server - params.verbose = params_base.verbosity > 9; - params.timings_per_token = json_value(data, "timings_per_token", false); - - params.stream = json_value(data, "stream", false); - auto stream_opt = json_value(data, "stream_options", json::object()); - params.include_usage = json_value(stream_opt, "include_usage", false); - params.cache_prompt = json_value(data, "cache_prompt", true); - params.return_tokens = json_value(data, "return_tokens", false); - params.return_progress = json_value(data, "return_progress", false); - params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict)); - params.n_indent = json_value(data, "n_indent", defaults.n_indent); - params.n_keep = json_value(data, "n_keep", defaults.n_keep); - params.n_discard = json_value(data, "n_discard", defaults.n_discard); - //params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement - params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms); - params.response_fields = json_value(data, "response_fields", std::vector()); - - params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k); - params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p); - params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p); - params.sampling.top_n_sigma = json_value(data, "top_n_sigma", defaults.sampling.top_n_sigma); - params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability); - params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold); - params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p); - params.sampling.temp = json_value(data, "temperature", defaults.sampling.temp); - params.sampling.dynatemp_range = json_value(data, "dynatemp_range", defaults.sampling.dynatemp_range); - params.sampling.dynatemp_exponent = json_value(data, "dynatemp_exponent", defaults.sampling.dynatemp_exponent); - params.sampling.penalty_last_n = json_value(data, "repeat_last_n", defaults.sampling.penalty_last_n); - params.sampling.penalty_repeat = json_value(data, "repeat_penalty", defaults.sampling.penalty_repeat); - params.sampling.penalty_freq = json_value(data, "frequency_penalty", defaults.sampling.penalty_freq); - params.sampling.penalty_present = json_value(data, "presence_penalty", defaults.sampling.penalty_present); - params.sampling.dry_multiplier = json_value(data, "dry_multiplier", defaults.sampling.dry_multiplier); - params.sampling.dry_base = json_value(data, "dry_base", defaults.sampling.dry_base); - params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length); - params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n); - params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat); - params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau); - params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta); - params.sampling.seed = json_value(data, "seed", defaults.sampling.seed); - params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs); - params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep); - params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs); - - params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min); - params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max); - params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min); - - params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min); - params.speculative.n_min = std::max(params.speculative.n_min, 0); - params.speculative.n_max = std::max(params.speculative.n_max, 0); - - // Use OpenAI API logprobs only if n_probs wasn't provided - if (data.contains("logprobs") && params.sampling.n_probs == defaults.sampling.n_probs){ - params.sampling.n_probs = json_value(data, "logprobs", defaults.sampling.n_probs); - } - - if (data.contains("lora")) { - if (data.at("lora").is_array()) { - params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora")); - } else { - throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields"); - } - } else { - params.lora = params_base.lora_adapters; - } - - // TODO: add more sanity checks for the input parameters - - if (params.sampling.penalty_last_n < -1) { - throw std::runtime_error("Error: repeat_last_n must be >= -1"); - } - - if (params.sampling.dry_penalty_last_n < -1) { - throw std::runtime_error("Error: dry_penalty_last_n must be >= -1"); - } - - if (params.sampling.penalty_last_n == -1) { - // note: should be the slot's context and not the full context, but it's ok - params.sampling.penalty_last_n = llama_n_ctx(ctx); - } - - if (params.sampling.dry_penalty_last_n == -1) { - params.sampling.dry_penalty_last_n = llama_n_ctx(ctx); - } - - if (params.sampling.dry_base < 1.0f) { - params.sampling.dry_base = defaults.sampling.dry_base; - } - - // sequence breakers for DRY - { - // Currently, this is not compatible with TextGen WebUI, Koboldcpp and SillyTavern format - // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39 - - if (data.contains("dry_sequence_breakers")) { - params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector()); - if (params.sampling.dry_sequence_breakers.empty()) { - throw std::runtime_error("Error: dry_sequence_breakers must be a non-empty array of strings"); - } - } - } - - // process "json_schema" and "grammar" - if (data.contains("json_schema") && !data.contains("grammar")) { - try { - auto schema = json_value(data, "json_schema", json::object()); - SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str()); - params.sampling.grammar = json_schema_to_grammar(schema); - SRV_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str()); - } catch (const std::exception & e) { - throw std::runtime_error(std::string("\"json_schema\": ") + e.what()); - } - } else { - params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar); - SRV_DBG("Grammar: %s\n", params.sampling.grammar.c_str()); - params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy); - SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false"); - } - - { - auto it = data.find("chat_format"); - if (it != data.end()) { - params.oaicompat_chat_syntax.format = static_cast(it->get()); - SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_syntax.format)); - } else { - params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format; - } - common_reasoning_format reasoning_format = params_base.reasoning_format; - if (data.contains("reasoning_format")) { - reasoning_format = common_reasoning_format_from_name(data.at("reasoning_format").get()); - } - params.oaicompat_chat_syntax.reasoning_format = reasoning_format; - params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY); - params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false); - params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false); - } - - { - const auto preserved_tokens = data.find("preserved_tokens"); - if (preserved_tokens != data.end()) { - for (const auto & t : *preserved_tokens) { - auto ids = common_tokenize(vocab, t.get(), /* add_special= */ false, /* parse_special= */ true); - if (ids.size() == 1) { - SRV_DBG("Preserved token: %d\n", ids[0]); - params.sampling.preserved_tokens.insert(ids[0]); - } else { - // This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens. - SRV_DBG("Not preserved because more than 1 token: %s\n", t.get().c_str()); - } - } - } - const auto grammar_triggers = data.find("grammar_triggers"); - if (grammar_triggers != data.end()) { - for (const auto & t : *grammar_triggers) { - server_grammar_trigger ct(t); - if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) { - const auto & word = ct.value.value; - auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true); - if (ids.size() == 1) { - auto token = ids[0]; - if (std::find(params.sampling.preserved_tokens.begin(), params.sampling.preserved_tokens.end(), (llama_token) token) == params.sampling.preserved_tokens.end()) { - throw std::runtime_error("Grammar trigger word should be marked as preserved token: " + word); - } - SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str()); - common_grammar_trigger trigger; - trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN; - trigger.value = word; - trigger.token = token; - params.sampling.grammar_triggers.push_back(std::move(trigger)); - } else { - SRV_DBG("Grammar trigger word: `%s`\n", word.c_str()); - params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word}); - } - } else { - if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN) { - SRV_DBG("Grammar trigger pattern: `%s`\n", ct.value.value.c_str()); - } else if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL) { - SRV_DBG("Grammar trigger pattern full: `%s`\n", ct.value.value.c_str()); - } else { - throw std::runtime_error("Unknown grammar trigger type"); - } - params.sampling.grammar_triggers.emplace_back(std::move(ct.value)); - } - } - } - if (params.sampling.grammar_lazy && params.sampling.grammar_triggers.empty()) { - throw std::runtime_error("Error: no triggers set for lazy grammar!"); - } - } - - { - params.sampling.logit_bias.clear(); - - const auto & logit_bias = data.find("logit_bias"); - if (logit_bias != data.end() && logit_bias->is_array()) { - const int n_vocab = llama_vocab_n_tokens(vocab); - for (const auto & el : *logit_bias) { - // TODO: we may want to throw errors here, in case "el" is incorrect - if (el.is_array() && el.size() == 2) { - float bias; - if (el[1].is_number()) { - bias = el[1].get(); - } else if (el[1].is_boolean() && !el[1].get()) { - bias = -INFINITY; - } else { - continue; - } - - if (el[0].is_number_integer()) { - llama_token tok = el[0].get(); - if (tok >= 0 && tok < n_vocab) { - params.sampling.logit_bias.push_back({tok, bias}); - } - } else if (el[0].is_string()) { - auto toks = common_tokenize(vocab, el[0].get(), false); - for (auto tok : toks) { - params.sampling.logit_bias.push_back({tok, bias}); - } - } - } - } - } else if (logit_bias != data.end() && logit_bias->is_object()) { - const int n_vocab = llama_vocab_n_tokens(vocab); - for (const auto & el : logit_bias->items()) { - float bias; - const auto & key = el.key(); - const auto & value = el.value(); - if (value.is_number()) { - bias = value.get(); - } else if (value.is_boolean() && !value.get()) { - bias = -INFINITY; - } else { - continue; - } - - char *end; - llama_token tok = strtol(key.c_str(), &end, 10); - if (*end == 0) { - if (tok >= 0 && tok < n_vocab) { - params.sampling.logit_bias.push_back({tok, bias}); - } - } else { - auto toks = common_tokenize(vocab, key, false); - for (auto tok : toks) { - params.sampling.logit_bias.push_back({tok, bias}); - } - } - } - } - - params.sampling.ignore_eos = json_value(data, "ignore_eos", params_base.sampling.ignore_eos); - if (params.sampling.ignore_eos) { - params.sampling.logit_bias.insert( - params.sampling.logit_bias.end(), - defaults.sampling.logit_bias_eog.begin(), defaults.sampling.logit_bias_eog.end()); - } - } - - { - params.antiprompt.clear(); - - const auto & stop = data.find("stop"); - if (stop != data.end() && stop->is_array()) { - for (const auto & word : *stop) { - if (!word.empty()) { - params.antiprompt.push_back(word); - } - } - } - // set reverse prompt from cli args if not set in the request - if (params.antiprompt.empty()) { - params.antiprompt = defaults.antiprompt; - } - } - - { - const auto samplers = data.find("samplers"); - if (samplers != data.end()) { - if (samplers->is_array()) { - params.sampling.samplers = common_sampler_types_from_names(*samplers, false); - } else if (samplers->is_string()){ - params.sampling.samplers = common_sampler_types_from_chars(samplers->get()); - } - } else { - params.sampling.samplers = defaults.sampling.samplers; - } - } - - std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias; - params.oaicompat_model = json_value(data, "model", model_name); - - return params; - } - - // utility function - static std::unordered_set get_list_id(const std::vector & tasks) { - std::unordered_set ids(tasks.size()); - for (size_t i = 0; i < tasks.size(); i++) { - ids.insert(tasks[i].id); - } - return ids; - } -}; - -struct result_timings { - int32_t cache_n = -1; - - int32_t prompt_n = -1; - double prompt_ms; - double prompt_per_token_ms; - double prompt_per_second; - - int32_t predicted_n = -1; - double predicted_ms; - double predicted_per_token_ms; - double predicted_per_second; - - // Optional speculative metrics - only included when > 0 - int32_t draft_n = 0; - int32_t draft_n_accepted = 0; - - json to_json() const { - json base = { - {"cache_n", cache_n}, - - {"prompt_n", prompt_n}, - {"prompt_ms", prompt_ms}, - {"prompt_per_token_ms", prompt_per_token_ms}, - {"prompt_per_second", prompt_per_second}, - - {"predicted_n", predicted_n}, - {"predicted_ms", predicted_ms}, - {"predicted_per_token_ms", predicted_per_token_ms}, - {"predicted_per_second", predicted_per_second}, - }; - - if (draft_n > 0) { - base["draft_n"] = draft_n; - base["draft_n_accepted"] = draft_n_accepted; - } - - return base; - } -}; - -struct result_prompt_progress { - int32_t total = 0; - int32_t cache = 0; - int32_t processed = 0; - int64_t time_ms = 0; - - json to_json() const { - return json { - {"total", total}, - {"cache", cache}, - {"processed", processed}, - {"time_ms", time_ms}, - }; - } -}; - -struct server_task_result { - int id = -1; - int id_slot = -1; - virtual bool is_error() { - // only used by server_task_result_error - return false; - } - virtual bool is_stop() { - // only used by server_task_result_cmpl_* - return true; - } - virtual int get_index() { - return -1; - } - virtual json to_json() = 0; - virtual ~server_task_result() = default; -}; - -// using shared_ptr for polymorphism of server_task_result -using server_task_result_ptr = std::unique_ptr; - -static inline std::string stop_type_to_str(stop_type type) { - switch (type) { - case STOP_TYPE_EOS: return "eos"; - case STOP_TYPE_WORD: return "word"; - case STOP_TYPE_LIMIT: return "limit"; - default: return "none"; - } -} - -struct completion_token_output { - llama_token tok; - float prob; - std::string text_to_send; - struct prob_info { - llama_token tok; - std::string txt; - float prob; - }; - std::vector probs; - - json to_json(bool post_sampling_probs) const { - json probs_for_token = json::array(); - for (const auto & p : probs) { - std::string txt(p.txt); - txt.resize(validate_utf8(txt)); - probs_for_token.push_back(json { - {"id", p.tok}, - {"token", txt}, - {"bytes", str_to_bytes(p.txt)}, - { - post_sampling_probs ? "prob" : "logprob", - post_sampling_probs ? p.prob : logarithm(p.prob) - }, - }); - } - return probs_for_token; - } - - static json probs_vector_to_json(const std::vector & probs, bool post_sampling_probs) { - json out = json::array(); - for (const auto & p : probs) { - std::string txt(p.text_to_send); - txt.resize(validate_utf8(txt)); - out.push_back(json { - {"id", p.tok}, - {"token", txt}, - {"bytes", str_to_bytes(p.text_to_send)}, - { - post_sampling_probs ? "prob" : "logprob", - post_sampling_probs ? p.prob : logarithm(p.prob) - }, - { - post_sampling_probs ? "top_probs" : "top_logprobs", - p.to_json(post_sampling_probs) - }, - }); - } - return out; - } - - static float logarithm(float x) { - // nlohmann::json converts -inf to null, so we need to prevent that - return x == 0.0f ? std::numeric_limits::lowest() : std::log(x); - } - - static std::vector str_to_bytes(const std::string & str) { - std::vector bytes; - for (unsigned char c : str) { - bytes.push_back(c); - } - return bytes; - } -}; - -struct server_task_result_cmpl_final : server_task_result { - int index = 0; - - std::string content; - llama_tokens tokens; - - bool stream; - bool include_usage; - result_timings timings; - std::string prompt; - - bool truncated; - int32_t n_decoded; - int32_t n_prompt_tokens; - int32_t n_tokens_cached; - bool has_new_line; - std::string stopping_word; - stop_type stop = STOP_TYPE_NONE; - - bool post_sampling_probs; - std::vector probs_output; - std::vector response_fields; - - slot_params generation_params; - - // OAI-compat fields - bool verbose = false; - oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE; - std::string oaicompat_model; - std::string oaicompat_cmpl_id; - common_chat_msg oaicompat_msg; - - std::vector oaicompat_msg_diffs; - - virtual int get_index() override { - return index; - } - - virtual bool is_stop() override { - return true; // in stream mode, final responses are considered stop - } - - virtual json to_json() override { - switch (oaicompat) { - case OAICOMPAT_TYPE_NONE: - return to_json_non_oaicompat(); - case OAICOMPAT_TYPE_COMPLETION: - return to_json_oaicompat(); - case OAICOMPAT_TYPE_CHAT: - return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat(); - default: - GGML_ASSERT(false && "Invalid oaicompat_type"); - } - } - - json to_json_non_oaicompat() { - json res = json { - {"index", index}, - {"content", stream ? "" : content}, // in stream mode, content is already in last partial chunk - {"tokens", stream ? llama_tokens {} : tokens}, - {"id_slot", id_slot}, - {"stop", true}, - {"model", oaicompat_model}, - {"tokens_predicted", n_decoded}, - {"tokens_evaluated", n_prompt_tokens}, - {"generation_settings", generation_params.to_json()}, - {"prompt", prompt}, - {"has_new_line", has_new_line}, - {"truncated", truncated}, - {"stop_type", stop_type_to_str(stop)}, - {"stopping_word", stopping_word}, - {"tokens_cached", n_tokens_cached}, - {"timings", timings.to_json()}, - }; - if (!stream && !probs_output.empty()) { - res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs); - } - return response_fields.empty() ? res : json_get_nested_values(response_fields, res); - } - - json to_json_oaicompat() { - std::time_t t = std::time(0); - json logprobs = json(nullptr); // OAI default to null - if (!stream && probs_output.size() > 0) { - logprobs = json{ - {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)}, - }; - } - json finish_reason = "length"; - if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { - finish_reason = "stop"; - } - json res = json { - {"choices", json::array({ - json{ - {"text", stream ? "" : content}, // in stream mode, content is already in last partial chunk - {"index", index}, - {"logprobs", logprobs}, - {"finish_reason", finish_reason}, - } - })}, - {"created", t}, - {"model", oaicompat_model}, - {"system_fingerprint", build_info}, - {"object", "text_completion"}, - {"usage", json { - {"completion_tokens", n_decoded}, - {"prompt_tokens", n_prompt_tokens}, - {"total_tokens", n_decoded + n_prompt_tokens} - }}, - {"id", oaicompat_cmpl_id} - }; - - // extra fields for debugging purposes - if (verbose) { - res["__verbose"] = to_json_non_oaicompat(); - } - if (timings.prompt_n >= 0) { - res.push_back({"timings", timings.to_json()}); - } - - return res; - } - - json to_json_oaicompat_chat() { - std::string finish_reason = "length"; - common_chat_msg msg; - if (!oaicompat_msg.empty()) { - msg = oaicompat_msg; - } else { - msg.role = "assistant"; - msg.content = content; - } - if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { - finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls"; - } - - json choice { - {"finish_reason", finish_reason}, - {"index", 0}, - {"message", msg.to_json_oaicompat()}, - }; - - if (!stream && probs_output.size() > 0) { - choice["logprobs"] = json{ - {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)}, - }; - } - - std::time_t t = std::time(0); - - json res = json { - {"choices", json::array({choice})}, - {"created", t}, - {"model", oaicompat_model}, - {"system_fingerprint", build_info}, - {"object", "chat.completion"}, - {"usage", json { - {"completion_tokens", n_decoded}, - {"prompt_tokens", n_prompt_tokens}, - {"total_tokens", n_decoded + n_prompt_tokens} - }}, - {"id", oaicompat_cmpl_id} - }; - - // extra fields for debugging purposes - if (verbose) { - res["__verbose"] = to_json_non_oaicompat(); - } - if (timings.prompt_n >= 0) { - res.push_back({"timings", timings.to_json()}); - } - - return res; - } - - json to_json_oaicompat_chat_stream() { - std::time_t t = std::time(0); - std::string finish_reason = "length"; - if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { - finish_reason = oaicompat_msg.tool_calls.empty() ? "stop" : "tool_calls"; - } - - json deltas = json::array(); - for (const auto & diff : oaicompat_msg_diffs) { - deltas.push_back({ - {"choices", json::array({ - json { - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", common_chat_msg_diff_to_json_oaicompat(diff)}, - }, - })}, - {"created", t}, - {"id", oaicompat_cmpl_id}, - {"model", oaicompat_model}, - {"system_fingerprint", build_info}, - {"object", "chat.completion.chunk"}, - }); - } - - deltas.push_back({ - {"choices", json::array({ - json { - {"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}, - }, - })}, - {"created", t}, - {"id", oaicompat_cmpl_id}, - {"model", oaicompat_model}, - {"system_fingerprint", build_info}, - {"object", "chat.completion.chunk"}, - }); - - if (include_usage) { - // OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage - // https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices - deltas.push_back({ - {"choices", json::array()}, - {"created", t}, - {"id", oaicompat_cmpl_id}, - {"model", oaicompat_model}, - {"system_fingerprint", build_info}, - {"object", "chat.completion.chunk"}, - {"usage", json { - {"completion_tokens", n_decoded}, - {"prompt_tokens", n_prompt_tokens}, - {"total_tokens", n_decoded + n_prompt_tokens}, - }}, - }); - } - - if (timings.prompt_n >= 0) { - deltas.back().push_back({"timings", timings.to_json()}); - } - - // extra fields for debugging purposes - if (verbose && !deltas.empty()) { - deltas.front()["__verbose"] = to_json_non_oaicompat(); - } - - return deltas; - } -}; - -struct server_task_result_cmpl_partial : server_task_result { - int index = 0; - - std::string content; - llama_tokens tokens; - - int32_t n_decoded; - int32_t n_prompt_tokens; - - bool post_sampling_probs; - bool is_progress = false; - completion_token_output prob_output; - result_timings timings; - result_prompt_progress progress; - - // OAI-compat fields - bool verbose = false; - oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE; - std::string oaicompat_model; - std::string oaicompat_cmpl_id; - std::vector oaicompat_msg_diffs; - - virtual int get_index() override { - return index; - } - - virtual bool is_stop() override { - return false; // in stream mode, partial responses are not considered stop - } - - virtual json to_json() override { - switch (oaicompat) { - case OAICOMPAT_TYPE_NONE: - return to_json_non_oaicompat(); - case OAICOMPAT_TYPE_COMPLETION: - return to_json_oaicompat(); - case OAICOMPAT_TYPE_CHAT: - return to_json_oaicompat_chat(); - default: - GGML_ASSERT(false && "Invalid oaicompat_type"); - } - } - - json to_json_non_oaicompat() { - // non-OAI-compat JSON - json res = json { - {"index", index}, - {"content", content}, - {"tokens", tokens}, - {"stop", false}, - {"id_slot", id_slot}, - {"tokens_predicted", n_decoded}, - {"tokens_evaluated", n_prompt_tokens}, - }; - // populate the timings object when needed (usually for the last response or with timings_per_token enabled) - if (timings.prompt_n > 0) { - res.push_back({"timings", timings.to_json()}); - } - if (is_progress) { - res.push_back({"prompt_progress", progress.to_json()}); - } - if (!prob_output.probs.empty()) { - res["completion_probabilities"] = completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs); - } - return res; - } - - json to_json_oaicompat() { - std::time_t t = std::time(0); - json logprobs = json(nullptr); // OAI default to null - if (prob_output.probs.size() > 0) { - logprobs = json{ - {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)}, - }; - } - json res = json { - {"choices", json::array({ - json{ - {"text", content}, - {"index", index}, - {"logprobs", logprobs}, - {"finish_reason", nullptr}, - } - })}, - {"created", t}, - {"model", oaicompat_model}, - {"system_fingerprint", build_info}, - {"object", "text_completion"}, - {"id", oaicompat_cmpl_id} - }; - - // extra fields for debugging purposes - if (verbose) { - res["__verbose"] = to_json_non_oaicompat(); - } - if (timings.prompt_n >= 0) { - res.push_back({"timings", timings.to_json()}); - } - if (is_progress) { - res.push_back({"prompt_progress", progress.to_json()}); - } - - return res; - } - - json to_json_oaicompat_chat() { - bool first = n_decoded == 1; - std::time_t t = std::time(0); - json choices; - - std::vector deltas; - auto add_delta = [&](const json & delta) { - deltas.push_back({ - {"choices", json::array({ - json { - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", delta}, - }, - })}, - {"created", t}, - {"id", oaicompat_cmpl_id}, - {"model", oaicompat_model}, - {"system_fingerprint", build_info}, - {"object", "chat.completion.chunk"}, - }); - }; - // We have to send an initial update to conform to openai behavior - if (first || is_progress) { - add_delta({ - {"role", "assistant"}, - {"content", nullptr}, - }); - } - - for (const auto & diff : oaicompat_msg_diffs) { - add_delta(common_chat_msg_diff_to_json_oaicompat(diff)); - } - - if (!deltas.empty()) { - auto & last_json = deltas[deltas.size() - 1]; - GGML_ASSERT(last_json.at("choices").size() >= 1); - - if (prob_output.probs.size() > 0) { - last_json.at("choices").at(0)["logprobs"] = json { - {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)}, - }; - } - - if (timings.prompt_n >= 0) { - last_json.push_back({"timings", timings.to_json()}); - } - if (is_progress) { - last_json.push_back({"prompt_progress", progress.to_json()}); - } - } - - return deltas; - } -}; - -struct server_task_result_embd : server_task_result { - int index = 0; - std::vector> embedding; - - int32_t n_tokens; - - // OAI-compat fields - oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE; - - virtual int get_index() override { - return index; - } - - virtual json to_json() override { - return oaicompat == OAICOMPAT_TYPE_EMBEDDING - ? to_json_oaicompat() - : to_json_non_oaicompat(); - } - - json to_json_non_oaicompat() { - return json { - {"index", index}, - {"embedding", embedding}, - }; - } - - json to_json_oaicompat() { - return json { - {"index", index}, - {"embedding", embedding[0]}, - {"tokens_evaluated", n_tokens}, - }; - } -}; - -struct server_task_result_rerank : server_task_result { - int index = 0; - float score = -1e6; - - int32_t n_tokens; - - virtual int get_index() override { - return index; - } - - virtual json to_json() override { - return json { - {"index", index}, - {"score", score}, - {"tokens_evaluated", n_tokens}, - }; - } -}; - -// this function maybe used outside of server_task_result_error -static json format_error_response(const std::string & message, const enum error_type type) { - std::string type_str; - int code = 500; - switch (type) { - case ERROR_TYPE_INVALID_REQUEST: - type_str = "invalid_request_error"; - code = 400; - break; - case ERROR_TYPE_AUTHENTICATION: - type_str = "authentication_error"; - code = 401; - break; - case ERROR_TYPE_NOT_FOUND: - type_str = "not_found_error"; - code = 404; - break; - case ERROR_TYPE_SERVER: - type_str = "server_error"; - code = 500; - break; - case ERROR_TYPE_PERMISSION: - type_str = "permission_error"; - code = 403; - break; - case ERROR_TYPE_NOT_SUPPORTED: - type_str = "not_supported_error"; - code = 501; - break; - case ERROR_TYPE_UNAVAILABLE: - type_str = "unavailable_error"; - code = 503; - break; - case ERROR_TYPE_EXCEED_CONTEXT_SIZE: - type_str = "exceed_context_size_error"; - code = 400; - break; - } - return json { - {"code", code}, - {"message", message}, - {"type", type_str}, - }; -} - -struct server_task_result_error : server_task_result { - int index = 0; - error_type err_type = ERROR_TYPE_SERVER; - std::string err_msg; - - // for ERROR_TYPE_EXCEED_CONTEXT_SIZE - int32_t n_prompt_tokens = 0; - int32_t n_ctx = 0; - - virtual bool is_error() override { - return true; - } - - virtual json to_json() override { - json res = format_error_response(err_msg, err_type); - if (err_type == ERROR_TYPE_EXCEED_CONTEXT_SIZE) { - res["n_prompt_tokens"] = n_prompt_tokens; - res["n_ctx"] = n_ctx; - } - return res; - } -}; - -struct server_task_result_metrics : server_task_result { - int n_idle_slots; - int n_processing_slots; - int n_tasks_deferred; - int64_t t_start; - - // TODO: somehow reuse server_metrics in the future, instead of duplicating the fields - uint64_t n_prompt_tokens_processed_total = 0; - uint64_t t_prompt_processing_total = 0; - uint64_t n_tokens_predicted_total = 0; - uint64_t t_tokens_generation_total = 0; - - uint64_t n_tokens_max = 0; - - uint64_t n_prompt_tokens_processed = 0; - uint64_t t_prompt_processing = 0; - - uint64_t n_tokens_predicted = 0; - uint64_t t_tokens_generation = 0; - - uint64_t n_decode_total = 0; - uint64_t n_busy_slots_total = 0; - - // while we can also use std::vector this requires copying the slot object which can be quite messy - // therefore, we use json to temporarily store the slot.to_json() result - json slots_data = json::array(); - - virtual json to_json() override { - return json { - { "idle", n_idle_slots }, - { "processing", n_processing_slots }, - { "deferred", n_tasks_deferred }, - { "t_start", t_start }, - - { "n_prompt_tokens_processed_total", n_prompt_tokens_processed_total }, - { "t_tokens_generation_total", t_tokens_generation_total }, - { "n_tokens_predicted_total", n_tokens_predicted_total }, - { "t_prompt_processing_total", t_prompt_processing_total }, - - { "n_tokens_max", n_tokens_max }, - - { "n_prompt_tokens_processed", n_prompt_tokens_processed }, - { "t_prompt_processing", t_prompt_processing }, - { "n_tokens_predicted", n_tokens_predicted }, - { "t_tokens_generation", t_tokens_generation }, - - { "n_decode_total", n_decode_total }, - { "n_busy_slots_total", n_busy_slots_total }, - - { "slots", slots_data }, - }; - } -}; - -struct server_task_result_slot_save_load : server_task_result { - std::string filename; - bool is_save; // true = save, false = load - - size_t n_tokens; - size_t n_bytes; - double t_ms; - - virtual json to_json() override { - if (is_save) { - return json { - { "id_slot", id_slot }, - { "filename", filename }, - { "n_saved", n_tokens }, - { "n_written", n_bytes }, - { "timings", { - { "save_ms", t_ms } - }}, - }; - } - - return json { - { "id_slot", id_slot }, - { "filename", filename }, - { "n_restored", n_tokens }, - { "n_read", n_bytes }, - { "timings", { - { "restore_ms", t_ms } - }}, - }; - } -}; - -struct server_task_result_slot_erase : server_task_result { - size_t n_erased; - - virtual json to_json() override { - return json { - { "id_slot", id_slot }, - { "n_erased", n_erased }, - }; - } -}; - -struct server_task_result_apply_lora : server_task_result { - virtual json to_json() override { - return json {{ "success", true }}; - } -}; - -struct server_prompt_checkpoint { - llama_pos pos_min; - llama_pos pos_max; - - std::vector data; - - size_t size() const { - return data.size(); - } -}; - -struct server_prompt { - server_tokens tokens; - - std::vector data; - - std::list checkpoints; - - size_t size() const { - size_t res = data.size(); - - for (const auto & checkpoint : checkpoints) { - res += checkpoint.size(); - } - - return res; - } - - int n_tokens() const { - return tokens.size(); - } -}; - -struct server_prompt_cache { - server_prompt_cache(int32_t limit_size_mib, size_t limit_tokens) { - this->limit_size = 1024ull*1024ull*(limit_size_mib < 0 ? 0 : limit_size_mib); - this->limit_tokens = limit_tokens; - } - - std::list states; - - // in bytes, 0 = no limit - size_t limit_size = 0; - - // in tokens, 0 = no limit - size_t limit_tokens = 0; - - size_t size() const { - size_t res = 0; - - for (const auto & state : states) { - res += state.size(); - } - - return res; - } - - size_t n_tokens() const { - size_t res = 0; - - for (const auto & state : states) { - res += state.n_tokens(); - } - - return res; - } - - server_prompt * alloc(const server_prompt & prompt, size_t state_size) { - // first check if the current state is contained fully in the cache - for (auto it = states.begin(); it != states.end(); ++it) { - const int cur_lcp_len = it->tokens.get_common_prefix(prompt.tokens); - - if (cur_lcp_len == (int) prompt.tokens.size()) { - SRV_WRN("%s", " - prompt is already in the cache, skipping\n"); - return nullptr; - } - } - - // next, remove any cached prompts that are fully contained in the current prompt - for (auto it = states.begin(); it != states.end();) { - const int len = it->tokens.get_common_prefix(prompt.tokens); - - if (len == (int) it->tokens.size()) { - SRV_WRN(" - removing obsolete cached prompt with length %d\n", len); - - it = states.erase(it); - } else { - ++it; - } - } - - std::vector state_data; - - // check if we can allocate enough memory for the new state - try { - state_data.resize(state_size); - } catch (const std::bad_alloc & e) { - SRV_ERR("failed to allocate memory for prompt cache state: %s\n", e.what()); - - limit_size = std::max(1, 0.4*size()); - - SRV_WRN(" - cache size limit reduced to %.3f MiB\n", limit_size / (1024.0 * 1024.0)); - - update(); - - return nullptr; - } - - // TODO: for some reason we can't copy server_tokens, so we have to do this workaround - auto & cur = states.emplace_back(); - cur = { - /*.tokens =*/ server_tokens(prompt.tokens.get_text_tokens(), false), - /*.data =*/ std::move(state_data), - /*.checkpoints =*/ prompt.checkpoints, - }; - - return &cur; - } - - bool load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, int32_t id_slot) { - const int lcp_best = prompt.tokens.get_common_prefix(tokens_new); - - float f_keep_best = float(lcp_best) / prompt.tokens.size(); - float sim_best = float(lcp_best) / tokens_new.size(); - - SRV_WRN(" - looking for better prompt, base f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best); - - auto it_best = states.end(); - - // find the most similar cached prompt, that would also preserve the most context - for (auto it = states.begin(); it != states.end(); ++it) { - const int lcp_cur = it->tokens.get_common_prefix(tokens_new); - - const float f_keep_cur = float(lcp_cur) / it->tokens.size(); - const float sim_cur = float(lcp_cur) / tokens_new.size(); - - // don't trash large prompts - if (f_keep_cur < 0.25f) { - continue; - } - - if (f_keep_best < f_keep_cur && sim_best < sim_cur) { - f_keep_best = f_keep_cur; - sim_best = sim_cur; - - it_best = it; - } - } - - if (it_best != states.end()) { - SRV_WRN(" - found better prompt with f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best); - - const size_t size = it_best->data.size(); - const size_t n = llama_state_seq_set_data_ext(ctx, it_best->data.data(), size, id_slot, 0); - if (n != size) { - SRV_WRN("failed to restore state with size %zu\n", size); - - return false; - } - - it_best->data.clear(); - it_best->data.shrink_to_fit(); - - prompt = std::move(*it_best); - - states.erase(it_best); - } - - return true; - } - - void update() { - if (limit_size > 0) { - // always keep at least one state, regardless of the limits - while (states.size() > 1 && size() > limit_size) { - if (states.empty()) { - break; - } - - SRV_WRN(" - cache size limit reached, removing oldest entry (size = %.3f MiB)\n", states.front().size() / (1024.0 * 1024.0)); - - states.pop_front(); - } - } - - // average size per token - const float size_per_token = std::max(1.0f, float(size()) / (std::max(1, n_tokens()))); - - // dynamically increase the token limit if it can fit in the memory limit - const size_t limit_tokens_cur = limit_size > 0 ? std::max(limit_tokens, limit_size/size_per_token) : limit_tokens; - - if (limit_tokens > 0) { - while (states.size() > 1 && n_tokens() > limit_tokens_cur) { - if (states.empty()) { - break; - } - - SRV_WRN(" - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n", - limit_tokens, limit_tokens_cur, states.front().size() / (1024.0 * 1024.0)); - - states.pop_front(); - } - } - - SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n", - states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens, limit_tokens_cur); - - for (const auto & state : states) { - SRV_WRN(" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n", - (const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0)); - } - } -}; - -struct server_slot { - int id; - - llama_batch batch_spec = {}; - - // TODO: change to unique_ptrs for consistency: - llama_context * ctx = nullptr; - llama_context * ctx_dft = nullptr; - - // multimodal - mtmd_context * mctx = nullptr; - - common_speculative * spec = nullptr; - - std::unique_ptr task; - std::unique_ptr task_prev; // used for debugging - - // used to determine the slot that has been used the longest - int64_t t_last_used = -1; - - // generation props - int32_t n_ctx = 0; // context size per slot - int32_t n_keep = 0; - int32_t n_decoded = 0; - int32_t n_remaining = -1; - int32_t i_batch = -1; - - int32_t n_prompt_tokens_cache = 0; - int32_t n_prompt_tokens_processed = 0; - - size_t last_nl_pos = 0; - - std::string generated_text; - llama_tokens generated_tokens; - - common_chat_msg chat_msg; - - std::vector generated_token_probs; - - bool has_next_token = true; - bool has_new_line = false; - bool truncated = false; - - stop_type stop; - - std::string stopping_word; - - // state - slot_state state = SLOT_STATE_IDLE; - - server_prompt prompt; - - void prompt_save(server_prompt_cache & prompt_cache) const { - assert(prompt.data.size() == 0); - - const size_t cur_size = llama_state_seq_get_size_ext(ctx, id, 0); - - SRV_WRN(" - saving prompt with length %d, total state size = %.3f MiB\n", - (int) prompt.tokens.size(), cur_size / (1024.0 * 1024.0)); - - auto * cur = prompt_cache.alloc(prompt, cur_size); - if (cur == nullptr) { - return; - } - - llama_state_seq_get_data_ext(ctx, cur->data.data(), cur_size, id, 0); - } - - void prompt_load(server_prompt_cache & prompt_cache, const server_tokens & tokens) { - bool res = prompt_cache.load(prompt, tokens, ctx, id); - if (!res) { - SLT_WRN(*this, "%s", "failed to load prompt from cache\n"); - - llama_memory_seq_rm(llama_get_memory(ctx), id, -1, -1); - prompt.tokens.clear(); - } - } - - std::vector lora; - int32_t alora_invocation_start = -1; - - // sampling - json json_schema; - - struct common_sampler * smpl = nullptr; - - llama_token sampled; - - common_chat_format chat_format = COMMON_CHAT_FORMAT_CONTENT_ONLY; - std::vector generated_tool_call_ids; - - // stats - size_t n_sent_text = 0; // number of sent text character - - int64_t t_start_process_prompt; - int64_t t_start_generation; - - double t_prompt_processing; // ms - double t_token_generation; // ms - - std::function callback_on_release; - - // Speculative decoding stats - int32_t n_draft_total = 0; // Total draft tokens generated - int32_t n_draft_accepted = 0; // Draft tokens actually accepted - - void reset() { - SLT_DBG(*this, "%s", "\n"); - - n_prompt_tokens_cache = 0; - - last_nl_pos = 0; - generated_text = ""; - has_new_line = false; - truncated = false; - stop = STOP_TYPE_NONE; - stopping_word = ""; - n_sent_text = 0; - chat_format = COMMON_CHAT_FORMAT_CONTENT_ONLY; - - generated_tokens.clear(); - generated_token_probs.clear(); - chat_msg = {}; - json_schema = json(); - generated_tool_call_ids.clear(); - - // clear speculative decoding stats - n_draft_total = 0; - n_draft_accepted = 0; - - task.reset(); - task_prev.reset(); - - // clear alora start - alora_invocation_start = -1; - } - - bool need_embd() const { - GGML_ASSERT(task); - - return server_task_type_need_embd(task->type); - } - - bool need_logits() const { - GGML_ASSERT(task); - - return server_task_type_need_logits(task->type); - } - - // if the context does not have a memory module then all embeddings have to be computed within a single ubatch - // also we cannot split if the pooling would require any past tokens - bool can_split() const { - return - !need_embd() || - (llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST); - } - - bool can_batch_with(server_slot & other_slot) const { - GGML_ASSERT(task); - - return task->type == other_slot.task->type && are_lora_equal(lora, other_slot.lora); - } - - bool has_budget(const common_params & global_params) { - GGML_ASSERT(task); - - if (task->params.n_predict == -1 && global_params.n_predict == -1) { - return true; // limitless - } - - n_remaining = -1; - - if (task->params.n_predict != -1) { - n_remaining = task->params.n_predict - n_decoded; - } else if (global_params.n_predict != -1) { - n_remaining = global_params.n_predict - n_decoded; - } - - return n_remaining > 0; // no budget - } - - bool is_processing() const { - return state != SLOT_STATE_IDLE; - } - - bool can_speculate() const { - return ctx_dft; - } - - void add_token(const completion_token_output & token) { - if (!is_processing()) { - SLT_WRN(*this, "%s", "slot is not processing\n"); - return; - } - generated_token_probs.push_back(token); - } - - void release() { - if (is_processing()) { - GGML_ASSERT(task); - - SLT_INF(*this, "stop processing: n_tokens = %d, truncated = %d\n", prompt.n_tokens(), truncated); - - t_last_used = ggml_time_us(); - t_token_generation = (ggml_time_us() - t_start_generation) / 1e3; - state = SLOT_STATE_IDLE; - - task_prev = std::move(task); - task.reset(); - - callback_on_release(id); - } - } - - result_timings get_timings() const { - result_timings timings; - timings.cache_n = n_prompt_tokens_cache; - - timings.prompt_n = n_prompt_tokens_processed; - timings.prompt_ms = t_prompt_processing; - timings.prompt_per_token_ms = t_prompt_processing / n_prompt_tokens_processed; - timings.prompt_per_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; - - timings.predicted_n = n_decoded; - timings.predicted_ms = t_token_generation; - timings.predicted_per_token_ms = t_token_generation / n_decoded; - timings.predicted_per_second = 1e3 / t_token_generation * n_decoded; - - // Add speculative metrics - if (n_draft_total > 0) { - timings.draft_n = n_draft_total; - timings.draft_n_accepted = n_draft_accepted; - } - - return timings; - } - - const common_chat_msg & update_chat_msg(std::vector & diffs) { - GGML_ASSERT(task); - - auto previous_msg = chat_msg; - SRV_DBG("Parsing chat message: %s\n", generated_text.c_str()); - auto new_msg = common_chat_parse( - generated_text, - /* is_partial= */ stop != STOP_TYPE_EOS, - task->params.oaicompat_chat_syntax); - if (!new_msg.empty()) { - new_msg.set_tool_call_ids(generated_tool_call_ids, gen_tool_call_id); - chat_msg = new_msg; - diffs = common_chat_msg_diff::compute_diffs(previous_msg, new_msg.empty() ? previous_msg : new_msg); - } - return chat_msg; - } - - size_t find_stopping_strings(const std::string & text, const size_t last_token_size, bool is_full_stop) { - GGML_ASSERT(task); - - size_t stop_pos = std::string::npos; - - for (const std::string & word : task->params.antiprompt) { - size_t pos; - - if (is_full_stop) { - const size_t tmp = word.size() + last_token_size; - const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; - - pos = text.find(word, from_pos); - } else { - // otherwise, partial stop - pos = string_find_partial_stop(text, word); - } - - if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) { - if (is_full_stop) { - stop = STOP_TYPE_WORD; - stopping_word = word; - has_next_token = false; - } - stop_pos = pos; - } - } - - return stop_pos; - } - - void print_timings() const { - const double t_prompt = t_prompt_processing / n_prompt_tokens_processed; - const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; - - const double t_gen = t_token_generation / n_decoded; - const double n_gen_second = 1e3 / t_token_generation * n_decoded; - - SLT_INF(*this, - "\n" - "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" - " eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" - " total time = %10.2f ms / %5d tokens\n", - t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second, - t_token_generation, n_decoded, t_gen, n_gen_second, - t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded); - - if (n_draft_total > 0) { - const float draft_ratio = (float) n_draft_accepted / n_draft_total; - SLT_INF(*this, - "\n" - "draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n", - draft_ratio, n_draft_accepted, n_draft_total - ); - } - } - - json to_json(bool only_metrics = false) const { - json res; - - res = { - {"id", id}, - {"n_ctx", n_ctx}, - {"speculative", can_speculate()}, - {"is_processing", is_processing()}, - }; - - const auto & ptask = task ? task : task_prev; - - if (ptask) { - res["id_task"] = ptask->id; - res["params"] = ptask->params.to_json(only_metrics); - res["next_token"] = { - { - {"has_next_token", has_next_token}, - {"has_new_line", has_new_line}, - {"n_remain", n_remaining}, - {"n_decoded", n_decoded}, - } - }; - - if (!only_metrics) { - res["prompt"] = ptask->tokens.detokenize(ctx, true); - res["generated"] = generated_text; - } - } - - return res; - } -}; - -struct server_metrics { - int64_t t_start = 0; - - uint64_t n_prompt_tokens_processed_total = 0; - uint64_t t_prompt_processing_total = 0; - uint64_t n_tokens_predicted_total = 0; - uint64_t t_tokens_generation_total = 0; - - uint64_t n_tokens_max = 0; - - uint64_t n_prompt_tokens_processed = 0; - uint64_t t_prompt_processing = 0; - - uint64_t n_tokens_predicted = 0; - uint64_t t_tokens_generation = 0; - - uint64_t n_decode_total = 0; - uint64_t n_busy_slots_total = 0; - - void init() { - t_start = ggml_time_us(); - } - - void on_prompt_eval(const server_slot & slot) { - n_prompt_tokens_processed_total += slot.n_prompt_tokens_processed; - n_prompt_tokens_processed += slot.n_prompt_tokens_processed; - t_prompt_processing += slot.t_prompt_processing; - t_prompt_processing_total += slot.t_prompt_processing; - - n_tokens_max = std::max(n_tokens_max, (uint64_t) slot.prompt.n_tokens()); - } - - void on_prediction(const server_slot & slot) { - n_tokens_predicted_total += slot.n_decoded; - n_tokens_predicted += slot.n_decoded; - t_tokens_generation += slot.t_token_generation; - t_tokens_generation_total += slot.t_token_generation; - } - - void on_decoded(const std::vector & slots) { - n_decode_total++; - for (const auto & slot : slots) { - if (slot.is_processing()) { - n_busy_slots_total++; - } - n_tokens_max = std::max(n_tokens_max, (uint64_t) slot.prompt.n_tokens()); - } - } - - void reset_bucket() { - n_prompt_tokens_processed = 0; - t_prompt_processing = 0; - n_tokens_predicted = 0; - t_tokens_generation = 0; - } -}; - -struct server_queue { - int id = 0; - bool running; - - // queues - std::deque queue_tasks; - std::deque queue_tasks_deferred; - - std::mutex mutex_tasks; - std::condition_variable condition_tasks; - - // callback functions - std::function callback_new_task; - std::function callback_update_slots; - - // Add a new task to the end of the queue - int post(server_task && task, bool front = false) { - std::unique_lock lock(mutex_tasks); - GGML_ASSERT(task.id != -1); - // if this is cancel task make sure to clean up pending tasks - if (task.type == SERVER_TASK_TYPE_CANCEL) { - cleanup_pending_task(task.id_target); - } - const int task_id = task.id; - QUE_DBG("new task, id = %d, front = %d\n", task_id, front); - if (front) { - queue_tasks.push_front(std::move(task)); - } else { - queue_tasks.push_back(std::move(task)); - } - condition_tasks.notify_one(); - return task_id; - } - - // multi-task version of post() - int post(std::vector && tasks, bool front = false) { - std::unique_lock lock(mutex_tasks); - for (auto & task : tasks) { - if (task.id == -1) { - task.id = id++; - } - // if this is cancel task make sure to clean up pending tasks - if (task.type == SERVER_TASK_TYPE_CANCEL) { - cleanup_pending_task(task.id_target); - } - QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front); - if (front) { - queue_tasks.push_front(std::move(task)); - } else { - queue_tasks.push_back(std::move(task)); - } - } - condition_tasks.notify_one(); - return 0; - } - - // Add a new task, but defer until one slot is available - void defer(server_task && task) { - std::unique_lock lock(mutex_tasks); - QUE_DBG("defer task, id = %d\n", task.id); - queue_tasks_deferred.push_back(std::move(task)); - condition_tasks.notify_one(); - } - - // Get the next id for creating a new task - int get_new_id() { - std::unique_lock lock(mutex_tasks); - int new_id = id++; - return new_id; - } - - // Register function to process a new task - void on_new_task(std::function callback) { - callback_new_task = std::move(callback); - } - - // Register the function to be called when all slots data is ready to be processed - void on_update_slots(std::function callback) { - callback_update_slots = std::move(callback); - } - - // Call when the state of one slot is changed, it will move one task from deferred to main queue - void pop_deferred_task() { - std::unique_lock lock(mutex_tasks); - if (!queue_tasks_deferred.empty()) { - queue_tasks.emplace_front(std::move(queue_tasks_deferred.front())); - queue_tasks_deferred.pop_front(); - } - condition_tasks.notify_one(); - } - - // end the start_loop routine - void terminate() { - std::unique_lock lock(mutex_tasks); - running = false; - condition_tasks.notify_all(); - } - - /** - * Main loop consists of these steps: - * - Wait until a new task arrives - * - Process the task (i.e. maybe copy data into slot) - * - Check if multitask is finished - * - Update all slots - */ - void start_loop() { - running = true; - - while (true) { - QUE_DBG("%s", "processing new tasks\n"); - - while (true) { - std::unique_lock lock(mutex_tasks); - if (!running) { - QUE_DBG("%s", "terminate\n"); - return; - } - if (queue_tasks.empty()) { - lock.unlock(); - break; - } - server_task task = std::move(queue_tasks.front()); - queue_tasks.pop_front(); - lock.unlock(); - - QUE_DBG("processing task, id = %d\n", task.id); - callback_new_task(std::move(task)); - } - - // all tasks in the current loop is processed, slots data is now ready - QUE_DBG("%s", "update slots\n"); - - callback_update_slots(); - - QUE_DBG("%s", "waiting for new tasks\n"); - { - std::unique_lock lock(mutex_tasks); - if (!running) { - QUE_DBG("%s", "terminate\n"); - return; - } - if (queue_tasks.empty()) { - condition_tasks.wait(lock, [&]{ - return (!queue_tasks.empty() || !running); - }); - } - } - } - } - -private: - void cleanup_pending_task(int id_target) { - // no need lock because this is called exclusively by post() - auto rm_func = [id_target](const server_task & task) { - return task.id == id_target; - }; - queue_tasks.erase( - std::remove_if(queue_tasks.begin(), queue_tasks.end(), rm_func), - queue_tasks.end()); - queue_tasks_deferred.erase( - std::remove_if(queue_tasks_deferred.begin(), queue_tasks_deferred.end(), rm_func), - queue_tasks_deferred.end()); - } -}; - -struct server_response { - bool running = true; - - // for keeping track of all tasks waiting for the result - std::unordered_set waiting_task_ids; - - // the main result queue (using ptr for polymorphism) - std::vector queue_results; - - std::mutex mutex_results; - std::condition_variable condition_results; - - // add the id_task to the list of tasks waiting for response - void add_waiting_task_id(int id_task) { - SRV_DBG("add task %d to waiting list. current waiting = %d (before add)\n", id_task, (int) waiting_task_ids.size()); - - std::unique_lock lock(mutex_results); - waiting_task_ids.insert(id_task); - } - - void add_waiting_tasks(const std::vector & tasks) { - std::unique_lock lock(mutex_results); - - for (const auto & task : tasks) { - SRV_DBG("add task %d to waiting list. current waiting = %d (before add)\n", task.id, (int) waiting_task_ids.size()); - waiting_task_ids.insert(task.id); - } - } - - // when the request is finished, we can remove task associated with it - void remove_waiting_task_id(int id_task) { - SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size()); - - std::unique_lock lock(mutex_results); - waiting_task_ids.erase(id_task); - // make sure to clean up all pending results - queue_results.erase( - std::remove_if(queue_results.begin(), queue_results.end(), [id_task](const server_task_result_ptr & res) { - return res->id == id_task; - }), - queue_results.end()); - } - - void remove_waiting_task_ids(const std::unordered_set & id_tasks) { - std::unique_lock lock(mutex_results); - - for (const auto & id_task : id_tasks) { - SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size()); - waiting_task_ids.erase(id_task); - } - } - - // This function blocks the thread until there is a response for one of the id_tasks - server_task_result_ptr recv(const std::unordered_set & id_tasks) { - while (true) { - std::unique_lock lock(mutex_results); - condition_results.wait(lock, [&]{ - if (!running) { - SRV_DBG("%s : queue result stop\n", __func__); - std::terminate(); // we cannot return here since the caller is HTTP code - } - return !queue_results.empty(); - }); - - for (size_t i = 0; i < queue_results.size(); i++) { - if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) { - server_task_result_ptr res = std::move(queue_results[i]); - queue_results.erase(queue_results.begin() + i); - return res; - } - } - } - - // should never reach here - } - - // same as recv(), but have timeout in seconds - // if timeout is reached, nullptr is returned - server_task_result_ptr recv_with_timeout(const std::unordered_set & id_tasks, int timeout) { - while (true) { - std::unique_lock lock(mutex_results); - - for (int i = 0; i < (int) queue_results.size(); i++) { - if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) { - server_task_result_ptr res = std::move(queue_results[i]); - queue_results.erase(queue_results.begin() + i); - return res; - } - } - - std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout)); - if (!running) { - SRV_DBG("%s : queue result stop\n", __func__); - std::terminate(); // we cannot return here since the caller is HTTP code - } - if (cr_res == std::cv_status::timeout) { - return nullptr; - } - } - - // should never reach here - } - - // single-task version of recv() - server_task_result_ptr recv(int id_task) { - std::unordered_set id_tasks = {id_task}; - return recv(id_tasks); - } - - // Send a new result to a waiting id_task - void send(server_task_result_ptr && result) { - SRV_DBG("sending result for task id = %d\n", result->id); - - std::unique_lock lock(mutex_results); - for (const auto & id_task : waiting_task_ids) { - if (result->id == id_task) { - SRV_DBG("task id = %d pushed to result queue\n", result->id); - - queue_results.emplace_back(std::move(result)); - condition_results.notify_all(); - return; - } - } - } - - // terminate the waiting loop - void terminate() { - running = false; - condition_results.notify_all(); - } -}; - -struct server_context { - common_params params_base; - - // note: keep these alive - they determine the lifetime of the model, context, etc. - common_init_result llama_init; - common_init_result llama_init_dft; - - llama_model * model = nullptr; - llama_context * ctx = nullptr; - - // multimodal - mtmd_context * mctx = nullptr; - - const llama_vocab * vocab = nullptr; - bool vocab_dft_compatible = true; - - llama_model * model_dft = nullptr; - - llama_context_params cparams_dft; - - llama_batch batch {}; - - bool clean_kv_cache = true; - bool add_bos_token = true; - - int32_t n_ctx; // total context for all clients / slots - - // slots / clients - std::vector slots; - - int slots_debug = 0; - - server_queue queue_tasks; - server_response queue_results; - - std::unique_ptr prompt_cache; - - server_metrics metrics; - - // Necessary similarity of prompt for slot selection - float slot_prompt_similarity = 0.0f; - - common_chat_templates_ptr chat_templates; - oaicompat_parser_options oai_parser_opt; - - ~server_context() { - mtmd_free(mctx); - - // Clear any sampling context - for (server_slot & slot : slots) { - common_sampler_free(slot.smpl); - slot.smpl = nullptr; - - llama_free(slot.ctx_dft); - slot.ctx_dft = nullptr; - - common_speculative_free(slot.spec); - slot.spec = nullptr; - - llama_batch_free(slot.batch_spec); - } - - llama_batch_free(batch); - } - - bool load_model(const common_params & params) { - SRV_INF("loading model '%s'\n", params.model.path.c_str()); - - params_base = params; - - llama_init = common_init_from_params(params_base); - - model = llama_init.model.get(); - ctx = llama_init.context.get(); - - if (model == nullptr) { - SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str()); - return false; - } - - vocab = llama_model_get_vocab(model); - - n_ctx = llama_n_ctx(ctx); - - add_bos_token = llama_vocab_get_add_bos(vocab); - - if (params_base.has_speculative()) { - SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str()); - - auto params_dft = params_base; - - params_dft.devices = params_base.speculative.devices; - params_dft.model = params_base.speculative.model; - params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? llama_n_ctx_seq(ctx) : params_base.speculative.n_ctx; - params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers; - params_dft.n_parallel = 1; - params_dft.cache_type_k = params_base.speculative.cache_type_k; - params_dft.cache_type_v = params_base.speculative.cache_type_v; - - params_dft.cpuparams.n_threads = params_base.speculative.cpuparams.n_threads; - params_dft.cpuparams_batch.n_threads = params_base.speculative.cpuparams_batch.n_threads; - params_dft.tensor_buft_overrides = params_base.speculative.tensor_buft_overrides; - - llama_init_dft = common_init_from_params(params_dft); - - model_dft = llama_init_dft.model.get(); - - if (model_dft == nullptr) { - SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str()); - return false; - } - - vocab_dft_compatible = common_speculative_are_compatible(ctx, llama_init_dft.context.get()); - if (!vocab_dft_compatible) { - SRV_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str()); - } - - const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get()); - - cparams_dft = common_context_params_to_llama(params_dft); - cparams_dft.n_batch = n_ctx_dft; - - // the context is not needed - we will create one for each slot - llama_init_dft.context.reset(); - } - - chat_templates = common_chat_templates_init(model, params_base.chat_template); - try { - common_chat_format_example(chat_templates.get(), params.use_jinja, params.default_template_kwargs); - } catch (const std::exception & e) { - SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what()); - SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__); - chat_templates = common_chat_templates_init(model, "chatml"); - } - - std::string & mmproj_path = params_base.mmproj.path; - if (!mmproj_path.empty()) { - mtmd_context_params mparams = mtmd_context_params_default(); - mparams.use_gpu = params_base.mmproj_use_gpu; - mparams.print_timings = false; - mparams.n_threads = params_base.cpuparams.n_threads; - mparams.verbosity = params_base.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO; - mparams.flash_attn_type = params_base.flash_attn_type; - mparams.image_min_tokens = params_base.image_min_tokens; - mparams.image_max_tokens = params_base.image_max_tokens; - mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams); - if (mctx == nullptr) { - SRV_ERR("failed to load multimodal model, '%s'\n", mmproj_path.c_str()); - return false; - } - SRV_INF("loaded multimodal model, '%s'\n", mmproj_path.c_str()); - - if (params_base.ctx_shift) { - params_base.ctx_shift = false; - SRV_WRN("%s\n", "ctx_shift is not supported by multimodal, it will be disabled"); - } - - if (params_base.n_cache_reuse) { - params_base.n_cache_reuse = 0; - SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled"); - } - - if (params_base.has_speculative()) { - SRV_ERR("%s\n", "err: speculative decode is not supported by multimodal"); - return false; - } - } - - if (!llama_memory_can_shift(llama_get_memory(ctx))) { - if (params_base.ctx_shift) { - params_base.ctx_shift = false; - SRV_WRN("%s\n", "ctx_shift is not supported by this context, it will be disabled"); - } - - if (params_base.n_cache_reuse) { - params_base.n_cache_reuse = 0; - SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled"); - } - } - - return true; - } - - void init() { - SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel); - - const int n_ctx_train = llama_model_n_ctx_train(model); - - int n_ctx_slot = llama_n_ctx_seq(ctx); - if (n_ctx_slot > n_ctx_train) { - SRV_WRN("the slot context (%d) exceeds the training context of the model (%d) - capping\n", n_ctx_slot, n_ctx_train); - n_ctx_slot = n_ctx_train; - } - - for (int i = 0; i < params_base.n_parallel; i++) { - server_slot slot; - - slot.id = i; - slot.ctx = ctx; - slot.n_ctx = n_ctx_slot; - slot.mctx = mctx; - slot.prompt.tokens.has_mtmd = mctx != nullptr; - - if (model_dft) { - slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1); - - // TODO: rework speculative decoding [TAG_SERVER_SPEC_REWORK] - slot.ctx_dft = llama_init_from_model(model_dft, cparams_dft); - if (slot.ctx_dft == nullptr) { - SRV_ERR("%s", "failed to create draft context\n"); - return; - } - - slot.spec = common_speculative_init(slot.ctx, slot.ctx_dft); - if (slot.spec == nullptr) { - SRV_ERR("%s", "failed to create speculator\n"); - return; - } - for (auto & pair : params_base.speculative.replacements) { - common_speculative_add_replacement_tgt_dft(slot.spec, pair.first.c_str(), pair.second.c_str()); - } - } - - SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx); - - slot.callback_on_release = [this](int) { - queue_tasks.pop_deferred_task(); - }; - - slot.reset(); - - slots.push_back(std::move(slot)); - } - - { - const char * LLAMA_SERVER_SLOTS_DEBUG = getenv("LLAMA_SERVER_SLOTS_DEBUG"); - slots_debug = LLAMA_SERVER_SLOTS_DEBUG ? atoi(LLAMA_SERVER_SLOTS_DEBUG) : 0; - - if (slots_debug) { - SRV_WRN("slots debug = %d\n", slots_debug); - } - } - - // the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens - // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used) - { - const int32_t n_batch = llama_n_batch(ctx); - batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1); - } - - metrics.init(); - - if (params_base.cache_ram_mib != 0) { - if (params_base.cache_ram_mib < 0) { - SRV_WRN("prompt cache is enabled, size limit: %s\n", "no limit"); - } else { - SRV_WRN("prompt cache is enabled, size limit: %d MiB\n", params_base.cache_ram_mib); - } - SRV_WRN("%s", "use `--cache-ram 0` to disable the prompt cache\n"); - - prompt_cache = std::make_unique(params_base.cache_ram_mib, n_ctx); - } else { - SRV_WRN("%s", "prompt cache is disabled - use `--cache-ram N` to enable it\n"); - } - SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n"); - - // thinking is enabled if: - // 1. It's not explicitly disabled (reasoning_budget == 0) - // 2. The chat template supports it - const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get()); - SRV_INF("thinking = %d\n", enable_thinking); - - oai_parser_opt = { - /* use_jinja */ params_base.use_jinja, - /* prefill_assistant */ params_base.prefill_assistant, - /* reasoning_format */ params_base.reasoning_format, - /* chat_template_kwargs */ params_base.default_template_kwargs, - /* common_chat_templates */ chat_templates.get(), - /* allow_image */ mctx ? mtmd_support_vision(mctx) : false, - /* allow_audio */ mctx ? mtmd_support_audio (mctx) : false, - /* enable_thinking */ enable_thinking, - }; - } - - server_slot * get_slot_by_id(int id) { - for (server_slot & slot : slots) { - if (slot.id == id) { - return &slot; - } - } - - return nullptr; - } - - server_slot * get_available_slot(const server_task & task) { - server_slot * ret = nullptr; - - bool update_cache = false; - - // find the slot that has at least n% prompt similarity - if (ret == nullptr && slot_prompt_similarity != 0.0f) { - float sim_best = 0; - - for (server_slot & slot : slots) { - // skip the slot if it is not available - if (slot.is_processing()) { - continue; - } - - const auto & tokens = slot.prompt.tokens; - - // skip the slot if it does not contains cached tokens - if (tokens.empty()) { - continue; - } - - // fraction of the Longest Common Prefix length with respect to the input prompt length - const float sim_cur = float(tokens.get_common_prefix(task.tokens)) / task.tokens.size(); - - // select the current slot if the criteria match - if (sim_cur > sim_best && sim_cur > slot_prompt_similarity) { - sim_best = sim_cur; - - ret = &slot; - } - } - - if (ret != nullptr) { - const float f_keep = (sim_best*task.tokens.size()) / ret->prompt.tokens.size(); - - SLT_INF(*ret, "selected slot by LCP similarity, sim_best = %.3f (> %.3f thold), f_keep = %.3f\n", - sim_best, slot_prompt_similarity, f_keep); - - // if we are about to lose a large portion of the existing context - save it in the prompt cache - if (f_keep < 0.5f) { - update_cache = true; - } - } - } - - // find the slot that has been least recently used - if (ret == nullptr) { - int64_t t_last = -1; - - for (server_slot & slot : slots) { - // skip the slot if it is not available - if (slot.is_processing()) { - continue; - } - - // select the current slot if the criteria match - if (!ret || slot.t_last_used <= t_last) { - t_last = slot.t_last_used; - ret = &slot; - } - } - - if (ret != nullptr) { - SLT_INF(*ret, "selected slot by LRU, t_last = %" PRId64 "\n", t_last); - - update_cache = true; - } - } - - if (ret) { - const auto & tokens = ret->prompt.tokens; - - update_cache = update_cache && prompt_cache; - - // cache prompts only for completion tasks - update_cache = update_cache && task.type == SERVER_TASK_TYPE_COMPLETION; - - // don't update the cache if the slot's context is empty - update_cache = update_cache && tokens.size() > 0; - - // TODO: mtmd does not support prompt cache - update_cache = update_cache && (ret->mctx == nullptr); - - if (update_cache) { - SRV_WRN("%s", "updating prompt cache\n"); - - const int64_t t_start = ggml_time_us(); - - ret->prompt_save(*prompt_cache); - ret->prompt_load(*prompt_cache, task.tokens); - - prompt_cache->update(); - - SRV_WRN("prompt cache update took %.2f ms\n", (ggml_time_us() - t_start) / 1000.0); - } - } - - return ret; - } - - // return true if at least one slot has been purged - // TODO: improve logic - // - smarter decision which slot to purge (LRU or longest prompt?) - // - move slot to level 2 cache instead of removing? - // - instead of purging, try to store and resume later? - bool try_purge_idle_slots() { - bool res = false; - - if (!params_base.kv_unified) { - return res; - } - - for (auto & slot : slots) { - if (slot.is_processing()) { - continue; - } - - if (slot.prompt.n_tokens() > 0) { - SRV_WRN("purging slot %d with %zu tokens\n", slot.id, slot.prompt.tokens.size()); - - llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1); - slot.prompt.tokens.clear(); - - res = true; - - // purge slots one by one - break; - } - } - - return res; - } - - bool launch_slot_with_task(server_slot & slot, server_task && task) { - slot.reset(); - - if (!are_lora_equal(task.params.lora, slot.lora)) { - // if lora has changed, check to see if the cache should be cleared - if (lora_should_clear_cache(slot.lora, task.params.lora)) { - SLT_INF(slot, "clearing cache for lora change. %zu loras -> %zu loras\n", slot.lora.size(), task.params.lora.size()); - slot.prompt.tokens.clear(); - } else { - SLT_INF(slot, "keeping cache for alora. %zu target loras\n", task.params.lora.size()); - } - slot.lora = task.params.lora; - } - - // if using alora, make sure it's only a single one requested and active - size_t alora_invocation_start = task.tokens.size(); - if (lora_all_alora(slot.lora)) { - const auto & enabled_ids = lora_get_enabled_ids(slot.lora); - // TODO: This will error out if a user requests two aloras, but only - // provides the activation string for one. We could, instead search - // for all requested alora activation strings and then either keep - // only the last one, or reject if multiple are found. - if (enabled_ids.size() != 1) { - send_error(task, "Cannot run multiple aLoRAs in a single request", ERROR_TYPE_INVALID_REQUEST); - return false; - } - const auto & lora = slot.lora[enabled_ids[0]].ptr; - - // get the pointer and count for the invocation tokens - const uint64_t n_invocation_tokens = llama_adapter_get_alora_n_invocation_tokens(lora); - const llama_token * invocation_tokens = llama_adapter_get_alora_invocation_tokens (lora); - - // scan backwards through the prompt tokens to find the last - // occurrence of the invocation sequence - int match_idx = static_cast(n_invocation_tokens) - 1; - for (int i = task.tokens.size() - 1; i >= 0; --i) { - // the token in this position matches the next token to find in - // the invocation sequence - if (task.tokens[i] == invocation_tokens[match_idx]) { - // if it's a full match, we've found the start - if (match_idx == 0) { - alora_invocation_start = i; - break; - } - // otherwise, check the next token in the sequence - --match_idx; - } else { - // no match in this position, so start looking over again - match_idx = static_cast(n_invocation_tokens) - 1; - } - } - - // if the activation string is not found, disable the alora - if (alora_invocation_start == task.tokens.size()) { - SLT_DBG(slot, "alora %zu requested, but not found. deactivating\n", enabled_ids[0]); - slot.lora[enabled_ids[0]].scale = 0.0f; - } else { - SLT_DBG(slot, "alora %zu activated starting at %zu\n", enabled_ids[0], alora_invocation_start); - slot.alora_invocation_start = alora_invocation_start; - } - } - - if (!task.tokens.validate(ctx)) { - send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST); - return false; - } - - SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str()); - - // initialize samplers - { - if (slot.smpl != nullptr) { - common_sampler_free(slot.smpl); - } - - slot.smpl = common_sampler_init(model, task.params.sampling); - if (slot.smpl == nullptr) { - // for now, the only error that may happen here is invalid grammar - send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST); - return false; - } - - SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl).c_str()); - } - - // initialize draft batch - // TODO: rework speculative decoding [TAG_SERVER_SPEC_REWORK] - if (slot.ctx_dft) { - llama_batch_free(slot.batch_spec); - - slot.batch_spec = llama_batch_init(task.params.speculative.n_max + 1, 0, 1); - } - - slot.task = std::make_unique(std::move(task)); - - slot.state = SLOT_STATE_STARTED; - - SLT_INF(slot, "%s", "processing task\n"); - - return true; - } - - void kv_cache_clear() { - SRV_DBG("%s", "clearing KV cache\n"); - - // clear the entire KV cache - llama_memory_clear(llama_get_memory(ctx), true); - clean_kv_cache = false; - } - - bool process_token(completion_token_output & result, server_slot & slot) { - // remember which tokens were sampled - used for repetition penalties during sampling - const std::string token_str = result.text_to_send; - slot.sampled = result.tok; - - slot.generated_text += token_str; - if (slot.task->params.return_tokens) { - slot.generated_tokens.push_back(result.tok); - } - slot.has_next_token = true; - - // check if there is incomplete UTF-8 character at the end - bool incomplete = validate_utf8(slot.generated_text) < slot.generated_text.size(); - - // search stop word and delete it - if (!incomplete) { - size_t pos = std::min(slot.n_sent_text, slot.generated_text.size()); - - const std::string str_test = slot.generated_text.substr(pos); - bool send_text = true; - - size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), true); - if (stop_pos != std::string::npos) { - slot.generated_text.erase( - slot.generated_text.begin() + pos + stop_pos, - slot.generated_text.end()); - pos = std::min(slot.n_sent_text, slot.generated_text.size()); - } else if (slot.has_next_token && !llama_vocab_is_eog(vocab, result.tok) ) { - stop_pos = slot.find_stopping_strings(str_test, token_str.size(), false); - send_text = stop_pos == std::string::npos; - } - - // check if there is any token to predict - if (send_text) { - // no send the stop word in the response - result.text_to_send = slot.generated_text.substr(pos, std::string::npos); - slot.n_sent_text += result.text_to_send.size(); - // add the token to slot queue and cache - } else { - result.text_to_send = ""; - } - - slot.add_token(result); - if (slot.task->params.stream) { - send_partial_response(slot, result, false); - } - } - - if (incomplete) { - slot.has_next_token = true; - } - - // if context shifting is disabled, make sure that we don't run out of context - if (!params_base.ctx_shift && slot.prompt.n_tokens() + 1 >= slot.n_ctx) { - slot.truncated = true; - slot.stop = STOP_TYPE_LIMIT; - slot.has_next_token = false; - - SLT_DBG(slot, "stopped due to running out of context capacity, prompt.n_tokens() = %d, task.n_tokens = %d, n_decoded = %d, n_ctx = %d\n", - slot.prompt.n_tokens(), slot.task->n_tokens(), slot.n_decoded, slot.n_ctx); - } - - // check the limits - if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) { - slot.stop = STOP_TYPE_LIMIT; - slot.has_next_token = false; - - SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.task->params.n_predict); - } - - if (slot.has_new_line) { - // require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent - if (slot.task->params.n_indent > 0) { - // check the current indentation - // TODO: improve by not doing it more than once for each new line - if (slot.last_nl_pos > 0) { - size_t pos = slot.last_nl_pos; - - int n_indent = 0; - while (pos < slot.generated_text.size() && (slot.generated_text[pos] == ' ' || slot.generated_text[pos] == '\t')) { - n_indent++; - pos++; - } - - if (pos < slot.generated_text.size() && n_indent < slot.task->params.n_indent) { - slot.stop = STOP_TYPE_LIMIT; - slot.has_next_token = false; - - // cut the last line - slot.generated_text.erase(pos, std::string::npos); - - SLT_DBG(slot, "stopped by indentation limit, n_decoded = %d, n_indent = %d\n", slot.n_decoded, n_indent); - } - } - - // find the next new line - { - const size_t pos = slot.generated_text.find('\n', slot.last_nl_pos); - - if (pos != std::string::npos) { - slot.last_nl_pos = pos + 1; - } - } - } - } - - // check if there is a new line in the generated text - if (result.text_to_send.find('\n') != std::string::npos) { - slot.has_new_line = true; - - // if we have seen a new line, we stop after a certain time limit, but only upon another new line - if (slot.task->params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.task->params.t_max_predict_ms)) { - slot.stop = STOP_TYPE_LIMIT; - slot.has_next_token = false; - - SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.task->params.t_max_predict_ms); - } - } - - if (llama_vocab_is_eog(vocab, result.tok)) { - slot.stop = STOP_TYPE_EOS; - slot.has_next_token = false; - - SLT_DBG(slot, "%s", "stopped by EOS\n"); - } - - SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n", slot.n_decoded, slot.n_remaining, result.tok, token_str.c_str()); - - return slot.has_next_token; // continue - } - - void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) const { - size_t n_probs = slot.task->params.sampling.n_probs; - size_t n_vocab = llama_vocab_n_tokens(vocab); - - if (post_sampling) { - const auto * cur_p = common_sampler_get_candidates(slot.smpl, true); - const size_t max_probs = cur_p->size; - - // set probability for sampled token - for (size_t i = 0; i < max_probs; i++) { - if (cur_p->data[i].id == result.tok) { - result.prob = cur_p->data[i].p; - break; - } - } - - // set probability for top n_probs tokens - result.probs.reserve(max_probs); - for (size_t i = 0; i < std::min(max_probs, n_probs); i++) { - result.probs.push_back({ - cur_p->data[i].id, - common_token_to_piece(ctx, cur_p->data[i].id, special), - cur_p->data[i].p - }); - } - } else { - // TODO: optimize this with min-p optimization - std::vector cur = get_token_probabilities(ctx, idx); - - // set probability for sampled token - for (size_t i = 0; i < n_vocab; i++) { - // set probability for sampled token - if (cur[i].id == result.tok) { - result.prob = cur[i].p; - break; - } - } - - // set probability for top n_probs tokens - result.probs.reserve(n_probs); - for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) { - result.probs.push_back({ - cur[i].id, - common_token_to_piece(ctx, cur[i].id, special), - cur[i].p - }); - } - } - } - - void send_error(const server_task & task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { - send_error(task.id, error, type); - } - - void send_error(const server_slot & slot, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { - send_error(slot.task->id, error, type, slot.task->n_tokens(), slot.n_ctx); - } - - void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER, const int32_t n_prompt_tokens = 0, const int32_t n_ctx = 0) { - SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str()); - - if (type == ERROR_TYPE_EXCEED_CONTEXT_SIZE) { - GGML_ASSERT(n_ctx > 0 && n_prompt_tokens > 0); - } - - auto res = std::make_unique(); - res->id = id_task; - res->err_type = type; - res->err_msg = error; - res->n_prompt_tokens = n_prompt_tokens; - res->n_ctx = n_ctx; - - queue_results.send(std::move(res)); - } - - // if multimodal is enabled, send an error and return false - bool check_no_mtmd(const int id_task) { - if (mctx) { - send_error(id_task, "This feature is not supported by multimodal", ERROR_TYPE_NOT_SUPPORTED); - return false; - } - return true; - } - - void send_partial_response(server_slot & slot, const completion_token_output & tkn, bool is_progress) { - auto res = std::make_unique(); - - res->id = slot.task->id; - res->index = slot.task->index; - - if (is_progress) { - res->is_progress = true; - res->progress.total = slot.task->n_tokens(); - res->progress.cache = slot.n_prompt_tokens_cache; - res->progress.processed = slot.prompt.tokens.size(); - res->progress.time_ms = (ggml_time_us() - slot.t_start_process_prompt) / 1000; - } else { - res->content = tkn.text_to_send; - res->tokens = { tkn.tok }; - - slot.update_chat_msg(res->oaicompat_msg_diffs); - } - - res->n_decoded = slot.n_decoded; - res->n_prompt_tokens = slot.task->n_tokens(); - res->post_sampling_probs = slot.task->params.post_sampling_probs; - - res->verbose = slot.task->params.verbose; - res->oaicompat = slot.task->params.oaicompat; - res->oaicompat_model = slot.task->params.oaicompat_model; - res->oaicompat_cmpl_id = slot.task->params.oaicompat_cmpl_id; - - // populate res.probs_output - if (slot.task->params.sampling.n_probs > 0) { - res->prob_output = tkn; // copy the token probs - } - - // populate timings if this is final response or timings_per_token is enabled - if (slot.stop != STOP_TYPE_NONE || slot.task->params.timings_per_token) { - res->timings = slot.get_timings(); - } - - queue_results.send(std::move(res)); - } - - void send_final_response(server_slot & slot) { - auto res = std::make_unique(); - - res->id = slot.task->id; - res->id_slot = slot.id; - - res->index = slot.task->index; - res->content = slot.generated_text; - res->tokens = std::move(slot.generated_tokens); - res->timings = slot.get_timings(); - res->prompt = slot.task->tokens.detokenize(ctx, true); - res->response_fields = std::move(slot.task->params.response_fields); - - res->truncated = slot.truncated; - res->n_decoded = slot.n_decoded; - res->n_prompt_tokens = slot.task->n_tokens(); - res->n_tokens_cached = slot.prompt.n_tokens(); - res->has_new_line = slot.has_new_line; - res->stopping_word = slot.stopping_word; - res->stop = slot.stop; - res->post_sampling_probs = slot.task->params.post_sampling_probs; - - res->verbose = slot.task->params.verbose; - res->stream = slot.task->params.stream; - res->include_usage = slot.task->params.include_usage; - res->oaicompat = slot.task->params.oaicompat; - res->oaicompat_model = slot.task->params.oaicompat_model; - res->oaicompat_cmpl_id = slot.task->params.oaicompat_cmpl_id; - res->oaicompat_msg = slot.update_chat_msg(res->oaicompat_msg_diffs); - - // populate res.probs_output - if (slot.task->params.sampling.n_probs > 0) { - if (!slot.task->params.stream && slot.stop == STOP_TYPE_WORD) { - const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false); - - size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size()); - res->probs_output = std::vector( - slot.generated_token_probs.begin(), - slot.generated_token_probs.end() - safe_offset); - } else { - res->probs_output = std::vector( - slot.generated_token_probs.begin(), - slot.generated_token_probs.end()); - } - } - - res->generation_params = slot.task->params; // copy the parameters - - queue_results.send(std::move(res)); - } - - void send_embedding(const server_slot & slot, const llama_batch & batch) { - auto res = std::make_unique(); - res->id = slot.task->id; - res->index = slot.task->index; - res->n_tokens = slot.task->n_tokens(); - res->oaicompat = slot.task->params.oaicompat; - - const int n_embd = llama_model_n_embd(model); - - std::vector embd_res(n_embd, 0.0f); - - for (int i = 0; i < batch.n_tokens; ++i) { - if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { - continue; - } - - const float * embd = nullptr; - if (llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE) { - embd = llama_get_embeddings_ith(ctx, i); - } else { - embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); - } - - if (embd == nullptr) { - SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]); - - res->embedding.push_back(std::vector(n_embd, 0.0f)); - continue; - } - - // normalize only when there is pooling - if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) { - common_embd_normalize(embd, embd_res.data(), n_embd, slot.task->params.embd_normalize); - res->embedding.push_back(embd_res); - break; - } - - res->embedding.emplace_back(embd, embd + n_embd); - } - - SLT_DBG(slot, "%s", "sending embeddings\n"); - - queue_results.send(std::move(res)); - } - - void send_rerank(const server_slot & slot, const llama_batch & batch) { - auto res = std::make_unique(); - res->id = slot.task->id; - res->index = slot.task->index; - res->n_tokens = slot.task->n_tokens(); - - for (int i = 0; i < batch.n_tokens; ++i) { - if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { - continue; - } - - const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); - if (embd == NULL) { - embd = llama_get_embeddings_ith(ctx, i); - } - - if (embd == NULL) { - SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]); - - res->score = -1e6; - continue; - } - - res->score = embd[0]; - } - - SLT_DBG(slot, "sending rerank result, res.score = %f\n", res->score); - - queue_results.send(std::move(res)); - } - - // - // Functions to process the task - // - - void process_single_task(server_task && task) { - switch (task.type) { - case SERVER_TASK_TYPE_COMPLETION: - case SERVER_TASK_TYPE_INFILL: - case SERVER_TASK_TYPE_EMBEDDING: - case SERVER_TASK_TYPE_RERANK: - { - const int id_slot = task.id_slot; - - server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task); - - if (slot == nullptr) { - // if no slot is available, we defer this task for processing later - SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id); - queue_tasks.defer(std::move(task)); - break; - } - - if (slot->is_processing()) { - // if requested slot is unavailable, we defer this task for processing later - SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); - queue_tasks.defer(std::move(task)); - break; - } - - if (!launch_slot_with_task(*slot, std::move(task))) { - SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id); - break; - } - } break; - case SERVER_TASK_TYPE_CANCEL: - { - // release slot linked with the task id - for (auto & slot : slots) { - if (slot.task && slot.task->id == task.id_target) { - slot.release(); - break; - } - } - } break; - case SERVER_TASK_TYPE_NEXT_RESPONSE: - { - // do nothing - } break; - case SERVER_TASK_TYPE_METRICS: - { - json slots_data = json::array(); - - int n_idle_slots = 0; - int n_processing_slots = 0; - - for (server_slot & slot : slots) { - json slot_data = slot.to_json(slots_debug == 0); - - if (slot.is_processing()) { - n_processing_slots++; - } else { - n_idle_slots++; - } - - slots_data.push_back(slot_data); - } - SRV_DBG("n_idle_slots = %d, n_processing_slots = %d\n", n_idle_slots, n_processing_slots); - - auto res = std::make_unique(); - res->id = task.id; - res->slots_data = std::move(slots_data); - res->n_idle_slots = n_idle_slots; - res->n_processing_slots = n_processing_slots; - res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size(); - res->t_start = metrics.t_start; - - res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total; - res->t_prompt_processing_total = metrics.t_prompt_processing_total; - res->n_tokens_predicted_total = metrics.n_tokens_predicted_total; - res->t_tokens_generation_total = metrics.t_tokens_generation_total; - - res->n_tokens_max = metrics.n_tokens_max; - - res->n_prompt_tokens_processed = metrics.n_prompt_tokens_processed; - res->t_prompt_processing = metrics.t_prompt_processing; - res->n_tokens_predicted = metrics.n_tokens_predicted; - res->t_tokens_generation = metrics.t_tokens_generation; - - res->n_decode_total = metrics.n_decode_total; - res->n_busy_slots_total = metrics.n_busy_slots_total; - - if (task.metrics_reset_bucket) { - metrics.reset_bucket(); - } - queue_results.send(std::move(res)); - } break; - case SERVER_TASK_TYPE_SLOT_SAVE: - { - if (!check_no_mtmd(task.id)) { - break; - } - - int id_slot = task.slot_action.slot_id; - server_slot * slot = get_slot_by_id(id_slot); - if (slot == nullptr) { - send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); - break; - } - if (slot->is_processing()) { - // if requested slot is unavailable, we defer this task for processing later - SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); - queue_tasks.defer(std::move(task)); - break; - } - - const size_t token_count = slot->prompt.tokens.size(); - const int64_t t_start = ggml_time_us(); - - std::string filename = task.slot_action.filename; - std::string filepath = task.slot_action.filepath; - - const llama_tokens & tokens = slot->prompt.tokens.get_text_tokens(); - const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, tokens.data(), token_count); - - const int64_t t_end = ggml_time_us(); - const double t_save_ms = (t_end - t_start) / 1000.0; - - auto res = std::make_unique(); - res->id = task.id; - res->id_slot = id_slot; - res->filename = filename; - res->is_save = true; - res->n_tokens = token_count; - res->n_bytes = nwrite; - res->t_ms = t_save_ms; - queue_results.send(std::move(res)); - } break; - case SERVER_TASK_TYPE_SLOT_RESTORE: - { - if (!check_no_mtmd(task.id)) break; - int id_slot = task.slot_action.slot_id; - server_slot * slot = get_slot_by_id(id_slot); - if (slot == nullptr) { - send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); - break; - } - if (slot->is_processing()) { - // if requested slot is unavailable, we defer this task for processing later - SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); - queue_tasks.defer(std::move(task)); - break; - } - - const int64_t t_start = ggml_time_us(); - - std::string filename = task.slot_action.filename; - std::string filepath = task.slot_action.filepath; - - llama_tokens tokens; - tokens.resize(slot->n_ctx); - size_t token_count = 0; - size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, tokens.data(), tokens.size(), &token_count); - if (nread == 0) { - slot->prompt.tokens.clear(); // KV may already been invalidated? - send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST); - break; - } - tokens.resize(token_count); - slot->prompt.tokens.clear(); - slot->prompt.tokens.insert(tokens); - - const int64_t t_end = ggml_time_us(); - const double t_restore_ms = (t_end - t_start) / 1000.0; - - auto res = std::make_unique(); - res->id = task.id; - res->id_slot = id_slot; - res->filename = filename; - res->is_save = false; - res->n_tokens = token_count; - res->n_bytes = nread; - res->t_ms = t_restore_ms; - queue_results.send(std::move(res)); - } break; - case SERVER_TASK_TYPE_SLOT_ERASE: - { - if (!check_no_mtmd(task.id)) { - break; - } - int id_slot = task.slot_action.slot_id; - server_slot * slot = get_slot_by_id(id_slot); - if (slot == nullptr) { - send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); - break; - } - if (slot->is_processing()) { - // if requested slot is unavailable, we defer this task for processing later - SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); - queue_tasks.defer(std::move(task)); - break; - } - - // Erase token cache - const size_t n_erased = slot->prompt.tokens.size(); - llama_memory_seq_rm(llama_get_memory(ctx), slot->id, -1, -1); - slot->prompt.tokens.clear(); - - auto res = std::make_unique(); - res->id = task.id; - res->id_slot = id_slot; - res->n_erased = n_erased; - queue_results.send(std::move(res)); - } break; - case SERVER_TASK_TYPE_SET_LORA: - { - params_base.lora_adapters = std::move(task.set_lora); - auto res = std::make_unique(); - res->id = task.id; - queue_results.send(std::move(res)); - } break; - - } - } - - void update_slots() { - // check if all slots are idle - { - bool all_idle = true; - - for (auto & slot : slots) { - if (slot.is_processing()) { - all_idle = false; - break; - } - } - - if (all_idle) { - SRV_INF("%s", "all slots are idle\n"); - if (clean_kv_cache) { - kv_cache_clear(); - } - - return; - } - } - - { - SRV_DBG("%s", "posting NEXT_RESPONSE\n"); - - server_task task(SERVER_TASK_TYPE_NEXT_RESPONSE); - task.id = queue_tasks.get_new_id(); - queue_tasks.post(std::move(task)); - } - - // apply context-shift if needed - // TODO: simplify and improve - for (server_slot & slot : slots) { - if (slot.state == SLOT_STATE_GENERATING && slot.prompt.n_tokens() + 1 >= slot.n_ctx) { - if (!params_base.ctx_shift) { - // this check is redundant (for good) - // we should never get here, because generation should already stopped in process_token() - send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER); - slot.release(); - continue; - } - - if (mctx) { - // we should never reach this because params_base.ctx_shift is automatically disabled if mmproj is loaded - // we don't support ctx_shift because an image chunk may contains multiple tokens - GGML_ABORT("not supported by multimodal"); - } - - // Shift context - int n_keep = slot.task->params.n_keep < 0 ? slot.task->n_tokens() : slot.task->params.n_keep; - - if (add_bos_token) { - n_keep += 1; - } - - n_keep = std::min(slot.n_ctx - 4, n_keep); - - const int n_left = slot.prompt.n_tokens() - n_keep; - const int n_discard = slot.task->params.n_discard ? slot.task->params.n_discard : (n_left / 2); - - SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); - - llama_memory_seq_rm (llama_get_memory(ctx), slot.id, n_keep , n_keep + n_discard); - llama_memory_seq_add(llama_get_memory(ctx), slot.id, n_keep + n_discard, slot.prompt.n_tokens(), -n_discard); - - // add generated tokens to cache - // ref: https://github.com/ggml-org/llama.cpp/pull/16818#discussion_r2473269481 - { - GGML_ASSERT(!slot.prompt.tokens.has_mtmd); - - llama_tokens new_tokens = slot.prompt.tokens.get_text_tokens(); // copy - for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) { - new_tokens[i - n_discard] = new_tokens[i]; - } - - new_tokens.resize(slot.prompt.tokens.size() - n_discard); - - slot.prompt.tokens.clear(); - slot.prompt.tokens.insert(new_tokens); - } - - slot.truncated = true; - } - } - - // start populating the batch for this iteration - common_batch_clear(batch); - - // track if given slot can be batched with slots already in the batch - server_slot * slot_batched = nullptr; - - auto accept_special_token = [&](server_slot & slot, llama_token token) { - return params_base.special || - slot.task->params.sampling.preserved_tokens.find(token) != slot.task->params.sampling.preserved_tokens.end(); - }; - - // first, add sampled tokens from any ongoing sequences - for (auto & slot : slots) { - if (slot.state != SLOT_STATE_GENERATING) { - continue; - } - - // check if we can batch this slot with the previous one - if (!slot_batched) { - slot_batched = &slot; - } else if (!slot_batched->can_batch_with(slot)) { - continue; - } - - slot.i_batch = batch.n_tokens; - - common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true); - - slot.prompt.tokens.push_back(slot.sampled); - - SLT_DBG(slot, "slot decode token, n_ctx = %d, n_tokens = %d, truncated = %d\n", - slot.n_ctx, slot.prompt.n_tokens(), slot.truncated); - } - - // process in chunks of params.n_batch - int32_t n_batch = llama_n_batch(ctx); - int32_t n_ubatch = llama_n_ubatch(ctx); - - float alora_scale = -1.0f; - size_t alora_disabled_id = 0; - - // next, batch any pending prompts without exceeding n_batch - if (params_base.cont_batching || batch.n_tokens == 0) { - for (auto & slot : slots) { - // check if we can batch this slot with the previous one - if (slot.is_processing()) { - if (!slot_batched) { - slot_batched = &slot; - } else if (!slot_batched->can_batch_with(slot)) { - continue; - } - } - - // this slot still has a prompt to be processed - if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) { - const auto & input_tokens = slot.task->tokens; - - // TODO: maybe move branch to outside of this loop in the future - if (slot.state == SLOT_STATE_STARTED) { - slot.t_start_process_prompt = ggml_time_us(); - slot.t_start_generation = 0; - - slot.state = SLOT_STATE_PROCESSING_PROMPT; - - SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, task.n_tokens = %d\n", - slot.n_ctx, slot.task->params.n_keep, slot.task->n_tokens()); - - // print prompt tokens (for debugging) - /*if (1) { - // first 16 tokens (avoid flooding logs) - for (int i = 0; i < std::min(16, input_tokens.size()); i++) { - SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, input_tokens[i], common_token_to_piece(ctx, input_tokens[i]).c_str()); - } - } else { - // all - for (int i = 0; i < (int) input_tokens.size(); i++) { - SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, input_tokens[i], common_token_to_piece(ctx, input_tokens[i]).c_str()); - } - }*/ - - // keep track how many tokens we can reuse from the previous state - int n_past = 0; - - // empty prompt passed -> release the slot and send empty response - if (input_tokens.empty()) { - SLT_WRN(slot, "%s", "empty prompt - releasing slot\n"); - - slot.print_timings(); - send_final_response(slot); - slot.release(); - - continue; - } - - // TODO: support memory-less logits computation - if (slot.need_logits() && !llama_get_memory(ctx)) { - send_error(slot, "the current context does not logits computation. skipping", ERROR_TYPE_SERVER); - slot.release(); - continue; - } - - if (!slot.can_split()) { - if (slot.task->n_tokens() > n_ubatch) { - send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER); - slot.release(); - continue; - } - - if (slot.task->n_tokens() > slot.n_ctx) { - send_error(slot, "input is larger than the max context size. skipping", ERROR_TYPE_EXCEED_CONTEXT_SIZE); - slot.release(); - continue; - } - } else { - if (slot.task->n_tokens() >= slot.n_ctx) { - send_error(slot, "the request exceeds the available context size, try increasing it", ERROR_TYPE_EXCEED_CONTEXT_SIZE); - slot.release(); - continue; - } - - if (slot.task->params.cache_prompt) { - // reuse any previously computed tokens that are common with the new prompt - n_past = slot.prompt.tokens.get_common_prefix(input_tokens); - - // if there is an alora invoked, don't cache after the invocation start - if (slot.alora_invocation_start > 0) { - SLT_DBG(slot, "only caching to alora invocation start (n_past = %d, alora_invocation_start = %d)\n", n_past, slot.alora_invocation_start); - n_past = std::min(n_past, slot.alora_invocation_start - 1); - } - - // reuse chunks from the cached prompt by shifting their KV cache in the new position - if (params_base.n_cache_reuse > 0) { - GGML_ASSERT(!slot.prompt.tokens.has_mtmd); - - size_t head_c = n_past; // cache - size_t head_p = n_past; // current prompt - - if (mctx) { - // we should never reach this - GGML_ABORT("not supported by multimodal"); - } - - SLT_DBG(slot, "trying to reuse chunks with size > %d, n_past = %d\n", params_base.n_cache_reuse, n_past); - - while (head_c < slot.prompt.tokens.size() && - head_p < input_tokens.size()) { - - size_t n_match = 0; - while (head_c + n_match < slot.prompt.tokens.size() && - head_p + n_match < input_tokens.size() && - slot.prompt.tokens[head_c + n_match] == input_tokens[head_p + n_match]) { - - n_match++; - } - - if (n_match >= (size_t) params_base.n_cache_reuse) { - SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match); - //for (size_t i = head_p; i < head_p + n_match; i++) { - // SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); - //} - - const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; - - llama_memory_seq_rm (llama_get_memory(ctx), slot.id, head_p, head_c); - llama_memory_seq_add(llama_get_memory(ctx), slot.id, head_c, head_c + n_match, kv_shift); - - for (size_t i = 0; i < n_match; i++) { - slot.prompt.tokens.set_token(head_p + i, slot.prompt.tokens[head_c + i]); - n_past++; - } - - head_c += n_match; - head_p += n_match; - } else { - head_c += 1; - } - } - - SLT_DBG(slot, "after context reuse, new n_past = %d\n", n_past); - } - } else { - // if we don't cache the prompt, we have to remove all previous tokens - n_past = 0; - } - - // note: when n_swa == 0, the model does not use SWA, which is equivalent to a window of 1 - const auto n_swa = std::max(1, llama_model_n_swa(model)); - - // the largest pos_min required for a checkpoint to be useful - const auto pos_min_thold = std::max(0, n_past - n_swa); - - // note: disallow with mtmd contexts for now - // https://github.com/ggml-org/llama.cpp/issues/17043 - if (!mctx && n_past > 0 && n_past < slot.prompt.n_tokens()) { - const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id); - if (pos_min == -1) { - SLT_ERR(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min); - GGML_ABORT("pos_min == -1, but n_past > 0 - should not happen: https://github.com/ggml-org/llama.cpp/pull/13833#discussion_r2116181237"); - } - - // when the prompt prefix does not match, print the tokens around the mismatch - // this is useful for debugging prompt caching - if (slots_debug) { - const int np0 = std::max(n_past - 4, 0); - const int np1 = std::min(n_past + 6, std::min(slot.prompt.tokens.size(), slot.task->tokens.size())); - - std::stringstream ss0; - std::stringstream ss1; - - std::stringstream st0; - std::stringstream st1; - - ss0 << "old: ... "; - ss1 << "new: ... "; - - for (int i = np0; i < np1; i++) { - if (i == n_past) { - ss0 << " | "; - ss1 << " | "; - } - - { - const auto token = slot.prompt.tokens[i]; - const auto piece = token != LLAMA_TOKEN_NULL ? common_token_to_piece(ctx, token) : "[mtmd]"; - ss0 << piece; - st0 << std::setw(8) << token; - } - - { - const auto token = slot.task->tokens[i]; - const auto piece = token != LLAMA_TOKEN_NULL ? common_token_to_piece(ctx, token) : "[mtmd]"; - ss1 << piece; - st1 << std::setw(8) << token; - } - } - - SLT_WRN(slot, "%s\n", ss0.str().c_str()); - SLT_WRN(slot, "%s\n", ss1.str().c_str()); - - SLT_WRN(slot, "%s\n", st0.str().c_str()); - SLT_WRN(slot, "%s\n", st1.str().c_str()); - } - - if (pos_min > pos_min_thold) { - // TODO: support can be added in the future when corresponding vision models get released - GGML_ASSERT(!slot.prompt.tokens.has_mtmd); - - SLT_WRN(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min, n_swa); - - // search for a context checkpoint - const auto it = std::find_if( - slot.prompt.checkpoints.rbegin(), - slot.prompt.checkpoints.rend(), - [&](const auto & cur) { - // guarantee that a checkpoint will result in at least one token being processed [TAG_PROMPT_LOGITS] - return cur.pos_min < pos_min_thold; - } - ); - - bool do_reset = it == slot.prompt.checkpoints.rend(); - - if (!do_reset) { - // restore the context checkpoint - const size_t checkpoint_size = it->data.size(); - const size_t n = llama_state_seq_set_data_ext(ctx, it->data.data(), checkpoint_size, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); - - if (n != checkpoint_size) { - SLT_ERR(slot, "failed to restore context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, (float) checkpoint_size / 1024 / 1024); - do_reset = true; - //printf("[DEBUG] `do_reset` was set to `true` after failing to restore a checkpoint"); - } else { - n_past = std::min(n_past, std::max(it->pos_min + 1, it->pos_max)); - SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, (float) checkpoint_size / 1024 / 1024); - } - } - - if (do_reset) { - SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA or hybrid/recurrent memory, see %s)\n", - "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055"); - n_past = 0; - } - } - } - - { - // erase any checkpoints with pos_min > pos_min_thold - for (auto it = slot.prompt.checkpoints.begin(); it != slot.prompt.checkpoints.end();) { - const auto & cur = *it; - if (cur.pos_min > pos_min_thold) { - SLT_WRN(slot, "erased invalidated context checkpoint (pos_min = %d, pos_max = %d, n_swa = %d, size = %.3f MiB)\n", cur.pos_min, cur.pos_max, n_swa, (float) cur.data.size() / 1024 / 1024); - it = slot.prompt.checkpoints.erase(it); - } else { - ++it; - } - } - } - } - - // [TAG_PROMPT_LOGITS] - if (n_past == slot.task->n_tokens() && n_past > 0) { - SLT_WRN(slot, "need to evaluate at least 1 token for each active slot (n_past = %d, task.n_tokens() = %d)\n", n_past, slot.task->n_tokens()); - n_past--; - SLT_WRN(slot, "n_past was set to %d\n", n_past); - } - - slot.n_prompt_tokens_cache = n_past; - slot.n_prompt_tokens_processed = 0; - - slot.prompt.tokens.keep_first(n_past); - } - - if (!slot.can_split()) { - // cannot fit the prompt in the current batch - will try next iter - if (batch.n_tokens + slot.task->n_tokens() > n_batch) { - continue; - } - } - - // truncate any tokens that are beyond n_past for this slot - const llama_pos p0 = slot.prompt.tokens.pos_next(); - - SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0); - - if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) { - SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0); - llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1); - - // there is no common part left - slot.n_prompt_tokens_cache = 0; - - slot.prompt.tokens.clear(); - } - - // check if we should process the image - if (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) { - // process the image - size_t n_tokens_out = 0; - int32_t res = input_tokens.process_chunk(ctx, mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out); - if (res != 0) { - SLT_ERR(slot, "failed to process image, res = %d\n", res); - send_error(slot, "failed to process image", ERROR_TYPE_SERVER); - slot.release(); - continue; - } - - slot.n_prompt_tokens_processed += n_tokens_out; - - // add the image chunk to cache - { - const auto & chunk = input_tokens.find_chunk(slot.prompt.n_tokens()); - slot.prompt.tokens.push_back(chunk.get()); // copy - } - } - - // If using an alora, there may be uncached tokens that come - // before the invocation sequence. When this happens, the - // tokens before the invocation sequence need to be - // processed without the adapter in a separate batch, then - // the adapter needs to be enabled for the remaining tokens. - if (lora_all_alora(slot.lora) && slot.alora_invocation_start - 1 > slot.prompt.n_tokens()) { - SLT_DBG(slot, "processing pre-alora tokens without the adapter (n_tokens = %d, alora_invocation_start = %d)\n", slot.prompt.n_tokens(), slot.alora_invocation_start); - const auto & enabled_loras = lora_get_enabled_ids(slot.lora); - GGML_ASSERT(enabled_loras.size() == 1); - alora_scale = slot.lora[enabled_loras[0]].scale; - slot.lora[enabled_loras[0]].scale = 0.0f; - alora_disabled_id = enabled_loras[0]; - } - - bool do_checkpoint = params_base.n_ctx_checkpoints > 0; - - // make checkpoints only for completion tasks - do_checkpoint = do_checkpoint && slot.task->type == SERVER_TASK_TYPE_COMPLETION; - - // make a checkpoint of the parts of the memory that cannot be rolled back. - // checkpoints are created only if: - // - the model uses SWA and we are not using `swa_full` - // - the model architecture is marked as recurrent or hybrid - // - // TODO: try to make this conditional on the context or the memory module, instead of the model type - do_checkpoint = do_checkpoint && ( - llama_model_is_recurrent(model) || - llama_model_is_hybrid(model) || - (llama_model_n_swa(model) > 0 && !params_base.swa_full) - ); - - // add prompt tokens for processing in the current batch - while (slot.prompt.n_tokens() < slot.task->n_tokens() && batch.n_tokens < n_batch) { - // get next token to process - llama_token cur_tok = input_tokens[slot.prompt.n_tokens()]; - if (cur_tok == LLAMA_TOKEN_NULL) { - break; // end of text chunk - } - - // if this is an alora request with pre-invocation - // tokens that are not cached, we need to stop filling - // this batch at those pre-invocation tokens. - if (alora_scale > 0 && slot.prompt.n_tokens() == slot.alora_invocation_start - 1) { - SLT_DBG(slot, "stop prompt batch filling at (n_tokens = %d, alora_invocation_start = %d)\n", slot.prompt.n_tokens(), slot.alora_invocation_start); - break; - } - - // embedding requires all tokens in the batch to be output - common_batch_add(batch, - cur_tok, - slot.prompt.tokens.pos_next(), - { slot.id }, - slot.need_embd()); - slot.prompt.tokens.push_back(cur_tok); - - slot.n_prompt_tokens_processed++; - - // process the last few tokens of the prompt separately in order to allow for a checkpoint to be created. - if (do_checkpoint && slot.task->n_tokens() - slot.prompt.n_tokens() == 64) { - break; - } - } - - // SLT_INF(slot, "new slot.prompt.tokens: %s\n", slot.slot.prompt.tokens.str().c_str()); - - SLT_INF(slot, "prompt processing progress, n_tokens = %d, batch.n_tokens = %d, progress = %f\n", slot.prompt.n_tokens(), batch.n_tokens, (float) slot.prompt.n_tokens() / slot.task->n_tokens()); - - // entire prompt has been processed - if (slot.prompt.n_tokens() == slot.task->n_tokens()) { - slot.state = SLOT_STATE_DONE_PROMPT; - - GGML_ASSERT(batch.n_tokens > 0); - - common_sampler_reset(slot.smpl); - - // Process all prompt tokens through sampler system - for (int i = 0; i < slot.task->n_tokens(); ++i) { - llama_token id = input_tokens[i]; - if (id != LLAMA_TOKEN_NULL) { - common_sampler_accept(slot.smpl, id, false); - } - } - - // extract the logits only for the last token - batch.logits[batch.n_tokens - 1] = true; - - slot.n_decoded = 0; - slot.i_batch = batch.n_tokens - 1; - - SLT_INF(slot, "prompt done, n_tokens = %d, batch.n_tokens = %d\n", slot.prompt.n_tokens(), batch.n_tokens); - - const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id); - const auto pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx), slot.id); - - // no need for empty or small checkpoints - do_checkpoint = do_checkpoint && (pos_min >= 0 && pos_max >= 64); - - // no need to create checkpoints that are too close together - do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || pos_max > slot.prompt.checkpoints.back().pos_max + 64); - - if (do_checkpoint) { - while (slot.prompt.checkpoints.size() >= (size_t) params_base.n_ctx_checkpoints) { - // make room for the new checkpoint, if needed - const auto & cur = slot.prompt.checkpoints.front(); - - SLT_WRN(slot, "erasing old context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", - cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024); - - slot.prompt.checkpoints.erase(slot.prompt.checkpoints.begin()); - } - - const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); - - auto & cur = slot.prompt.checkpoints.emplace_back(server_prompt_checkpoint{ - /*.pos_min = */ pos_min, - /*.pos_max = */ pos_max, - /*.data = */ std::vector(checkpoint_size), - }); - - llama_state_seq_get_data_ext(ctx, cur.data.data(), checkpoint_size, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); - - SLT_WRN(slot, "created context checkpoint %d of %d (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", - (int) slot.prompt.checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024); - } - } - } - - if (batch.n_tokens >= n_batch) { - break; - } - } - } - - if (batch.n_tokens == 0) { - SRV_WRN("%s", "no tokens to decode\n"); - return; - } - - SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens); - - if (slot_batched) { - // apply lora, only need to do it once per batch - common_set_adapter_lora(ctx, slot_batched->lora); - - // if the lora is temporarily disabled for an alora, re-enable it - // for next time - if (alora_scale > 0.0f) { - SRV_DBG("re-enabling alora with scale %f\n", alora_scale); - slot_batched->lora[alora_disabled_id].scale = alora_scale; - } - - llama_set_embeddings(ctx, slot_batched->need_embd()); - } - - int32_t i_next = 0; - - // process the created batch of tokens - for (int32_t i = 0; i < batch.n_tokens; i = i_next) { - const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i); - - llama_batch batch_view = { - n_tokens, - batch.token + i, - nullptr, - batch.pos + i, - batch.n_seq_id + i, - batch.seq_id + i, - batch.logits + i, - }; - - const int ret = llama_decode(ctx, batch_view); - - metrics.on_decoded(slots); - - if (ret != 0) { - { - std::string err; - - if (n_batch == 1 && ret == 1) { - // TODO: try to terminate only the largest active slot/sequence and continue with the rest - // need to remove the tokens from the current batch too - err = "Context size has been exceeded."; - } - - if (ret == -1) { - err = "Invalid input batch."; - } - - if (ret < -1) { - // TODO: update slot state based on llama_memory_seq_pos_min() and llama_memory_seq_pos_max() - err = "Compute error."; - } - - // TODO: handle ret == 2 (abort) when we start aborting - - if (!err.empty()) { - SRV_ERR("%s i = %d, n_batch = %d, ret = %d\n", err.c_str(), i, n_batch, ret); - - for (auto & slot : slots) { - if (slot.is_processing()) { - send_error(slot, err); - slot.release(); - } - } - - break; - } - } - - // retry with half the batch size to try to find a free slot in the KV cache - if (!try_purge_idle_slots()) { - n_batch /= 2; - } - - SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret); - - continue; // continue loop of n_batch - } - - // move the head of the batch forward with the number of tokens we just processed - i_next = i + n_tokens; - - // on successful decode, restore the original batch size - n_batch = llama_n_batch(ctx); - - for (auto & slot : slots) { - // optionally send prompt processing progress - if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_DONE_PROMPT) { - if (slot.task->params.stream && slot.task->params.return_progress) { - send_partial_response(slot, {}, true); - } - } - - if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) { - continue; // continue loop of slots - } - - if (slot.state == SLOT_STATE_DONE_PROMPT) { - if (slot.task->type == SERVER_TASK_TYPE_EMBEDDING) { - // prompt evaluated for embedding - send_embedding(slot, batch_view); - slot.release(); - slot.i_batch = -1; - continue; // continue loop of slots - } - - if (slot.task->type == SERVER_TASK_TYPE_RERANK) { - send_rerank(slot, batch_view); - slot.release(); - slot.i_batch = -1; - continue; // continue loop of slots - } - - // prompt evaluated for next-token prediction - slot.state = SLOT_STATE_GENERATING; - } else if (slot.state != SLOT_STATE_GENERATING) { - continue; // continue loop of slots - } - - const int tok_idx = slot.i_batch - i; - - llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx); - - slot.i_batch = -1; - - common_sampler_accept(slot.smpl, id, true); - - slot.n_decoded += 1; - - const int64_t t_current = ggml_time_us(); - - if (slot.n_decoded == 1) { - slot.t_start_generation = t_current; - slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3; - metrics.on_prompt_eval(slot); - } - - slot.t_token_generation = std::max(1, t_current - slot.t_start_generation) / 1e3; - - completion_token_output result; - result.tok = id; - result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok)); - result.prob = 1.0f; // TODO: set it here instead of doing inside populate_token_probs - - if (slot.task->params.sampling.n_probs > 0) { - populate_token_probs(slot, result, slot.task->params.post_sampling_probs, params_base.special, tok_idx); - } - - if (!process_token(result, slot)) { - // release slot because of stop condition - slot.print_timings(); - send_final_response(slot); - metrics.on_prediction(slot); - slot.release(); - - continue; - } - } - - // do speculative decoding - // TODO: rework to have a single draft llama_context shared across all slots [TAG_SERVER_SPEC_REWORK] - // perform the speculative drafting for all sequences at the same time in a single batch - for (auto & slot : slots) { - if (!slot.is_processing() || !slot.can_speculate()) { - continue; - } - - if (slot.state != SLOT_STATE_GENERATING) { - continue; - } - - if (mctx) { - // we should never reach this, as speculative is automatically disabled if mmproj is loaded - GGML_ABORT("not supported by multimodal"); - } - - // determine the max draft that fits the current slot state - int n_draft_max = slot.task->params.speculative.n_max; - - // note: slot.prompt is not yet expanded with the `id` token sampled above - // also, need to leave space for 1 extra token to allow context shifts - n_draft_max = std::min(n_draft_max, slot.n_ctx - slot.prompt.n_tokens() - 2); - - if (slot.n_remaining > 0) { - n_draft_max = std::min(n_draft_max, slot.n_remaining - 1); - } - - SLT_DBG(slot, "max possible draft: %d\n", n_draft_max); - - if (n_draft_max < slot.task->params.speculative.n_min) { - SLT_DBG(slot, "the max possible draft is too small: %d < %d - skipping speculative decoding\n", n_draft_max, slot.task->params.speculative.n_min); - - continue; - } - - llama_token id = slot.sampled; - - struct common_speculative_params params_spec; - params_spec.n_draft = n_draft_max; - params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.task->params.speculative.n_max; - params_spec.p_min = slot.task->params.speculative.p_min; - - const llama_tokens & cached_text_tokens = slot.prompt.tokens.get_text_tokens(); - llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, id); - - // ignore small drafts - if (slot.task->params.speculative.n_min > (int) draft.size()) { - SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.task->params.speculative.n_min); - - continue; - } - - // keep track of total number of drafted tokens tested - slot.n_draft_total += draft.size(); - - // construct the speculation batch - common_batch_clear(slot.batch_spec); - common_batch_add (slot.batch_spec, id, slot.prompt.tokens.pos_next(), { slot.id }, true); - - for (size_t i = 0; i < draft.size(); ++i) { - common_batch_add(slot.batch_spec, draft[i], slot.prompt.tokens.pos_next() + 1 + i, { slot.id }, true); - } - - SLT_DBG(slot, "decoding speculative batch, size = %d\n", slot.batch_spec.n_tokens); - - llama_decode(ctx, slot.batch_spec); - - // the accepted tokens from the speculation - const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft); - - slot.n_decoded += ids.size(); - - // update how many tokens out of those tested were accepted - slot.n_draft_accepted += ids.size() - 1; - - slot.prompt.tokens.push_back(id); - slot.prompt.tokens.insert({ids.begin(), ids.end() - 1}); - - llama_memory_seq_rm(llama_get_memory(ctx), slot.id, slot.prompt.n_tokens(), -1); - - for (size_t i = 0; i < ids.size(); ++i) { - completion_token_output result; - - result.tok = ids[i]; - result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok)); - result.prob = 1.0f; // set later - - // TODO: set result.probs - - if (!process_token(result, slot)) { - slot.print_timings(); - send_final_response(slot); - metrics.on_prediction(slot); - slot.release(); - - break; - } - } - - SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int) ids.size() - 1, (int) draft.size(), slot.prompt.n_tokens()); - } - } - - SRV_DBG("%s", "run slots completed\n"); - } - - json model_meta() const { - return json { - {"vocab_type", llama_vocab_type (vocab)}, - {"n_vocab", llama_vocab_n_tokens (vocab)}, - {"n_ctx_train", llama_model_n_ctx_train(model)}, - {"n_embd", llama_model_n_embd (model)}, - {"n_params", llama_model_n_params (model)}, - {"size", llama_model_size (model)}, - }; - } -}; - -// generator-like API for server responses, support pooling connection state and aggregating results -struct server_response_reader { - std::unordered_set id_tasks; - server_context & ctx_server; - size_t received_count = 0; - bool cancelled = false; - - server_response_reader(server_context & ctx_server) : ctx_server(ctx_server) {} - ~server_response_reader() { - stop(); - } - - void post_tasks(std::vector && tasks) { - id_tasks = server_task::get_list_id(tasks); - ctx_server.queue_results.add_waiting_tasks(tasks); - ctx_server.queue_tasks.post(std::move(tasks)); - } - - bool has_next() { - return !cancelled && received_count < id_tasks.size(); - } - - // return nullptr if should_stop() is true before receiving a result - // note: if one error is received, it will stop further processing and return error result - server_task_result_ptr next(const std::function & should_stop) { - while (true) { - server_task_result_ptr result = ctx_server.queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS); - if (result == nullptr) { - // timeout, check stop condition - if (should_stop()) { - SRV_DBG("%s", "stopping wait for next result due to should_stop condition\n"); - return nullptr; - } - } else { - if (result->is_error()) { - stop(); // cancel remaining tasks - SRV_DBG("%s", "received error result, stopping further processing\n"); - return result; - } - if (result->is_stop()) { - received_count++; - } - return result; - } - } - - // should not reach here - } - - struct batch_response { - bool is_terminated = false; // if true, indicates that processing was stopped before all results were received - std::vector results; - server_task_result_ptr error; // nullptr if no error - }; - - batch_response wait_for_all(const std::function & should_stop) { - batch_response batch_res; - batch_res.results.resize(id_tasks.size()); - while (has_next()) { - auto res = next(should_stop); - if (res == nullptr) { - batch_res.is_terminated = true; - return batch_res; - } - if (res->is_error()) { - batch_res.error = std::move(res); - return batch_res; - } - const size_t idx = res->get_index(); - GGML_ASSERT(idx < batch_res.results.size() && "index out of range"); - GGML_ASSERT(batch_res.results[idx] == nullptr && "duplicate result received"); - batch_res.results[idx] = std::move(res); - } - return batch_res; - } - - void stop() { - ctx_server.queue_results.remove_waiting_task_ids(id_tasks); - if (has_next() && !cancelled) { - // if tasks is not finished yet, cancel them - cancelled = true; - std::vector cancel_tasks; - cancel_tasks.reserve(id_tasks.size()); - for (const auto & id_task : id_tasks) { - SRV_WRN("cancel task, id_task = %d\n", id_task); - server_task task(SERVER_TASK_TYPE_CANCEL); - task.id_target = id_task; - ctx_server.queue_results.remove_waiting_task_id(id_task); - cancel_tasks.push_back(std::move(task)); - } - // push to beginning of the queue, so it has highest priority - ctx_server.queue_tasks.post(std::move(cancel_tasks), true); - } else { - SRV_DBG("%s", "all tasks already finished, no need to cancel\n"); - } - } -}; - -static void log_server_request(const httplib::Request & req, const httplib::Response & res) { - // skip GH copilot requests when using default port - if (req.path == "/v1/health") { - return; - } - - // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch - - SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status); - - SRV_DBG("request: %s\n", req.body.c_str()); - SRV_DBG("response: %s\n", res.body.c_str()); -} - -static void res_error(httplib::Response & res, const json & error_data) { - json final_response {{"error", error_data}}; - res.set_content(safe_json_to_str(final_response), MIMETYPE_JSON); - res.status = json_value(error_data, "code", 500); -} - -static void res_ok(httplib::Response & res, const json & data) { - res.set_content(safe_json_to_str(data), MIMETYPE_JSON); - res.status = 200; -} - -std::function shutdown_handler; -std::atomic_flag is_terminating = ATOMIC_FLAG_INIT; - -inline void signal_handler(int signal) { - if (is_terminating.test_and_set()) { - // in case it hangs, we can force terminate the server by hitting Ctrl+C twice - // this is for better developer experience, we can remove when the server is stable enough - fprintf(stderr, "Received second interrupt, terminating immediately.\n"); - exit(1); - } - - shutdown_handler(signal); -} - -int main(int argc, char ** argv) { - // own arguments required by this example - common_params params; - - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) { - return 1; - } - - // TODO: should we have a separate n_parallel parameter for the server? - // https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177 - // TODO: this is a common configuration that is suitable for most local use cases - // however, overriding the parameters is a bit confusing - figure out something more intuitive - if (params.n_parallel == 1 && params.kv_unified == false && !params.has_speculative()) { - LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true (add -kvu to disable this)\n", __func__); - - params.n_parallel = 4; - params.kv_unified = true; - } - - common_init(); - - // struct that contains llama context and inference - server_context ctx_server; - - llama_backend_init(); - llama_numa_init(params.numa); - - LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency()); - LOG_INF("\n"); - LOG_INF("%s\n", common_params_get_system_info(params).c_str()); - LOG_INF("\n"); - - std::unique_ptr svr; -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - if (params.ssl_file_key != "" && params.ssl_file_cert != "") { - LOG_INF("Running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str()); - svr.reset( - new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str()) - ); - } else { - LOG_INF("Running without SSL\n"); - svr.reset(new httplib::Server()); - } -#else - if (params.ssl_file_key != "" && params.ssl_file_cert != "") { - LOG_ERR("Server is built without SSL support\n"); - return 1; - } - svr.reset(new httplib::Server()); -#endif - - std::atomic state{SERVER_STATE_LOADING_MODEL}; - - svr->set_default_headers({{"Server", "llama.cpp"}}); - svr->set_logger(log_server_request); - svr->set_exception_handler([](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) { - std::string message; - try { - std::rethrow_exception(ep); - } catch (const std::exception & e) { - message = e.what(); - } catch (...) { - message = "Unknown Exception"; - } - - try { - json formatted_error = format_error_response(message, ERROR_TYPE_SERVER); - LOG_WRN("got exception: %s\n", formatted_error.dump().c_str()); - res_error(res, formatted_error); - } catch (const std::exception & e) { - LOG_ERR("got another exception: %s | while hanlding exception: %s\n", e.what(), message.c_str()); - } - }); - - svr->set_error_handler([](const httplib::Request &, httplib::Response & res) { - if (res.status == 404) { - res_error(res, format_error_response("File Not Found", ERROR_TYPE_NOT_FOUND)); - } - // for other error codes, we skip processing here because it's already done by res_error() - }); - - // set timeouts and change hostname and port - svr->set_read_timeout (params.timeout_read); - svr->set_write_timeout(params.timeout_write); - - std::unordered_map log_data; - - log_data["hostname"] = params.hostname; - log_data["port"] = std::to_string(params.port); - - if (params.api_keys.size() == 1) { - auto key = params.api_keys[0]; - log_data["api_key"] = "api_key: ****" + key.substr(std::max((int)(key.length() - 4), 0)); - } else if (params.api_keys.size() > 1) { - log_data["api_key"] = "api_key: " + std::to_string(params.api_keys.size()) + " keys loaded"; - } - - // Necessary similarity of prompt for slot selection - ctx_server.slot_prompt_similarity = params.slot_prompt_similarity; - - // - // Middlewares - // - - auto middleware_validate_api_key = [¶ms](const httplib::Request & req, httplib::Response & res) { - static const std::unordered_set public_endpoints = { - "/health", - "/v1/health", - "/models", - "/v1/models", - "/api/tags" - }; - - // If API key is not set, skip validation - if (params.api_keys.empty()) { - return true; - } - - // If path is public or is static file, skip validation - if (public_endpoints.find(req.path) != public_endpoints.end() || req.path == "/") { - return true; - } - - // Check for API key in the header - auto auth_header = req.get_header_value("Authorization"); - - std::string prefix = "Bearer "; - if (auth_header.substr(0, prefix.size()) == prefix) { - std::string received_api_key = auth_header.substr(prefix.size()); - if (std::find(params.api_keys.begin(), params.api_keys.end(), received_api_key) != params.api_keys.end()) { - return true; // API key is valid - } - } - - // API key is invalid or not provided - res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION)); - - LOG_WRN("Unauthorized: Invalid API Key\n"); - - return false; - }; - - auto middleware_server_state = [&state](const httplib::Request & req, httplib::Response & res) { - server_state current_state = state.load(); - if (current_state == SERVER_STATE_LOADING_MODEL) { - res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE)); - return false; - } - return true; - }; - - // register server middlewares - svr->set_pre_routing_handler([&middleware_validate_api_key, &middleware_server_state](const httplib::Request & req, httplib::Response & res) { - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - // If this is OPTIONS request, skip validation because browsers don't include Authorization header - if (req.method == "OPTIONS") { - res.set_header("Access-Control-Allow-Credentials", "true"); - res.set_header("Access-Control-Allow-Methods", "GET, POST"); - res.set_header("Access-Control-Allow-Headers", "*"); - res.set_content("", "text/html"); // blank response, no data - return httplib::Server::HandlerResponse::Handled; // skip further processing - } - if (!middleware_server_state(req, res)) { - return httplib::Server::HandlerResponse::Handled; - } - if (!middleware_validate_api_key(req, res)) { - return httplib::Server::HandlerResponse::Handled; - } - return httplib::Server::HandlerResponse::Unhandled; - }); - - // - // Route handlers (or controllers) - // - - const auto handle_health = [&](const httplib::Request &, httplib::Response & res) { - // error and loading states are handled by middleware - json health = {{"status", "ok"}}; - res_ok(res, health); - }; - - const auto handle_slots = [&](const httplib::Request & req, httplib::Response & res) { - if (!params.endpoint_slots) { - res_error(res, format_error_response("This server does not support slots endpoint. Start it with `--slots`", ERROR_TYPE_NOT_SUPPORTED)); - return; - } - - // request slots data using task queue - int task_id = ctx_server.queue_tasks.get_new_id(); - { - server_task task(SERVER_TASK_TYPE_METRICS); - task.id = task_id; - ctx_server.queue_results.add_waiting_task_id(task_id); - ctx_server.queue_tasks.post(std::move(task), true); // high-priority task - } - - // get the result - server_task_result_ptr result = ctx_server.queue_results.recv(task_id); - ctx_server.queue_results.remove_waiting_task_id(task_id); - - if (result->is_error()) { - res_error(res, result->to_json()); - return; - } - - // TODO: get rid of this dynamic_cast - auto res_task = dynamic_cast(result.get()); - GGML_ASSERT(res_task != nullptr); - - // optionally return "fail_on_no_slot" error - if (req.has_param("fail_on_no_slot")) { - if (res_task->n_idle_slots == 0) { - res_error(res, format_error_response("no slot available", ERROR_TYPE_UNAVAILABLE)); - return; - } - } - - res_ok(res, res_task->slots_data); - }; - - const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) { - if (!params.endpoint_metrics) { - res_error(res, format_error_response("This server does not support metrics endpoint. Start it with `--metrics`", ERROR_TYPE_NOT_SUPPORTED)); - return; - } - - // request slots data using task queue - int task_id = ctx_server.queue_tasks.get_new_id(); - { - server_task task(SERVER_TASK_TYPE_METRICS); - task.id = task_id; - ctx_server.queue_results.add_waiting_task_id(task_id); - ctx_server.queue_tasks.post(std::move(task), true); // high-priority task - } - - // get the result - server_task_result_ptr result = ctx_server.queue_results.recv(task_id); - ctx_server.queue_results.remove_waiting_task_id(task_id); - - if (result->is_error()) { - res_error(res, result->to_json()); - return; - } - - // TODO: get rid of this dynamic_cast - auto res_task = dynamic_cast(result.get()); - GGML_ASSERT(res_task != nullptr); - - // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names - json all_metrics_def = json { - {"counter", {{ - {"name", "prompt_tokens_total"}, - {"help", "Number of prompt tokens processed."}, - {"value", (uint64_t) res_task->n_prompt_tokens_processed_total} - }, { - {"name", "prompt_seconds_total"}, - {"help", "Prompt process time"}, - {"value", (uint64_t) res_task->t_prompt_processing_total / 1.e3} - }, { - {"name", "tokens_predicted_total"}, - {"help", "Number of generation tokens processed."}, - {"value", (uint64_t) res_task->n_tokens_predicted_total} - }, { - {"name", "tokens_predicted_seconds_total"}, - {"help", "Predict process time"}, - {"value", (uint64_t) res_task->t_tokens_generation_total / 1.e3} - }, { - {"name", "n_decode_total"}, - {"help", "Total number of llama_decode() calls"}, - {"value", res_task->n_decode_total} - }, { - {"name", "n_tokens_max"}, - {"help", "Largest observed n_tokens."}, - {"value", res_task->n_tokens_max} - }, { - {"name", "n_busy_slots_per_decode"}, - {"help", "Average number of busy slots per llama_decode() call"}, - {"value", (float) res_task->n_busy_slots_total / std::max((float) res_task->n_decode_total, 1.f)} - }}}, - {"gauge", {{ - {"name", "prompt_tokens_seconds"}, - {"help", "Average prompt throughput in tokens/s."}, - {"value", res_task->n_prompt_tokens_processed ? 1.e3 / res_task->t_prompt_processing * res_task->n_prompt_tokens_processed : 0.} - },{ - {"name", "predicted_tokens_seconds"}, - {"help", "Average generation throughput in tokens/s."}, - {"value", res_task->n_tokens_predicted ? 1.e3 / res_task->t_tokens_generation * res_task->n_tokens_predicted : 0.} - },{ - {"name", "requests_processing"}, - {"help", "Number of requests processing."}, - {"value", (uint64_t) res_task->n_processing_slots} - },{ - {"name", "requests_deferred"}, - {"help", "Number of requests deferred."}, - {"value", (uint64_t) res_task->n_tasks_deferred} - }}} - }; - - std::stringstream prometheus; - - for (const auto & el : all_metrics_def.items()) { - const auto & type = el.key(); - const auto & metrics_def = el.value(); - - for (const auto & metric_def : metrics_def) { - const std::string name = metric_def.at("name"); - const std::string help = metric_def.at("help"); - - auto value = json_value(metric_def, "value", 0.); - prometheus << "# HELP llamacpp:" << name << " " << help << "\n" - << "# TYPE llamacpp:" << name << " " << type << "\n" - << "llamacpp:" << name << " " << value << "\n"; - } - } - - res.set_header("Process-Start-Time-Unix", std::to_string(res_task->t_start)); - - res.set_content(prometheus.str(), "text/plain; version=0.0.4"); - res.status = 200; // HTTP OK - }; - - const auto handle_slots_save = [&ctx_server, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) { - json request_data = json::parse(req.body); - std::string filename = request_data.at("filename"); - if (!fs_validate_filename(filename)) { - res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST)); - return; - } - std::string filepath = params.slot_save_path + filename; - - int task_id = ctx_server.queue_tasks.get_new_id(); - { - server_task task(SERVER_TASK_TYPE_SLOT_SAVE); - task.id = task_id; - task.slot_action.slot_id = id_slot; - task.slot_action.filename = filename; - task.slot_action.filepath = filepath; - - ctx_server.queue_results.add_waiting_task_id(task_id); - ctx_server.queue_tasks.post(std::move(task)); - } - - server_task_result_ptr result = ctx_server.queue_results.recv(task_id); - ctx_server.queue_results.remove_waiting_task_id(task_id); - - if (result->is_error()) { - res_error(res, result->to_json()); - return; - } - - res_ok(res, result->to_json()); - }; - - const auto handle_slots_restore = [&ctx_server, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) { - json request_data = json::parse(req.body); - std::string filename = request_data.at("filename"); - if (!fs_validate_filename(filename)) { - res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST)); - return; - } - std::string filepath = params.slot_save_path + filename; - - int task_id = ctx_server.queue_tasks.get_new_id(); - { - server_task task(SERVER_TASK_TYPE_SLOT_RESTORE); - task.id = task_id; - task.slot_action.slot_id = id_slot; - task.slot_action.filename = filename; - task.slot_action.filepath = filepath; - - ctx_server.queue_results.add_waiting_task_id(task_id); - ctx_server.queue_tasks.post(std::move(task)); - } - - server_task_result_ptr result = ctx_server.queue_results.recv(task_id); - ctx_server.queue_results.remove_waiting_task_id(task_id); - - if (result->is_error()) { - res_error(res, result->to_json()); - return; - } - - GGML_ASSERT(dynamic_cast(result.get()) != nullptr); - res_ok(res, result->to_json()); - }; - - const auto handle_slots_erase = [&ctx_server](const httplib::Request & /* req */, httplib::Response & res, int id_slot) { - int task_id = ctx_server.queue_tasks.get_new_id(); - { - server_task task(SERVER_TASK_TYPE_SLOT_ERASE); - task.id = task_id; - task.slot_action.slot_id = id_slot; - - ctx_server.queue_results.add_waiting_task_id(task_id); - ctx_server.queue_tasks.post(std::move(task)); - } - - server_task_result_ptr result = ctx_server.queue_results.recv(task_id); - ctx_server.queue_results.remove_waiting_task_id(task_id); +#include +#include +#include // for std::thread::hardware_concurrency - if (result->is_error()) { - res_error(res, result->to_json()); - return; - } +#if defined(_WIN32) +#include +#endif - GGML_ASSERT(dynamic_cast(result.get()) != nullptr); - res_ok(res, result->to_json()); - }; +static std::function shutdown_handler; +static std::atomic_flag is_terminating = ATOMIC_FLAG_INIT; - const auto handle_slots_action = [¶ms, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) { - if (params.slot_save_path.empty()) { - res_error(res, format_error_response("This server does not support slots action. Start it with `--slot-save-path`", ERROR_TYPE_NOT_SUPPORTED)); - return; - } +static inline void signal_handler(int signal) { + if (is_terminating.test_and_set()) { + // in case it hangs, we can force terminate the server by hitting Ctrl+C twice + // this is for better developer experience, we can remove when the server is stable enough + fprintf(stderr, "Received second interrupt, terminating immediately.\n"); + exit(1); + } - std::string id_slot_str = req.path_params.at("id_slot"); - int id_slot; + shutdown_handler(signal); +} +// wrapper function that handles exceptions and logs errors +// this is to make sure handler_t never throws exceptions; instead, it returns an error response +static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) { + return [func = std::move(func)](const server_http_req & req) -> server_http_res_ptr { + std::string message; + error_type error; try { - id_slot = std::stoi(id_slot_str); - } catch (const std::exception &) { - res_error(res, format_error_response("Invalid slot ID", ERROR_TYPE_INVALID_REQUEST)); - return; - } - - std::string action = req.get_param_value("action"); - - if (action == "save") { - handle_slots_save(req, res, id_slot); - } else if (action == "restore") { - handle_slots_restore(req, res, id_slot); - } else if (action == "erase") { - handle_slots_erase(req, res, id_slot); - } else { - res_error(res, format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST)); - } - }; - - const auto handle_props = [¶ms, &ctx_server](const httplib::Request &, httplib::Response & res) { - json default_generation_settings_for_props; - - { - slot_params params; - - params.sampling = ctx_server.params_base.sampling; - - default_generation_settings_for_props = json { - {"params", params.to_json(true)}, - {"n_ctx", ctx_server.slots[0].n_ctx}, - }; - } - - // this endpoint is publicly available, please only return what is safe to be exposed - json data = { - { "default_generation_settings", default_generation_settings_for_props }, - { "total_slots", ctx_server.params_base.n_parallel }, - { "model_alias", ctx_server.params_base.model_alias }, - { "model_path", ctx_server.params_base.model.path }, - { "modalities", json { - {"vision", ctx_server.oai_parser_opt.allow_image}, - {"audio", ctx_server.oai_parser_opt.allow_audio}, - } }, - { "endpoint_slots", params.endpoint_slots }, - { "endpoint_props", params.endpoint_props }, - { "endpoint_metrics", params.endpoint_metrics }, - { "webui", params.webui }, - { "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) }, - { "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)}, - { "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)}, - { "build_info", build_info }, - }; - if (ctx_server.params_base.use_jinja) { - if (auto tool_use_src = common_chat_templates_source(ctx_server.chat_templates.get(), "tool_use")) { - data["chat_template_tool_use"] = tool_use_src; - } - } - - res_ok(res, data); - }; - - const auto handle_props_change = [&ctx_server](const httplib::Request & req, httplib::Response & res) { - if (!ctx_server.params_base.endpoint_props) { - res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED)); - return; + return func(req); + } catch (const std::invalid_argument & e) { + // treat invalid_argument as invalid request (400) + error = ERROR_TYPE_INVALID_REQUEST; + message = e.what(); + } catch (const std::exception & e) { + // treat other exceptions as server error (500) + error = ERROR_TYPE_SERVER; + message = e.what(); + } catch (...) { + error = ERROR_TYPE_SERVER; + message = "unknown error"; } - json data = json::parse(req.body); - - // update any props here - - res_ok(res, {{ "success", true }}); - }; - - const auto handle_api_show = [&ctx_server](const httplib::Request &, httplib::Response & res) { - bool has_mtmd = ctx_server.mctx != nullptr; - json data = { - { - "template", common_chat_templates_source(ctx_server.chat_templates.get()), - }, - { - "model_info", { - { "llama.context_length", ctx_server.slots.back().n_ctx, }, - } - }, - {"modelfile", ""}, - {"parameters", ""}, - {"template", common_chat_templates_source(ctx_server.chat_templates.get())}, - {"details", { - {"parent_model", ""}, - {"format", "gguf"}, - {"family", ""}, - {"families", {""}}, - {"parameter_size", ""}, - {"quantization_level", ""} - }}, - {"model_info", ""}, - {"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})} - }; - - res_ok(res, data); - }; - - // handle completion-like requests (completion, chat, infill) - // we can optionally provide a custom format for partial results and final results - const auto handle_completions_impl = [&ctx_server]( - server_task_type type, - json & data, - const std::vector & files, - const std::function & is_connection_closed, - httplib::Response & res, - oaicompat_type oaicompat) -> void { - GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL); - - auto completion_id = gen_chatcmplid(); - // need to store the reader as a pointer, so that it won't be destroyed when the handle returns - // use shared_ptr as it's shared between the chunked_content_provider() and on_complete() - const auto rd = std::make_shared(ctx_server); - + auto res = std::make_unique(); + res->status = 500; try { - std::vector tasks; - - const auto & prompt = data.at("prompt"); - // TODO: this log can become very long, put it behind a flag or think about a more compact format - //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get().c_str() : prompt.dump(2).c_str()); - - // process prompt - std::vector inputs; - - if (oaicompat && ctx_server.mctx != nullptr) { - // This is the case used by OAI compatible chat path with MTMD. TODO It can be moved to the path below. - inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt.get(), files)); - } else { - // Everything else, including multimodal completions. - inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true); - } - tasks.reserve(inputs.size()); - for (size_t i = 0; i < inputs.size(); i++) { - server_task task = server_task(type); - - task.id = ctx_server.queue_tasks.get_new_id(); - task.index = i; - - task.tokens = std::move(inputs[i]); - task.params = server_task::params_from_json_cmpl( - ctx_server.ctx, - ctx_server.params_base, - data); - task.id_slot = json_value(data, "id_slot", -1); - - // OAI-compat - task.params.oaicompat = oaicompat; - task.params.oaicompat_cmpl_id = completion_id; - // oaicompat_model is already populated by params_from_json_cmpl - - tasks.push_back(std::move(task)); - } - - rd->post_tasks(std::move(tasks)); + json error_data = format_error_response(message, error); + res->status = json_value(error_data, "code", 500); + res->data = safe_json_to_str({{ "error", error_data }}); + SRV_WRN("got exception: %s\n", res->data.c_str()); } catch (const std::exception & e) { - res_error(res, format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST)); - return; - } - - bool stream = json_value(data, "stream", false); - - if (!stream) { - // non-stream, wait for the results - auto all_results = rd->wait_for_all(is_connection_closed); - if (all_results.is_terminated) { - return; // connection is closed - } else if (all_results.error) { - res_error(res, all_results.error->to_json()); - return; - } else { - json arr = json::array(); - for (auto & res : all_results.results) { - GGML_ASSERT(dynamic_cast(res.get()) != nullptr); - arr.push_back(res->to_json()); - } - // if single request, return single object instead of array - res_ok(res, arr.size() == 1 ? arr[0] : arr); - } - - } else { - // in streaming mode, the first error must be treated as non-stream response - // this is to match the OAI API behavior - // ref: https://github.com/ggml-org/llama.cpp/pull/16486#discussion_r2419657309 - server_task_result_ptr first_result = rd->next(is_connection_closed); - if (first_result == nullptr) { - return; // connection is closed - } else if (first_result->is_error()) { - res_error(res, first_result->to_json()); - return; - } else { - GGML_ASSERT( - dynamic_cast(first_result.get()) != nullptr - || dynamic_cast(first_result.get()) != nullptr - ); - } - - // next responses are streamed - json first_result_json = first_result->to_json(); - const auto chunked_content_provider = [first_result_json, rd, oaicompat](size_t, httplib::DataSink & sink) mutable -> bool { - // flush the first result as it's not an error - if (!first_result_json.empty()) { - if (!server_sent_event(sink, first_result_json)) { - sink.done(); - return false; // sending failed, go to on_complete() - } - first_result_json.clear(); // mark as sent - } - - // receive subsequent results - auto result = rd->next([&sink]{ return !sink.is_writable(); }); - if (result == nullptr) { - sink.done(); - return false; // connection is closed, go to on_complete() - } - - // send the results - json res_json = result->to_json(); - bool ok = false; - if (result->is_error()) { - ok = server_sent_event(sink, json {{ "error", result->to_json() }}); - sink.done(); - return false; // go to on_complete() - } else { - GGML_ASSERT( - dynamic_cast(result.get()) != nullptr - || dynamic_cast(result.get()) != nullptr - ); - ok = server_sent_event(sink, res_json); - } - - if (!ok) { - sink.done(); - return false; // sending failed, go to on_complete() - } - - // check if there is more data - if (!rd->has_next()) { - if (oaicompat != OAICOMPAT_TYPE_NONE) { - static const std::string ev_done = "data: [DONE]\n\n"; - sink.write(ev_done.data(), ev_done.size()); - } - sink.done(); - return false; // no more data, go to on_complete() - } - - // has next data, continue - return true; - }; - - auto on_complete = [rd](bool) { - rd->stop(); - }; - - res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete); - } - }; - - const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) { - json data = json::parse(req.body); - std::vector files; // dummy - handle_completions_impl( - SERVER_TASK_TYPE_COMPLETION, - data, - files, - req.is_connection_closed, - res, - OAICOMPAT_TYPE_NONE); - }; - - const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) { - json data = oaicompat_completion_params_parse(json::parse(req.body)); - std::vector files; // dummy - handle_completions_impl( - SERVER_TASK_TYPE_COMPLETION, - data, - files, - req.is_connection_closed, - res, - OAICOMPAT_TYPE_COMPLETION); - }; - - const auto handle_infill = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) { - // check model compatibility - std::string err; - if (llama_vocab_fim_pre(ctx_server.vocab) == LLAMA_TOKEN_NULL) { - err += "prefix token is missing. "; - } - if (llama_vocab_fim_suf(ctx_server.vocab) == LLAMA_TOKEN_NULL) { - err += "suffix token is missing. "; - } - if (llama_vocab_fim_mid(ctx_server.vocab) == LLAMA_TOKEN_NULL) { - err += "middle token is missing. "; - } - if (!err.empty()) { - res_error(res, format_error_response(string_format("Infill is not supported by this model: %s", err.c_str()), ERROR_TYPE_NOT_SUPPORTED)); - return; - } - - json data = json::parse(req.body); - - // validate input - if (data.contains("prompt") && !data.at("prompt").is_string()) { - // prompt is optional - res_error(res, format_error_response("\"prompt\" must be a string", ERROR_TYPE_INVALID_REQUEST)); - } - - if (!data.contains("input_prefix")) { - res_error(res, format_error_response("\"input_prefix\" is required", ERROR_TYPE_INVALID_REQUEST)); - } - - if (!data.contains("input_suffix")) { - res_error(res, format_error_response("\"input_suffix\" is required", ERROR_TYPE_INVALID_REQUEST)); - } - - if (data.contains("input_extra") && !data.at("input_extra").is_array()) { - // input_extra is optional - res_error(res, format_error_response("\"input_extra\" must be an array of {\"filename\": string, \"text\": string}", ERROR_TYPE_INVALID_REQUEST)); - return; - } - - json input_extra = json_value(data, "input_extra", json::array()); - for (const auto & chunk : input_extra) { - // { "text": string, "filename": string } - if (!chunk.contains("text") || !chunk.at("text").is_string()) { - res_error(res, format_error_response("extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST)); - return; - } - // filename is optional - if (chunk.contains("filename") && !chunk.at("filename").is_string()) { - res_error(res, format_error_response("extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST)); - return; - } - } - data["input_extra"] = input_extra; // default to empty array if it's not exist - - std::string prompt = json_value(data, "prompt", std::string()); - std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, false, true); - SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); - data["prompt"] = format_infill( - ctx_server.vocab, - data.at("input_prefix"), - data.at("input_suffix"), - data.at("input_extra"), - ctx_server.params_base.n_batch, - ctx_server.params_base.n_predict, - ctx_server.slots[0].n_ctx, // TODO: there should be a better way - ctx_server.params_base.spm_infill, - tokenized_prompts[0].get_text_tokens() // TODO: this could maybe be multimodal. - ); - - std::vector files; // dummy - handle_completions_impl( - SERVER_TASK_TYPE_INFILL, - data, - files, - req.is_connection_closed, - res, - OAICOMPAT_TYPE_NONE); // infill is not OAI compatible - }; - - const auto handle_chat_completions = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) { - LOG_DBG("request: %s\n", req.body.c_str()); - - auto body = json::parse(req.body); - std::vector files; - json data = oaicompat_chat_params_parse( - body, - ctx_server.oai_parser_opt, - files); - - handle_completions_impl( - SERVER_TASK_TYPE_COMPLETION, - data, - files, - req.is_connection_closed, - res, - OAICOMPAT_TYPE_CHAT); - }; - - // same with handle_chat_completions, but without inference part - const auto handle_apply_template = [&ctx_server](const httplib::Request & req, httplib::Response & res) { - auto body = json::parse(req.body); - std::vector files; // dummy, unused - json data = oaicompat_chat_params_parse( - body, - ctx_server.oai_parser_opt, - files); - res_ok(res, {{ "prompt", std::move(data.at("prompt")) }}); - }; - - const auto handle_models = [¶ms, &ctx_server, &state](const httplib::Request &, httplib::Response & res) { - server_state current_state = state.load(); - json model_meta = nullptr; - if (current_state == SERVER_STATE_READY) { - model_meta = ctx_server.model_meta(); - } - bool has_mtmd = ctx_server.mctx != nullptr; - json models = { - {"models", { - { - {"name", params.model_alias.empty() ? params.model.path : params.model_alias}, - {"model", params.model_alias.empty() ? params.model.path : params.model_alias}, - {"modified_at", ""}, - {"size", ""}, - {"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash - {"type", "model"}, - {"description", ""}, - {"tags", {""}}, - {"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})}, - {"parameters", ""}, - {"details", { - {"parent_model", ""}, - {"format", "gguf"}, - {"family", ""}, - {"families", {""}}, - {"parameter_size", ""}, - {"quantization_level", ""} - }} - } - }}, - {"object", "list"}, - {"data", { - { - {"id", params.model_alias.empty() ? params.model.path : params.model_alias}, - {"object", "model"}, - {"created", std::time(0)}, - {"owned_by", "llamacpp"}, - {"meta", model_meta}, - }, - }} - }; - - res_ok(res, models); - }; - - const auto handle_tokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) { - const json body = json::parse(req.body); - - json tokens_response = json::array(); - if (body.count("content") != 0) { - const bool add_special = json_value(body, "add_special", false); - const bool parse_special = json_value(body, "parse_special", true); - const bool with_pieces = json_value(body, "with_pieces", false); - - llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, parse_special); - - if (with_pieces) { - for (const auto& token : tokens) { - std::string piece = common_token_to_piece(ctx_server.ctx, token); - json piece_json; - - // Check if the piece is valid UTF-8 - if (is_valid_utf8(piece)) { - piece_json = piece; - } else { - // If not valid UTF-8, store as array of byte values - piece_json = json::array(); - for (unsigned char c : piece) { - piece_json.push_back(static_cast(c)); - } - } - - tokens_response.push_back({ - {"id", token}, - {"piece", piece_json} - }); - } - } else { - tokens_response = tokens; - } - } - - const json data = format_tokenizer_response(tokens_response); - res_ok(res, data); - }; - - const auto handle_detokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) { - const json body = json::parse(req.body); - - std::string content; - if (body.count("tokens") != 0) { - const llama_tokens tokens = body.at("tokens"); - content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend()); - } - - const json data = format_detokenized_response(content); - res_ok(res, data); - }; - - const auto handle_embeddings_impl = [&ctx_server](const httplib::Request & req, httplib::Response & res, oaicompat_type oaicompat) { - if (!ctx_server.params_base.embedding) { - res_error(res, format_error_response("This server does not support embeddings. Start it with `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); - return; - } - - if (oaicompat != OAICOMPAT_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) { - res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST)); - return; - } - - const json body = json::parse(req.body); - - // for the shape of input/content, see tokenize_input_prompts() - json prompt; - if (body.count("input") != 0) { - prompt = body.at("input"); - } else if (body.contains("content")) { - oaicompat = OAICOMPAT_TYPE_NONE; // "content" field is not OAI compatible - prompt = body.at("content"); - } else { - res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST)); - return; - } - - bool use_base64 = false; - if (body.count("encoding_format") != 0) { - const std::string& format = body.at("encoding_format"); - if (format == "base64") { - use_base64 = true; - } else if (format != "float") { - res_error(res, format_error_response("The format to return the embeddings in. Can be either float or base64", ERROR_TYPE_INVALID_REQUEST)); - return; - } - } - - auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true); - for (const auto & tokens : tokenized_prompts) { - // this check is necessary for models that do not add BOS token to the input - if (tokens.empty()) { - res_error(res, format_error_response("Input content cannot be empty", ERROR_TYPE_INVALID_REQUEST)); - return; - } - } - - int embd_normalize = 2; // default to Euclidean/L2 norm - if (body.count("embd_normalize") != 0) { - embd_normalize = body.at("embd_normalize"); - if (llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) { - SRV_DBG("embd_normalize is not supported by pooling type %d, ignoring it\n", llama_pooling_type(ctx_server.ctx)); - } - } - - // create and queue the task - json responses = json::array(); - server_response_reader rd(ctx_server); - { - std::vector tasks; - for (size_t i = 0; i < tokenized_prompts.size(); i++) { - server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING); - - task.id = ctx_server.queue_tasks.get_new_id(); - task.index = i; - task.tokens = std::move(tokenized_prompts[i]); - - // OAI-compat - task.params.oaicompat = oaicompat; - task.params.embd_normalize = embd_normalize; - - tasks.push_back(std::move(task)); - } - rd.post_tasks(std::move(tasks)); - } - - // wait for the results - auto all_results = rd.wait_for_all(req.is_connection_closed); - - // collect results - if (all_results.is_terminated) { - return; // connection is closed - } else if (all_results.error) { - res_error(res, all_results.error->to_json()); - return; - } else { - for (auto & res : all_results.results) { - GGML_ASSERT(dynamic_cast(res.get()) != nullptr); - responses.push_back(res->to_json()); - } + SRV_ERR("got another exception: %s | while handling exception: %s\n", e.what(), message.c_str()); + res->data = "Internal Server Error"; } - - // write JSON response - json root = oaicompat == OAICOMPAT_TYPE_EMBEDDING - ? format_embeddings_response_oaicompat(body, responses, use_base64) - : json(responses); - res_ok(res, root); - }; - - const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) { - handle_embeddings_impl(req, res, OAICOMPAT_TYPE_NONE); - }; - - const auto handle_embeddings_oai = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) { - handle_embeddings_impl(req, res, OAICOMPAT_TYPE_EMBEDDING); + return res; }; +} - const auto handle_rerank = [&ctx_server](const httplib::Request & req, httplib::Response & res) { - if (!ctx_server.params_base.embedding || ctx_server.params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) { - res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED)); - return; - } - - const json body = json::parse(req.body); - - // if true, use TEI API format, otherwise use Jina API format - // Jina: https://jina.ai/reranker/ - // TEI: https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/rerank - bool is_tei_format = body.contains("texts"); - - json query; - if (body.count("query") == 1) { - query = body.at("query"); - if (!query.is_string()) { - res_error(res, format_error_response("\"query\" must be a string", ERROR_TYPE_INVALID_REQUEST)); - return; - } - } else { - res_error(res, format_error_response("\"query\" must be provided", ERROR_TYPE_INVALID_REQUEST)); - return; - } - - std::vector documents = json_value(body, "documents", - json_value(body, "texts", std::vector())); - if (documents.empty()) { - res_error(res, format_error_response("\"documents\" must be a non-empty string array", ERROR_TYPE_INVALID_REQUEST)); - return; - } - - int top_n = json_value(body, "top_n", (int)documents.size()); - - // create and queue the task - json responses = json::array(); - server_response_reader rd(ctx_server); - { - std::vector tasks; - tasks.reserve(documents.size()); - for (size_t i = 0; i < documents.size(); i++) { - auto tmp = format_rerank(ctx_server.model, ctx_server.vocab, ctx_server.mctx, query, documents[i]); - server_task task = server_task(SERVER_TASK_TYPE_RERANK); - task.id = ctx_server.queue_tasks.get_new_id(); - task.index = i; - task.tokens = std::move(tmp); - tasks.push_back(std::move(task)); - } - rd.post_tasks(std::move(tasks)); - } - - // wait for the results - auto all_results = rd.wait_for_all(req.is_connection_closed); +int main(int argc, char ** argv, char ** envp) { + // own arguments required by this example + common_params params; - // collect results - if (all_results.is_terminated) { - return; // connection is closed - } else if (all_results.error) { - res_error(res, all_results.error->to_json()); - return; - } else { - for (auto & res : all_results.results) { - GGML_ASSERT(dynamic_cast(res.get()) != nullptr); - responses.push_back(res->to_json()); - } - } + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) { + return 1; + } - // write JSON response - json root = format_response_rerank( - body, - responses, - is_tei_format, - documents, - top_n); + // TODO: should we have a separate n_parallel parameter for the server? + // https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177 + // TODO: this is a common configuration that is suitable for most local use cases + // however, overriding the parameters is a bit confusing - figure out something more intuitive + if (params.n_parallel == 1 && params.kv_unified == false && !params.has_speculative()) { + LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true (add -kvu to disable this)\n", __func__); - res_ok(res, root); - }; + params.n_parallel = 4; + params.kv_unified = true; + } - const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) { - json result = json::array(); - const auto & loras = ctx_server.params_base.lora_adapters; - for (size_t i = 0; i < loras.size(); ++i) { - auto & lora = loras[i]; - json entry = { - {"id", i}, - {"path", lora.path}, - {"scale", lora.scale}, - {"task_name", lora.task_name}, - {"prompt_prefix", lora.prompt_prefix}, - }; - std::string alora_invocation_string = ""; - const uint64_t n_alora_tokens = llama_adapter_get_alora_n_invocation_tokens(lora.ptr); - std::vector alora_invocation_tokens; - if (n_alora_tokens) { - const llama_token * alora_tokens = llama_adapter_get_alora_invocation_tokens(lora.ptr); - for (uint64_t i = 0; i < n_alora_tokens; ++i) { - alora_invocation_string += common_token_to_piece(ctx_server.ctx, alora_tokens[i]); - alora_invocation_tokens.push_back(alora_tokens[i]); - } - entry["alora_invocation_string"] = alora_invocation_string; - entry["alora_invocation_tokens"] = alora_invocation_tokens; - } - result.push_back(std::move(entry)); - } - res_ok(res, result); - res.status = 200; // HTTP OK - }; + // for consistency between server router mode and single-model mode, we set the same model name as alias + if (params.model_alias.empty() && !params.model.name.empty()) { + params.model_alias = params.model.name; + } - const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) { - const json body = json::parse(req.body); - if (!body.is_array()) { - res_error(res, format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST)); - return; - } + common_init(); - int task_id = ctx_server.queue_tasks.get_new_id(); - { - server_task task(SERVER_TASK_TYPE_SET_LORA); - task.id = task_id; - task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body); - ctx_server.queue_results.add_waiting_task_id(task_id); - ctx_server.queue_tasks.post(std::move(task)); - } + // struct that contains llama context and inference + server_context ctx_server; - // get the result - server_task_result_ptr result = ctx_server.queue_results.recv(task_id); - ctx_server.queue_results.remove_waiting_task_id(task_id); + llama_backend_init(); + llama_numa_init(params.numa); - if (result->is_error()) { - res_error(res, result->to_json()); - return; - } + LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency()); + LOG_INF("\n"); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); + LOG_INF("\n"); - GGML_ASSERT(dynamic_cast(result.get()) != nullptr); - res_ok(res, result->to_json()); - }; + server_http_context ctx_http; + if (!ctx_http.init(params)) { + LOG_ERR("%s: failed to initialize HTTP server\n", __func__); + return 1; + } // // Router // - if (!params.webui) { - LOG_INF("Web UI is disabled\n"); - } else { - // register static assets routes - if (!params.public_path.empty()) { - // Set the base directory for serving static files - bool is_found = svr->set_mount_point(params.api_prefix + "/", params.public_path); - if (!is_found) { - LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str()); - return 1; - } - } else { - // using embedded static index.html - svr->Get(params.api_prefix + "/", [](const httplib::Request & req, httplib::Response & res) { - if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) { - res.set_content("Error: gzip is not supported by this browser", "text/plain"); - } else { - res.set_header("Content-Encoding", "gzip"); - // COEP and COOP headers, required by pyodide (python interpreter) - res.set_header("Cross-Origin-Embedder-Policy", "require-corp"); - res.set_header("Cross-Origin-Opener-Policy", "same-origin"); - } - return false; - }); - } - } - // register API routes - svr->Get (params.api_prefix + "/health", handle_health); // public endpoint (no API key check) - svr->Get (params.api_prefix + "/v1/health", handle_health); // public endpoint (no API key check) - svr->Get (params.api_prefix + "/metrics", handle_metrics); - svr->Get (params.api_prefix + "/props", handle_props); - svr->Post(params.api_prefix + "/props", handle_props_change); - svr->Post(params.api_prefix + "/api/show", handle_api_show); - svr->Get (params.api_prefix + "/models", handle_models); // public endpoint (no API key check) - svr->Get (params.api_prefix + "/v1/models", handle_models); // public endpoint (no API key check) - svr->Get (params.api_prefix + "/api/tags", handle_models); // ollama specific endpoint. public endpoint (no API key check) - svr->Post(params.api_prefix + "/completion", handle_completions); // legacy - svr->Post(params.api_prefix + "/completions", handle_completions); - svr->Post(params.api_prefix + "/v1/completions", handle_completions_oai); - svr->Post(params.api_prefix + "/chat/completions", handle_chat_completions); - svr->Post(params.api_prefix + "/v1/chat/completions", handle_chat_completions); - svr->Post(params.api_prefix + "/api/chat", handle_chat_completions); // ollama specific endpoint - svr->Post(params.api_prefix + "/infill", handle_infill); - svr->Post(params.api_prefix + "/embedding", handle_embeddings); // legacy - svr->Post(params.api_prefix + "/embeddings", handle_embeddings); - svr->Post(params.api_prefix + "/v1/embeddings", handle_embeddings_oai); - svr->Post(params.api_prefix + "/rerank", handle_rerank); - svr->Post(params.api_prefix + "/reranking", handle_rerank); - svr->Post(params.api_prefix + "/v1/rerank", handle_rerank); - svr->Post(params.api_prefix + "/v1/reranking", handle_rerank); - svr->Post(params.api_prefix + "/tokenize", handle_tokenize); - svr->Post(params.api_prefix + "/detokenize", handle_detokenize); - svr->Post(params.api_prefix + "/apply-template", handle_apply_template); + server_routes routes(params, ctx_server, [&ctx_http]() { return ctx_http.is_ready.load(); }); + + bool is_router_server = params.model.path.empty(); + std::optional models_routes{}; + if (is_router_server) { + // setup server instances manager + models_routes.emplace(params, argc, argv, envp); + + // proxy handlers + // note: routes.get_health stays the same + routes.get_metrics = models_routes->proxy_get; + routes.post_props = models_routes->proxy_post; + routes.get_api_show = models_routes->proxy_get; + routes.post_completions = models_routes->proxy_post; + routes.post_completions_oai = models_routes->proxy_post; + routes.post_chat_completions = models_routes->proxy_post; + routes.post_anthropic_messages = models_routes->proxy_post; + routes.post_anthropic_count_tokens = models_routes->proxy_post; + routes.post_infill = models_routes->proxy_post; + routes.post_embeddings = models_routes->proxy_post; + routes.post_embeddings_oai = models_routes->proxy_post; + routes.post_rerank = models_routes->proxy_post; + routes.post_tokenize = models_routes->proxy_post; + routes.post_detokenize = models_routes->proxy_post; + routes.post_apply_template = models_routes->proxy_post; + routes.get_lora_adapters = models_routes->proxy_get; + routes.post_lora_adapters = models_routes->proxy_post; + routes.get_slots = models_routes->proxy_get; + routes.post_slots = models_routes->proxy_post; + + // custom routes for router + routes.get_props = models_routes->get_router_props; + routes.get_models = models_routes->get_router_models; + ctx_http.post("/models/load", ex_wrapper(models_routes->post_router_models_load)); + ctx_http.post("/models/unload", ex_wrapper(models_routes->post_router_models_unload)); + ctx_http.post("/models/status", ex_wrapper(models_routes->post_router_models_status)); + } + + ctx_http.get ("/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check) + ctx_http.get ("/v1/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check) + ctx_http.get ("/metrics", ex_wrapper(routes.get_metrics)); + ctx_http.get ("/props", ex_wrapper(routes.get_props)); + ctx_http.post("/props", ex_wrapper(routes.post_props)); + ctx_http.post("/api/show", ex_wrapper(routes.get_api_show)); + ctx_http.get ("/models", ex_wrapper(routes.get_models)); // public endpoint (no API key check) + ctx_http.get ("/v1/models", ex_wrapper(routes.get_models)); // public endpoint (no API key check) + ctx_http.get ("/api/tags", ex_wrapper(routes.get_models)); // ollama specific endpoint. public endpoint (no API key check) + ctx_http.post("/completion", ex_wrapper(routes.post_completions)); // legacy + ctx_http.post("/completions", ex_wrapper(routes.post_completions)); + ctx_http.post("/v1/completions", ex_wrapper(routes.post_completions_oai)); + ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions)); + ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions)); + ctx_http.post("/api/chat", ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint + ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API + ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting + ctx_http.post("/infill", ex_wrapper(routes.post_infill)); + ctx_http.post("/embedding", ex_wrapper(routes.post_embeddings)); // legacy + ctx_http.post("/embeddings", ex_wrapper(routes.post_embeddings)); + ctx_http.post("/v1/embeddings", ex_wrapper(routes.post_embeddings_oai)); + ctx_http.post("/rerank", ex_wrapper(routes.post_rerank)); + ctx_http.post("/reranking", ex_wrapper(routes.post_rerank)); + ctx_http.post("/v1/rerank", ex_wrapper(routes.post_rerank)); + ctx_http.post("/v1/reranking", ex_wrapper(routes.post_rerank)); + ctx_http.post("/tokenize", ex_wrapper(routes.post_tokenize)); + ctx_http.post("/detokenize", ex_wrapper(routes.post_detokenize)); + ctx_http.post("/apply-template", ex_wrapper(routes.post_apply_template)); // LoRA adapters hotswap - svr->Get (params.api_prefix + "/lora-adapters", handle_lora_adapters_list); - svr->Post(params.api_prefix + "/lora-adapters", handle_lora_adapters_apply); + ctx_http.get ("/lora-adapters", ex_wrapper(routes.get_lora_adapters)); + ctx_http.post("/lora-adapters", ex_wrapper(routes.post_lora_adapters)); // Save & load slots - svr->Get (params.api_prefix + "/slots", handle_slots); - svr->Post(params.api_prefix + "/slots/:id_slot", handle_slots_action); + ctx_http.get ("/slots", ex_wrapper(routes.get_slots)); + ctx_http.post("/slots/:id_slot", ex_wrapper(routes.post_slots)); // // Start the server // - if (params.n_threads_http < 1) { - // +2 threads for monitoring endpoints - params.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1); - } - log_data["n_threads_http"] = std::to_string(params.n_threads_http); - svr->new_task_queue = [¶ms] { return new httplib::ThreadPool(params.n_threads_http); }; - - // clean up function, to be called before exit - auto clean_up = [&svr, &ctx_server]() { - SRV_INF("%s: cleaning up before exit...\n", __func__); - svr->stop(); - ctx_server.queue_results.terminate(); - llama_backend_free(); - }; - bool was_bound = false; - bool is_sock = false; - if (string_ends_with(std::string(params.hostname), ".sock")) { - is_sock = true; - LOG_INF("%s: setting address family to AF_UNIX\n", __func__); - svr->set_address_family(AF_UNIX); - // bind_to_port requires a second arg, any value other than 0 should - // simply get ignored - was_bound = svr->bind_to_port(params.hostname, 8080); - } else { - LOG_INF("%s: binding port with default address family\n", __func__); - // bind HTTP listen port - if (params.port == 0) { - int bound_port = svr->bind_to_any_port(params.hostname); - if ((was_bound = (bound_port >= 0))) { - params.port = bound_port; - } - } else { - was_bound = svr->bind_to_port(params.hostname, params.port); - } - } + std::function clean_up; - if (!was_bound) { - LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port); - clean_up(); - return 1; - } + if (is_router_server) { + LOG_INF("%s: starting router server, no model will be loaded in this process\n", __func__); - // run the HTTP server in a thread - std::thread t([&]() { svr->listen_after_bind(); }); - svr->wait_until_ready(); + clean_up = [&models_routes]() { + SRV_INF("%s: cleaning up before exit...\n", __func__); + if (models_routes.has_value()) { + models_routes->models.unload_all(); + } + llama_backend_free(); + }; - LOG_INF("%s: HTTP server is listening, hostname: %s, port: %d, http threads: %d\n", __func__, params.hostname.c_str(), params.port, params.n_threads_http); + if (!ctx_http.start()) { + clean_up(); + LOG_ERR("%s: exiting due to HTTP server error\n", __func__); + return 1; + } + ctx_http.is_ready.store(true); - // load the model - LOG_INF("%s: loading model\n", __func__); + shutdown_handler = [&](int) { + ctx_http.stop(); + }; - if (!ctx_server.load_model(params)) { - clean_up(); - t.join(); - LOG_ERR("%s: exiting due to model loading error\n", __func__); - return 1; - } + } else { + // setup clean up function, to be called before exit + clean_up = [&ctx_http, &ctx_server]() { + SRV_INF("%s: cleaning up before exit...\n", __func__); + ctx_http.stop(); + ctx_server.terminate(); + llama_backend_free(); + }; - ctx_server.init(); - state.store(SERVER_STATE_READY); + // start the HTTP server before loading the model to be able to serve /health requests + if (!ctx_http.start()) { + clean_up(); + LOG_ERR("%s: exiting due to HTTP server error\n", __func__); + return 1; + } - LOG_INF("%s: model loaded\n", __func__); + // load the model + LOG_INF("%s: loading model\n", __func__); - // print sample chat example to make it clear which template is used - LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__, - common_chat_templates_source(ctx_server.chat_templates.get()), - common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja, ctx_server.params_base.default_template_kwargs).c_str()); + if (!ctx_server.load_model(params)) { + clean_up(); + if (ctx_http.thread.joinable()) { + ctx_http.thread.join(); + } + LOG_ERR("%s: exiting due to model loading error\n", __func__); + return 1; + } - ctx_server.queue_tasks.on_new_task([&ctx_server](server_task && task) { - ctx_server.process_single_task(std::move(task)); - }); + ctx_server.init(); + ctx_http.is_ready.store(true); - ctx_server.queue_tasks.on_update_slots([&ctx_server]() { - ctx_server.update_slots(); - }); + LOG_INF("%s: model loaded\n", __func__); - shutdown_handler = [&](int) { - // this will unblock start_loop() - ctx_server.queue_tasks.terminate(); - }; + shutdown_handler = [&](int) { + // this will unblock start_loop() + ctx_server.terminate(); + }; + } + // TODO: refactor in common/console #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; sigint_action.sa_handler = signal_handler; @@ -5771,16 +268,39 @@ int main(int argc, char ** argv) { SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); #endif - LOG_INF("%s: server is listening on %s - starting the main loop\n", __func__, - is_sock ? string_format("unix://%s", params.hostname.c_str()).c_str() : - string_format("http://%s:%d", params.hostname.c_str(), params.port).c_str()); + if (is_router_server) { + LOG_INF("%s: router server is listening on %s\n", __func__, ctx_http.listening_address.c_str()); + LOG_INF("%s: NOTE: router mode is experimental\n", __func__); + LOG_INF("%s: it is not recommended to use this mode in untrusted environments\n", __func__); + if (ctx_http.thread.joinable()) { + ctx_http.thread.join(); // keep the main thread alive + } + + // when the HTTP server stops, clean up and exit + clean_up(); + } else { + LOG_INF("%s: server is listening on %s\n", __func__, ctx_http.listening_address.c_str()); + LOG_INF("%s: starting the main loop...\n", __func__); + + // optionally, notify router server that this instance is ready + const char * router_port = std::getenv("LLAMA_SERVER_ROUTER_PORT"); + std::thread monitor_thread; + if (router_port != nullptr) { + monitor_thread = server_models::setup_child_server(params, std::atoi(router_port), params.model_alias, shutdown_handler); + } - // this call blocks the main thread until queue_tasks.terminate() is called - ctx_server.queue_tasks.start_loop(); + // this call blocks the main thread until queue_tasks.terminate() is called + ctx_server.start_loop(); - clean_up(); - t.join(); - llama_memory_breakdown_print(ctx_server.ctx); + clean_up(); + if (ctx_http.thread.joinable()) { + ctx_http.thread.join(); + } + if (monitor_thread.joinable()) { + monitor_thread.join(); + } + llama_memory_breakdown_print(ctx_server.get_llama_context()); + } return 0; } diff --git a/llamacpp/native/src/server/server.patch b/llamacpp/native/src/server/server.patch deleted file mode 100644 index 1988b6b5c..000000000 --- a/llamacpp/native/src/server/server.patch +++ /dev/null @@ -1,20 +0,0 @@ -16,19d15 -< -< // auto generated files (see README.md for details) -< #include "index.html.gz.hpp" -< #include "loading.html.hpp" -4224,4233c4220 -< auto tmp = string_split(req.path, '.'); -< if (req.path == "/" || tmp.back() == "html") { -< res.set_content(reinterpret_cast(loading_html), loading_html_len, "text/html; charset=utf-8"); -< res.status = 503; -< } else if (req.path == "/models" || req.path == "/v1/models" || req.path == "/api/tags") { -< // allow the models endpoint to be accessed during loading -< return true; -< } else { -< res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE)); -< } ---- -> res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE)); -5226d5212 -< res.set_content(reinterpret_cast(index_html_gz), index_html_gz_len, "text/html; charset=utf-8"); diff --git a/llamacpp/native/vendor/llama.cpp b/llamacpp/native/vendor/llama.cpp index 97d511721..37adc9c6b 160000 --- a/llamacpp/native/vendor/llama.cpp +++ b/llamacpp/native/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 97d5117217e4ad904493345e2f71dfe441a08e25 +Subproject commit 37adc9c6ba6057bfe7c036c201abe85471d854a1