From d8be8af24a514227ea9f116c98ec2e47cd2299d3 Mon Sep 17 00:00:00 2001
From: Dorin Geman <dorin.geman@docker.com>
Date: Wed, 3 Dec 2025 12:33:38 +0200
Subject: [PATCH] llamacpp: bump llama.cpp (b7245)

See https://github.com/ggml-org/llama.cpp/tree/b7245.

Signed-off-by: Dorin Geman <dorin.geman@docker.com>
---
 llamacpp/native/src/server/CMakeLists.txt     |     8 +-
 llamacpp/native/src/server/Makefile           |    18 +-
 llamacpp/native/src/server/httplib.h          | 10506 ----------------
 .../server/{utils.hpp => server-common.cpp}   |  2043 +--
 llamacpp/native/src/server/server-common.h    |   359 +
 llamacpp/native/src/server/server-context.cpp |  3637 ++++++
 llamacpp/native/src/server/server-context.h   |    83 +
 llamacpp/native/src/server/server-http.cpp    |   380 +
 llamacpp/native/src/server/server-http.h      |    78 +
 llamacpp/native/src/server/server-http.patch  |    61 +
 llamacpp/native/src/server/server-models.cpp  |   975 ++
 llamacpp/native/src/server/server-models.h    |   174 +
 llamacpp/native/src/server/server-queue.cpp   |   351 +
 llamacpp/native/src/server/server-queue.h     |   146 +
 llamacpp/native/src/server/server-task.cpp    |  1471 +++
 llamacpp/native/src/server/server-task.h      |   460 +
 llamacpp/native/src/server/server.cpp         |  5948 +--------
 llamacpp/native/src/server/server.patch       |    20 -
 llamacpp/native/vendor/llama.cpp              |     2 +-
 19 files changed, 9503 insertions(+), 17217 deletions(-)
 delete mode 100644 llamacpp/native/src/server/httplib.h
 rename llamacpp/native/src/server/{utils.hpp => server-common.cpp} (59%)
 create mode 100644 llamacpp/native/src/server/server-common.h
 create mode 100644 llamacpp/native/src/server/server-context.cpp
 create mode 100644 llamacpp/native/src/server/server-context.h
 create mode 100644 llamacpp/native/src/server/server-http.cpp
 create mode 100644 llamacpp/native/src/server/server-http.h
 create mode 100644 llamacpp/native/src/server/server-http.patch
 create mode 100644 llamacpp/native/src/server/server-models.cpp
 create mode 100644 llamacpp/native/src/server/server-models.h
 create mode 100644 llamacpp/native/src/server/server-queue.cpp
 create mode 100644 llamacpp/native/src/server/server-queue.h
 create mode 100644 llamacpp/native/src/server/server-task.cpp
 create mode 100644 llamacpp/native/src/server/server-task.h
 delete mode 100644 llamacpp/native/src/server/server.patch

diff --git a/llamacpp/native/src/server/CMakeLists.txt b/llamacpp/native/src/server/CMakeLists.txt
index 8d995f069..95d89b5cb 100644
--- a/llamacpp/native/src/server/CMakeLists.txt
+++ b/llamacpp/native/src/server/CMakeLists.txt
@@ -15,18 +15,14 @@ if (MINGW)
     add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
 endif()
 
-set(TARGET_SRCS
-    server.cpp
-    utils.hpp
-    httplib.h
-)
+file(GLOB TARGET_SRCS "*.cpp")
 
 add_executable(${TARGET} ${TARGET_SRCS})
 install(TARGETS ${TARGET} RUNTIME)
 
 target_include_directories(${TARGET} PRIVATE ../../vendor/llama.cpp/tools/mtmd)
 target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
-target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT} cpp-httplib)
 
 if (WIN32)
     TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
diff --git a/llamacpp/native/src/server/Makefile b/llamacpp/native/src/server/Makefile
index cd1b03ef3..aad5b71b0 100644
--- a/llamacpp/native/src/server/Makefile
+++ b/llamacpp/native/src/server/Makefile
@@ -1,16 +1,18 @@
 LLAMA_SERVER_DIR = ../../vendor/llama.cpp/tools/server/
+SERVER_FILES = server-common server-context server-http server-models server-queue server-task server
+HEADERS = $(addsuffix .h, $(filter-out server, $(SERVER_FILES)))
+SOURCES = $(addsuffix .cpp, $(SERVER_FILES))
 
 .PHONY: clean all
 
-all: utils.hpp server.cpp
+all: $(HEADERS) $(SOURCES)
 
-utils.hpp: $(LLAMA_SERVER_DIR)/utils.hpp
-	cp $(LLAMA_SERVER_DIR)/utils.hpp .
+%.h: $(LLAMA_SERVER_DIR)/%.h
+	cp $< $@
 
-server.cpp: $(LLAMA_SERVER_DIR)/server.cpp
-	cp $(LLAMA_SERVER_DIR)/server.cpp .
-	patch server.cpp < server.patch
+%.cpp: $(LLAMA_SERVER_DIR)/%.cpp
+	cp $< $@
+	@if [ "$@" = "server-http.cpp" ]; then patch $@ < server-http.patch; fi
 
 clean:
-	rm *.cpp
-	rm *.hpp
+	rm -f $(HEADERS) $(SOURCES)
diff --git a/llamacpp/native/src/server/httplib.h b/llamacpp/native/src/server/httplib.h
deleted file mode 100644
index 0f981dc89..000000000
--- a/llamacpp/native/src/server/httplib.h
+++ /dev/null
@@ -1,10506 +0,0 @@
-//
-//  httplib.h
-//
-//  Copyright (c) 2025 Yuji Hirose. All rights reserved.
-//  MIT License
-//
-
-#ifndef CPPHTTPLIB_HTTPLIB_H
-#define CPPHTTPLIB_HTTPLIB_H
-
-#define CPPHTTPLIB_VERSION "0.20.0"
-
-/*
- * Configuration
- */
-
-#ifndef CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND
-#define CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND 5
-#endif
-
-#ifndef CPPHTTPLIB_KEEPALIVE_TIMEOUT_CHECK_INTERVAL_USECOND
-#define CPPHTTPLIB_KEEPALIVE_TIMEOUT_CHECK_INTERVAL_USECOND 10000
-#endif
-
-#ifndef CPPHTTPLIB_KEEPALIVE_MAX_COUNT
-#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 100
-#endif
-
-#ifndef CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND
-#define CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND 300
-#endif
-
-#ifndef CPPHTTPLIB_CONNECTION_TIMEOUT_USECOND
-#define CPPHTTPLIB_CONNECTION_TIMEOUT_USECOND 0
-#endif
-
-#ifndef CPPHTTPLIB_SERVER_READ_TIMEOUT_SECOND
-#define CPPHTTPLIB_SERVER_READ_TIMEOUT_SECOND 5
-#endif
-
-#ifndef CPPHTTPLIB_SERVER_READ_TIMEOUT_USECOND
-#define CPPHTTPLIB_SERVER_READ_TIMEOUT_USECOND 0
-#endif
-
-#ifndef CPPHTTPLIB_SERVER_WRITE_TIMEOUT_SECOND
-#define CPPHTTPLIB_SERVER_WRITE_TIMEOUT_SECOND 5
-#endif
-
-#ifndef CPPHTTPLIB_SERVER_WRITE_TIMEOUT_USECOND
-#define CPPHTTPLIB_SERVER_WRITE_TIMEOUT_USECOND 0
-#endif
-
-#ifndef CPPHTTPLIB_CLIENT_READ_TIMEOUT_SECOND
-#define CPPHTTPLIB_CLIENT_READ_TIMEOUT_SECOND 300
-#endif
-
-#ifndef CPPHTTPLIB_CLIENT_READ_TIMEOUT_USECOND
-#define CPPHTTPLIB_CLIENT_READ_TIMEOUT_USECOND 0
-#endif
-
-#ifndef CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_SECOND
-#define CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_SECOND 5
-#endif
-
-#ifndef CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_USECOND
-#define CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_USECOND 0
-#endif
-
-#ifndef CPPHTTPLIB_CLIENT_MAX_TIMEOUT_MSECOND
-#define CPPHTTPLIB_CLIENT_MAX_TIMEOUT_MSECOND 0
-#endif
-
-#ifndef CPPHTTPLIB_IDLE_INTERVAL_SECOND
-#define CPPHTTPLIB_IDLE_INTERVAL_SECOND 0
-#endif
-
-#ifndef CPPHTTPLIB_IDLE_INTERVAL_USECOND
-#ifdef _WIN32
-#define CPPHTTPLIB_IDLE_INTERVAL_USECOND 10000
-#else
-#define CPPHTTPLIB_IDLE_INTERVAL_USECOND 0
-#endif
-#endif
-
-#ifndef CPPHTTPLIB_REQUEST_URI_MAX_LENGTH
-#define CPPHTTPLIB_REQUEST_URI_MAX_LENGTH 8192
-#endif
-
-#ifndef CPPHTTPLIB_HEADER_MAX_LENGTH
-#define CPPHTTPLIB_HEADER_MAX_LENGTH 8192
-#endif
-
-#ifndef CPPHTTPLIB_REDIRECT_MAX_COUNT
-#define CPPHTTPLIB_REDIRECT_MAX_COUNT 20
-#endif
-
-#ifndef CPPHTTPLIB_MULTIPART_FORM_DATA_FILE_MAX_COUNT
-#define CPPHTTPLIB_MULTIPART_FORM_DATA_FILE_MAX_COUNT 1024
-#endif
-
-#ifndef CPPHTTPLIB_PAYLOAD_MAX_LENGTH
-#define CPPHTTPLIB_PAYLOAD_MAX_LENGTH ((std::numeric_limits<size_t>::max)())
-#endif
-
-#ifndef CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH
-#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 8192
-#endif
-
-#ifndef CPPHTTPLIB_RANGE_MAX_COUNT
-#define CPPHTTPLIB_RANGE_MAX_COUNT 1024
-#endif
-
-#ifndef CPPHTTPLIB_TCP_NODELAY
-#define CPPHTTPLIB_TCP_NODELAY false
-#endif
-
-#ifndef CPPHTTPLIB_IPV6_V6ONLY
-#define CPPHTTPLIB_IPV6_V6ONLY false
-#endif
-
-#ifndef CPPHTTPLIB_RECV_BUFSIZ
-#define CPPHTTPLIB_RECV_BUFSIZ size_t(16384u)
-#endif
-
-#ifndef CPPHTTPLIB_COMPRESSION_BUFSIZ
-#define CPPHTTPLIB_COMPRESSION_BUFSIZ size_t(16384u)
-#endif
-
-#ifndef CPPHTTPLIB_THREAD_POOL_COUNT
-#define CPPHTTPLIB_THREAD_POOL_COUNT                                           \
-  ((std::max)(8u, std::thread::hardware_concurrency() > 0                      \
-                      ? std::thread::hardware_concurrency() - 1                \
-                      : 0))
-#endif
-
-#ifndef CPPHTTPLIB_RECV_FLAGS
-#define CPPHTTPLIB_RECV_FLAGS 0
-#endif
-
-#ifndef CPPHTTPLIB_SEND_FLAGS
-#define CPPHTTPLIB_SEND_FLAGS 0
-#endif
-
-#ifndef CPPHTTPLIB_LISTEN_BACKLOG
-#define CPPHTTPLIB_LISTEN_BACKLOG 5
-#endif
-
-/*
- * Headers
- */
-
-#ifdef _WIN32
-#ifndef _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_WARNINGS
-#endif //_CRT_SECURE_NO_WARNINGS
-
-#ifndef _CRT_NONSTDC_NO_DEPRECATE
-#define _CRT_NONSTDC_NO_DEPRECATE
-#endif //_CRT_NONSTDC_NO_DEPRECATE
-
-#if defined(_MSC_VER)
-#if _MSC_VER < 1900
-#error Sorry, Visual Studio versions prior to 2015 are not supported
-#endif
-
-#pragma comment(lib, "ws2_32.lib")
-
-#ifdef _WIN64
-using ssize_t = __int64;
-#else
-using ssize_t = long;
-#endif
-#endif // _MSC_VER
-
-#ifndef S_ISREG
-#define S_ISREG(m) (((m) & S_IFREG) == S_IFREG)
-#endif // S_ISREG
-
-#ifndef S_ISDIR
-#define S_ISDIR(m) (((m) & S_IFDIR) == S_IFDIR)
-#endif // S_ISDIR
-
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif // NOMINMAX
-
-#include <io.h>
-#include <winsock2.h>
-#include <ws2tcpip.h>
-
-// afunix.h uses types declared in winsock2.h, so has to be included after it.
-#include <afunix.h>
-
-#ifndef WSA_FLAG_NO_HANDLE_INHERIT
-#define WSA_FLAG_NO_HANDLE_INHERIT 0x80
-#endif
-
-using nfds_t = unsigned long;
-using socket_t = SOCKET;
-using socklen_t = int;
-
-#else // not _WIN32
-
-#include <arpa/inet.h>
-#if !defined(_AIX) && !defined(__MVS__)
-#include <ifaddrs.h>
-#endif
-#ifdef __MVS__
-#include <strings.h>
-#ifndef NI_MAXHOST
-#define NI_MAXHOST 1025
-#endif
-#endif
-#include <net/if.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#ifdef __linux__
-#include <resolv.h>
-#endif
-#include <csignal>
-#include <netinet/tcp.h>
-#include <poll.h>
-#include <pthread.h>
-#include <sys/mman.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <unistd.h>
-
-using socket_t = int;
-#ifndef INVALID_SOCKET
-#define INVALID_SOCKET (-1)
-#endif
-#endif //_WIN32
-
-#include <algorithm>
-#include <array>
-#include <atomic>
-#include <cassert>
-#include <cctype>
-#include <climits>
-#include <condition_variable>
-#include <cstring>
-#include <errno.h>
-#include <exception>
-#include <fcntl.h>
-#include <functional>
-#include <iomanip>
-#include <iostream>
-#include <list>
-#include <map>
-#include <memory>
-#include <mutex>
-#include <random>
-#include <regex>
-#include <set>
-#include <sstream>
-#include <string>
-#include <sys/stat.h>
-#include <thread>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-#ifdef _WIN32
-#include <wincrypt.h>
-
-// these are defined in wincrypt.h and it breaks compilation if BoringSSL is
-// used
-#undef X509_NAME
-#undef X509_CERT_PAIR
-#undef X509_EXTENSIONS
-#undef PKCS7_SIGNER_INFO
-
-#ifdef _MSC_VER
-#pragma comment(lib, "crypt32.lib")
-#endif
-#elif defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN) && defined(__APPLE__)
-#include <TargetConditionals.h>
-#if TARGET_OS_OSX
-#include <CoreFoundation/CoreFoundation.h>
-#include <Security/Security.h>
-#endif // TARGET_OS_OSX
-#endif // _WIN32
-
-#include <openssl/err.h>
-#include <openssl/evp.h>
-#include <openssl/ssl.h>
-#include <openssl/x509v3.h>
-
-#if defined(_WIN32) && defined(OPENSSL_USE_APPLINK)
-#include <openssl/applink.c>
-#endif
-
-#include <iostream>
-#include <sstream>
-
-#if defined(OPENSSL_IS_BORINGSSL) || defined(LIBRESSL_VERSION_NUMBER)
-#if OPENSSL_VERSION_NUMBER < 0x1010107f
-#error Please use OpenSSL or a current version of BoringSSL
-#endif
-#define SSL_get1_peer_certificate SSL_get_peer_certificate
-#elif OPENSSL_VERSION_NUMBER < 0x30000000L
-#error Sorry, OpenSSL versions prior to 3.0.0 are not supported
-#endif
-
-#endif
-
-#ifdef CPPHTTPLIB_ZLIB_SUPPORT
-#include <zlib.h>
-#endif
-
-#ifdef CPPHTTPLIB_BROTLI_SUPPORT
-#include <brotli/decode.h>
-#include <brotli/encode.h>
-#endif
-
-#ifdef CPPHTTPLIB_ZSTD_SUPPORT
-#include <zstd.h>
-#endif
-
-/*
- * Declaration
- */
-namespace httplib {
-
-namespace detail {
-
-/*
- * Backport std::make_unique from C++14.
- *
- * NOTE: This code came up with the following stackoverflow post:
- * https://stackoverflow.com/questions/10149840/c-arrays-and-make-unique
- *
- */
-
-template <class T, class... Args>
-typename std::enable_if<!std::is_array<T>::value, std::unique_ptr<T>>::type
-make_unique(Args &&...args) {
-  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
-}
-
-template <class T>
-typename std::enable_if<std::is_array<T>::value, std::unique_ptr<T>>::type
-make_unique(std::size_t n) {
-  typedef typename std::remove_extent<T>::type RT;
-  return std::unique_ptr<T>(new RT[n]);
-}
-
-namespace case_ignore {
-
-inline unsigned char to_lower(int c) {
-  const static unsigned char table[256] = {
-      0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
-      15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
-      30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
-      45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
-      60,  61,  62,  63,  64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106,
-      107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
-      122, 91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
-      105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
-      120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
-      135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
-      150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
-      165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
-      180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 224, 225, 226,
-      227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
-      242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, 224,
-      225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
-      240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
-      255,
-  };
-  return table[(unsigned char)(char)c];
-}
-
-inline bool equal(const std::string &a, const std::string &b) {
-  return a.size() == b.size() &&
-         std::equal(a.begin(), a.end(), b.begin(), [](char ca, char cb) {
-           return to_lower(ca) == to_lower(cb);
-         });
-}
-
-struct equal_to {
-  bool operator()(const std::string &a, const std::string &b) const {
-    return equal(a, b);
-  }
-};
-
-struct hash {
-  size_t operator()(const std::string &key) const {
-    return hash_core(key.data(), key.size(), 0);
-  }
-
-  size_t hash_core(const char *s, size_t l, size_t h) const {
-    return (l == 0) ? h
-                    : hash_core(s + 1, l - 1,
-                                // Unsets the 6 high bits of h, therefore no
-                                // overflow happens
-                                (((std::numeric_limits<size_t>::max)() >> 6) &
-                                 h * 33) ^
-                                    static_cast<unsigned char>(to_lower(*s)));
-  }
-};
-
-} // namespace case_ignore
-
-// This is based on
-// "http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n4189".
-
-struct scope_exit {
-  explicit scope_exit(std::function<void(void)> &&f)
-      : exit_function(std::move(f)), execute_on_destruction{true} {}
-
-  scope_exit(scope_exit &&rhs) noexcept
-      : exit_function(std::move(rhs.exit_function)),
-        execute_on_destruction{rhs.execute_on_destruction} {
-    rhs.release();
-  }
-
-  ~scope_exit() {
-    if (execute_on_destruction) { this->exit_function(); }
-  }
-
-  void release() { this->execute_on_destruction = false; }
-
-private:
-  scope_exit(const scope_exit &) = delete;
-  void operator=(const scope_exit &) = delete;
-  scope_exit &operator=(scope_exit &&) = delete;
-
-  std::function<void(void)> exit_function;
-  bool execute_on_destruction;
-};
-
-} // namespace detail
-
-enum SSLVerifierResponse {
-  // no decision has been made, use the built-in certificate verifier
-  NoDecisionMade,
-  // connection certificate is verified and accepted
-  CertificateAccepted,
-  // connection certificate was processed but is rejected
-  CertificateRejected
-};
-
-enum StatusCode {
-  // Information responses
-  Continue_100 = 100,
-  SwitchingProtocol_101 = 101,
-  Processing_102 = 102,
-  EarlyHints_103 = 103,
-
-  // Successful responses
-  OK_200 = 200,
-  Created_201 = 201,
-  Accepted_202 = 202,
-  NonAuthoritativeInformation_203 = 203,
-  NoContent_204 = 204,
-  ResetContent_205 = 205,
-  PartialContent_206 = 206,
-  MultiStatus_207 = 207,
-  AlreadyReported_208 = 208,
-  IMUsed_226 = 226,
-
-  // Redirection messages
-  MultipleChoices_300 = 300,
-  MovedPermanently_301 = 301,
-  Found_302 = 302,
-  SeeOther_303 = 303,
-  NotModified_304 = 304,
-  UseProxy_305 = 305,
-  unused_306 = 306,
-  TemporaryRedirect_307 = 307,
-  PermanentRedirect_308 = 308,
-
-  // Client error responses
-  BadRequest_400 = 400,
-  Unauthorized_401 = 401,
-  PaymentRequired_402 = 402,
-  Forbidden_403 = 403,
-  NotFound_404 = 404,
-  MethodNotAllowed_405 = 405,
-  NotAcceptable_406 = 406,
-  ProxyAuthenticationRequired_407 = 407,
-  RequestTimeout_408 = 408,
-  Conflict_409 = 409,
-  Gone_410 = 410,
-  LengthRequired_411 = 411,
-  PreconditionFailed_412 = 412,
-  PayloadTooLarge_413 = 413,
-  UriTooLong_414 = 414,
-  UnsupportedMediaType_415 = 415,
-  RangeNotSatisfiable_416 = 416,
-  ExpectationFailed_417 = 417,
-  ImATeapot_418 = 418,
-  MisdirectedRequest_421 = 421,
-  UnprocessableContent_422 = 422,
-  Locked_423 = 423,
-  FailedDependency_424 = 424,
-  TooEarly_425 = 425,
-  UpgradeRequired_426 = 426,
-  PreconditionRequired_428 = 428,
-  TooManyRequests_429 = 429,
-  RequestHeaderFieldsTooLarge_431 = 431,
-  UnavailableForLegalReasons_451 = 451,
-
-  // Server error responses
-  InternalServerError_500 = 500,
-  NotImplemented_501 = 501,
-  BadGateway_502 = 502,
-  ServiceUnavailable_503 = 503,
-  GatewayTimeout_504 = 504,
-  HttpVersionNotSupported_505 = 505,
-  VariantAlsoNegotiates_506 = 506,
-  InsufficientStorage_507 = 507,
-  LoopDetected_508 = 508,
-  NotExtended_510 = 510,
-  NetworkAuthenticationRequired_511 = 511,
-};
-
-using Headers =
-    std::unordered_multimap<std::string, std::string, detail::case_ignore::hash,
-                            detail::case_ignore::equal_to>;
-
-using Params = std::multimap<std::string, std::string>;
-using Match = std::smatch;
-
-using Progress = std::function<bool(uint64_t current, uint64_t total)>;
-
-struct Response;
-using ResponseHandler = std::function<bool(const Response &response)>;
-
-struct MultipartFormData {
-  std::string name;
-  std::string content;
-  std::string filename;
-  std::string content_type;
-};
-using MultipartFormDataItems = std::vector<MultipartFormData>;
-using MultipartFormDataMap = std::multimap<std::string, MultipartFormData>;
-
-class DataSink {
-public:
-  DataSink() : os(&sb_), sb_(*this) {}
-
-  DataSink(const DataSink &) = delete;
-  DataSink &operator=(const DataSink &) = delete;
-  DataSink(DataSink &&) = delete;
-  DataSink &operator=(DataSink &&) = delete;
-
-  std::function<bool(const char *data, size_t data_len)> write;
-  std::function<bool()> is_writable;
-  std::function<void()> done;
-  std::function<void(const Headers &trailer)> done_with_trailer;
-  std::ostream os;
-
-private:
-  class data_sink_streambuf final : public std::streambuf {
-  public:
-    explicit data_sink_streambuf(DataSink &sink) : sink_(sink) {}
-
-  protected:
-    std::streamsize xsputn(const char *s, std::streamsize n) override {
-      sink_.write(s, static_cast<size_t>(n));
-      return n;
-    }
-
-  private:
-    DataSink &sink_;
-  };
-
-  data_sink_streambuf sb_;
-};
-
-using ContentProvider =
-    std::function<bool(size_t offset, size_t length, DataSink &sink)>;
-
-using ContentProviderWithoutLength =
-    std::function<bool(size_t offset, DataSink &sink)>;
-
-using ContentProviderResourceReleaser = std::function<void(bool success)>;
-
-struct MultipartFormDataProvider {
-  std::string name;
-  ContentProviderWithoutLength provider;
-  std::string filename;
-  std::string content_type;
-};
-using MultipartFormDataProviderItems = std::vector<MultipartFormDataProvider>;
-
-using ContentReceiverWithProgress =
-    std::function<bool(const char *data, size_t data_length, uint64_t offset,
-                       uint64_t total_length)>;
-
-using ContentReceiver =
-    std::function<bool(const char *data, size_t data_length)>;
-
-using MultipartContentHeader =
-    std::function<bool(const MultipartFormData &file)>;
-
-class ContentReader {
-public:
-  using Reader = std::function<bool(ContentReceiver receiver)>;
-  using MultipartReader = std::function<bool(MultipartContentHeader header,
-                                             ContentReceiver receiver)>;
-
-  ContentReader(Reader reader, MultipartReader multipart_reader)
-      : reader_(std::move(reader)),
-        multipart_reader_(std::move(multipart_reader)) {}
-
-  bool operator()(MultipartContentHeader header,
-                  ContentReceiver receiver) const {
-    return multipart_reader_(std::move(header), std::move(receiver));
-  }
-
-  bool operator()(ContentReceiver receiver) const {
-    return reader_(std::move(receiver));
-  }
-
-  Reader reader_;
-  MultipartReader multipart_reader_;
-};
-
-using Range = std::pair<ssize_t, ssize_t>;
-using Ranges = std::vector<Range>;
-
-struct Request {
-  std::string method;
-  std::string path;
-  Params params;
-  Headers headers;
-  std::string body;
-
-  std::string remote_addr;
-  int remote_port = -1;
-  std::string local_addr;
-  int local_port = -1;
-
-  // for server
-  std::string version;
-  std::string target;
-  MultipartFormDataMap files;
-  Ranges ranges;
-  Match matches;
-  std::unordered_map<std::string, std::string> path_params;
-  std::function<bool()> is_connection_closed = []() { return true; };
-
-  // for client
-  ResponseHandler response_handler;
-  ContentReceiverWithProgress content_receiver;
-  Progress progress;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  const SSL *ssl = nullptr;
-#endif
-
-  bool has_header(const std::string &key) const;
-  std::string get_header_value(const std::string &key, const char *def = "",
-                               size_t id = 0) const;
-  uint64_t get_header_value_u64(const std::string &key, uint64_t def = 0,
-                                size_t id = 0) const;
-  size_t get_header_value_count(const std::string &key) const;
-  void set_header(const std::string &key, const std::string &val);
-
-  bool has_param(const std::string &key) const;
-  std::string get_param_value(const std::string &key, size_t id = 0) const;
-  size_t get_param_value_count(const std::string &key) const;
-
-  bool is_multipart_form_data() const;
-
-  bool has_file(const std::string &key) const;
-  MultipartFormData get_file_value(const std::string &key) const;
-  std::vector<MultipartFormData> get_file_values(const std::string &key) const;
-
-  // private members...
-  size_t redirect_count_ = CPPHTTPLIB_REDIRECT_MAX_COUNT;
-  size_t content_length_ = 0;
-  ContentProvider content_provider_;
-  bool is_chunked_content_provider_ = false;
-  size_t authorization_count_ = 0;
-  std::chrono::time_point<std::chrono::steady_clock> start_time_ =
-      (std::chrono::steady_clock::time_point::min)();
-};
-
-struct Response {
-  std::string version;
-  int status = -1;
-  std::string reason;
-  Headers headers;
-  std::string body;
-  std::string location; // Redirect location
-
-  bool has_header(const std::string &key) const;
-  std::string get_header_value(const std::string &key, const char *def = "",
-                               size_t id = 0) const;
-  uint64_t get_header_value_u64(const std::string &key, uint64_t def = 0,
-                                size_t id = 0) const;
-  size_t get_header_value_count(const std::string &key) const;
-  void set_header(const std::string &key, const std::string &val);
-
-  void set_redirect(const std::string &url, int status = StatusCode::Found_302);
-  void set_content(const char *s, size_t n, const std::string &content_type);
-  void set_content(const std::string &s, const std::string &content_type);
-  void set_content(std::string &&s, const std::string &content_type);
-
-  void set_content_provider(
-      size_t length, const std::string &content_type, ContentProvider provider,
-      ContentProviderResourceReleaser resource_releaser = nullptr);
-
-  void set_content_provider(
-      const std::string &content_type, ContentProviderWithoutLength provider,
-      ContentProviderResourceReleaser resource_releaser = nullptr);
-
-  void set_chunked_content_provider(
-      const std::string &content_type, ContentProviderWithoutLength provider,
-      ContentProviderResourceReleaser resource_releaser = nullptr);
-
-  void set_file_content(const std::string &path,
-                        const std::string &content_type);
-  void set_file_content(const std::string &path);
-
-  Response() = default;
-  Response(const Response &) = default;
-  Response &operator=(const Response &) = default;
-  Response(Response &&) = default;
-  Response &operator=(Response &&) = default;
-  ~Response() {
-    if (content_provider_resource_releaser_) {
-      content_provider_resource_releaser_(content_provider_success_);
-    }
-  }
-
-  // private members...
-  size_t content_length_ = 0;
-  ContentProvider content_provider_;
-  ContentProviderResourceReleaser content_provider_resource_releaser_;
-  bool is_chunked_content_provider_ = false;
-  bool content_provider_success_ = false;
-  std::string file_content_path_;
-  std::string file_content_content_type_;
-};
-
-class Stream {
-public:
-  virtual ~Stream() = default;
-
-  virtual bool is_readable() const = 0;
-  virtual bool wait_readable() const = 0;
-  virtual bool wait_writable() const = 0;
-
-  virtual ssize_t read(char *ptr, size_t size) = 0;
-  virtual ssize_t write(const char *ptr, size_t size) = 0;
-  virtual void get_remote_ip_and_port(std::string &ip, int &port) const = 0;
-  virtual void get_local_ip_and_port(std::string &ip, int &port) const = 0;
-  virtual socket_t socket() const = 0;
-
-  virtual time_t duration() const = 0;
-
-  ssize_t write(const char *ptr);
-  ssize_t write(const std::string &s);
-};
-
-class TaskQueue {
-public:
-  TaskQueue() = default;
-  virtual ~TaskQueue() = default;
-
-  virtual bool enqueue(std::function<void()> fn) = 0;
-  virtual void shutdown() = 0;
-
-  virtual void on_idle() {}
-};
-
-class ThreadPool final : public TaskQueue {
-public:
-  explicit ThreadPool(size_t n, size_t mqr = 0)
-      : shutdown_(false), max_queued_requests_(mqr) {
-    while (n) {
-      threads_.emplace_back(worker(*this));
-      n--;
-    }
-  }
-
-  ThreadPool(const ThreadPool &) = delete;
-  ~ThreadPool() override = default;
-
-  bool enqueue(std::function<void()> fn) override {
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      if (max_queued_requests_ > 0 && jobs_.size() >= max_queued_requests_) {
-        return false;
-      }
-      jobs_.push_back(std::move(fn));
-    }
-
-    cond_.notify_one();
-    return true;
-  }
-
-  void shutdown() override {
-    // Stop all worker threads...
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      shutdown_ = true;
-    }
-
-    cond_.notify_all();
-
-    // Join...
-    for (auto &t : threads_) {
-      t.join();
-    }
-  }
-
-private:
-  struct worker {
-    explicit worker(ThreadPool &pool) : pool_(pool) {}
-
-    void operator()() {
-      for (;;) {
-        std::function<void()> fn;
-        {
-          std::unique_lock<std::mutex> lock(pool_.mutex_);
-
-          pool_.cond_.wait(
-              lock, [&] { return !pool_.jobs_.empty() || pool_.shutdown_; });
-
-          if (pool_.shutdown_ && pool_.jobs_.empty()) { break; }
-
-          fn = pool_.jobs_.front();
-          pool_.jobs_.pop_front();
-        }
-
-        assert(true == static_cast<bool>(fn));
-        fn();
-      }
-
-#if defined(CPPHTTPLIB_OPENSSL_SUPPORT) && !defined(OPENSSL_IS_BORINGSSL) &&   \
-    !defined(LIBRESSL_VERSION_NUMBER)
-      OPENSSL_thread_stop();
-#endif
-    }
-
-    ThreadPool &pool_;
-  };
-  friend struct worker;
-
-  std::vector<std::thread> threads_;
-  std::list<std::function<void()>> jobs_;
-
-  bool shutdown_;
-  size_t max_queued_requests_ = 0;
-
-  std::condition_variable cond_;
-  std::mutex mutex_;
-};
-
-using Logger = std::function<void(const Request &, const Response &)>;
-
-using SocketOptions = std::function<void(socket_t sock)>;
-
-namespace detail {
-
-bool set_socket_opt_impl(socket_t sock, int level, int optname,
-                         const void *optval, socklen_t optlen);
-bool set_socket_opt(socket_t sock, int level, int optname, int opt);
-bool set_socket_opt_time(socket_t sock, int level, int optname, time_t sec,
-                         time_t usec);
-
-} // namespace detail
-
-void default_socket_options(socket_t sock);
-
-const char *status_message(int status);
-
-std::string get_bearer_token_auth(const Request &req);
-
-namespace detail {
-
-class MatcherBase {
-public:
-  virtual ~MatcherBase() = default;
-
-  // Match request path and populate its matches and
-  virtual bool match(Request &request) const = 0;
-};
-
-/**
- * Captures parameters in request path and stores them in Request::path_params
- *
- * Capture name is a substring of a pattern from : to /.
- * The rest of the pattern is matched against the request path directly
- * Parameters are captured starting from the next character after
- * the end of the last matched static pattern fragment until the next /.
- *
- * Example pattern:
- * "/path/fragments/:capture/more/fragments/:second_capture"
- * Static fragments:
- * "/path/fragments/", "more/fragments/"
- *
- * Given the following request path:
- * "/path/fragments/:1/more/fragments/:2"
- * the resulting capture will be
- * {{"capture", "1"}, {"second_capture", "2"}}
- */
-class PathParamsMatcher final : public MatcherBase {
-public:
-  PathParamsMatcher(const std::string &pattern);
-
-  bool match(Request &request) const override;
-
-private:
-  // Treat segment separators as the end of path parameter capture
-  // Does not need to handle query parameters as they are parsed before path
-  // matching
-  static constexpr char separator = '/';
-
-  // Contains static path fragments to match against, excluding the '/' after
-  // path params
-  // Fragments are separated by path params
-  std::vector<std::string> static_fragments_;
-  // Stores the names of the path parameters to be used as keys in the
-  // Request::path_params map
-  std::vector<std::string> param_names_;
-};
-
-/**
- * Performs std::regex_match on request path
- * and stores the result in Request::matches
- *
- * Note that regex match is performed directly on the whole request.
- * This means that wildcard patterns may match multiple path segments with /:
- * "/begin/(.*)/end" will match both "/begin/middle/end" and "/begin/1/2/end".
- */
-class RegexMatcher final : public MatcherBase {
-public:
-  RegexMatcher(const std::string &pattern) : regex_(pattern) {}
-
-  bool match(Request &request) const override;
-
-private:
-  std::regex regex_;
-};
-
-ssize_t write_headers(Stream &strm, const Headers &headers);
-
-} // namespace detail
-
-class Server {
-public:
-  using Handler = std::function<void(const Request &, Response &)>;
-
-  using ExceptionHandler =
-      std::function<void(const Request &, Response &, std::exception_ptr ep)>;
-
-  enum class HandlerResponse {
-    Handled,
-    Unhandled,
-  };
-  using HandlerWithResponse =
-      std::function<HandlerResponse(const Request &, Response &)>;
-
-  using HandlerWithContentReader = std::function<void(
-      const Request &, Response &, const ContentReader &content_reader)>;
-
-  using Expect100ContinueHandler =
-      std::function<int(const Request &, Response &)>;
-
-  Server();
-
-  virtual ~Server();
-
-  virtual bool is_valid() const;
-
-  Server &Get(const std::string &pattern, Handler handler);
-  Server &Post(const std::string &pattern, Handler handler);
-  Server &Post(const std::string &pattern, HandlerWithContentReader handler);
-  Server &Put(const std::string &pattern, Handler handler);
-  Server &Put(const std::string &pattern, HandlerWithContentReader handler);
-  Server &Patch(const std::string &pattern, Handler handler);
-  Server &Patch(const std::string &pattern, HandlerWithContentReader handler);
-  Server &Delete(const std::string &pattern, Handler handler);
-  Server &Delete(const std::string &pattern, HandlerWithContentReader handler);
-  Server &Options(const std::string &pattern, Handler handler);
-
-  bool set_base_dir(const std::string &dir,
-                    const std::string &mount_point = std::string());
-  bool set_mount_point(const std::string &mount_point, const std::string &dir,
-                       Headers headers = Headers());
-  bool remove_mount_point(const std::string &mount_point);
-  Server &set_file_extension_and_mimetype_mapping(const std::string &ext,
-                                                  const std::string &mime);
-  Server &set_default_file_mimetype(const std::string &mime);
-  Server &set_file_request_handler(Handler handler);
-
-  template <class ErrorHandlerFunc>
-  Server &set_error_handler(ErrorHandlerFunc &&handler) {
-    return set_error_handler_core(
-        std::forward<ErrorHandlerFunc>(handler),
-        std::is_convertible<ErrorHandlerFunc, HandlerWithResponse>{});
-  }
-
-  Server &set_exception_handler(ExceptionHandler handler);
-  Server &set_pre_routing_handler(HandlerWithResponse handler);
-  Server &set_post_routing_handler(Handler handler);
-
-  Server &set_expect_100_continue_handler(Expect100ContinueHandler handler);
-  Server &set_logger(Logger logger);
-
-  Server &set_address_family(int family);
-  Server &set_tcp_nodelay(bool on);
-  Server &set_ipv6_v6only(bool on);
-  Server &set_socket_options(SocketOptions socket_options);
-
-  Server &set_default_headers(Headers headers);
-  Server &
-  set_header_writer(std::function<ssize_t(Stream &, Headers &)> const &writer);
-
-  Server &set_keep_alive_max_count(size_t count);
-  Server &set_keep_alive_timeout(time_t sec);
-
-  Server &set_read_timeout(time_t sec, time_t usec = 0);
-  template <class Rep, class Period>
-  Server &set_read_timeout(const std::chrono::duration<Rep, Period> &duration);
-
-  Server &set_write_timeout(time_t sec, time_t usec = 0);
-  template <class Rep, class Period>
-  Server &set_write_timeout(const std::chrono::duration<Rep, Period> &duration);
-
-  Server &set_idle_interval(time_t sec, time_t usec = 0);
-  template <class Rep, class Period>
-  Server &set_idle_interval(const std::chrono::duration<Rep, Period> &duration);
-
-  Server &set_payload_max_length(size_t length);
-
-  bool bind_to_port(const std::string &host, int port, int socket_flags = 0);
-  int bind_to_any_port(const std::string &host, int socket_flags = 0);
-  bool listen_after_bind();
-
-  bool listen(const std::string &host, int port, int socket_flags = 0);
-
-  bool is_running() const;
-  void wait_until_ready() const;
-  void stop();
-  void decommission();
-
-  std::function<TaskQueue *(void)> new_task_queue;
-
-protected:
-  bool process_request(Stream &strm, const std::string &remote_addr,
-                       int remote_port, const std::string &local_addr,
-                       int local_port, bool close_connection,
-                       bool &connection_closed,
-                       const std::function<void(Request &)> &setup_request);
-
-  std::atomic<socket_t> svr_sock_{INVALID_SOCKET};
-  size_t keep_alive_max_count_ = CPPHTTPLIB_KEEPALIVE_MAX_COUNT;
-  time_t keep_alive_timeout_sec_ = CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND;
-  time_t read_timeout_sec_ = CPPHTTPLIB_SERVER_READ_TIMEOUT_SECOND;
-  time_t read_timeout_usec_ = CPPHTTPLIB_SERVER_READ_TIMEOUT_USECOND;
-  time_t write_timeout_sec_ = CPPHTTPLIB_SERVER_WRITE_TIMEOUT_SECOND;
-  time_t write_timeout_usec_ = CPPHTTPLIB_SERVER_WRITE_TIMEOUT_USECOND;
-  time_t idle_interval_sec_ = CPPHTTPLIB_IDLE_INTERVAL_SECOND;
-  time_t idle_interval_usec_ = CPPHTTPLIB_IDLE_INTERVAL_USECOND;
-  size_t payload_max_length_ = CPPHTTPLIB_PAYLOAD_MAX_LENGTH;
-
-private:
-  using Handlers =
-      std::vector<std::pair<std::unique_ptr<detail::MatcherBase>, Handler>>;
-  using HandlersForContentReader =
-      std::vector<std::pair<std::unique_ptr<detail::MatcherBase>,
-                            HandlerWithContentReader>>;
-
-  static std::unique_ptr<detail::MatcherBase>
-  make_matcher(const std::string &pattern);
-
-  Server &set_error_handler_core(HandlerWithResponse handler, std::true_type);
-  Server &set_error_handler_core(Handler handler, std::false_type);
-
-  socket_t create_server_socket(const std::string &host, int port,
-                                int socket_flags,
-                                SocketOptions socket_options) const;
-  int bind_internal(const std::string &host, int port, int socket_flags);
-  bool listen_internal();
-
-  bool routing(Request &req, Response &res, Stream &strm);
-  bool handle_file_request(const Request &req, Response &res,
-                           bool head = false);
-  bool dispatch_request(Request &req, Response &res,
-                        const Handlers &handlers) const;
-  bool dispatch_request_for_content_reader(
-      Request &req, Response &res, ContentReader content_reader,
-      const HandlersForContentReader &handlers) const;
-
-  bool parse_request_line(const char *s, Request &req) const;
-  void apply_ranges(const Request &req, Response &res,
-                    std::string &content_type, std::string &boundary) const;
-  bool write_response(Stream &strm, bool close_connection, Request &req,
-                      Response &res);
-  bool write_response_with_content(Stream &strm, bool close_connection,
-                                   const Request &req, Response &res);
-  bool write_response_core(Stream &strm, bool close_connection,
-                           const Request &req, Response &res,
-                           bool need_apply_ranges);
-  bool write_content_with_provider(Stream &strm, const Request &req,
-                                   Response &res, const std::string &boundary,
-                                   const std::string &content_type);
-  bool read_content(Stream &strm, Request &req, Response &res);
-  bool
-  read_content_with_content_receiver(Stream &strm, Request &req, Response &res,
-                                     ContentReceiver receiver,
-                                     MultipartContentHeader multipart_header,
-                                     ContentReceiver multipart_receiver);
-  bool read_content_core(Stream &strm, Request &req, Response &res,
-                         ContentReceiver receiver,
-                         MultipartContentHeader multipart_header,
-                         ContentReceiver multipart_receiver) const;
-
-  virtual bool process_and_close_socket(socket_t sock);
-
-  std::atomic<bool> is_running_{false};
-  std::atomic<bool> is_decommissioned{false};
-
-  struct MountPointEntry {
-    std::string mount_point;
-    std::string base_dir;
-    Headers headers;
-  };
-  std::vector<MountPointEntry> base_dirs_;
-  std::map<std::string, std::string> file_extension_and_mimetype_map_;
-  std::string default_file_mimetype_ = "application/octet-stream";
-  Handler file_request_handler_;
-
-  Handlers get_handlers_;
-  Handlers post_handlers_;
-  HandlersForContentReader post_handlers_for_content_reader_;
-  Handlers put_handlers_;
-  HandlersForContentReader put_handlers_for_content_reader_;
-  Handlers patch_handlers_;
-  HandlersForContentReader patch_handlers_for_content_reader_;
-  Handlers delete_handlers_;
-  HandlersForContentReader delete_handlers_for_content_reader_;
-  Handlers options_handlers_;
-
-  HandlerWithResponse error_handler_;
-  ExceptionHandler exception_handler_;
-  HandlerWithResponse pre_routing_handler_;
-  Handler post_routing_handler_;
-  Expect100ContinueHandler expect_100_continue_handler_;
-
-  Logger logger_;
-
-  int address_family_ = AF_UNSPEC;
-  bool tcp_nodelay_ = CPPHTTPLIB_TCP_NODELAY;
-  bool ipv6_v6only_ = CPPHTTPLIB_IPV6_V6ONLY;
-  SocketOptions socket_options_ = default_socket_options;
-
-  Headers default_headers_;
-  std::function<ssize_t(Stream &, Headers &)> header_writer_ =
-      detail::write_headers;
-};
-
-enum class Error {
-  Success = 0,
-  Unknown,
-  Connection,
-  BindIPAddress,
-  Read,
-  Write,
-  ExceedRedirectCount,
-  Canceled,
-  SSLConnection,
-  SSLLoadingCerts,
-  SSLServerVerification,
-  SSLServerHostnameVerification,
-  UnsupportedMultipartBoundaryChars,
-  Compression,
-  ConnectionTimeout,
-  ProxyConnection,
-
-  // For internal use only
-  SSLPeerCouldBeClosed_,
-};
-
-std::string to_string(Error error);
-
-std::ostream &operator<<(std::ostream &os, const Error &obj);
-
-class Result {
-public:
-  Result() = default;
-  Result(std::unique_ptr<Response> &&res, Error err,
-         Headers &&request_headers = Headers{})
-      : res_(std::move(res)), err_(err),
-        request_headers_(std::move(request_headers)) {}
-  // Response
-  operator bool() const { return res_ != nullptr; }
-  bool operator==(std::nullptr_t) const { return res_ == nullptr; }
-  bool operator!=(std::nullptr_t) const { return res_ != nullptr; }
-  const Response &value() const { return *res_; }
-  Response &value() { return *res_; }
-  const Response &operator*() const { return *res_; }
-  Response &operator*() { return *res_; }
-  const Response *operator->() const { return res_.get(); }
-  Response *operator->() { return res_.get(); }
-
-  // Error
-  Error error() const { return err_; }
-
-  // Request Headers
-  bool has_request_header(const std::string &key) const;
-  std::string get_request_header_value(const std::string &key,
-                                       const char *def = "",
-                                       size_t id = 0) const;
-  uint64_t get_request_header_value_u64(const std::string &key,
-                                        uint64_t def = 0, size_t id = 0) const;
-  size_t get_request_header_value_count(const std::string &key) const;
-
-private:
-  std::unique_ptr<Response> res_;
-  Error err_ = Error::Unknown;
-  Headers request_headers_;
-};
-
-class ClientImpl {
-public:
-  explicit ClientImpl(const std::string &host);
-
-  explicit ClientImpl(const std::string &host, int port);
-
-  explicit ClientImpl(const std::string &host, int port,
-                      const std::string &client_cert_path,
-                      const std::string &client_key_path);
-
-  virtual ~ClientImpl();
-
-  virtual bool is_valid() const;
-
-  Result Get(const std::string &path);
-  Result Get(const std::string &path, const Headers &headers);
-  Result Get(const std::string &path, Progress progress);
-  Result Get(const std::string &path, const Headers &headers,
-             Progress progress);
-  Result Get(const std::string &path, ContentReceiver content_receiver);
-  Result Get(const std::string &path, const Headers &headers,
-             ContentReceiver content_receiver);
-  Result Get(const std::string &path, ContentReceiver content_receiver,
-             Progress progress);
-  Result Get(const std::string &path, const Headers &headers,
-             ContentReceiver content_receiver, Progress progress);
-  Result Get(const std::string &path, ResponseHandler response_handler,
-             ContentReceiver content_receiver);
-  Result Get(const std::string &path, const Headers &headers,
-             ResponseHandler response_handler,
-             ContentReceiver content_receiver);
-  Result Get(const std::string &path, ResponseHandler response_handler,
-             ContentReceiver content_receiver, Progress progress);
-  Result Get(const std::string &path, const Headers &headers,
-             ResponseHandler response_handler, ContentReceiver content_receiver,
-             Progress progress);
-
-  Result Get(const std::string &path, const Params &params,
-             const Headers &headers, Progress progress = nullptr);
-  Result Get(const std::string &path, const Params &params,
-             const Headers &headers, ContentReceiver content_receiver,
-             Progress progress = nullptr);
-  Result Get(const std::string &path, const Params &params,
-             const Headers &headers, ResponseHandler response_handler,
-             ContentReceiver content_receiver, Progress progress = nullptr);
-
-  Result Head(const std::string &path);
-  Result Head(const std::string &path, const Headers &headers);
-
-  Result Post(const std::string &path);
-  Result Post(const std::string &path, const Headers &headers);
-  Result Post(const std::string &path, const char *body, size_t content_length,
-              const std::string &content_type);
-  Result Post(const std::string &path, const Headers &headers, const char *body,
-              size_t content_length, const std::string &content_type);
-  Result Post(const std::string &path, const Headers &headers, const char *body,
-              size_t content_length, const std::string &content_type,
-              Progress progress);
-  Result Post(const std::string &path, const std::string &body,
-              const std::string &content_type);
-  Result Post(const std::string &path, const std::string &body,
-              const std::string &content_type, Progress progress);
-  Result Post(const std::string &path, const Headers &headers,
-              const std::string &body, const std::string &content_type);
-  Result Post(const std::string &path, const Headers &headers,
-              const std::string &body, const std::string &content_type,
-              Progress progress);
-  Result Post(const std::string &path, size_t content_length,
-              ContentProvider content_provider,
-              const std::string &content_type);
-  Result Post(const std::string &path,
-              ContentProviderWithoutLength content_provider,
-              const std::string &content_type);
-  Result Post(const std::string &path, const Headers &headers,
-              size_t content_length, ContentProvider content_provider,
-              const std::string &content_type);
-  Result Post(const std::string &path, const Headers &headers,
-              ContentProviderWithoutLength content_provider,
-              const std::string &content_type);
-  Result Post(const std::string &path, const Params &params);
-  Result Post(const std::string &path, const Headers &headers,
-              const Params &params);
-  Result Post(const std::string &path, const Headers &headers,
-              const Params &params, Progress progress);
-  Result Post(const std::string &path, const MultipartFormDataItems &items);
-  Result Post(const std::string &path, const Headers &headers,
-              const MultipartFormDataItems &items);
-  Result Post(const std::string &path, const Headers &headers,
-              const MultipartFormDataItems &items, const std::string &boundary);
-  Result Post(const std::string &path, const Headers &headers,
-              const MultipartFormDataItems &items,
-              const MultipartFormDataProviderItems &provider_items);
-
-  Result Put(const std::string &path);
-  Result Put(const std::string &path, const char *body, size_t content_length,
-             const std::string &content_type);
-  Result Put(const std::string &path, const Headers &headers, const char *body,
-             size_t content_length, const std::string &content_type);
-  Result Put(const std::string &path, const Headers &headers, const char *body,
-             size_t content_length, const std::string &content_type,
-             Progress progress);
-  Result Put(const std::string &path, const std::string &body,
-             const std::string &content_type);
-  Result Put(const std::string &path, const std::string &body,
-             const std::string &content_type, Progress progress);
-  Result Put(const std::string &path, const Headers &headers,
-             const std::string &body, const std::string &content_type);
-  Result Put(const std::string &path, const Headers &headers,
-             const std::string &body, const std::string &content_type,
-             Progress progress);
-  Result Put(const std::string &path, size_t content_length,
-             ContentProvider content_provider, const std::string &content_type);
-  Result Put(const std::string &path,
-             ContentProviderWithoutLength content_provider,
-             const std::string &content_type);
-  Result Put(const std::string &path, const Headers &headers,
-             size_t content_length, ContentProvider content_provider,
-             const std::string &content_type);
-  Result Put(const std::string &path, const Headers &headers,
-             ContentProviderWithoutLength content_provider,
-             const std::string &content_type);
-  Result Put(const std::string &path, const Params &params);
-  Result Put(const std::string &path, const Headers &headers,
-             const Params &params);
-  Result Put(const std::string &path, const Headers &headers,
-             const Params &params, Progress progress);
-  Result Put(const std::string &path, const MultipartFormDataItems &items);
-  Result Put(const std::string &path, const Headers &headers,
-             const MultipartFormDataItems &items);
-  Result Put(const std::string &path, const Headers &headers,
-             const MultipartFormDataItems &items, const std::string &boundary);
-  Result Put(const std::string &path, const Headers &headers,
-             const MultipartFormDataItems &items,
-             const MultipartFormDataProviderItems &provider_items);
-
-  Result Patch(const std::string &path);
-  Result Patch(const std::string &path, const char *body, size_t content_length,
-               const std::string &content_type);
-  Result Patch(const std::string &path, const char *body, size_t content_length,
-               const std::string &content_type, Progress progress);
-  Result Patch(const std::string &path, const Headers &headers,
-               const char *body, size_t content_length,
-               const std::string &content_type);
-  Result Patch(const std::string &path, const Headers &headers,
-               const char *body, size_t content_length,
-               const std::string &content_type, Progress progress);
-  Result Patch(const std::string &path, const std::string &body,
-               const std::string &content_type);
-  Result Patch(const std::string &path, const std::string &body,
-               const std::string &content_type, Progress progress);
-  Result Patch(const std::string &path, const Headers &headers,
-               const std::string &body, const std::string &content_type);
-  Result Patch(const std::string &path, const Headers &headers,
-               const std::string &body, const std::string &content_type,
-               Progress progress);
-  Result Patch(const std::string &path, size_t content_length,
-               ContentProvider content_provider,
-               const std::string &content_type);
-  Result Patch(const std::string &path,
-               ContentProviderWithoutLength content_provider,
-               const std::string &content_type);
-  Result Patch(const std::string &path, const Headers &headers,
-               size_t content_length, ContentProvider content_provider,
-               const std::string &content_type);
-  Result Patch(const std::string &path, const Headers &headers,
-               ContentProviderWithoutLength content_provider,
-               const std::string &content_type);
-
-  Result Delete(const std::string &path);
-  Result Delete(const std::string &path, const Headers &headers);
-  Result Delete(const std::string &path, const char *body,
-                size_t content_length, const std::string &content_type);
-  Result Delete(const std::string &path, const char *body,
-                size_t content_length, const std::string &content_type,
-                Progress progress);
-  Result Delete(const std::string &path, const Headers &headers,
-                const char *body, size_t content_length,
-                const std::string &content_type);
-  Result Delete(const std::string &path, const Headers &headers,
-                const char *body, size_t content_length,
-                const std::string &content_type, Progress progress);
-  Result Delete(const std::string &path, const std::string &body,
-                const std::string &content_type);
-  Result Delete(const std::string &path, const std::string &body,
-                const std::string &content_type, Progress progress);
-  Result Delete(const std::string &path, const Headers &headers,
-                const std::string &body, const std::string &content_type);
-  Result Delete(const std::string &path, const Headers &headers,
-                const std::string &body, const std::string &content_type,
-                Progress progress);
-
-  Result Options(const std::string &path);
-  Result Options(const std::string &path, const Headers &headers);
-
-  bool send(Request &req, Response &res, Error &error);
-  Result send(const Request &req);
-
-  void stop();
-
-  std::string host() const;
-  int port() const;
-
-  size_t is_socket_open() const;
-  socket_t socket() const;
-
-  void set_hostname_addr_map(std::map<std::string, std::string> addr_map);
-
-  void set_default_headers(Headers headers);
-
-  void
-  set_header_writer(std::function<ssize_t(Stream &, Headers &)> const &writer);
-
-  void set_address_family(int family);
-  void set_tcp_nodelay(bool on);
-  void set_ipv6_v6only(bool on);
-  void set_socket_options(SocketOptions socket_options);
-
-  void set_connection_timeout(time_t sec, time_t usec = 0);
-  template <class Rep, class Period>
-  void
-  set_connection_timeout(const std::chrono::duration<Rep, Period> &duration);
-
-  void set_read_timeout(time_t sec, time_t usec = 0);
-  template <class Rep, class Period>
-  void set_read_timeout(const std::chrono::duration<Rep, Period> &duration);
-
-  void set_write_timeout(time_t sec, time_t usec = 0);
-  template <class Rep, class Period>
-  void set_write_timeout(const std::chrono::duration<Rep, Period> &duration);
-
-  void set_max_timeout(time_t msec);
-  template <class Rep, class Period>
-  void set_max_timeout(const std::chrono::duration<Rep, Period> &duration);
-
-  void set_basic_auth(const std::string &username, const std::string &password);
-  void set_bearer_token_auth(const std::string &token);
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void set_digest_auth(const std::string &username,
-                       const std::string &password);
-#endif
-
-  void set_keep_alive(bool on);
-  void set_follow_location(bool on);
-
-  void set_url_encode(bool on);
-
-  void set_compress(bool on);
-
-  void set_decompress(bool on);
-
-  void set_interface(const std::string &intf);
-
-  void set_proxy(const std::string &host, int port);
-  void set_proxy_basic_auth(const std::string &username,
-                            const std::string &password);
-  void set_proxy_bearer_token_auth(const std::string &token);
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void set_proxy_digest_auth(const std::string &username,
-                             const std::string &password);
-#endif
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void set_ca_cert_path(const std::string &ca_cert_file_path,
-                        const std::string &ca_cert_dir_path = std::string());
-  void set_ca_cert_store(X509_STORE *ca_cert_store);
-  X509_STORE *create_ca_cert_store(const char *ca_cert, std::size_t size) const;
-#endif
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void enable_server_certificate_verification(bool enabled);
-  void enable_server_hostname_verification(bool enabled);
-  void set_server_certificate_verifier(
-      std::function<SSLVerifierResponse(SSL *ssl)> verifier);
-#endif
-
-  void set_logger(Logger logger);
-
-protected:
-  struct Socket {
-    socket_t sock = INVALID_SOCKET;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-    SSL *ssl = nullptr;
-#endif
-
-    bool is_open() const { return sock != INVALID_SOCKET; }
-  };
-
-  virtual bool create_and_connect_socket(Socket &socket, Error &error);
-
-  // All of:
-  //   shutdown_ssl
-  //   shutdown_socket
-  //   close_socket
-  // should ONLY be called when socket_mutex_ is locked.
-  // Also, shutdown_ssl and close_socket should also NOT be called concurrently
-  // with a DIFFERENT thread sending requests using that socket.
-  virtual void shutdown_ssl(Socket &socket, bool shutdown_gracefully);
-  void shutdown_socket(Socket &socket) const;
-  void close_socket(Socket &socket);
-
-  bool process_request(Stream &strm, Request &req, Response &res,
-                       bool close_connection, Error &error);
-
-  bool write_content_with_provider(Stream &strm, const Request &req,
-                                   Error &error) const;
-
-  void copy_settings(const ClientImpl &rhs);
-
-  // Socket endpoint information
-  const std::string host_;
-  const int port_;
-  const std::string host_and_port_;
-
-  // Current open socket
-  Socket socket_;
-  mutable std::mutex socket_mutex_;
-  std::recursive_mutex request_mutex_;
-
-  // These are all protected under socket_mutex
-  size_t socket_requests_in_flight_ = 0;
-  std::thread::id socket_requests_are_from_thread_ = std::thread::id();
-  bool socket_should_be_closed_when_request_is_done_ = false;
-
-  // Hostname-IP map
-  std::map<std::string, std::string> addr_map_;
-
-  // Default headers
-  Headers default_headers_;
-
-  // Header writer
-  std::function<ssize_t(Stream &, Headers &)> header_writer_ =
-      detail::write_headers;
-
-  // Settings
-  std::string client_cert_path_;
-  std::string client_key_path_;
-
-  time_t connection_timeout_sec_ = CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND;
-  time_t connection_timeout_usec_ = CPPHTTPLIB_CONNECTION_TIMEOUT_USECOND;
-  time_t read_timeout_sec_ = CPPHTTPLIB_CLIENT_READ_TIMEOUT_SECOND;
-  time_t read_timeout_usec_ = CPPHTTPLIB_CLIENT_READ_TIMEOUT_USECOND;
-  time_t write_timeout_sec_ = CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_SECOND;
-  time_t write_timeout_usec_ = CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_USECOND;
-  time_t max_timeout_msec_ = CPPHTTPLIB_CLIENT_MAX_TIMEOUT_MSECOND;
-
-  std::string basic_auth_username_;
-  std::string basic_auth_password_;
-  std::string bearer_token_auth_token_;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  std::string digest_auth_username_;
-  std::string digest_auth_password_;
-#endif
-
-  bool keep_alive_ = false;
-  bool follow_location_ = false;
-
-  bool url_encode_ = true;
-
-  int address_family_ = AF_UNSPEC;
-  bool tcp_nodelay_ = CPPHTTPLIB_TCP_NODELAY;
-  bool ipv6_v6only_ = CPPHTTPLIB_IPV6_V6ONLY;
-  SocketOptions socket_options_ = nullptr;
-
-  bool compress_ = false;
-  bool decompress_ = true;
-
-  std::string interface_;
-
-  std::string proxy_host_;
-  int proxy_port_ = -1;
-
-  std::string proxy_basic_auth_username_;
-  std::string proxy_basic_auth_password_;
-  std::string proxy_bearer_token_auth_token_;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  std::string proxy_digest_auth_username_;
-  std::string proxy_digest_auth_password_;
-#endif
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  std::string ca_cert_file_path_;
-  std::string ca_cert_dir_path_;
-
-  X509_STORE *ca_cert_store_ = nullptr;
-#endif
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  bool server_certificate_verification_ = true;
-  bool server_hostname_verification_ = true;
-  std::function<SSLVerifierResponse(SSL *ssl)> server_certificate_verifier_;
-#endif
-
-  Logger logger_;
-
-private:
-  bool send_(Request &req, Response &res, Error &error);
-  Result send_(Request &&req);
-
-  socket_t create_client_socket(Error &error) const;
-  bool read_response_line(Stream &strm, const Request &req,
-                          Response &res) const;
-  bool write_request(Stream &strm, Request &req, bool close_connection,
-                     Error &error);
-  bool redirect(Request &req, Response &res, Error &error);
-  bool handle_request(Stream &strm, Request &req, Response &res,
-                      bool close_connection, Error &error);
-  std::unique_ptr<Response> send_with_content_provider(
-      Request &req, const char *body, size_t content_length,
-      ContentProvider content_provider,
-      ContentProviderWithoutLength content_provider_without_length,
-      const std::string &content_type, Error &error);
-  Result send_with_content_provider(
-      const std::string &method, const std::string &path,
-      const Headers &headers, const char *body, size_t content_length,
-      ContentProvider content_provider,
-      ContentProviderWithoutLength content_provider_without_length,
-      const std::string &content_type, Progress progress);
-  ContentProviderWithoutLength get_multipart_content_provider(
-      const std::string &boundary, const MultipartFormDataItems &items,
-      const MultipartFormDataProviderItems &provider_items) const;
-
-  std::string adjust_host_string(const std::string &host) const;
-
-  virtual bool
-  process_socket(const Socket &socket,
-                 std::chrono::time_point<std::chrono::steady_clock> start_time,
-                 std::function<bool(Stream &strm)> callback);
-  virtual bool is_ssl() const;
-};
-
-class Client {
-public:
-  // Universal interface
-  explicit Client(const std::string &scheme_host_port);
-
-  explicit Client(const std::string &scheme_host_port,
-                  const std::string &client_cert_path,
-                  const std::string &client_key_path);
-
-  // HTTP only interface
-  explicit Client(const std::string &host, int port);
-
-  explicit Client(const std::string &host, int port,
-                  const std::string &client_cert_path,
-                  const std::string &client_key_path);
-
-  Client(Client &&) = default;
-  Client &operator=(Client &&) = default;
-
-  ~Client();
-
-  bool is_valid() const;
-
-  Result Get(const std::string &path);
-  Result Get(const std::string &path, const Headers &headers);
-  Result Get(const std::string &path, Progress progress);
-  Result Get(const std::string &path, const Headers &headers,
-             Progress progress);
-  Result Get(const std::string &path, ContentReceiver content_receiver);
-  Result Get(const std::string &path, const Headers &headers,
-             ContentReceiver content_receiver);
-  Result Get(const std::string &path, ContentReceiver content_receiver,
-             Progress progress);
-  Result Get(const std::string &path, const Headers &headers,
-             ContentReceiver content_receiver, Progress progress);
-  Result Get(const std::string &path, ResponseHandler response_handler,
-             ContentReceiver content_receiver);
-  Result Get(const std::string &path, const Headers &headers,
-             ResponseHandler response_handler,
-             ContentReceiver content_receiver);
-  Result Get(const std::string &path, const Headers &headers,
-             ResponseHandler response_handler, ContentReceiver content_receiver,
-             Progress progress);
-  Result Get(const std::string &path, ResponseHandler response_handler,
-             ContentReceiver content_receiver, Progress progress);
-
-  Result Get(const std::string &path, const Params &params,
-             const Headers &headers, Progress progress = nullptr);
-  Result Get(const std::string &path, const Params &params,
-             const Headers &headers, ContentReceiver content_receiver,
-             Progress progress = nullptr);
-  Result Get(const std::string &path, const Params &params,
-             const Headers &headers, ResponseHandler response_handler,
-             ContentReceiver content_receiver, Progress progress = nullptr);
-
-  Result Head(const std::string &path);
-  Result Head(const std::string &path, const Headers &headers);
-
-  Result Post(const std::string &path);
-  Result Post(const std::string &path, const Headers &headers);
-  Result Post(const std::string &path, const char *body, size_t content_length,
-              const std::string &content_type);
-  Result Post(const std::string &path, const Headers &headers, const char *body,
-              size_t content_length, const std::string &content_type);
-  Result Post(const std::string &path, const Headers &headers, const char *body,
-              size_t content_length, const std::string &content_type,
-              Progress progress);
-  Result Post(const std::string &path, const std::string &body,
-              const std::string &content_type);
-  Result Post(const std::string &path, const std::string &body,
-              const std::string &content_type, Progress progress);
-  Result Post(const std::string &path, const Headers &headers,
-              const std::string &body, const std::string &content_type);
-  Result Post(const std::string &path, const Headers &headers,
-              const std::string &body, const std::string &content_type,
-              Progress progress);
-  Result Post(const std::string &path, size_t content_length,
-              ContentProvider content_provider,
-              const std::string &content_type);
-  Result Post(const std::string &path,
-              ContentProviderWithoutLength content_provider,
-              const std::string &content_type);
-  Result Post(const std::string &path, const Headers &headers,
-              size_t content_length, ContentProvider content_provider,
-              const std::string &content_type);
-  Result Post(const std::string &path, const Headers &headers,
-              ContentProviderWithoutLength content_provider,
-              const std::string &content_type);
-  Result Post(const std::string &path, const Params &params);
-  Result Post(const std::string &path, const Headers &headers,
-              const Params &params);
-  Result Post(const std::string &path, const Headers &headers,
-              const Params &params, Progress progress);
-  Result Post(const std::string &path, const MultipartFormDataItems &items);
-  Result Post(const std::string &path, const Headers &headers,
-              const MultipartFormDataItems &items);
-  Result Post(const std::string &path, const Headers &headers,
-              const MultipartFormDataItems &items, const std::string &boundary);
-  Result Post(const std::string &path, const Headers &headers,
-              const MultipartFormDataItems &items,
-              const MultipartFormDataProviderItems &provider_items);
-
-  Result Put(const std::string &path);
-  Result Put(const std::string &path, const char *body, size_t content_length,
-             const std::string &content_type);
-  Result Put(const std::string &path, const Headers &headers, const char *body,
-             size_t content_length, const std::string &content_type);
-  Result Put(const std::string &path, const Headers &headers, const char *body,
-             size_t content_length, const std::string &content_type,
-             Progress progress);
-  Result Put(const std::string &path, const std::string &body,
-             const std::string &content_type);
-  Result Put(const std::string &path, const std::string &body,
-             const std::string &content_type, Progress progress);
-  Result Put(const std::string &path, const Headers &headers,
-             const std::string &body, const std::string &content_type);
-  Result Put(const std::string &path, const Headers &headers,
-             const std::string &body, const std::string &content_type,
-             Progress progress);
-  Result Put(const std::string &path, size_t content_length,
-             ContentProvider content_provider, const std::string &content_type);
-  Result Put(const std::string &path,
-             ContentProviderWithoutLength content_provider,
-             const std::string &content_type);
-  Result Put(const std::string &path, const Headers &headers,
-             size_t content_length, ContentProvider content_provider,
-             const std::string &content_type);
-  Result Put(const std::string &path, const Headers &headers,
-             ContentProviderWithoutLength content_provider,
-             const std::string &content_type);
-  Result Put(const std::string &path, const Params &params);
-  Result Put(const std::string &path, const Headers &headers,
-             const Params &params);
-  Result Put(const std::string &path, const Headers &headers,
-             const Params &params, Progress progress);
-  Result Put(const std::string &path, const MultipartFormDataItems &items);
-  Result Put(const std::string &path, const Headers &headers,
-             const MultipartFormDataItems &items);
-  Result Put(const std::string &path, const Headers &headers,
-             const MultipartFormDataItems &items, const std::string &boundary);
-  Result Put(const std::string &path, const Headers &headers,
-             const MultipartFormDataItems &items,
-             const MultipartFormDataProviderItems &provider_items);
-
-  Result Patch(const std::string &path);
-  Result Patch(const std::string &path, const char *body, size_t content_length,
-               const std::string &content_type);
-  Result Patch(const std::string &path, const char *body, size_t content_length,
-               const std::string &content_type, Progress progress);
-  Result Patch(const std::string &path, const Headers &headers,
-               const char *body, size_t content_length,
-               const std::string &content_type);
-  Result Patch(const std::string &path, const Headers &headers,
-               const char *body, size_t content_length,
-               const std::string &content_type, Progress progress);
-  Result Patch(const std::string &path, const std::string &body,
-               const std::string &content_type);
-  Result Patch(const std::string &path, const std::string &body,
-               const std::string &content_type, Progress progress);
-  Result Patch(const std::string &path, const Headers &headers,
-               const std::string &body, const std::string &content_type);
-  Result Patch(const std::string &path, const Headers &headers,
-               const std::string &body, const std::string &content_type,
-               Progress progress);
-  Result Patch(const std::string &path, size_t content_length,
-               ContentProvider content_provider,
-               const std::string &content_type);
-  Result Patch(const std::string &path,
-               ContentProviderWithoutLength content_provider,
-               const std::string &content_type);
-  Result Patch(const std::string &path, const Headers &headers,
-               size_t content_length, ContentProvider content_provider,
-               const std::string &content_type);
-  Result Patch(const std::string &path, const Headers &headers,
-               ContentProviderWithoutLength content_provider,
-               const std::string &content_type);
-
-  Result Delete(const std::string &path);
-  Result Delete(const std::string &path, const Headers &headers);
-  Result Delete(const std::string &path, const char *body,
-                size_t content_length, const std::string &content_type);
-  Result Delete(const std::string &path, const char *body,
-                size_t content_length, const std::string &content_type,
-                Progress progress);
-  Result Delete(const std::string &path, const Headers &headers,
-                const char *body, size_t content_length,
-                const std::string &content_type);
-  Result Delete(const std::string &path, const Headers &headers,
-                const char *body, size_t content_length,
-                const std::string &content_type, Progress progress);
-  Result Delete(const std::string &path, const std::string &body,
-                const std::string &content_type);
-  Result Delete(const std::string &path, const std::string &body,
-                const std::string &content_type, Progress progress);
-  Result Delete(const std::string &path, const Headers &headers,
-                const std::string &body, const std::string &content_type);
-  Result Delete(const std::string &path, const Headers &headers,
-                const std::string &body, const std::string &content_type,
-                Progress progress);
-
-  Result Options(const std::string &path);
-  Result Options(const std::string &path, const Headers &headers);
-
-  bool send(Request &req, Response &res, Error &error);
-  Result send(const Request &req);
-
-  void stop();
-
-  std::string host() const;
-  int port() const;
-
-  size_t is_socket_open() const;
-  socket_t socket() const;
-
-  void set_hostname_addr_map(std::map<std::string, std::string> addr_map);
-
-  void set_default_headers(Headers headers);
-
-  void
-  set_header_writer(std::function<ssize_t(Stream &, Headers &)> const &writer);
-
-  void set_address_family(int family);
-  void set_tcp_nodelay(bool on);
-  void set_socket_options(SocketOptions socket_options);
-
-  void set_connection_timeout(time_t sec, time_t usec = 0);
-  template <class Rep, class Period>
-  void
-  set_connection_timeout(const std::chrono::duration<Rep, Period> &duration);
-
-  void set_read_timeout(time_t sec, time_t usec = 0);
-  template <class Rep, class Period>
-  void set_read_timeout(const std::chrono::duration<Rep, Period> &duration);
-
-  void set_write_timeout(time_t sec, time_t usec = 0);
-  template <class Rep, class Period>
-  void set_write_timeout(const std::chrono::duration<Rep, Period> &duration);
-
-  void set_max_timeout(time_t msec);
-  template <class Rep, class Period>
-  void set_max_timeout(const std::chrono::duration<Rep, Period> &duration);
-
-  void set_basic_auth(const std::string &username, const std::string &password);
-  void set_bearer_token_auth(const std::string &token);
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void set_digest_auth(const std::string &username,
-                       const std::string &password);
-#endif
-
-  void set_keep_alive(bool on);
-  void set_follow_location(bool on);
-
-  void set_url_encode(bool on);
-
-  void set_compress(bool on);
-
-  void set_decompress(bool on);
-
-  void set_interface(const std::string &intf);
-
-  void set_proxy(const std::string &host, int port);
-  void set_proxy_basic_auth(const std::string &username,
-                            const std::string &password);
-  void set_proxy_bearer_token_auth(const std::string &token);
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void set_proxy_digest_auth(const std::string &username,
-                             const std::string &password);
-#endif
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void enable_server_certificate_verification(bool enabled);
-  void enable_server_hostname_verification(bool enabled);
-  void set_server_certificate_verifier(
-      std::function<SSLVerifierResponse(SSL *ssl)> verifier);
-#endif
-
-  void set_logger(Logger logger);
-
-  // SSL
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void set_ca_cert_path(const std::string &ca_cert_file_path,
-                        const std::string &ca_cert_dir_path = std::string());
-
-  void set_ca_cert_store(X509_STORE *ca_cert_store);
-  void load_ca_cert_store(const char *ca_cert, std::size_t size);
-
-  long get_openssl_verify_result() const;
-
-  SSL_CTX *ssl_context() const;
-#endif
-
-private:
-  std::unique_ptr<ClientImpl> cli_;
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  bool is_ssl_ = false;
-#endif
-};
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-class SSLServer : public Server {
-public:
-  SSLServer(const char *cert_path, const char *private_key_path,
-            const char *client_ca_cert_file_path = nullptr,
-            const char *client_ca_cert_dir_path = nullptr,
-            const char *private_key_password = nullptr);
-
-  SSLServer(X509 *cert, EVP_PKEY *private_key,
-            X509_STORE *client_ca_cert_store = nullptr);
-
-  SSLServer(
-      const std::function<bool(SSL_CTX &ssl_ctx)> &setup_ssl_ctx_callback);
-
-  ~SSLServer() override;
-
-  bool is_valid() const override;
-
-  SSL_CTX *ssl_context() const;
-
-  void update_certs(X509 *cert, EVP_PKEY *private_key,
-                    X509_STORE *client_ca_cert_store = nullptr);
-
-private:
-  bool process_and_close_socket(socket_t sock) override;
-
-  SSL_CTX *ctx_;
-  std::mutex ctx_mutex_;
-};
-
-class SSLClient final : public ClientImpl {
-public:
-  explicit SSLClient(const std::string &host);
-
-  explicit SSLClient(const std::string &host, int port);
-
-  explicit SSLClient(const std::string &host, int port,
-                     const std::string &client_cert_path,
-                     const std::string &client_key_path,
-                     const std::string &private_key_password = std::string());
-
-  explicit SSLClient(const std::string &host, int port, X509 *client_cert,
-                     EVP_PKEY *client_key,
-                     const std::string &private_key_password = std::string());
-
-  ~SSLClient() override;
-
-  bool is_valid() const override;
-
-  void set_ca_cert_store(X509_STORE *ca_cert_store);
-  void load_ca_cert_store(const char *ca_cert, std::size_t size);
-
-  long get_openssl_verify_result() const;
-
-  SSL_CTX *ssl_context() const;
-
-private:
-  bool create_and_connect_socket(Socket &socket, Error &error) override;
-  void shutdown_ssl(Socket &socket, bool shutdown_gracefully) override;
-  void shutdown_ssl_impl(Socket &socket, bool shutdown_gracefully);
-
-  bool
-  process_socket(const Socket &socket,
-                 std::chrono::time_point<std::chrono::steady_clock> start_time,
-                 std::function<bool(Stream &strm)> callback) override;
-  bool is_ssl() const override;
-
-  bool connect_with_proxy(
-      Socket &sock,
-      std::chrono::time_point<std::chrono::steady_clock> start_time,
-      Response &res, bool &success, Error &error);
-  bool initialize_ssl(Socket &socket, Error &error);
-
-  bool load_certs();
-
-  bool verify_host(X509 *server_cert) const;
-  bool verify_host_with_subject_alt_name(X509 *server_cert) const;
-  bool verify_host_with_common_name(X509 *server_cert) const;
-  bool check_host_name(const char *pattern, size_t pattern_len) const;
-
-  SSL_CTX *ctx_;
-  std::mutex ctx_mutex_;
-  std::once_flag initialize_cert_;
-
-  std::vector<std::string> host_components_;
-
-  long verify_result_ = 0;
-
-  friend class ClientImpl;
-};
-#endif
-
-/*
- * Implementation of template methods.
- */
-
-namespace detail {
-
-template <typename T, typename U>
-inline void duration_to_sec_and_usec(const T &duration, U callback) {
-  auto sec = std::chrono::duration_cast<std::chrono::seconds>(duration).count();
-  auto usec = std::chrono::duration_cast<std::chrono::microseconds>(
-                  duration - std::chrono::seconds(sec))
-                  .count();
-  callback(static_cast<time_t>(sec), static_cast<time_t>(usec));
-}
-
-template <size_t N> inline constexpr size_t str_len(const char (&)[N]) {
-  return N - 1;
-}
-
-inline bool is_numeric(const std::string &str) {
-  return !str.empty() && std::all_of(str.begin(), str.end(), ::isdigit);
-}
-
-inline uint64_t get_header_value_u64(const Headers &headers,
-                                     const std::string &key, uint64_t def,
-                                     size_t id, bool &is_invalid_value) {
-  is_invalid_value = false;
-  auto rng = headers.equal_range(key);
-  auto it = rng.first;
-  std::advance(it, static_cast<ssize_t>(id));
-  if (it != rng.second) {
-    if (is_numeric(it->second)) {
-      return std::strtoull(it->second.data(), nullptr, 10);
-    } else {
-      is_invalid_value = true;
-    }
-  }
-  return def;
-}
-
-inline uint64_t get_header_value_u64(const Headers &headers,
-                                     const std::string &key, uint64_t def,
-                                     size_t id) {
-  bool dummy = false;
-  return get_header_value_u64(headers, key, def, id, dummy);
-}
-
-} // namespace detail
-
-inline uint64_t Request::get_header_value_u64(const std::string &key,
-                                              uint64_t def, size_t id) const {
-  return detail::get_header_value_u64(headers, key, def, id);
-}
-
-inline uint64_t Response::get_header_value_u64(const std::string &key,
-                                               uint64_t def, size_t id) const {
-  return detail::get_header_value_u64(headers, key, def, id);
-}
-
-namespace detail {
-
-inline bool set_socket_opt_impl(socket_t sock, int level, int optname,
-                                const void *optval, socklen_t optlen) {
-  return setsockopt(sock, level, optname,
-#ifdef _WIN32
-                    reinterpret_cast<const char *>(optval),
-#else
-                    optval,
-#endif
-                    optlen) == 0;
-}
-
-inline bool set_socket_opt(socket_t sock, int level, int optname, int optval) {
-  return set_socket_opt_impl(sock, level, optname, &optval, sizeof(optval));
-}
-
-inline bool set_socket_opt_time(socket_t sock, int level, int optname,
-                                time_t sec, time_t usec) {
-#ifdef _WIN32
-  auto timeout = static_cast<uint32_t>(sec * 1000 + usec / 1000);
-#else
-  timeval timeout;
-  timeout.tv_sec = static_cast<long>(sec);
-  timeout.tv_usec = static_cast<decltype(timeout.tv_usec)>(usec);
-#endif
-  return set_socket_opt_impl(sock, level, optname, &timeout, sizeof(timeout));
-}
-
-} // namespace detail
-
-inline void default_socket_options(socket_t sock) {
-  detail::set_socket_opt(sock, SOL_SOCKET,
-#ifdef SO_REUSEPORT
-                         SO_REUSEPORT,
-#else
-                         SO_REUSEADDR,
-#endif
-                         1);
-}
-
-inline const char *status_message(int status) {
-  switch (status) {
-  case StatusCode::Continue_100: return "Continue";
-  case StatusCode::SwitchingProtocol_101: return "Switching Protocol";
-  case StatusCode::Processing_102: return "Processing";
-  case StatusCode::EarlyHints_103: return "Early Hints";
-  case StatusCode::OK_200: return "OK";
-  case StatusCode::Created_201: return "Created";
-  case StatusCode::Accepted_202: return "Accepted";
-  case StatusCode::NonAuthoritativeInformation_203:
-    return "Non-Authoritative Information";
-  case StatusCode::NoContent_204: return "No Content";
-  case StatusCode::ResetContent_205: return "Reset Content";
-  case StatusCode::PartialContent_206: return "Partial Content";
-  case StatusCode::MultiStatus_207: return "Multi-Status";
-  case StatusCode::AlreadyReported_208: return "Already Reported";
-  case StatusCode::IMUsed_226: return "IM Used";
-  case StatusCode::MultipleChoices_300: return "Multiple Choices";
-  case StatusCode::MovedPermanently_301: return "Moved Permanently";
-  case StatusCode::Found_302: return "Found";
-  case StatusCode::SeeOther_303: return "See Other";
-  case StatusCode::NotModified_304: return "Not Modified";
-  case StatusCode::UseProxy_305: return "Use Proxy";
-  case StatusCode::unused_306: return "unused";
-  case StatusCode::TemporaryRedirect_307: return "Temporary Redirect";
-  case StatusCode::PermanentRedirect_308: return "Permanent Redirect";
-  case StatusCode::BadRequest_400: return "Bad Request";
-  case StatusCode::Unauthorized_401: return "Unauthorized";
-  case StatusCode::PaymentRequired_402: return "Payment Required";
-  case StatusCode::Forbidden_403: return "Forbidden";
-  case StatusCode::NotFound_404: return "Not Found";
-  case StatusCode::MethodNotAllowed_405: return "Method Not Allowed";
-  case StatusCode::NotAcceptable_406: return "Not Acceptable";
-  case StatusCode::ProxyAuthenticationRequired_407:
-    return "Proxy Authentication Required";
-  case StatusCode::RequestTimeout_408: return "Request Timeout";
-  case StatusCode::Conflict_409: return "Conflict";
-  case StatusCode::Gone_410: return "Gone";
-  case StatusCode::LengthRequired_411: return "Length Required";
-  case StatusCode::PreconditionFailed_412: return "Precondition Failed";
-  case StatusCode::PayloadTooLarge_413: return "Payload Too Large";
-  case StatusCode::UriTooLong_414: return "URI Too Long";
-  case StatusCode::UnsupportedMediaType_415: return "Unsupported Media Type";
-  case StatusCode::RangeNotSatisfiable_416: return "Range Not Satisfiable";
-  case StatusCode::ExpectationFailed_417: return "Expectation Failed";
-  case StatusCode::ImATeapot_418: return "I'm a teapot";
-  case StatusCode::MisdirectedRequest_421: return "Misdirected Request";
-  case StatusCode::UnprocessableContent_422: return "Unprocessable Content";
-  case StatusCode::Locked_423: return "Locked";
-  case StatusCode::FailedDependency_424: return "Failed Dependency";
-  case StatusCode::TooEarly_425: return "Too Early";
-  case StatusCode::UpgradeRequired_426: return "Upgrade Required";
-  case StatusCode::PreconditionRequired_428: return "Precondition Required";
-  case StatusCode::TooManyRequests_429: return "Too Many Requests";
-  case StatusCode::RequestHeaderFieldsTooLarge_431:
-    return "Request Header Fields Too Large";
-  case StatusCode::UnavailableForLegalReasons_451:
-    return "Unavailable For Legal Reasons";
-  case StatusCode::NotImplemented_501: return "Not Implemented";
-  case StatusCode::BadGateway_502: return "Bad Gateway";
-  case StatusCode::ServiceUnavailable_503: return "Service Unavailable";
-  case StatusCode::GatewayTimeout_504: return "Gateway Timeout";
-  case StatusCode::HttpVersionNotSupported_505:
-    return "HTTP Version Not Supported";
-  case StatusCode::VariantAlsoNegotiates_506: return "Variant Also Negotiates";
-  case StatusCode::InsufficientStorage_507: return "Insufficient Storage";
-  case StatusCode::LoopDetected_508: return "Loop Detected";
-  case StatusCode::NotExtended_510: return "Not Extended";
-  case StatusCode::NetworkAuthenticationRequired_511:
-    return "Network Authentication Required";
-
-  default:
-  case StatusCode::InternalServerError_500: return "Internal Server Error";
-  }
-}
-
-inline std::string get_bearer_token_auth(const Request &req) {
-  if (req.has_header("Authorization")) {
-    constexpr auto bearer_header_prefix_len = detail::str_len("Bearer ");
-    return req.get_header_value("Authorization")
-        .substr(bearer_header_prefix_len);
-  }
-  return "";
-}
-
-template <class Rep, class Period>
-inline Server &
-Server::set_read_timeout(const std::chrono::duration<Rep, Period> &duration) {
-  detail::duration_to_sec_and_usec(
-      duration, [&](time_t sec, time_t usec) { set_read_timeout(sec, usec); });
-  return *this;
-}
-
-template <class Rep, class Period>
-inline Server &
-Server::set_write_timeout(const std::chrono::duration<Rep, Period> &duration) {
-  detail::duration_to_sec_and_usec(
-      duration, [&](time_t sec, time_t usec) { set_write_timeout(sec, usec); });
-  return *this;
-}
-
-template <class Rep, class Period>
-inline Server &
-Server::set_idle_interval(const std::chrono::duration<Rep, Period> &duration) {
-  detail::duration_to_sec_and_usec(
-      duration, [&](time_t sec, time_t usec) { set_idle_interval(sec, usec); });
-  return *this;
-}
-
-inline std::string to_string(const Error error) {
-  switch (error) {
-  case Error::Success: return "Success (no error)";
-  case Error::Connection: return "Could not establish connection";
-  case Error::BindIPAddress: return "Failed to bind IP address";
-  case Error::Read: return "Failed to read connection";
-  case Error::Write: return "Failed to write connection";
-  case Error::ExceedRedirectCount: return "Maximum redirect count exceeded";
-  case Error::Canceled: return "Connection handling canceled";
-  case Error::SSLConnection: return "SSL connection failed";
-  case Error::SSLLoadingCerts: return "SSL certificate loading failed";
-  case Error::SSLServerVerification: return "SSL server verification failed";
-  case Error::SSLServerHostnameVerification:
-    return "SSL server hostname verification failed";
-  case Error::UnsupportedMultipartBoundaryChars:
-    return "Unsupported HTTP multipart boundary characters";
-  case Error::Compression: return "Compression failed";
-  case Error::ConnectionTimeout: return "Connection timed out";
-  case Error::ProxyConnection: return "Proxy connection failed";
-  case Error::Unknown: return "Unknown";
-  default: break;
-  }
-
-  return "Invalid";
-}
-
-inline std::ostream &operator<<(std::ostream &os, const Error &obj) {
-  os << to_string(obj);
-  os << " (" << static_cast<std::underlying_type<Error>::type>(obj) << ')';
-  return os;
-}
-
-inline uint64_t Result::get_request_header_value_u64(const std::string &key,
-                                                     uint64_t def,
-                                                     size_t id) const {
-  return detail::get_header_value_u64(request_headers_, key, def, id);
-}
-
-template <class Rep, class Period>
-inline void ClientImpl::set_connection_timeout(
-    const std::chrono::duration<Rep, Period> &duration) {
-  detail::duration_to_sec_and_usec(duration, [&](time_t sec, time_t usec) {
-    set_connection_timeout(sec, usec);
-  });
-}
-
-template <class Rep, class Period>
-inline void ClientImpl::set_read_timeout(
-    const std::chrono::duration<Rep, Period> &duration) {
-  detail::duration_to_sec_and_usec(
-      duration, [&](time_t sec, time_t usec) { set_read_timeout(sec, usec); });
-}
-
-template <class Rep, class Period>
-inline void ClientImpl::set_write_timeout(
-    const std::chrono::duration<Rep, Period> &duration) {
-  detail::duration_to_sec_and_usec(
-      duration, [&](time_t sec, time_t usec) { set_write_timeout(sec, usec); });
-}
-
-template <class Rep, class Period>
-inline void ClientImpl::set_max_timeout(
-    const std::chrono::duration<Rep, Period> &duration) {
-  auto msec =
-      std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
-  set_max_timeout(msec);
-}
-
-template <class Rep, class Period>
-inline void Client::set_connection_timeout(
-    const std::chrono::duration<Rep, Period> &duration) {
-  cli_->set_connection_timeout(duration);
-}
-
-template <class Rep, class Period>
-inline void
-Client::set_read_timeout(const std::chrono::duration<Rep, Period> &duration) {
-  cli_->set_read_timeout(duration);
-}
-
-template <class Rep, class Period>
-inline void
-Client::set_write_timeout(const std::chrono::duration<Rep, Period> &duration) {
-  cli_->set_write_timeout(duration);
-}
-
-template <class Rep, class Period>
-inline void
-Client::set_max_timeout(const std::chrono::duration<Rep, Period> &duration) {
-  cli_->set_max_timeout(duration);
-}
-
-/*
- * Forward declarations and types that will be part of the .h file if split into
- * .h + .cc.
- */
-
-std::string hosted_at(const std::string &hostname);
-
-void hosted_at(const std::string &hostname, std::vector<std::string> &addrs);
-
-std::string append_query_params(const std::string &path, const Params &params);
-
-std::pair<std::string, std::string> make_range_header(const Ranges &ranges);
-
-std::pair<std::string, std::string>
-make_basic_authentication_header(const std::string &username,
-                                 const std::string &password,
-                                 bool is_proxy = false);
-
-namespace detail {
-
-#if defined(_WIN32)
-inline std::wstring u8string_to_wstring(const char *s) {
-  std::wstring ws;
-  auto len = static_cast<int>(strlen(s));
-  auto wlen = ::MultiByteToWideChar(CP_UTF8, 0, s, len, nullptr, 0);
-  if (wlen > 0) {
-    ws.resize(wlen);
-    wlen = ::MultiByteToWideChar(
-        CP_UTF8, 0, s, len,
-        const_cast<LPWSTR>(reinterpret_cast<LPCWSTR>(ws.data())), wlen);
-    if (wlen != static_cast<int>(ws.size())) { ws.clear(); }
-  }
-  return ws;
-}
-#endif
-
-struct FileStat {
-  FileStat(const std::string &path);
-  bool is_file() const;
-  bool is_dir() const;
-
-private:
-#if defined(_WIN32)
-  struct _stat st_;
-#else
-  struct stat st_;
-#endif
-  int ret_ = -1;
-};
-
-std::string encode_query_param(const std::string &value);
-
-std::string decode_url(const std::string &s, bool convert_plus_to_space);
-
-std::string trim_copy(const std::string &s);
-
-void divide(
-    const char *data, std::size_t size, char d,
-    std::function<void(const char *, std::size_t, const char *, std::size_t)>
-        fn);
-
-void divide(
-    const std::string &str, char d,
-    std::function<void(const char *, std::size_t, const char *, std::size_t)>
-        fn);
-
-void split(const char *b, const char *e, char d,
-           std::function<void(const char *, const char *)> fn);
-
-void split(const char *b, const char *e, char d, size_t m,
-           std::function<void(const char *, const char *)> fn);
-
-bool process_client_socket(
-    socket_t sock, time_t read_timeout_sec, time_t read_timeout_usec,
-    time_t write_timeout_sec, time_t write_timeout_usec,
-    time_t max_timeout_msec,
-    std::chrono::time_point<std::chrono::steady_clock> start_time,
-    std::function<bool(Stream &)> callback);
-
-socket_t create_client_socket(const std::string &host, const std::string &ip,
-                              int port, int address_family, bool tcp_nodelay,
-                              bool ipv6_v6only, SocketOptions socket_options,
-                              time_t connection_timeout_sec,
-                              time_t connection_timeout_usec,
-                              time_t read_timeout_sec, time_t read_timeout_usec,
-                              time_t write_timeout_sec,
-                              time_t write_timeout_usec,
-                              const std::string &intf, Error &error);
-
-const char *get_header_value(const Headers &headers, const std::string &key,
-                             const char *def, size_t id);
-
-std::string params_to_query_str(const Params &params);
-
-void parse_query_text(const char *data, std::size_t size, Params &params);
-
-void parse_query_text(const std::string &s, Params &params);
-
-bool parse_multipart_boundary(const std::string &content_type,
-                              std::string &boundary);
-
-bool parse_range_header(const std::string &s, Ranges &ranges);
-
-int close_socket(socket_t sock);
-
-ssize_t send_socket(socket_t sock, const void *ptr, size_t size, int flags);
-
-ssize_t read_socket(socket_t sock, void *ptr, size_t size, int flags);
-
-enum class EncodingType { None = 0, Gzip, Brotli, Zstd };
-
-EncodingType encoding_type(const Request &req, const Response &res);
-
-class BufferStream final : public Stream {
-public:
-  BufferStream() = default;
-  ~BufferStream() override = default;
-
-  bool is_readable() const override;
-  bool wait_readable() const override;
-  bool wait_writable() const override;
-  ssize_t read(char *ptr, size_t size) override;
-  ssize_t write(const char *ptr, size_t size) override;
-  void get_remote_ip_and_port(std::string &ip, int &port) const override;
-  void get_local_ip_and_port(std::string &ip, int &port) const override;
-  socket_t socket() const override;
-  time_t duration() const override;
-
-  const std::string &get_buffer() const;
-
-private:
-  std::string buffer;
-  size_t position = 0;
-};
-
-class compressor {
-public:
-  virtual ~compressor() = default;
-
-  typedef std::function<bool(const char *data, size_t data_len)> Callback;
-  virtual bool compress(const char *data, size_t data_length, bool last,
-                        Callback callback) = 0;
-};
-
-class decompressor {
-public:
-  virtual ~decompressor() = default;
-
-  virtual bool is_valid() const = 0;
-
-  typedef std::function<bool(const char *data, size_t data_len)> Callback;
-  virtual bool decompress(const char *data, size_t data_length,
-                          Callback callback) = 0;
-};
-
-class nocompressor final : public compressor {
-public:
-  ~nocompressor() override = default;
-
-  bool compress(const char *data, size_t data_length, bool /*last*/,
-                Callback callback) override;
-};
-
-#ifdef CPPHTTPLIB_ZLIB_SUPPORT
-class gzip_compressor final : public compressor {
-public:
-  gzip_compressor();
-  ~gzip_compressor() override;
-
-  bool compress(const char *data, size_t data_length, bool last,
-                Callback callback) override;
-
-private:
-  bool is_valid_ = false;
-  z_stream strm_;
-};
-
-class gzip_decompressor final : public decompressor {
-public:
-  gzip_decompressor();
-  ~gzip_decompressor() override;
-
-  bool is_valid() const override;
-
-  bool decompress(const char *data, size_t data_length,
-                  Callback callback) override;
-
-private:
-  bool is_valid_ = false;
-  z_stream strm_;
-};
-#endif
-
-#ifdef CPPHTTPLIB_BROTLI_SUPPORT
-class brotli_compressor final : public compressor {
-public:
-  brotli_compressor();
-  ~brotli_compressor();
-
-  bool compress(const char *data, size_t data_length, bool last,
-                Callback callback) override;
-
-private:
-  BrotliEncoderState *state_ = nullptr;
-};
-
-class brotli_decompressor final : public decompressor {
-public:
-  brotli_decompressor();
-  ~brotli_decompressor();
-
-  bool is_valid() const override;
-
-  bool decompress(const char *data, size_t data_length,
-                  Callback callback) override;
-
-private:
-  BrotliDecoderResult decoder_r;
-  BrotliDecoderState *decoder_s = nullptr;
-};
-#endif
-
-#ifdef CPPHTTPLIB_ZSTD_SUPPORT
-class zstd_compressor : public compressor {
-public:
-  zstd_compressor();
-  ~zstd_compressor();
-
-  bool compress(const char *data, size_t data_length, bool last,
-                Callback callback) override;
-
-private:
-  ZSTD_CCtx *ctx_ = nullptr;
-};
-
-class zstd_decompressor : public decompressor {
-public:
-  zstd_decompressor();
-  ~zstd_decompressor();
-
-  bool is_valid() const override;
-
-  bool decompress(const char *data, size_t data_length,
-                  Callback callback) override;
-
-private:
-  ZSTD_DCtx *ctx_ = nullptr;
-};
-#endif
-
-// NOTE: until the read size reaches `fixed_buffer_size`, use `fixed_buffer`
-// to store data. The call can set memory on stack for performance.
-class stream_line_reader {
-public:
-  stream_line_reader(Stream &strm, char *fixed_buffer,
-                     size_t fixed_buffer_size);
-  const char *ptr() const;
-  size_t size() const;
-  bool end_with_crlf() const;
-  bool getline();
-
-private:
-  void append(char c);
-
-  Stream &strm_;
-  char *fixed_buffer_;
-  const size_t fixed_buffer_size_;
-  size_t fixed_buffer_used_size_ = 0;
-  std::string growable_buffer_;
-};
-
-class mmap {
-public:
-  mmap(const char *path);
-  ~mmap();
-
-  bool open(const char *path);
-  void close();
-
-  bool is_open() const;
-  size_t size() const;
-  const char *data() const;
-
-private:
-#if defined(_WIN32)
-  HANDLE hFile_ = NULL;
-  HANDLE hMapping_ = NULL;
-#else
-  int fd_ = -1;
-#endif
-  size_t size_ = 0;
-  void *addr_ = nullptr;
-  bool is_open_empty_file = false;
-};
-
-// NOTE: https://www.rfc-editor.org/rfc/rfc9110#section-5
-namespace fields {
-
-inline bool is_token_char(char c) {
-  return std::isalnum(c) || c == '!' || c == '#' || c == '$' || c == '%' ||
-         c == '&' || c == '\'' || c == '*' || c == '+' || c == '-' ||
-         c == '.' || c == '^' || c == '_' || c == '`' || c == '|' || c == '~';
-}
-
-inline bool is_token(const std::string &s) {
-  if (s.empty()) { return false; }
-  for (auto c : s) {
-    if (!is_token_char(c)) { return false; }
-  }
-  return true;
-}
-
-inline bool is_field_name(const std::string &s) { return is_token(s); }
-
-inline bool is_vchar(char c) { return c >= 33 && c <= 126; }
-
-inline bool is_obs_text(char c) { return 128 <= static_cast<unsigned char>(c); }
-
-inline bool is_field_vchar(char c) { return is_vchar(c) || is_obs_text(c); }
-
-inline bool is_field_content(const std::string &s) {
-  if (s.empty()) { return true; }
-
-  if (s.size() == 1) {
-    return is_field_vchar(s[0]);
-  } else if (s.size() == 2) {
-    return is_field_vchar(s[0]) && is_field_vchar(s[1]);
-  } else {
-    size_t i = 0;
-
-    if (!is_field_vchar(s[i])) { return false; }
-    i++;
-
-    while (i < s.size() - 1) {
-      auto c = s[i++];
-      if (c == ' ' || c == '\t' || is_field_vchar(c)) {
-      } else {
-        return false;
-      }
-    }
-
-    return is_field_vchar(s[i]);
-  }
-}
-
-inline bool is_field_value(const std::string &s) { return is_field_content(s); }
-
-} // namespace fields
-
-} // namespace detail
-
-// ----------------------------------------------------------------------------
-
-/*
- * Implementation that will be part of the .cc file if split into .h + .cc.
- */
-
-namespace detail {
-
-inline bool is_hex(char c, int &v) {
-  if (0x20 <= c && isdigit(c)) {
-    v = c - '0';
-    return true;
-  } else if ('A' <= c && c <= 'F') {
-    v = c - 'A' + 10;
-    return true;
-  } else if ('a' <= c && c <= 'f') {
-    v = c - 'a' + 10;
-    return true;
-  }
-  return false;
-}
-
-inline bool from_hex_to_i(const std::string &s, size_t i, size_t cnt,
-                          int &val) {
-  if (i >= s.size()) { return false; }
-
-  val = 0;
-  for (; cnt; i++, cnt--) {
-    if (!s[i]) { return false; }
-    auto v = 0;
-    if (is_hex(s[i], v)) {
-      val = val * 16 + v;
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
-
-inline std::string from_i_to_hex(size_t n) {
-  static const auto charset = "0123456789abcdef";
-  std::string ret;
-  do {
-    ret = charset[n & 15] + ret;
-    n >>= 4;
-  } while (n > 0);
-  return ret;
-}
-
-inline size_t to_utf8(int code, char *buff) {
-  if (code < 0x0080) {
-    buff[0] = static_cast<char>(code & 0x7F);
-    return 1;
-  } else if (code < 0x0800) {
-    buff[0] = static_cast<char>(0xC0 | ((code >> 6) & 0x1F));
-    buff[1] = static_cast<char>(0x80 | (code & 0x3F));
-    return 2;
-  } else if (code < 0xD800) {
-    buff[0] = static_cast<char>(0xE0 | ((code >> 12) & 0xF));
-    buff[1] = static_cast<char>(0x80 | ((code >> 6) & 0x3F));
-    buff[2] = static_cast<char>(0x80 | (code & 0x3F));
-    return 3;
-  } else if (code < 0xE000) { // D800 - DFFF is invalid...
-    return 0;
-  } else if (code < 0x10000) {
-    buff[0] = static_cast<char>(0xE0 | ((code >> 12) & 0xF));
-    buff[1] = static_cast<char>(0x80 | ((code >> 6) & 0x3F));
-    buff[2] = static_cast<char>(0x80 | (code & 0x3F));
-    return 3;
-  } else if (code < 0x110000) {
-    buff[0] = static_cast<char>(0xF0 | ((code >> 18) & 0x7));
-    buff[1] = static_cast<char>(0x80 | ((code >> 12) & 0x3F));
-    buff[2] = static_cast<char>(0x80 | ((code >> 6) & 0x3F));
-    buff[3] = static_cast<char>(0x80 | (code & 0x3F));
-    return 4;
-  }
-
-  // NOTREACHED
-  return 0;
-}
-
-// NOTE: This code came up with the following stackoverflow post:
-// https://stackoverflow.com/questions/180947/base64-decode-snippet-in-c
-inline std::string base64_encode(const std::string &in) {
-  static const auto lookup =
-      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-
-  std::string out;
-  out.reserve(in.size());
-
-  auto val = 0;
-  auto valb = -6;
-
-  for (auto c : in) {
-    val = (val << 8) + static_cast<uint8_t>(c);
-    valb += 8;
-    while (valb >= 0) {
-      out.push_back(lookup[(val >> valb) & 0x3F]);
-      valb -= 6;
-    }
-  }
-
-  if (valb > -6) { out.push_back(lookup[((val << 8) >> (valb + 8)) & 0x3F]); }
-
-  while (out.size() % 4) {
-    out.push_back('=');
-  }
-
-  return out;
-}
-
-inline bool is_valid_path(const std::string &path) {
-  size_t level = 0;
-  size_t i = 0;
-
-  // Skip slash
-  while (i < path.size() && path[i] == '/') {
-    i++;
-  }
-
-  while (i < path.size()) {
-    // Read component
-    auto beg = i;
-    while (i < path.size() && path[i] != '/') {
-      if (path[i] == '\0') {
-        return false;
-      } else if (path[i] == '\\') {
-        return false;
-      }
-      i++;
-    }
-
-    auto len = i - beg;
-    assert(len > 0);
-
-    if (!path.compare(beg, len, ".")) {
-      ;
-    } else if (!path.compare(beg, len, "..")) {
-      if (level == 0) { return false; }
-      level--;
-    } else {
-      level++;
-    }
-
-    // Skip slash
-    while (i < path.size() && path[i] == '/') {
-      i++;
-    }
-  }
-
-  return true;
-}
-
-inline FileStat::FileStat(const std::string &path) {
-#if defined(_WIN32)
-  auto wpath = u8string_to_wstring(path.c_str());
-  ret_ = _wstat(wpath.c_str(), &st_);
-#else
-  ret_ = stat(path.c_str(), &st_);
-#endif
-}
-inline bool FileStat::is_file() const {
-  return ret_ >= 0 && S_ISREG(st_.st_mode);
-}
-inline bool FileStat::is_dir() const {
-  return ret_ >= 0 && S_ISDIR(st_.st_mode);
-}
-
-inline std::string encode_query_param(const std::string &value) {
-  std::ostringstream escaped;
-  escaped.fill('0');
-  escaped << std::hex;
-
-  for (auto c : value) {
-    if (std::isalnum(static_cast<uint8_t>(c)) || c == '-' || c == '_' ||
-        c == '.' || c == '!' || c == '~' || c == '*' || c == '\'' || c == '(' ||
-        c == ')') {
-      escaped << c;
-    } else {
-      escaped << std::uppercase;
-      escaped << '%' << std::setw(2)
-              << static_cast<int>(static_cast<unsigned char>(c));
-      escaped << std::nouppercase;
-    }
-  }
-
-  return escaped.str();
-}
-
-inline std::string encode_url(const std::string &s) {
-  std::string result;
-  result.reserve(s.size());
-
-  for (size_t i = 0; s[i]; i++) {
-    switch (s[i]) {
-    case ' ': result += "%20"; break;
-    case '+': result += "%2B"; break;
-    case '\r': result += "%0D"; break;
-    case '\n': result += "%0A"; break;
-    case '\'': result += "%27"; break;
-    case ',': result += "%2C"; break;
-    // case ':': result += "%3A"; break; // ok? probably...
-    case ';': result += "%3B"; break;
-    default:
-      auto c = static_cast<uint8_t>(s[i]);
-      if (c >= 0x80) {
-        result += '%';
-        char hex[4];
-        auto len = snprintf(hex, sizeof(hex) - 1, "%02X", c);
-        assert(len == 2);
-        result.append(hex, static_cast<size_t>(len));
-      } else {
-        result += s[i];
-      }
-      break;
-    }
-  }
-
-  return result;
-}
-
-inline std::string decode_url(const std::string &s,
-                              bool convert_plus_to_space) {
-  std::string result;
-
-  for (size_t i = 0; i < s.size(); i++) {
-    if (s[i] == '%' && i + 1 < s.size()) {
-      if (s[i + 1] == 'u') {
-        auto val = 0;
-        if (from_hex_to_i(s, i + 2, 4, val)) {
-          // 4 digits Unicode codes
-          char buff[4];
-          size_t len = to_utf8(val, buff);
-          if (len > 0) { result.append(buff, len); }
-          i += 5; // 'u0000'
-        } else {
-          result += s[i];
-        }
-      } else {
-        auto val = 0;
-        if (from_hex_to_i(s, i + 1, 2, val)) {
-          // 2 digits hex codes
-          result += static_cast<char>(val);
-          i += 2; // '00'
-        } else {
-          result += s[i];
-        }
-      }
-    } else if (convert_plus_to_space && s[i] == '+') {
-      result += ' ';
-    } else {
-      result += s[i];
-    }
-  }
-
-  return result;
-}
-
-inline std::string file_extension(const std::string &path) {
-  std::smatch m;
-  thread_local auto re = std::regex("\\.([a-zA-Z0-9]+)$");
-  if (std::regex_search(path, m, re)) { return m[1].str(); }
-  return std::string();
-}
-
-inline bool is_space_or_tab(char c) { return c == ' ' || c == '\t'; }
-
-inline std::pair<size_t, size_t> trim(const char *b, const char *e, size_t left,
-                                      size_t right) {
-  while (b + left < e && is_space_or_tab(b[left])) {
-    left++;
-  }
-  while (right > 0 && is_space_or_tab(b[right - 1])) {
-    right--;
-  }
-  return std::make_pair(left, right);
-}
-
-inline std::string trim_copy(const std::string &s) {
-  auto r = trim(s.data(), s.data() + s.size(), 0, s.size());
-  return s.substr(r.first, r.second - r.first);
-}
-
-inline std::string trim_double_quotes_copy(const std::string &s) {
-  if (s.length() >= 2 && s.front() == '"' && s.back() == '"') {
-    return s.substr(1, s.size() - 2);
-  }
-  return s;
-}
-
-inline void
-divide(const char *data, std::size_t size, char d,
-       std::function<void(const char *, std::size_t, const char *, std::size_t)>
-           fn) {
-  const auto it = std::find(data, data + size, d);
-  const auto found = static_cast<std::size_t>(it != data + size);
-  const auto lhs_data = data;
-  const auto lhs_size = static_cast<std::size_t>(it - data);
-  const auto rhs_data = it + found;
-  const auto rhs_size = size - lhs_size - found;
-
-  fn(lhs_data, lhs_size, rhs_data, rhs_size);
-}
-
-inline void
-divide(const std::string &str, char d,
-       std::function<void(const char *, std::size_t, const char *, std::size_t)>
-           fn) {
-  divide(str.data(), str.size(), d, std::move(fn));
-}
-
-inline void split(const char *b, const char *e, char d,
-                  std::function<void(const char *, const char *)> fn) {
-  return split(b, e, d, (std::numeric_limits<size_t>::max)(), std::move(fn));
-}
-
-inline void split(const char *b, const char *e, char d, size_t m,
-                  std::function<void(const char *, const char *)> fn) {
-  size_t i = 0;
-  size_t beg = 0;
-  size_t count = 1;
-
-  while (e ? (b + i < e) : (b[i] != '\0')) {
-    if (b[i] == d && count < m) {
-      auto r = trim(b, e, beg, i);
-      if (r.first < r.second) { fn(&b[r.first], &b[r.second]); }
-      beg = i + 1;
-      count++;
-    }
-    i++;
-  }
-
-  if (i) {
-    auto r = trim(b, e, beg, i);
-    if (r.first < r.second) { fn(&b[r.first], &b[r.second]); }
-  }
-}
-
-inline stream_line_reader::stream_line_reader(Stream &strm, char *fixed_buffer,
-                                              size_t fixed_buffer_size)
-    : strm_(strm), fixed_buffer_(fixed_buffer),
-      fixed_buffer_size_(fixed_buffer_size) {}
-
-inline const char *stream_line_reader::ptr() const {
-  if (growable_buffer_.empty()) {
-    return fixed_buffer_;
-  } else {
-    return growable_buffer_.data();
-  }
-}
-
-inline size_t stream_line_reader::size() const {
-  if (growable_buffer_.empty()) {
-    return fixed_buffer_used_size_;
-  } else {
-    return growable_buffer_.size();
-  }
-}
-
-inline bool stream_line_reader::end_with_crlf() const {
-  auto end = ptr() + size();
-  return size() >= 2 && end[-2] == '\r' && end[-1] == '\n';
-}
-
-inline bool stream_line_reader::getline() {
-  fixed_buffer_used_size_ = 0;
-  growable_buffer_.clear();
-
-#ifndef CPPHTTPLIB_ALLOW_LF_AS_LINE_TERMINATOR
-  char prev_byte = 0;
-#endif
-
-  for (size_t i = 0;; i++) {
-    char byte;
-    auto n = strm_.read(&byte, 1);
-
-    if (n < 0) {
-      return false;
-    } else if (n == 0) {
-      if (i == 0) {
-        return false;
-      } else {
-        break;
-      }
-    }
-
-    append(byte);
-
-#ifdef CPPHTTPLIB_ALLOW_LF_AS_LINE_TERMINATOR
-    if (byte == '\n') { break; }
-#else
-    if (prev_byte == '\r' && byte == '\n') { break; }
-    prev_byte = byte;
-#endif
-  }
-
-  return true;
-}
-
-inline void stream_line_reader::append(char c) {
-  if (fixed_buffer_used_size_ < fixed_buffer_size_ - 1) {
-    fixed_buffer_[fixed_buffer_used_size_++] = c;
-    fixed_buffer_[fixed_buffer_used_size_] = '\0';
-  } else {
-    if (growable_buffer_.empty()) {
-      assert(fixed_buffer_[fixed_buffer_used_size_] == '\0');
-      growable_buffer_.assign(fixed_buffer_, fixed_buffer_used_size_);
-    }
-    growable_buffer_ += c;
-  }
-}
-
-inline mmap::mmap(const char *path) { open(path); }
-
-inline mmap::~mmap() { close(); }
-
-inline bool mmap::open(const char *path) {
-  close();
-
-#if defined(_WIN32)
-  auto wpath = u8string_to_wstring(path);
-  if (wpath.empty()) { return false; }
-
-#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
-  hFile_ = ::CreateFile2(wpath.c_str(), GENERIC_READ, FILE_SHARE_READ,
-                         OPEN_EXISTING, NULL);
-#else
-  hFile_ = ::CreateFileW(wpath.c_str(), GENERIC_READ, FILE_SHARE_READ, NULL,
-                         OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
-#endif
-
-  if (hFile_ == INVALID_HANDLE_VALUE) { return false; }
-
-  LARGE_INTEGER size{};
-  if (!::GetFileSizeEx(hFile_, &size)) { return false; }
-  // If the following line doesn't compile due to QuadPart, update Windows SDK.
-  // See:
-  // https://github.com/yhirose/cpp-httplib/issues/1903#issuecomment-2316520721
-  if (static_cast<ULONGLONG>(size.QuadPart) >
-      (std::numeric_limits<decltype(size_)>::max)()) {
-    // `size_t` might be 32-bits, on 32-bits Windows.
-    return false;
-  }
-  size_ = static_cast<size_t>(size.QuadPart);
-
-#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
-  hMapping_ =
-      ::CreateFileMappingFromApp(hFile_, NULL, PAGE_READONLY, size_, NULL);
-#else
-  hMapping_ = ::CreateFileMappingW(hFile_, NULL, PAGE_READONLY, 0, 0, NULL);
-#endif
-
-  // Special treatment for an empty file...
-  if (hMapping_ == NULL && size_ == 0) {
-    close();
-    is_open_empty_file = true;
-    return true;
-  }
-
-  if (hMapping_ == NULL) {
-    close();
-    return false;
-  }
-
-#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
-  addr_ = ::MapViewOfFileFromApp(hMapping_, FILE_MAP_READ, 0, 0);
-#else
-  addr_ = ::MapViewOfFile(hMapping_, FILE_MAP_READ, 0, 0, 0);
-#endif
-
-  if (addr_ == nullptr) {
-    close();
-    return false;
-  }
-#else
-  fd_ = ::open(path, O_RDONLY);
-  if (fd_ == -1) { return false; }
-
-  struct stat sb;
-  if (fstat(fd_, &sb) == -1) {
-    close();
-    return false;
-  }
-  size_ = static_cast<size_t>(sb.st_size);
-
-  addr_ = ::mmap(NULL, size_, PROT_READ, MAP_PRIVATE, fd_, 0);
-
-  // Special treatment for an empty file...
-  if (addr_ == MAP_FAILED && size_ == 0) {
-    close();
-    is_open_empty_file = true;
-    return false;
-  }
-#endif
-
-  return true;
-}
-
-inline bool mmap::is_open() const {
-  return is_open_empty_file ? true : addr_ != nullptr;
-}
-
-inline size_t mmap::size() const { return size_; }
-
-inline const char *mmap::data() const {
-  return is_open_empty_file ? "" : static_cast<const char *>(addr_);
-}
-
-inline void mmap::close() {
-#if defined(_WIN32)
-  if (addr_) {
-    ::UnmapViewOfFile(addr_);
-    addr_ = nullptr;
-  }
-
-  if (hMapping_) {
-    ::CloseHandle(hMapping_);
-    hMapping_ = NULL;
-  }
-
-  if (hFile_ != INVALID_HANDLE_VALUE) {
-    ::CloseHandle(hFile_);
-    hFile_ = INVALID_HANDLE_VALUE;
-  }
-
-  is_open_empty_file = false;
-#else
-  if (addr_ != nullptr) {
-    munmap(addr_, size_);
-    addr_ = nullptr;
-  }
-
-  if (fd_ != -1) {
-    ::close(fd_);
-    fd_ = -1;
-  }
-#endif
-  size_ = 0;
-}
-inline int close_socket(socket_t sock) {
-#ifdef _WIN32
-  return closesocket(sock);
-#else
-  return close(sock);
-#endif
-}
-
-template <typename T> inline ssize_t handle_EINTR(T fn) {
-  ssize_t res = 0;
-  while (true) {
-    res = fn();
-    if (res < 0 && errno == EINTR) {
-      std::this_thread::sleep_for(std::chrono::microseconds{1});
-      continue;
-    }
-    break;
-  }
-  return res;
-}
-
-inline ssize_t read_socket(socket_t sock, void *ptr, size_t size, int flags) {
-  return handle_EINTR([&]() {
-    return recv(sock,
-#ifdef _WIN32
-                static_cast<char *>(ptr), static_cast<int>(size),
-#else
-                ptr, size,
-#endif
-                flags);
-  });
-}
-
-inline ssize_t send_socket(socket_t sock, const void *ptr, size_t size,
-                           int flags) {
-  return handle_EINTR([&]() {
-    return send(sock,
-#ifdef _WIN32
-                static_cast<const char *>(ptr), static_cast<int>(size),
-#else
-                ptr, size,
-#endif
-                flags);
-  });
-}
-
-inline int poll_wrapper(struct pollfd *fds, nfds_t nfds, int timeout) {
-#ifdef _WIN32
-  return ::WSAPoll(fds, nfds, timeout);
-#else
-  return ::poll(fds, nfds, timeout);
-#endif
-}
-
-template <bool Read>
-inline ssize_t select_impl(socket_t sock, time_t sec, time_t usec) {
-  struct pollfd pfd;
-  pfd.fd = sock;
-  pfd.events = (Read ? POLLIN : POLLOUT);
-
-  auto timeout = static_cast<int>(sec * 1000 + usec / 1000);
-
-  return handle_EINTR([&]() { return poll_wrapper(&pfd, 1, timeout); });
-}
-
-inline ssize_t select_read(socket_t sock, time_t sec, time_t usec) {
-  return select_impl<true>(sock, sec, usec);
-}
-
-inline ssize_t select_write(socket_t sock, time_t sec, time_t usec) {
-  return select_impl<false>(sock, sec, usec);
-}
-
-inline Error wait_until_socket_is_ready(socket_t sock, time_t sec,
-                                        time_t usec) {
-  struct pollfd pfd_read;
-  pfd_read.fd = sock;
-  pfd_read.events = POLLIN | POLLOUT;
-
-  auto timeout = static_cast<int>(sec * 1000 + usec / 1000);
-
-  auto poll_res =
-      handle_EINTR([&]() { return poll_wrapper(&pfd_read, 1, timeout); });
-
-  if (poll_res == 0) { return Error::ConnectionTimeout; }
-
-  if (poll_res > 0 && pfd_read.revents & (POLLIN | POLLOUT)) {
-    auto error = 0;
-    socklen_t len = sizeof(error);
-    auto res = getsockopt(sock, SOL_SOCKET, SO_ERROR,
-                          reinterpret_cast<char *>(&error), &len);
-    auto successful = res >= 0 && !error;
-    return successful ? Error::Success : Error::Connection;
-  }
-
-  return Error::Connection;
-}
-
-inline bool is_socket_alive(socket_t sock) {
-  const auto val = detail::select_read(sock, 0, 0);
-  if (val == 0) {
-    return true;
-  } else if (val < 0 && errno == EBADF) {
-    return false;
-  }
-  char buf[1];
-  return detail::read_socket(sock, &buf[0], sizeof(buf), MSG_PEEK) > 0;
-}
-
-class SocketStream final : public Stream {
-public:
-  SocketStream(socket_t sock, time_t read_timeout_sec, time_t read_timeout_usec,
-               time_t write_timeout_sec, time_t write_timeout_usec,
-               time_t max_timeout_msec = 0,
-               std::chrono::time_point<std::chrono::steady_clock> start_time =
-                   (std::chrono::steady_clock::time_point::min)());
-  ~SocketStream() override;
-
-  bool is_readable() const override;
-  bool wait_readable() const override;
-  bool wait_writable() const override;
-  ssize_t read(char *ptr, size_t size) override;
-  ssize_t write(const char *ptr, size_t size) override;
-  void get_remote_ip_and_port(std::string &ip, int &port) const override;
-  void get_local_ip_and_port(std::string &ip, int &port) const override;
-  socket_t socket() const override;
-  time_t duration() const override;
-
-private:
-  socket_t sock_;
-  time_t read_timeout_sec_;
-  time_t read_timeout_usec_;
-  time_t write_timeout_sec_;
-  time_t write_timeout_usec_;
-  time_t max_timeout_msec_;
-  const std::chrono::time_point<std::chrono::steady_clock> start_time_;
-
-  std::vector<char> read_buff_;
-  size_t read_buff_off_ = 0;
-  size_t read_buff_content_size_ = 0;
-
-  static const size_t read_buff_size_ = 1024l * 4;
-};
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-class SSLSocketStream final : public Stream {
-public:
-  SSLSocketStream(
-      socket_t sock, SSL *ssl, time_t read_timeout_sec,
-      time_t read_timeout_usec, time_t write_timeout_sec,
-      time_t write_timeout_usec, time_t max_timeout_msec = 0,
-      std::chrono::time_point<std::chrono::steady_clock> start_time =
-          (std::chrono::steady_clock::time_point::min)());
-  ~SSLSocketStream() override;
-
-  bool is_readable() const override;
-  bool wait_readable() const override;
-  bool wait_writable() const override;
-  ssize_t read(char *ptr, size_t size) override;
-  ssize_t write(const char *ptr, size_t size) override;
-  void get_remote_ip_and_port(std::string &ip, int &port) const override;
-  void get_local_ip_and_port(std::string &ip, int &port) const override;
-  socket_t socket() const override;
-  time_t duration() const override;
-
-private:
-  socket_t sock_;
-  SSL *ssl_;
-  time_t read_timeout_sec_;
-  time_t read_timeout_usec_;
-  time_t write_timeout_sec_;
-  time_t write_timeout_usec_;
-  time_t max_timeout_msec_;
-  const std::chrono::time_point<std::chrono::steady_clock> start_time_;
-};
-#endif
-
-inline bool keep_alive(const std::atomic<socket_t> &svr_sock, socket_t sock,
-                       time_t keep_alive_timeout_sec) {
-  using namespace std::chrono;
-
-  const auto interval_usec =
-      CPPHTTPLIB_KEEPALIVE_TIMEOUT_CHECK_INTERVAL_USECOND;
-
-  // Avoid expensive `steady_clock::now()` call for the first time
-  if (select_read(sock, 0, interval_usec) > 0) { return true; }
-
-  const auto start = steady_clock::now() - microseconds{interval_usec};
-  const auto timeout = seconds{keep_alive_timeout_sec};
-
-  while (true) {
-    if (svr_sock == INVALID_SOCKET) {
-      break; // Server socket is closed
-    }
-
-    auto val = select_read(sock, 0, interval_usec);
-    if (val < 0) {
-      break; // Ssocket error
-    } else if (val == 0) {
-      if (steady_clock::now() - start > timeout) {
-        break; // Timeout
-      }
-    } else {
-      return true; // Ready for read
-    }
-  }
-
-  return false;
-}
-
-template <typename T>
-inline bool
-process_server_socket_core(const std::atomic<socket_t> &svr_sock, socket_t sock,
-                           size_t keep_alive_max_count,
-                           time_t keep_alive_timeout_sec, T callback) {
-  assert(keep_alive_max_count > 0);
-  auto ret = false;
-  auto count = keep_alive_max_count;
-  while (count > 0 && keep_alive(svr_sock, sock, keep_alive_timeout_sec)) {
-    auto close_connection = count == 1;
-    auto connection_closed = false;
-    ret = callback(close_connection, connection_closed);
-    if (!ret || connection_closed) { break; }
-    count--;
-  }
-  return ret;
-}
-
-template <typename T>
-inline bool
-process_server_socket(const std::atomic<socket_t> &svr_sock, socket_t sock,
-                      size_t keep_alive_max_count,
-                      time_t keep_alive_timeout_sec, time_t read_timeout_sec,
-                      time_t read_timeout_usec, time_t write_timeout_sec,
-                      time_t write_timeout_usec, T callback) {
-  return process_server_socket_core(
-      svr_sock, sock, keep_alive_max_count, keep_alive_timeout_sec,
-      [&](bool close_connection, bool &connection_closed) {
-        SocketStream strm(sock, read_timeout_sec, read_timeout_usec,
-                          write_timeout_sec, write_timeout_usec);
-        return callback(strm, close_connection, connection_closed);
-      });
-}
-
-inline bool process_client_socket(
-    socket_t sock, time_t read_timeout_sec, time_t read_timeout_usec,
-    time_t write_timeout_sec, time_t write_timeout_usec,
-    time_t max_timeout_msec,
-    std::chrono::time_point<std::chrono::steady_clock> start_time,
-    std::function<bool(Stream &)> callback) {
-  SocketStream strm(sock, read_timeout_sec, read_timeout_usec,
-                    write_timeout_sec, write_timeout_usec, max_timeout_msec,
-                    start_time);
-  return callback(strm);
-}
-
-inline int shutdown_socket(socket_t sock) {
-#ifdef _WIN32
-  return shutdown(sock, SD_BOTH);
-#else
-  return shutdown(sock, SHUT_RDWR);
-#endif
-}
-
-inline std::string escape_abstract_namespace_unix_domain(const std::string &s) {
-  if (s.size() > 1 && s[0] == '\0') {
-    auto ret = s;
-    ret[0] = '@';
-    return ret;
-  }
-  return s;
-}
-
-inline std::string
-unescape_abstract_namespace_unix_domain(const std::string &s) {
-  if (s.size() > 1 && s[0] == '@') {
-    auto ret = s;
-    ret[0] = '\0';
-    return ret;
-  }
-  return s;
-}
-
-template <typename BindOrConnect>
-socket_t create_socket(const std::string &host, const std::string &ip, int port,
-                       int address_family, int socket_flags, bool tcp_nodelay,
-                       bool ipv6_v6only, SocketOptions socket_options,
-                       BindOrConnect bind_or_connect) {
-  // Get address info
-  const char *node = nullptr;
-  struct addrinfo hints;
-  struct addrinfo *result;
-
-  memset(&hints, 0, sizeof(struct addrinfo));
-  hints.ai_socktype = SOCK_STREAM;
-  hints.ai_protocol = IPPROTO_IP;
-
-  if (!ip.empty()) {
-    node = ip.c_str();
-    // Ask getaddrinfo to convert IP in c-string to address
-    hints.ai_family = AF_UNSPEC;
-    hints.ai_flags = AI_NUMERICHOST;
-  } else {
-    if (!host.empty()) { node = host.c_str(); }
-    hints.ai_family = address_family;
-    hints.ai_flags = socket_flags;
-  }
-
-  if (hints.ai_family == AF_UNIX) {
-    const auto addrlen = host.length();
-    if (addrlen > sizeof(sockaddr_un::sun_path)) { return INVALID_SOCKET; }
-
-#ifdef SOCK_CLOEXEC
-    auto sock = socket(hints.ai_family, hints.ai_socktype | SOCK_CLOEXEC,
-                       hints.ai_protocol);
-#else
-    auto sock = socket(hints.ai_family, hints.ai_socktype, hints.ai_protocol);
-#endif
-
-    if (sock != INVALID_SOCKET) {
-      sockaddr_un addr{};
-      addr.sun_family = AF_UNIX;
-
-      auto unescaped_host = unescape_abstract_namespace_unix_domain(host);
-      std::copy(unescaped_host.begin(), unescaped_host.end(), addr.sun_path);
-
-      hints.ai_addr = reinterpret_cast<sockaddr *>(&addr);
-      hints.ai_addrlen = static_cast<socklen_t>(
-          sizeof(addr) - sizeof(addr.sun_path) + addrlen);
-
-#ifndef SOCK_CLOEXEC
-#ifndef _WIN32
-      fcntl(sock, F_SETFD, FD_CLOEXEC);
-#endif
-#endif
-
-      if (socket_options) { socket_options(sock); }
-
-#ifdef _WIN32
-      // Setting SO_REUSEADDR seems not to work well with AF_UNIX on windows, so
-      // remove the option.
-      detail::set_socket_opt(sock, SOL_SOCKET, SO_REUSEADDR, 0);
-#endif
-
-      bool dummy;
-      if (!bind_or_connect(sock, hints, dummy)) {
-        close_socket(sock);
-        sock = INVALID_SOCKET;
-      }
-    }
-    return sock;
-  }
-
-  auto service = std::to_string(port);
-
-  if (getaddrinfo(node, service.c_str(), &hints, &result)) {
-#if defined __linux__ && !defined __ANDROID__
-    res_init();
-#endif
-    return INVALID_SOCKET;
-  }
-  auto se = detail::scope_exit([&] { freeaddrinfo(result); });
-
-  for (auto rp = result; rp; rp = rp->ai_next) {
-    // Create a socket
-#ifdef _WIN32
-    auto sock =
-        WSASocketW(rp->ai_family, rp->ai_socktype, rp->ai_protocol, nullptr, 0,
-                   WSA_FLAG_NO_HANDLE_INHERIT | WSA_FLAG_OVERLAPPED);
-    /**
-     * Since the WSA_FLAG_NO_HANDLE_INHERIT is only supported on Windows 7 SP1
-     * and above the socket creation fails on older Windows Systems.
-     *
-     * Let's try to create a socket the old way in this case.
-     *
-     * Reference:
-     * https://docs.microsoft.com/en-us/windows/win32/api/winsock2/nf-winsock2-wsasocketa
-     *
-     * WSA_FLAG_NO_HANDLE_INHERIT:
-     * This flag is supported on Windows 7 with SP1, Windows Server 2008 R2 with
-     * SP1, and later
-     *
-     */
-    if (sock == INVALID_SOCKET) {
-      sock = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol);
-    }
-#else
-
-#ifdef SOCK_CLOEXEC
-    auto sock =
-        socket(rp->ai_family, rp->ai_socktype | SOCK_CLOEXEC, rp->ai_protocol);
-#else
-    auto sock = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol);
-#endif
-
-#endif
-    if (sock == INVALID_SOCKET) { continue; }
-
-#if !defined _WIN32 && !defined SOCK_CLOEXEC
-    if (fcntl(sock, F_SETFD, FD_CLOEXEC) == -1) {
-      close_socket(sock);
-      continue;
-    }
-#endif
-
-    if (tcp_nodelay) { set_socket_opt(sock, IPPROTO_TCP, TCP_NODELAY, 1); }
-
-    if (rp->ai_family == AF_INET6) {
-      set_socket_opt(sock, IPPROTO_IPV6, IPV6_V6ONLY, ipv6_v6only ? 1 : 0);
-    }
-
-    if (socket_options) { socket_options(sock); }
-
-    // bind or connect
-    auto quit = false;
-    if (bind_or_connect(sock, *rp, quit)) { return sock; }
-
-    close_socket(sock);
-
-    if (quit) { break; }
-  }
-
-  return INVALID_SOCKET;
-}
-
-inline void set_nonblocking(socket_t sock, bool nonblocking) {
-#ifdef _WIN32
-  auto flags = nonblocking ? 1UL : 0UL;
-  ioctlsocket(sock, FIONBIO, &flags);
-#else
-  auto flags = fcntl(sock, F_GETFL, 0);
-  fcntl(sock, F_SETFL,
-        nonblocking ? (flags | O_NONBLOCK) : (flags & (~O_NONBLOCK)));
-#endif
-}
-
-inline bool is_connection_error() {
-#ifdef _WIN32
-  return WSAGetLastError() != WSAEWOULDBLOCK;
-#else
-  return errno != EINPROGRESS;
-#endif
-}
-
-inline bool bind_ip_address(socket_t sock, const std::string &host) {
-  struct addrinfo hints;
-  struct addrinfo *result;
-
-  memset(&hints, 0, sizeof(struct addrinfo));
-  hints.ai_family = AF_UNSPEC;
-  hints.ai_socktype = SOCK_STREAM;
-  hints.ai_protocol = 0;
-
-  if (getaddrinfo(host.c_str(), "0", &hints, &result)) { return false; }
-  auto se = detail::scope_exit([&] { freeaddrinfo(result); });
-
-  auto ret = false;
-  for (auto rp = result; rp; rp = rp->ai_next) {
-    const auto &ai = *rp;
-    if (!::bind(sock, ai.ai_addr, static_cast<socklen_t>(ai.ai_addrlen))) {
-      ret = true;
-      break;
-    }
-  }
-
-  return ret;
-}
-
-#if !defined _WIN32 && !defined ANDROID && !defined _AIX && !defined __MVS__
-#define USE_IF2IP
-#endif
-
-#ifdef USE_IF2IP
-inline std::string if2ip(int address_family, const std::string &ifn) {
-  struct ifaddrs *ifap;
-  getifaddrs(&ifap);
-  auto se = detail::scope_exit([&] { freeifaddrs(ifap); });
-
-  std::string addr_candidate;
-  for (auto ifa = ifap; ifa; ifa = ifa->ifa_next) {
-    if (ifa->ifa_addr && ifn == ifa->ifa_name &&
-        (AF_UNSPEC == address_family ||
-         ifa->ifa_addr->sa_family == address_family)) {
-      if (ifa->ifa_addr->sa_family == AF_INET) {
-        auto sa = reinterpret_cast<struct sockaddr_in *>(ifa->ifa_addr);
-        char buf[INET_ADDRSTRLEN];
-        if (inet_ntop(AF_INET, &sa->sin_addr, buf, INET_ADDRSTRLEN)) {
-          return std::string(buf, INET_ADDRSTRLEN);
-        }
-      } else if (ifa->ifa_addr->sa_family == AF_INET6) {
-        auto sa = reinterpret_cast<struct sockaddr_in6 *>(ifa->ifa_addr);
-        if (!IN6_IS_ADDR_LINKLOCAL(&sa->sin6_addr)) {
-          char buf[INET6_ADDRSTRLEN] = {};
-          if (inet_ntop(AF_INET6, &sa->sin6_addr, buf, INET6_ADDRSTRLEN)) {
-            // equivalent to mac's IN6_IS_ADDR_UNIQUE_LOCAL
-            auto s6_addr_head = sa->sin6_addr.s6_addr[0];
-            if (s6_addr_head == 0xfc || s6_addr_head == 0xfd) {
-              addr_candidate = std::string(buf, INET6_ADDRSTRLEN);
-            } else {
-              return std::string(buf, INET6_ADDRSTRLEN);
-            }
-          }
-        }
-      }
-    }
-  }
-  return addr_candidate;
-}
-#endif
-
-inline socket_t create_client_socket(
-    const std::string &host, const std::string &ip, int port,
-    int address_family, bool tcp_nodelay, bool ipv6_v6only,
-    SocketOptions socket_options, time_t connection_timeout_sec,
-    time_t connection_timeout_usec, time_t read_timeout_sec,
-    time_t read_timeout_usec, time_t write_timeout_sec,
-    time_t write_timeout_usec, const std::string &intf, Error &error) {
-  auto sock = create_socket(
-      host, ip, port, address_family, 0, tcp_nodelay, ipv6_v6only,
-      std::move(socket_options),
-      [&](socket_t sock2, struct addrinfo &ai, bool &quit) -> bool {
-        if (!intf.empty()) {
-#ifdef USE_IF2IP
-          auto ip_from_if = if2ip(address_family, intf);
-          if (ip_from_if.empty()) { ip_from_if = intf; }
-          if (!bind_ip_address(sock2, ip_from_if)) {
-            error = Error::BindIPAddress;
-            return false;
-          }
-#endif
-        }
-
-        set_nonblocking(sock2, true);
-
-        auto ret =
-            ::connect(sock2, ai.ai_addr, static_cast<socklen_t>(ai.ai_addrlen));
-
-        if (ret < 0) {
-          if (is_connection_error()) {
-            error = Error::Connection;
-            return false;
-          }
-          error = wait_until_socket_is_ready(sock2, connection_timeout_sec,
-                                             connection_timeout_usec);
-          if (error != Error::Success) {
-            if (error == Error::ConnectionTimeout) { quit = true; }
-            return false;
-          }
-        }
-
-        set_nonblocking(sock2, false);
-        set_socket_opt_time(sock2, SOL_SOCKET, SO_RCVTIMEO, read_timeout_sec,
-                            read_timeout_usec);
-        set_socket_opt_time(sock2, SOL_SOCKET, SO_SNDTIMEO, write_timeout_sec,
-                            write_timeout_usec);
-
-        error = Error::Success;
-        return true;
-      });
-
-  if (sock != INVALID_SOCKET) {
-    error = Error::Success;
-  } else {
-    if (error == Error::Success) { error = Error::Connection; }
-  }
-
-  return sock;
-}
-
-inline bool get_ip_and_port(const struct sockaddr_storage &addr,
-                            socklen_t addr_len, std::string &ip, int &port) {
-  if (addr.ss_family == AF_INET) {
-    port = ntohs(reinterpret_cast<const struct sockaddr_in *>(&addr)->sin_port);
-  } else if (addr.ss_family == AF_INET6) {
-    port =
-        ntohs(reinterpret_cast<const struct sockaddr_in6 *>(&addr)->sin6_port);
-  } else {
-    return false;
-  }
-
-  std::array<char, NI_MAXHOST> ipstr{};
-  if (getnameinfo(reinterpret_cast<const struct sockaddr *>(&addr), addr_len,
-                  ipstr.data(), static_cast<socklen_t>(ipstr.size()), nullptr,
-                  0, NI_NUMERICHOST)) {
-    return false;
-  }
-
-  ip = ipstr.data();
-  return true;
-}
-
-inline void get_local_ip_and_port(socket_t sock, std::string &ip, int &port) {
-  struct sockaddr_storage addr;
-  socklen_t addr_len = sizeof(addr);
-  if (!getsockname(sock, reinterpret_cast<struct sockaddr *>(&addr),
-                   &addr_len)) {
-    get_ip_and_port(addr, addr_len, ip, port);
-  }
-}
-
-inline void get_remote_ip_and_port(socket_t sock, std::string &ip, int &port) {
-  struct sockaddr_storage addr;
-  socklen_t addr_len = sizeof(addr);
-
-  if (!getpeername(sock, reinterpret_cast<struct sockaddr *>(&addr),
-                   &addr_len)) {
-#ifndef _WIN32
-    if (addr.ss_family == AF_UNIX) {
-#if defined(__linux__)
-      struct ucred ucred;
-      socklen_t len = sizeof(ucred);
-      if (getsockopt(sock, SOL_SOCKET, SO_PEERCRED, &ucred, &len) == 0) {
-        port = ucred.pid;
-      }
-#elif defined(SOL_LOCAL) && defined(SO_PEERPID) // __APPLE__
-      pid_t pid;
-      socklen_t len = sizeof(pid);
-      if (getsockopt(sock, SOL_LOCAL, SO_PEERPID, &pid, &len) == 0) {
-        port = pid;
-      }
-#endif
-      return;
-    }
-#endif
-    get_ip_and_port(addr, addr_len, ip, port);
-  }
-}
-
-inline constexpr unsigned int str2tag_core(const char *s, size_t l,
-                                           unsigned int h) {
-  return (l == 0)
-             ? h
-             : str2tag_core(
-                   s + 1, l - 1,
-                   // Unsets the 6 high bits of h, therefore no overflow happens
-                   (((std::numeric_limits<unsigned int>::max)() >> 6) &
-                    h * 33) ^
-                       static_cast<unsigned char>(*s));
-}
-
-inline unsigned int str2tag(const std::string &s) {
-  return str2tag_core(s.data(), s.size(), 0);
-}
-
-namespace udl {
-
-inline constexpr unsigned int operator""_t(const char *s, size_t l) {
-  return str2tag_core(s, l, 0);
-}
-
-} // namespace udl
-
-inline std::string
-find_content_type(const std::string &path,
-                  const std::map<std::string, std::string> &user_data,
-                  const std::string &default_content_type) {
-  auto ext = file_extension(path);
-
-  auto it = user_data.find(ext);
-  if (it != user_data.end()) { return it->second; }
-
-  using udl::operator""_t;
-
-  switch (str2tag(ext)) {
-  default: return default_content_type;
-
-  case "css"_t: return "text/css";
-  case "csv"_t: return "text/csv";
-  case "htm"_t:
-  case "html"_t: return "text/html";
-  case "js"_t:
-  case "mjs"_t: return "text/javascript";
-  case "txt"_t: return "text/plain";
-  case "vtt"_t: return "text/vtt";
-
-  case "apng"_t: return "image/apng";
-  case "avif"_t: return "image/avif";
-  case "bmp"_t: return "image/bmp";
-  case "gif"_t: return "image/gif";
-  case "png"_t: return "image/png";
-  case "svg"_t: return "image/svg+xml";
-  case "webp"_t: return "image/webp";
-  case "ico"_t: return "image/x-icon";
-  case "tif"_t: return "image/tiff";
-  case "tiff"_t: return "image/tiff";
-  case "jpg"_t:
-  case "jpeg"_t: return "image/jpeg";
-
-  case "mp4"_t: return "video/mp4";
-  case "mpeg"_t: return "video/mpeg";
-  case "webm"_t: return "video/webm";
-
-  case "mp3"_t: return "audio/mp3";
-  case "mpga"_t: return "audio/mpeg";
-  case "weba"_t: return "audio/webm";
-  case "wav"_t: return "audio/wave";
-
-  case "otf"_t: return "font/otf";
-  case "ttf"_t: return "font/ttf";
-  case "woff"_t: return "font/woff";
-  case "woff2"_t: return "font/woff2";
-
-  case "7z"_t: return "application/x-7z-compressed";
-  case "atom"_t: return "application/atom+xml";
-  case "pdf"_t: return "application/pdf";
-  case "json"_t: return "application/json";
-  case "rss"_t: return "application/rss+xml";
-  case "tar"_t: return "application/x-tar";
-  case "xht"_t:
-  case "xhtml"_t: return "application/xhtml+xml";
-  case "xslt"_t: return "application/xslt+xml";
-  case "xml"_t: return "application/xml";
-  case "gz"_t: return "application/gzip";
-  case "zip"_t: return "application/zip";
-  case "wasm"_t: return "application/wasm";
-  }
-}
-
-inline bool can_compress_content_type(const std::string &content_type) {
-  using udl::operator""_t;
-
-  auto tag = str2tag(content_type);
-
-  switch (tag) {
-  case "image/svg+xml"_t:
-  case "application/javascript"_t:
-  case "application/json"_t:
-  case "application/xml"_t:
-  case "application/protobuf"_t:
-  case "application/xhtml+xml"_t: return true;
-
-  case "text/event-stream"_t: return false;
-
-  default: return !content_type.rfind("text/", 0);
-  }
-}
-
-inline EncodingType encoding_type(const Request &req, const Response &res) {
-  auto ret =
-      detail::can_compress_content_type(res.get_header_value("Content-Type"));
-  if (!ret) { return EncodingType::None; }
-
-  const auto &s = req.get_header_value("Accept-Encoding");
-  (void)(s);
-
-#ifdef CPPHTTPLIB_BROTLI_SUPPORT
-  // TODO: 'Accept-Encoding' has br, not br;q=0
-  ret = s.find("br") != std::string::npos;
-  if (ret) { return EncodingType::Brotli; }
-#endif
-
-#ifdef CPPHTTPLIB_ZLIB_SUPPORT
-  // TODO: 'Accept-Encoding' has gzip, not gzip;q=0
-  ret = s.find("gzip") != std::string::npos;
-  if (ret) { return EncodingType::Gzip; }
-#endif
-
-#ifdef CPPHTTPLIB_ZSTD_SUPPORT
-  // TODO: 'Accept-Encoding' has zstd, not zstd;q=0
-  ret = s.find("zstd") != std::string::npos;
-  if (ret) { return EncodingType::Zstd; }
-#endif
-
-  return EncodingType::None;
-}
-
-inline bool nocompressor::compress(const char *data, size_t data_length,
-                                   bool /*last*/, Callback callback) {
-  if (!data_length) { return true; }
-  return callback(data, data_length);
-}
-
-#ifdef CPPHTTPLIB_ZLIB_SUPPORT
-inline gzip_compressor::gzip_compressor() {
-  std::memset(&strm_, 0, sizeof(strm_));
-  strm_.zalloc = Z_NULL;
-  strm_.zfree = Z_NULL;
-  strm_.opaque = Z_NULL;
-
-  is_valid_ = deflateInit2(&strm_, Z_DEFAULT_COMPRESSION, Z_DEFLATED, 31, 8,
-                           Z_DEFAULT_STRATEGY) == Z_OK;
-}
-
-inline gzip_compressor::~gzip_compressor() { deflateEnd(&strm_); }
-
-inline bool gzip_compressor::compress(const char *data, size_t data_length,
-                                      bool last, Callback callback) {
-  assert(is_valid_);
-
-  do {
-    constexpr size_t max_avail_in =
-        (std::numeric_limits<decltype(strm_.avail_in)>::max)();
-
-    strm_.avail_in = static_cast<decltype(strm_.avail_in)>(
-        (std::min)(data_length, max_avail_in));
-    strm_.next_in = const_cast<Bytef *>(reinterpret_cast<const Bytef *>(data));
-
-    data_length -= strm_.avail_in;
-    data += strm_.avail_in;
-
-    auto flush = (last && data_length == 0) ? Z_FINISH : Z_NO_FLUSH;
-    auto ret = Z_OK;
-
-    std::array<char, CPPHTTPLIB_COMPRESSION_BUFSIZ> buff{};
-    do {
-      strm_.avail_out = static_cast<uInt>(buff.size());
-      strm_.next_out = reinterpret_cast<Bytef *>(buff.data());
-
-      ret = deflate(&strm_, flush);
-      if (ret == Z_STREAM_ERROR) { return false; }
-
-      if (!callback(buff.data(), buff.size() - strm_.avail_out)) {
-        return false;
-      }
-    } while (strm_.avail_out == 0);
-
-    assert((flush == Z_FINISH && ret == Z_STREAM_END) ||
-           (flush == Z_NO_FLUSH && ret == Z_OK));
-    assert(strm_.avail_in == 0);
-  } while (data_length > 0);
-
-  return true;
-}
-
-inline gzip_decompressor::gzip_decompressor() {
-  std::memset(&strm_, 0, sizeof(strm_));
-  strm_.zalloc = Z_NULL;
-  strm_.zfree = Z_NULL;
-  strm_.opaque = Z_NULL;
-
-  // 15 is the value of wbits, which should be at the maximum possible value
-  // to ensure that any gzip stream can be decoded. The offset of 32 specifies
-  // that the stream type should be automatically detected either gzip or
-  // deflate.
-  is_valid_ = inflateInit2(&strm_, 32 + 15) == Z_OK;
-}
-
-inline gzip_decompressor::~gzip_decompressor() { inflateEnd(&strm_); }
-
-inline bool gzip_decompressor::is_valid() const { return is_valid_; }
-
-inline bool gzip_decompressor::decompress(const char *data, size_t data_length,
-                                          Callback callback) {
-  assert(is_valid_);
-
-  auto ret = Z_OK;
-
-  do {
-    constexpr size_t max_avail_in =
-        (std::numeric_limits<decltype(strm_.avail_in)>::max)();
-
-    strm_.avail_in = static_cast<decltype(strm_.avail_in)>(
-        (std::min)(data_length, max_avail_in));
-    strm_.next_in = const_cast<Bytef *>(reinterpret_cast<const Bytef *>(data));
-
-    data_length -= strm_.avail_in;
-    data += strm_.avail_in;
-
-    std::array<char, CPPHTTPLIB_COMPRESSION_BUFSIZ> buff{};
-    while (strm_.avail_in > 0 && ret == Z_OK) {
-      strm_.avail_out = static_cast<uInt>(buff.size());
-      strm_.next_out = reinterpret_cast<Bytef *>(buff.data());
-
-      ret = inflate(&strm_, Z_NO_FLUSH);
-
-      assert(ret != Z_STREAM_ERROR);
-      switch (ret) {
-      case Z_NEED_DICT:
-      case Z_DATA_ERROR:
-      case Z_MEM_ERROR: inflateEnd(&strm_); return false;
-      }
-
-      if (!callback(buff.data(), buff.size() - strm_.avail_out)) {
-        return false;
-      }
-    }
-
-    if (ret != Z_OK && ret != Z_STREAM_END) { return false; }
-
-  } while (data_length > 0);
-
-  return true;
-}
-#endif
-
-#ifdef CPPHTTPLIB_BROTLI_SUPPORT
-inline brotli_compressor::brotli_compressor() {
-  state_ = BrotliEncoderCreateInstance(nullptr, nullptr, nullptr);
-}
-
-inline brotli_compressor::~brotli_compressor() {
-  BrotliEncoderDestroyInstance(state_);
-}
-
-inline bool brotli_compressor::compress(const char *data, size_t data_length,
-                                        bool last, Callback callback) {
-  std::array<uint8_t, CPPHTTPLIB_COMPRESSION_BUFSIZ> buff{};
-
-  auto operation = last ? BROTLI_OPERATION_FINISH : BROTLI_OPERATION_PROCESS;
-  auto available_in = data_length;
-  auto next_in = reinterpret_cast<const uint8_t *>(data);
-
-  for (;;) {
-    if (last) {
-      if (BrotliEncoderIsFinished(state_)) { break; }
-    } else {
-      if (!available_in) { break; }
-    }
-
-    auto available_out = buff.size();
-    auto next_out = buff.data();
-
-    if (!BrotliEncoderCompressStream(state_, operation, &available_in, &next_in,
-                                     &available_out, &next_out, nullptr)) {
-      return false;
-    }
-
-    auto output_bytes = buff.size() - available_out;
-    if (output_bytes) {
-      callback(reinterpret_cast<const char *>(buff.data()), output_bytes);
-    }
-  }
-
-  return true;
-}
-
-inline brotli_decompressor::brotli_decompressor() {
-  decoder_s = BrotliDecoderCreateInstance(0, 0, 0);
-  decoder_r = decoder_s ? BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT
-                        : BROTLI_DECODER_RESULT_ERROR;
-}
-
-inline brotli_decompressor::~brotli_decompressor() {
-  if (decoder_s) { BrotliDecoderDestroyInstance(decoder_s); }
-}
-
-inline bool brotli_decompressor::is_valid() const { return decoder_s; }
-
-inline bool brotli_decompressor::decompress(const char *data,
-                                            size_t data_length,
-                                            Callback callback) {
-  if (decoder_r == BROTLI_DECODER_RESULT_SUCCESS ||
-      decoder_r == BROTLI_DECODER_RESULT_ERROR) {
-    return 0;
-  }
-
-  auto next_in = reinterpret_cast<const uint8_t *>(data);
-  size_t avail_in = data_length;
-  size_t total_out;
-
-  decoder_r = BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT;
-
-  std::array<char, CPPHTTPLIB_COMPRESSION_BUFSIZ> buff{};
-  while (decoder_r == BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT) {
-    char *next_out = buff.data();
-    size_t avail_out = buff.size();
-
-    decoder_r = BrotliDecoderDecompressStream(
-        decoder_s, &avail_in, &next_in, &avail_out,
-        reinterpret_cast<uint8_t **>(&next_out), &total_out);
-
-    if (decoder_r == BROTLI_DECODER_RESULT_ERROR) { return false; }
-
-    if (!callback(buff.data(), buff.size() - avail_out)) { return false; }
-  }
-
-  return decoder_r == BROTLI_DECODER_RESULT_SUCCESS ||
-         decoder_r == BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT;
-}
-#endif
-
-#ifdef CPPHTTPLIB_ZSTD_SUPPORT
-inline zstd_compressor::zstd_compressor() {
-  ctx_ = ZSTD_createCCtx();
-  ZSTD_CCtx_setParameter(ctx_, ZSTD_c_compressionLevel, ZSTD_fast);
-}
-
-inline zstd_compressor::~zstd_compressor() { ZSTD_freeCCtx(ctx_); }
-
-inline bool zstd_compressor::compress(const char *data, size_t data_length,
-                                      bool last, Callback callback) {
-  std::array<char, CPPHTTPLIB_COMPRESSION_BUFSIZ> buff{};
-
-  ZSTD_EndDirective mode = last ? ZSTD_e_end : ZSTD_e_continue;
-  ZSTD_inBuffer input = {data, data_length, 0};
-
-  bool finished;
-  do {
-    ZSTD_outBuffer output = {buff.data(), CPPHTTPLIB_COMPRESSION_BUFSIZ, 0};
-    size_t const remaining = ZSTD_compressStream2(ctx_, &output, &input, mode);
-
-    if (ZSTD_isError(remaining)) { return false; }
-
-    if (!callback(buff.data(), output.pos)) { return false; }
-
-    finished = last ? (remaining == 0) : (input.pos == input.size);
-
-  } while (!finished);
-
-  return true;
-}
-
-inline zstd_decompressor::zstd_decompressor() { ctx_ = ZSTD_createDCtx(); }
-
-inline zstd_decompressor::~zstd_decompressor() { ZSTD_freeDCtx(ctx_); }
-
-inline bool zstd_decompressor::is_valid() const { return ctx_ != nullptr; }
-
-inline bool zstd_decompressor::decompress(const char *data, size_t data_length,
-                                          Callback callback) {
-  std::array<char, CPPHTTPLIB_COMPRESSION_BUFSIZ> buff{};
-  ZSTD_inBuffer input = {data, data_length, 0};
-
-  while (input.pos < input.size) {
-    ZSTD_outBuffer output = {buff.data(), CPPHTTPLIB_COMPRESSION_BUFSIZ, 0};
-    size_t const remaining = ZSTD_decompressStream(ctx_, &output, &input);
-
-    if (ZSTD_isError(remaining)) { return false; }
-
-    if (!callback(buff.data(), output.pos)) { return false; }
-  }
-
-  return true;
-}
-#endif
-
-inline bool has_header(const Headers &headers, const std::string &key) {
-  return headers.find(key) != headers.end();
-}
-
-inline const char *get_header_value(const Headers &headers,
-                                    const std::string &key, const char *def,
-                                    size_t id) {
-  auto rng = headers.equal_range(key);
-  auto it = rng.first;
-  std::advance(it, static_cast<ssize_t>(id));
-  if (it != rng.second) { return it->second.c_str(); }
-  return def;
-}
-
-template <typename T>
-inline bool parse_header(const char *beg, const char *end, T fn) {
-  // Skip trailing spaces and tabs.
-  while (beg < end && is_space_or_tab(end[-1])) {
-    end--;
-  }
-
-  auto p = beg;
-  while (p < end && *p != ':') {
-    p++;
-  }
-
-  auto name = std::string(beg, p);
-  if (!detail::fields::is_field_name(name)) { return false; }
-
-  if (p == end) { return false; }
-
-  auto key_end = p;
-
-  if (*p++ != ':') { return false; }
-
-  while (p < end && is_space_or_tab(*p)) {
-    p++;
-  }
-
-  if (p <= end) {
-    auto key_len = key_end - beg;
-    if (!key_len) { return false; }
-
-    auto key = std::string(beg, key_end);
-    auto val = std::string(p, end);
-
-    if (!detail::fields::is_field_value(val)) { return false; }
-
-    if (case_ignore::equal(key, "Location") ||
-        case_ignore::equal(key, "Referer")) {
-      fn(key, val);
-    } else {
-      fn(key, decode_url(val, false));
-    }
-
-    return true;
-  }
-
-  return false;
-}
-
-inline bool read_headers(Stream &strm, Headers &headers) {
-  const auto bufsiz = 2048;
-  char buf[bufsiz];
-  stream_line_reader line_reader(strm, buf, bufsiz);
-
-  for (;;) {
-    if (!line_reader.getline()) { return false; }
-
-    // Check if the line ends with CRLF.
-    auto line_terminator_len = 2;
-    if (line_reader.end_with_crlf()) {
-      // Blank line indicates end of headers.
-      if (line_reader.size() == 2) { break; }
-    } else {
-#ifdef CPPHTTPLIB_ALLOW_LF_AS_LINE_TERMINATOR
-      // Blank line indicates end of headers.
-      if (line_reader.size() == 1) { break; }
-      line_terminator_len = 1;
-#else
-      continue; // Skip invalid line.
-#endif
-    }
-
-    if (line_reader.size() > CPPHTTPLIB_HEADER_MAX_LENGTH) { return false; }
-
-    // Exclude line terminator
-    auto end = line_reader.ptr() + line_reader.size() - line_terminator_len;
-
-    if (!parse_header(line_reader.ptr(), end,
-                      [&](const std::string &key, const std::string &val) {
-                        headers.emplace(key, val);
-                      })) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-inline bool read_content_with_length(Stream &strm, uint64_t len,
-                                     Progress progress,
-                                     ContentReceiverWithProgress out) {
-  char buf[CPPHTTPLIB_RECV_BUFSIZ];
-
-  uint64_t r = 0;
-  while (r < len) {
-    auto read_len = static_cast<size_t>(len - r);
-    auto n = strm.read(buf, (std::min)(read_len, CPPHTTPLIB_RECV_BUFSIZ));
-    if (n <= 0) { return false; }
-
-    if (!out(buf, static_cast<size_t>(n), r, len)) { return false; }
-    r += static_cast<uint64_t>(n);
-
-    if (progress) {
-      if (!progress(r, len)) { return false; }
-    }
-  }
-
-  return true;
-}
-
-inline void skip_content_with_length(Stream &strm, uint64_t len) {
-  char buf[CPPHTTPLIB_RECV_BUFSIZ];
-  uint64_t r = 0;
-  while (r < len) {
-    auto read_len = static_cast<size_t>(len - r);
-    auto n = strm.read(buf, (std::min)(read_len, CPPHTTPLIB_RECV_BUFSIZ));
-    if (n <= 0) { return; }
-    r += static_cast<uint64_t>(n);
-  }
-}
-
-inline bool read_content_without_length(Stream &strm,
-                                        ContentReceiverWithProgress out) {
-  char buf[CPPHTTPLIB_RECV_BUFSIZ];
-  uint64_t r = 0;
-  for (;;) {
-    auto n = strm.read(buf, CPPHTTPLIB_RECV_BUFSIZ);
-    if (n == 0) { return true; }
-    if (n < 0) { return false; }
-
-    if (!out(buf, static_cast<size_t>(n), r, 0)) { return false; }
-    r += static_cast<uint64_t>(n);
-  }
-
-  return true;
-}
-
-template <typename T>
-inline bool read_content_chunked(Stream &strm, T &x,
-                                 ContentReceiverWithProgress out) {
-  const auto bufsiz = 16;
-  char buf[bufsiz];
-
-  stream_line_reader line_reader(strm, buf, bufsiz);
-
-  if (!line_reader.getline()) { return false; }
-
-  unsigned long chunk_len;
-  while (true) {
-    char *end_ptr;
-
-    chunk_len = std::strtoul(line_reader.ptr(), &end_ptr, 16);
-
-    if (end_ptr == line_reader.ptr()) { return false; }
-    if (chunk_len == ULONG_MAX) { return false; }
-
-    if (chunk_len == 0) { break; }
-
-    if (!read_content_with_length(strm, chunk_len, nullptr, out)) {
-      return false;
-    }
-
-    if (!line_reader.getline()) { return false; }
-
-    if (strcmp(line_reader.ptr(), "\r\n") != 0) { return false; }
-
-    if (!line_reader.getline()) { return false; }
-  }
-
-  assert(chunk_len == 0);
-
-  // NOTE: In RFC 9112, '7.1 Chunked Transfer Coding' mentions "The chunked
-  // transfer coding is complete when a chunk with a chunk-size of zero is
-  // received, possibly followed by a trailer section, and finally terminated by
-  // an empty line". https://www.rfc-editor.org/rfc/rfc9112.html#section-7.1
-  //
-  // In '7.1.3. Decoding Chunked', however, the pseudo-code in the section
-  // does't care for the existence of the final CRLF. In other words, it seems
-  // to be ok whether the final CRLF exists or not in the chunked data.
-  // https://www.rfc-editor.org/rfc/rfc9112.html#section-7.1.3
-  //
-  // According to the reference code in RFC 9112, cpp-httplib now allows
-  // chunked transfer coding data without the final CRLF.
-  if (!line_reader.getline()) { return true; }
-
-  while (strcmp(line_reader.ptr(), "\r\n") != 0) {
-    if (line_reader.size() > CPPHTTPLIB_HEADER_MAX_LENGTH) { return false; }
-
-    // Exclude line terminator
-    constexpr auto line_terminator_len = 2;
-    auto end = line_reader.ptr() + line_reader.size() - line_terminator_len;
-
-    parse_header(line_reader.ptr(), end,
-                 [&](const std::string &key, const std::string &val) {
-                   x.headers.emplace(key, val);
-                 });
-
-    if (!line_reader.getline()) { return false; }
-  }
-
-  return true;
-}
-
-inline bool is_chunked_transfer_encoding(const Headers &headers) {
-  return case_ignore::equal(
-      get_header_value(headers, "Transfer-Encoding", "", 0), "chunked");
-}
-
-template <typename T, typename U>
-bool prepare_content_receiver(T &x, int &status,
-                              ContentReceiverWithProgress receiver,
-                              bool decompress, U callback) {
-  if (decompress) {
-    std::string encoding = x.get_header_value("Content-Encoding");
-    std::unique_ptr<decompressor> decompressor;
-
-    if (encoding == "gzip" || encoding == "deflate") {
-#ifdef CPPHTTPLIB_ZLIB_SUPPORT
-      decompressor = detail::make_unique<gzip_decompressor>();
-#else
-      status = StatusCode::UnsupportedMediaType_415;
-      return false;
-#endif
-    } else if (encoding.find("br") != std::string::npos) {
-#ifdef CPPHTTPLIB_BROTLI_SUPPORT
-      decompressor = detail::make_unique<brotli_decompressor>();
-#else
-      status = StatusCode::UnsupportedMediaType_415;
-      return false;
-#endif
-    } else if (encoding == "zstd") {
-#ifdef CPPHTTPLIB_ZSTD_SUPPORT
-      decompressor = detail::make_unique<zstd_decompressor>();
-#else
-      status = StatusCode::UnsupportedMediaType_415;
-      return false;
-#endif
-    }
-
-    if (decompressor) {
-      if (decompressor->is_valid()) {
-        ContentReceiverWithProgress out = [&](const char *buf, size_t n,
-                                              uint64_t off, uint64_t len) {
-          return decompressor->decompress(buf, n,
-                                          [&](const char *buf2, size_t n2) {
-                                            return receiver(buf2, n2, off, len);
-                                          });
-        };
-        return callback(std::move(out));
-      } else {
-        status = StatusCode::InternalServerError_500;
-        return false;
-      }
-    }
-  }
-
-  ContentReceiverWithProgress out = [&](const char *buf, size_t n, uint64_t off,
-                                        uint64_t len) {
-    return receiver(buf, n, off, len);
-  };
-  return callback(std::move(out));
-}
-
-template <typename T>
-bool read_content(Stream &strm, T &x, size_t payload_max_length, int &status,
-                  Progress progress, ContentReceiverWithProgress receiver,
-                  bool decompress) {
-  return prepare_content_receiver(
-      x, status, std::move(receiver), decompress,
-      [&](const ContentReceiverWithProgress &out) {
-        auto ret = true;
-        auto exceed_payload_max_length = false;
-
-        if (is_chunked_transfer_encoding(x.headers)) {
-          ret = read_content_chunked(strm, x, out);
-        } else if (!has_header(x.headers, "Content-Length")) {
-          ret = read_content_without_length(strm, out);
-        } else {
-          auto is_invalid_value = false;
-          auto len = get_header_value_u64(
-              x.headers, "Content-Length",
-              (std::numeric_limits<uint64_t>::max)(), 0, is_invalid_value);
-
-          if (is_invalid_value) {
-            ret = false;
-          } else if (len > payload_max_length) {
-            exceed_payload_max_length = true;
-            skip_content_with_length(strm, len);
-            ret = false;
-          } else if (len > 0) {
-            ret = read_content_with_length(strm, len, std::move(progress), out);
-          }
-        }
-
-        if (!ret) {
-          status = exceed_payload_max_length ? StatusCode::PayloadTooLarge_413
-                                             : StatusCode::BadRequest_400;
-        }
-        return ret;
-      });
-}
-
-inline ssize_t write_request_line(Stream &strm, const std::string &method,
-                                  const std::string &path) {
-  std::string s = method;
-  s += " ";
-  s += path;
-  s += " HTTP/1.1\r\n";
-  return strm.write(s.data(), s.size());
-}
-
-inline ssize_t write_response_line(Stream &strm, int status) {
-  std::string s = "HTTP/1.1 ";
-  s += std::to_string(status);
-  s += " ";
-  s += httplib::status_message(status);
-  s += "\r\n";
-  return strm.write(s.data(), s.size());
-}
-
-inline ssize_t write_headers(Stream &strm, const Headers &headers) {
-  ssize_t write_len = 0;
-  for (const auto &x : headers) {
-    std::string s;
-    s = x.first;
-    s += ": ";
-    s += x.second;
-    s += "\r\n";
-
-    auto len = strm.write(s.data(), s.size());
-    if (len < 0) { return len; }
-    write_len += len;
-  }
-  auto len = strm.write("\r\n");
-  if (len < 0) { return len; }
-  write_len += len;
-  return write_len;
-}
-
-inline bool write_data(Stream &strm, const char *d, size_t l) {
-  size_t offset = 0;
-  while (offset < l) {
-    auto length = strm.write(d + offset, l - offset);
-    if (length < 0) { return false; }
-    offset += static_cast<size_t>(length);
-  }
-  return true;
-}
-
-template <typename T>
-inline bool write_content(Stream &strm, const ContentProvider &content_provider,
-                          size_t offset, size_t length, T is_shutting_down,
-                          Error &error) {
-  size_t end_offset = offset + length;
-  auto ok = true;
-  DataSink data_sink;
-
-  data_sink.write = [&](const char *d, size_t l) -> bool {
-    if (ok) {
-      if (write_data(strm, d, l)) {
-        offset += l;
-      } else {
-        ok = false;
-      }
-    }
-    return ok;
-  };
-
-  data_sink.is_writable = [&]() -> bool { return strm.wait_writable(); };
-
-  while (offset < end_offset && !is_shutting_down()) {
-    if (!strm.wait_writable()) {
-      error = Error::Write;
-      return false;
-    } else if (!content_provider(offset, end_offset - offset, data_sink)) {
-      error = Error::Canceled;
-      return false;
-    } else if (!ok) {
-      error = Error::Write;
-      return false;
-    }
-  }
-
-  error = Error::Success;
-  return true;
-}
-
-template <typename T>
-inline bool write_content(Stream &strm, const ContentProvider &content_provider,
-                          size_t offset, size_t length,
-                          const T &is_shutting_down) {
-  auto error = Error::Success;
-  return write_content(strm, content_provider, offset, length, is_shutting_down,
-                       error);
-}
-
-template <typename T>
-inline bool
-write_content_without_length(Stream &strm,
-                             const ContentProvider &content_provider,
-                             const T &is_shutting_down) {
-  size_t offset = 0;
-  auto data_available = true;
-  auto ok = true;
-  DataSink data_sink;
-
-  data_sink.write = [&](const char *d, size_t l) -> bool {
-    if (ok) {
-      offset += l;
-      if (!write_data(strm, d, l)) { ok = false; }
-    }
-    return ok;
-  };
-
-  data_sink.is_writable = [&]() -> bool { return strm.wait_writable(); };
-
-  data_sink.done = [&](void) { data_available = false; };
-
-  while (data_available && !is_shutting_down()) {
-    if (!strm.wait_writable()) {
-      return false;
-    } else if (!content_provider(offset, 0, data_sink)) {
-      return false;
-    } else if (!ok) {
-      return false;
-    }
-  }
-  return true;
-}
-
-template <typename T, typename U>
-inline bool
-write_content_chunked(Stream &strm, const ContentProvider &content_provider,
-                      const T &is_shutting_down, U &compressor, Error &error) {
-  size_t offset = 0;
-  auto data_available = true;
-  auto ok = true;
-  DataSink data_sink;
-
-  data_sink.write = [&](const char *d, size_t l) -> bool {
-    if (ok) {
-      data_available = l > 0;
-      offset += l;
-
-      std::string payload;
-      if (compressor.compress(d, l, false,
-                              [&](const char *data, size_t data_len) {
-                                payload.append(data, data_len);
-                                return true;
-                              })) {
-        if (!payload.empty()) {
-          // Emit chunked response header and footer for each chunk
-          auto chunk =
-              from_i_to_hex(payload.size()) + "\r\n" + payload + "\r\n";
-          if (!write_data(strm, chunk.data(), chunk.size())) { ok = false; }
-        }
-      } else {
-        ok = false;
-      }
-    }
-    return ok;
-  };
-
-  data_sink.is_writable = [&]() -> bool { return strm.wait_writable(); };
-
-  auto done_with_trailer = [&](const Headers *trailer) {
-    if (!ok) { return; }
-
-    data_available = false;
-
-    std::string payload;
-    if (!compressor.compress(nullptr, 0, true,
-                             [&](const char *data, size_t data_len) {
-                               payload.append(data, data_len);
-                               return true;
-                             })) {
-      ok = false;
-      return;
-    }
-
-    if (!payload.empty()) {
-      // Emit chunked response header and footer for each chunk
-      auto chunk = from_i_to_hex(payload.size()) + "\r\n" + payload + "\r\n";
-      if (!write_data(strm, chunk.data(), chunk.size())) {
-        ok = false;
-        return;
-      }
-    }
-
-    constexpr const char done_marker[] = "0\r\n";
-    if (!write_data(strm, done_marker, str_len(done_marker))) { ok = false; }
-
-    // Trailer
-    if (trailer) {
-      for (const auto &kv : *trailer) {
-        std::string field_line = kv.first + ": " + kv.second + "\r\n";
-        if (!write_data(strm, field_line.data(), field_line.size())) {
-          ok = false;
-        }
-      }
-    }
-
-    constexpr const char crlf[] = "\r\n";
-    if (!write_data(strm, crlf, str_len(crlf))) { ok = false; }
-  };
-
-  data_sink.done = [&](void) { done_with_trailer(nullptr); };
-
-  data_sink.done_with_trailer = [&](const Headers &trailer) {
-    done_with_trailer(&trailer);
-  };
-
-  while (data_available && !is_shutting_down()) {
-    if (!strm.wait_writable()) {
-      error = Error::Write;
-      return false;
-    } else if (!content_provider(offset, 0, data_sink)) {
-      error = Error::Canceled;
-      return false;
-    } else if (!ok) {
-      error = Error::Write;
-      return false;
-    }
-  }
-
-  error = Error::Success;
-  return true;
-}
-
-template <typename T, typename U>
-inline bool write_content_chunked(Stream &strm,
-                                  const ContentProvider &content_provider,
-                                  const T &is_shutting_down, U &compressor) {
-  auto error = Error::Success;
-  return write_content_chunked(strm, content_provider, is_shutting_down,
-                               compressor, error);
-}
-
-template <typename T>
-inline bool redirect(T &cli, Request &req, Response &res,
-                     const std::string &path, const std::string &location,
-                     Error &error) {
-  Request new_req = req;
-  new_req.path = path;
-  new_req.redirect_count_ -= 1;
-
-  if (res.status == StatusCode::SeeOther_303 &&
-      (req.method != "GET" && req.method != "HEAD")) {
-    new_req.method = "GET";
-    new_req.body.clear();
-    new_req.headers.clear();
-  }
-
-  Response new_res;
-
-  auto ret = cli.send(new_req, new_res, error);
-  if (ret) {
-    req = new_req;
-    res = new_res;
-
-    if (res.location.empty()) { res.location = location; }
-  }
-  return ret;
-}
-
-inline std::string params_to_query_str(const Params &params) {
-  std::string query;
-
-  for (auto it = params.begin(); it != params.end(); ++it) {
-    if (it != params.begin()) { query += "&"; }
-    query += it->first;
-    query += "=";
-    query += encode_query_param(it->second);
-  }
-  return query;
-}
-
-inline void parse_query_text(const char *data, std::size_t size,
-                             Params &params) {
-  std::set<std::string> cache;
-  split(data, data + size, '&', [&](const char *b, const char *e) {
-    std::string kv(b, e);
-    if (cache.find(kv) != cache.end()) { return; }
-    cache.insert(std::move(kv));
-
-    std::string key;
-    std::string val;
-    divide(b, static_cast<std::size_t>(e - b), '=',
-           [&](const char *lhs_data, std::size_t lhs_size, const char *rhs_data,
-               std::size_t rhs_size) {
-             key.assign(lhs_data, lhs_size);
-             val.assign(rhs_data, rhs_size);
-           });
-
-    if (!key.empty()) {
-      params.emplace(decode_url(key, true), decode_url(val, true));
-    }
-  });
-}
-
-inline void parse_query_text(const std::string &s, Params &params) {
-  parse_query_text(s.data(), s.size(), params);
-}
-
-inline bool parse_multipart_boundary(const std::string &content_type,
-                                     std::string &boundary) {
-  auto boundary_keyword = "boundary=";
-  auto pos = content_type.find(boundary_keyword);
-  if (pos == std::string::npos) { return false; }
-  auto end = content_type.find(';', pos);
-  auto beg = pos + strlen(boundary_keyword);
-  boundary = trim_double_quotes_copy(content_type.substr(beg, end - beg));
-  return !boundary.empty();
-}
-
-inline void parse_disposition_params(const std::string &s, Params &params) {
-  std::set<std::string> cache;
-  split(s.data(), s.data() + s.size(), ';', [&](const char *b, const char *e) {
-    std::string kv(b, e);
-    if (cache.find(kv) != cache.end()) { return; }
-    cache.insert(kv);
-
-    std::string key;
-    std::string val;
-    split(b, e, '=', [&](const char *b2, const char *e2) {
-      if (key.empty()) {
-        key.assign(b2, e2);
-      } else {
-        val.assign(b2, e2);
-      }
-    });
-
-    if (!key.empty()) {
-      params.emplace(trim_double_quotes_copy((key)),
-                     trim_double_quotes_copy((val)));
-    }
-  });
-}
-
-#ifdef CPPHTTPLIB_NO_EXCEPTIONS
-inline bool parse_range_header(const std::string &s, Ranges &ranges) {
-#else
-inline bool parse_range_header(const std::string &s, Ranges &ranges) try {
-#endif
-  auto is_valid = [](const std::string &str) {
-    return std::all_of(str.cbegin(), str.cend(),
-                       [](unsigned char c) { return std::isdigit(c); });
-  };
-
-  if (s.size() > 7 && s.compare(0, 6, "bytes=") == 0) {
-    const auto pos = static_cast<size_t>(6);
-    const auto len = static_cast<size_t>(s.size() - 6);
-    auto all_valid_ranges = true;
-    split(&s[pos], &s[pos + len], ',', [&](const char *b, const char *e) {
-      if (!all_valid_ranges) { return; }
-
-      const auto it = std::find(b, e, '-');
-      if (it == e) {
-        all_valid_ranges = false;
-        return;
-      }
-
-      const auto lhs = std::string(b, it);
-      const auto rhs = std::string(it + 1, e);
-      if (!is_valid(lhs) || !is_valid(rhs)) {
-        all_valid_ranges = false;
-        return;
-      }
-
-      const auto first =
-          static_cast<ssize_t>(lhs.empty() ? -1 : std::stoll(lhs));
-      const auto last =
-          static_cast<ssize_t>(rhs.empty() ? -1 : std::stoll(rhs));
-      if ((first == -1 && last == -1) ||
-          (first != -1 && last != -1 && first > last)) {
-        all_valid_ranges = false;
-        return;
-      }
-
-      ranges.emplace_back(first, last);
-    });
-    return all_valid_ranges && !ranges.empty();
-  }
-  return false;
-#ifdef CPPHTTPLIB_NO_EXCEPTIONS
-}
-#else
-} catch (...) { return false; }
-#endif
-
-class MultipartFormDataParser {
-public:
-  MultipartFormDataParser() = default;
-
-  void set_boundary(std::string &&boundary) {
-    boundary_ = boundary;
-    dash_boundary_crlf_ = dash_ + boundary_ + crlf_;
-    crlf_dash_boundary_ = crlf_ + dash_ + boundary_;
-  }
-
-  bool is_valid() const { return is_valid_; }
-
-  bool parse(const char *buf, size_t n, const ContentReceiver &content_callback,
-             const MultipartContentHeader &header_callback) {
-
-    buf_append(buf, n);
-
-    while (buf_size() > 0) {
-      switch (state_) {
-      case 0: { // Initial boundary
-        buf_erase(buf_find(dash_boundary_crlf_));
-        if (dash_boundary_crlf_.size() > buf_size()) { return true; }
-        if (!buf_start_with(dash_boundary_crlf_)) { return false; }
-        buf_erase(dash_boundary_crlf_.size());
-        state_ = 1;
-        break;
-      }
-      case 1: { // New entry
-        clear_file_info();
-        state_ = 2;
-        break;
-      }
-      case 2: { // Headers
-        auto pos = buf_find(crlf_);
-        if (pos > CPPHTTPLIB_HEADER_MAX_LENGTH) { return false; }
-        while (pos < buf_size()) {
-          // Empty line
-          if (pos == 0) {
-            if (!header_callback(file_)) {
-              is_valid_ = false;
-              return false;
-            }
-            buf_erase(crlf_.size());
-            state_ = 3;
-            break;
-          }
-
-          const auto header = buf_head(pos);
-
-          if (!parse_header(header.data(), header.data() + header.size(),
-                            [&](const std::string &, const std::string &) {})) {
-            is_valid_ = false;
-            return false;
-          }
-
-          constexpr const char header_content_type[] = "Content-Type:";
-
-          if (start_with_case_ignore(header, header_content_type)) {
-            file_.content_type =
-                trim_copy(header.substr(str_len(header_content_type)));
-          } else {
-            thread_local const std::regex re_content_disposition(
-                R"~(^Content-Disposition:\s*form-data;\s*(.*)$)~",
-                std::regex_constants::icase);
-
-            std::smatch m;
-            if (std::regex_match(header, m, re_content_disposition)) {
-              Params params;
-              parse_disposition_params(m[1], params);
-
-              auto it = params.find("name");
-              if (it != params.end()) {
-                file_.name = it->second;
-              } else {
-                is_valid_ = false;
-                return false;
-              }
-
-              it = params.find("filename");
-              if (it != params.end()) { file_.filename = it->second; }
-
-              it = params.find("filename*");
-              if (it != params.end()) {
-                // Only allow UTF-8 encoding...
-                thread_local const std::regex re_rfc5987_encoding(
-                    R"~(^UTF-8''(.+?)$)~", std::regex_constants::icase);
-
-                std::smatch m2;
-                if (std::regex_match(it->second, m2, re_rfc5987_encoding)) {
-                  file_.filename = decode_url(m2[1], false); // override...
-                } else {
-                  is_valid_ = false;
-                  return false;
-                }
-              }
-            }
-          }
-          buf_erase(pos + crlf_.size());
-          pos = buf_find(crlf_);
-        }
-        if (state_ != 3) { return true; }
-        break;
-      }
-      case 3: { // Body
-        if (crlf_dash_boundary_.size() > buf_size()) { return true; }
-        auto pos = buf_find(crlf_dash_boundary_);
-        if (pos < buf_size()) {
-          if (!content_callback(buf_data(), pos)) {
-            is_valid_ = false;
-            return false;
-          }
-          buf_erase(pos + crlf_dash_boundary_.size());
-          state_ = 4;
-        } else {
-          auto len = buf_size() - crlf_dash_boundary_.size();
-          if (len > 0) {
-            if (!content_callback(buf_data(), len)) {
-              is_valid_ = false;
-              return false;
-            }
-            buf_erase(len);
-          }
-          return true;
-        }
-        break;
-      }
-      case 4: { // Boundary
-        if (crlf_.size() > buf_size()) { return true; }
-        if (buf_start_with(crlf_)) {
-          buf_erase(crlf_.size());
-          state_ = 1;
-        } else {
-          if (dash_.size() > buf_size()) { return true; }
-          if (buf_start_with(dash_)) {
-            buf_erase(dash_.size());
-            is_valid_ = true;
-            buf_erase(buf_size()); // Remove epilogue
-          } else {
-            return true;
-          }
-        }
-        break;
-      }
-      }
-    }
-
-    return true;
-  }
-
-private:
-  void clear_file_info() {
-    file_.name.clear();
-    file_.filename.clear();
-    file_.content_type.clear();
-  }
-
-  bool start_with_case_ignore(const std::string &a, const char *b) const {
-    const auto b_len = strlen(b);
-    if (a.size() < b_len) { return false; }
-    for (size_t i = 0; i < b_len; i++) {
-      if (case_ignore::to_lower(a[i]) != case_ignore::to_lower(b[i])) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  const std::string dash_ = "--";
-  const std::string crlf_ = "\r\n";
-  std::string boundary_;
-  std::string dash_boundary_crlf_;
-  std::string crlf_dash_boundary_;
-
-  size_t state_ = 0;
-  bool is_valid_ = false;
-  MultipartFormData file_;
-
-  // Buffer
-  bool start_with(const std::string &a, size_t spos, size_t epos,
-                  const std::string &b) const {
-    if (epos - spos < b.size()) { return false; }
-    for (size_t i = 0; i < b.size(); i++) {
-      if (a[i + spos] != b[i]) { return false; }
-    }
-    return true;
-  }
-
-  size_t buf_size() const { return buf_epos_ - buf_spos_; }
-
-  const char *buf_data() const { return &buf_[buf_spos_]; }
-
-  std::string buf_head(size_t l) const { return buf_.substr(buf_spos_, l); }
-
-  bool buf_start_with(const std::string &s) const {
-    return start_with(buf_, buf_spos_, buf_epos_, s);
-  }
-
-  size_t buf_find(const std::string &s) const {
-    auto c = s.front();
-
-    size_t off = buf_spos_;
-    while (off < buf_epos_) {
-      auto pos = off;
-      while (true) {
-        if (pos == buf_epos_) { return buf_size(); }
-        if (buf_[pos] == c) { break; }
-        pos++;
-      }
-
-      auto remaining_size = buf_epos_ - pos;
-      if (s.size() > remaining_size) { return buf_size(); }
-
-      if (start_with(buf_, pos, buf_epos_, s)) { return pos - buf_spos_; }
-
-      off = pos + 1;
-    }
-
-    return buf_size();
-  }
-
-  void buf_append(const char *data, size_t n) {
-    auto remaining_size = buf_size();
-    if (remaining_size > 0 && buf_spos_ > 0) {
-      for (size_t i = 0; i < remaining_size; i++) {
-        buf_[i] = buf_[buf_spos_ + i];
-      }
-    }
-    buf_spos_ = 0;
-    buf_epos_ = remaining_size;
-
-    if (remaining_size + n > buf_.size()) { buf_.resize(remaining_size + n); }
-
-    for (size_t i = 0; i < n; i++) {
-      buf_[buf_epos_ + i] = data[i];
-    }
-    buf_epos_ += n;
-  }
-
-  void buf_erase(size_t size) { buf_spos_ += size; }
-
-  std::string buf_;
-  size_t buf_spos_ = 0;
-  size_t buf_epos_ = 0;
-};
-
-inline std::string random_string(size_t length) {
-  constexpr const char data[] =
-      "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
-
-  thread_local auto engine([]() {
-    // std::random_device might actually be deterministic on some
-    // platforms, but due to lack of support in the c++ standard library,
-    // doing better requires either some ugly hacks or breaking portability.
-    std::random_device seed_gen;
-    // Request 128 bits of entropy for initialization
-    std::seed_seq seed_sequence{seed_gen(), seed_gen(), seed_gen(), seed_gen()};
-    return std::mt19937(seed_sequence);
-  }());
-
-  std::string result;
-  for (size_t i = 0; i < length; i++) {
-    result += data[engine() % (sizeof(data) - 1)];
-  }
-  return result;
-}
-
-inline std::string make_multipart_data_boundary() {
-  return "--cpp-httplib-multipart-data-" + detail::random_string(16);
-}
-
-inline bool is_multipart_boundary_chars_valid(const std::string &boundary) {
-  auto valid = true;
-  for (size_t i = 0; i < boundary.size(); i++) {
-    auto c = boundary[i];
-    if (!std::isalnum(c) && c != '-' && c != '_') {
-      valid = false;
-      break;
-    }
-  }
-  return valid;
-}
-
-template <typename T>
-inline std::string
-serialize_multipart_formdata_item_begin(const T &item,
-                                        const std::string &boundary) {
-  std::string body = "--" + boundary + "\r\n";
-  body += "Content-Disposition: form-data; name=\"" + item.name + "\"";
-  if (!item.filename.empty()) {
-    body += "; filename=\"" + item.filename + "\"";
-  }
-  body += "\r\n";
-  if (!item.content_type.empty()) {
-    body += "Content-Type: " + item.content_type + "\r\n";
-  }
-  body += "\r\n";
-
-  return body;
-}
-
-inline std::string serialize_multipart_formdata_item_end() { return "\r\n"; }
-
-inline std::string
-serialize_multipart_formdata_finish(const std::string &boundary) {
-  return "--" + boundary + "--\r\n";
-}
-
-inline std::string
-serialize_multipart_formdata_get_content_type(const std::string &boundary) {
-  return "multipart/form-data; boundary=" + boundary;
-}
-
-inline std::string
-serialize_multipart_formdata(const MultipartFormDataItems &items,
-                             const std::string &boundary, bool finish = true) {
-  std::string body;
-
-  for (const auto &item : items) {
-    body += serialize_multipart_formdata_item_begin(item, boundary);
-    body += item.content + serialize_multipart_formdata_item_end();
-  }
-
-  if (finish) { body += serialize_multipart_formdata_finish(boundary); }
-
-  return body;
-}
-
-inline bool range_error(Request &req, Response &res) {
-  if (!req.ranges.empty() && 200 <= res.status && res.status < 300) {
-    ssize_t content_len = static_cast<ssize_t>(
-        res.content_length_ ? res.content_length_ : res.body.size());
-
-    ssize_t prev_first_pos = -1;
-    ssize_t prev_last_pos = -1;
-    size_t overwrapping_count = 0;
-
-    // NOTE: The following Range check is based on '14.2. Range' in RFC 9110
-    // 'HTTP Semantics' to avoid potential denial-of-service attacks.
-    // https://www.rfc-editor.org/rfc/rfc9110#section-14.2
-
-    // Too many ranges
-    if (req.ranges.size() > CPPHTTPLIB_RANGE_MAX_COUNT) { return true; }
-
-    for (auto &r : req.ranges) {
-      auto &first_pos = r.first;
-      auto &last_pos = r.second;
-
-      if (first_pos == -1 && last_pos == -1) {
-        first_pos = 0;
-        last_pos = content_len;
-      }
-
-      if (first_pos == -1) {
-        first_pos = content_len - last_pos;
-        last_pos = content_len - 1;
-      }
-
-      // NOTE: RFC-9110 '14.1.2. Byte Ranges':
-      // A client can limit the number of bytes requested without knowing the
-      // size of the selected representation. If the last-pos value is absent,
-      // or if the value is greater than or equal to the current length of the
-      // representation data, the byte range is interpreted as the remainder of
-      // the representation (i.e., the server replaces the value of last-pos
-      // with a value that is one less than the current length of the selected
-      // representation).
-      // https://www.rfc-editor.org/rfc/rfc9110.html#section-14.1.2-6
-      if (last_pos == -1 || last_pos >= content_len) {
-        last_pos = content_len - 1;
-      }
-
-      // Range must be within content length
-      if (!(0 <= first_pos && first_pos <= last_pos &&
-            last_pos <= content_len - 1)) {
-        return true;
-      }
-
-      // Ranges must be in ascending order
-      if (first_pos <= prev_first_pos) { return true; }
-
-      // Request must not have more than two overlapping ranges
-      if (first_pos <= prev_last_pos) {
-        overwrapping_count++;
-        if (overwrapping_count > 2) { return true; }
-      }
-
-      prev_first_pos = (std::max)(prev_first_pos, first_pos);
-      prev_last_pos = (std::max)(prev_last_pos, last_pos);
-    }
-  }
-
-  return false;
-}
-
-inline std::pair<size_t, size_t>
-get_range_offset_and_length(Range r, size_t content_length) {
-  assert(r.first != -1 && r.second != -1);
-  assert(0 <= r.first && r.first < static_cast<ssize_t>(content_length));
-  assert(r.first <= r.second &&
-         r.second < static_cast<ssize_t>(content_length));
-  (void)(content_length);
-  return std::make_pair(r.first, static_cast<size_t>(r.second - r.first) + 1);
-}
-
-inline std::string make_content_range_header_field(
-    const std::pair<size_t, size_t> &offset_and_length, size_t content_length) {
-  auto st = offset_and_length.first;
-  auto ed = st + offset_and_length.second - 1;
-
-  std::string field = "bytes ";
-  field += std::to_string(st);
-  field += "-";
-  field += std::to_string(ed);
-  field += "/";
-  field += std::to_string(content_length);
-  return field;
-}
-
-template <typename SToken, typename CToken, typename Content>
-bool process_multipart_ranges_data(const Request &req,
-                                   const std::string &boundary,
-                                   const std::string &content_type,
-                                   size_t content_length, SToken stoken,
-                                   CToken ctoken, Content content) {
-  for (size_t i = 0; i < req.ranges.size(); i++) {
-    ctoken("--");
-    stoken(boundary);
-    ctoken("\r\n");
-    if (!content_type.empty()) {
-      ctoken("Content-Type: ");
-      stoken(content_type);
-      ctoken("\r\n");
-    }
-
-    auto offset_and_length =
-        get_range_offset_and_length(req.ranges[i], content_length);
-
-    ctoken("Content-Range: ");
-    stoken(make_content_range_header_field(offset_and_length, content_length));
-    ctoken("\r\n");
-    ctoken("\r\n");
-
-    if (!content(offset_and_length.first, offset_and_length.second)) {
-      return false;
-    }
-    ctoken("\r\n");
-  }
-
-  ctoken("--");
-  stoken(boundary);
-  ctoken("--");
-
-  return true;
-}
-
-inline void make_multipart_ranges_data(const Request &req, Response &res,
-                                       const std::string &boundary,
-                                       const std::string &content_type,
-                                       size_t content_length,
-                                       std::string &data) {
-  process_multipart_ranges_data(
-      req, boundary, content_type, content_length,
-      [&](const std::string &token) { data += token; },
-      [&](const std::string &token) { data += token; },
-      [&](size_t offset, size_t length) {
-        assert(offset + length <= content_length);
-        data += res.body.substr(offset, length);
-        return true;
-      });
-}
-
-inline size_t get_multipart_ranges_data_length(const Request &req,
-                                               const std::string &boundary,
-                                               const std::string &content_type,
-                                               size_t content_length) {
-  size_t data_length = 0;
-
-  process_multipart_ranges_data(
-      req, boundary, content_type, content_length,
-      [&](const std::string &token) { data_length += token.size(); },
-      [&](const std::string &token) { data_length += token.size(); },
-      [&](size_t /*offset*/, size_t length) {
-        data_length += length;
-        return true;
-      });
-
-  return data_length;
-}
-
-template <typename T>
-inline bool
-write_multipart_ranges_data(Stream &strm, const Request &req, Response &res,
-                            const std::string &boundary,
-                            const std::string &content_type,
-                            size_t content_length, const T &is_shutting_down) {
-  return process_multipart_ranges_data(
-      req, boundary, content_type, content_length,
-      [&](const std::string &token) { strm.write(token); },
-      [&](const std::string &token) { strm.write(token); },
-      [&](size_t offset, size_t length) {
-        return write_content(strm, res.content_provider_, offset, length,
-                             is_shutting_down);
-      });
-}
-
-inline bool expect_content(const Request &req) {
-  if (req.method == "POST" || req.method == "PUT" || req.method == "PATCH" ||
-      req.method == "DELETE") {
-    return true;
-  }
-  if (req.has_header("Content-Length") &&
-      req.get_header_value_u64("Content-Length") > 0) {
-    return true;
-  }
-  if (is_chunked_transfer_encoding(req.headers)) { return true; }
-  return false;
-}
-
-inline bool has_crlf(const std::string &s) {
-  auto p = s.c_str();
-  while (*p) {
-    if (*p == '\r' || *p == '\n') { return true; }
-    p++;
-  }
-  return false;
-}
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-inline std::string message_digest(const std::string &s, const EVP_MD *algo) {
-  auto context = std::unique_ptr<EVP_MD_CTX, decltype(&EVP_MD_CTX_free)>(
-      EVP_MD_CTX_new(), EVP_MD_CTX_free);
-
-  unsigned int hash_length = 0;
-  unsigned char hash[EVP_MAX_MD_SIZE];
-
-  EVP_DigestInit_ex(context.get(), algo, nullptr);
-  EVP_DigestUpdate(context.get(), s.c_str(), s.size());
-  EVP_DigestFinal_ex(context.get(), hash, &hash_length);
-
-  std::stringstream ss;
-  for (auto i = 0u; i < hash_length; ++i) {
-    ss << std::hex << std::setw(2) << std::setfill('0')
-       << static_cast<unsigned int>(hash[i]);
-  }
-
-  return ss.str();
-}
-
-inline std::string MD5(const std::string &s) {
-  return message_digest(s, EVP_md5());
-}
-
-inline std::string SHA_256(const std::string &s) {
-  return message_digest(s, EVP_sha256());
-}
-
-inline std::string SHA_512(const std::string &s) {
-  return message_digest(s, EVP_sha512());
-}
-
-inline std::pair<std::string, std::string> make_digest_authentication_header(
-    const Request &req, const std::map<std::string, std::string> &auth,
-    size_t cnonce_count, const std::string &cnonce, const std::string &username,
-    const std::string &password, bool is_proxy = false) {
-  std::string nc;
-  {
-    std::stringstream ss;
-    ss << std::setfill('0') << std::setw(8) << std::hex << cnonce_count;
-    nc = ss.str();
-  }
-
-  std::string qop;
-  if (auth.find("qop") != auth.end()) {
-    qop = auth.at("qop");
-    if (qop.find("auth-int") != std::string::npos) {
-      qop = "auth-int";
-    } else if (qop.find("auth") != std::string::npos) {
-      qop = "auth";
-    } else {
-      qop.clear();
-    }
-  }
-
-  std::string algo = "MD5";
-  if (auth.find("algorithm") != auth.end()) { algo = auth.at("algorithm"); }
-
-  std::string response;
-  {
-    auto H = algo == "SHA-256"   ? detail::SHA_256
-             : algo == "SHA-512" ? detail::SHA_512
-                                 : detail::MD5;
-
-    auto A1 = username + ":" + auth.at("realm") + ":" + password;
-
-    auto A2 = req.method + ":" + req.path;
-    if (qop == "auth-int") { A2 += ":" + H(req.body); }
-
-    if (qop.empty()) {
-      response = H(H(A1) + ":" + auth.at("nonce") + ":" + H(A2));
-    } else {
-      response = H(H(A1) + ":" + auth.at("nonce") + ":" + nc + ":" + cnonce +
-                   ":" + qop + ":" + H(A2));
-    }
-  }
-
-  auto opaque = (auth.find("opaque") != auth.end()) ? auth.at("opaque") : "";
-
-  auto field = "Digest username=\"" + username + "\", realm=\"" +
-               auth.at("realm") + "\", nonce=\"" + auth.at("nonce") +
-               "\", uri=\"" + req.path + "\", algorithm=" + algo +
-               (qop.empty() ? ", response=\""
-                            : ", qop=" + qop + ", nc=" + nc + ", cnonce=\"" +
-                                  cnonce + "\", response=\"") +
-               response + "\"" +
-               (opaque.empty() ? "" : ", opaque=\"" + opaque + "\"");
-
-  auto key = is_proxy ? "Proxy-Authorization" : "Authorization";
-  return std::make_pair(key, field);
-}
-
-inline bool is_ssl_peer_could_be_closed(SSL *ssl, socket_t sock) {
-  detail::set_nonblocking(sock, true);
-  auto se = detail::scope_exit([&]() { detail::set_nonblocking(sock, false); });
-
-  char buf[1];
-  return !SSL_peek(ssl, buf, 1) &&
-         SSL_get_error(ssl, 0) == SSL_ERROR_ZERO_RETURN;
-}
-
-#ifdef _WIN32
-// NOTE: This code came up with the following stackoverflow post:
-// https://stackoverflow.com/questions/9507184/can-openssl-on-windows-use-the-system-certificate-store
-inline bool load_system_certs_on_windows(X509_STORE *store) {
-  auto hStore = CertOpenSystemStoreW((HCRYPTPROV_LEGACY)NULL, L"ROOT");
-  if (!hStore) { return false; }
-
-  auto result = false;
-  PCCERT_CONTEXT pContext = NULL;
-  while ((pContext = CertEnumCertificatesInStore(hStore, pContext)) !=
-         nullptr) {
-    auto encoded_cert =
-        static_cast<const unsigned char *>(pContext->pbCertEncoded);
-
-    auto x509 = d2i_X509(NULL, &encoded_cert, pContext->cbCertEncoded);
-    if (x509) {
-      X509_STORE_add_cert(store, x509);
-      X509_free(x509);
-      result = true;
-    }
-  }
-
-  CertFreeCertificateContext(pContext);
-  CertCloseStore(hStore, 0);
-
-  return result;
-}
-#elif defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN) && defined(__APPLE__)
-#if TARGET_OS_OSX
-template <typename T>
-using CFObjectPtr =
-    std::unique_ptr<typename std::remove_pointer<T>::type, void (*)(CFTypeRef)>;
-
-inline void cf_object_ptr_deleter(CFTypeRef obj) {
-  if (obj) { CFRelease(obj); }
-}
-
-inline bool retrieve_certs_from_keychain(CFObjectPtr<CFArrayRef> &certs) {
-  CFStringRef keys[] = {kSecClass, kSecMatchLimit, kSecReturnRef};
-  CFTypeRef values[] = {kSecClassCertificate, kSecMatchLimitAll,
-                        kCFBooleanTrue};
-
-  CFObjectPtr<CFDictionaryRef> query(
-      CFDictionaryCreate(nullptr, reinterpret_cast<const void **>(keys), values,
-                         sizeof(keys) / sizeof(keys[0]),
-                         &kCFTypeDictionaryKeyCallBacks,
-                         &kCFTypeDictionaryValueCallBacks),
-      cf_object_ptr_deleter);
-
-  if (!query) { return false; }
-
-  CFTypeRef security_items = nullptr;
-  if (SecItemCopyMatching(query.get(), &security_items) != errSecSuccess ||
-      CFArrayGetTypeID() != CFGetTypeID(security_items)) {
-    return false;
-  }
-
-  certs.reset(reinterpret_cast<CFArrayRef>(security_items));
-  return true;
-}
-
-inline bool retrieve_root_certs_from_keychain(CFObjectPtr<CFArrayRef> &certs) {
-  CFArrayRef root_security_items = nullptr;
-  if (SecTrustCopyAnchorCertificates(&root_security_items) != errSecSuccess) {
-    return false;
-  }
-
-  certs.reset(root_security_items);
-  return true;
-}
-
-inline bool add_certs_to_x509_store(CFArrayRef certs, X509_STORE *store) {
-  auto result = false;
-  for (auto i = 0; i < CFArrayGetCount(certs); ++i) {
-    const auto cert = reinterpret_cast<const __SecCertificate *>(
-        CFArrayGetValueAtIndex(certs, i));
-
-    if (SecCertificateGetTypeID() != CFGetTypeID(cert)) { continue; }
-
-    CFDataRef cert_data = nullptr;
-    if (SecItemExport(cert, kSecFormatX509Cert, 0, nullptr, &cert_data) !=
-        errSecSuccess) {
-      continue;
-    }
-
-    CFObjectPtr<CFDataRef> cert_data_ptr(cert_data, cf_object_ptr_deleter);
-
-    auto encoded_cert = static_cast<const unsigned char *>(
-        CFDataGetBytePtr(cert_data_ptr.get()));
-
-    auto x509 =
-        d2i_X509(NULL, &encoded_cert, CFDataGetLength(cert_data_ptr.get()));
-
-    if (x509) {
-      X509_STORE_add_cert(store, x509);
-      X509_free(x509);
-      result = true;
-    }
-  }
-
-  return result;
-}
-
-inline bool load_system_certs_on_macos(X509_STORE *store) {
-  auto result = false;
-  CFObjectPtr<CFArrayRef> certs(nullptr, cf_object_ptr_deleter);
-  if (retrieve_certs_from_keychain(certs) && certs) {
-    result = add_certs_to_x509_store(certs.get(), store);
-  }
-
-  if (retrieve_root_certs_from_keychain(certs) && certs) {
-    result = add_certs_to_x509_store(certs.get(), store) || result;
-  }
-
-  return result;
-}
-#endif // TARGET_OS_OSX
-#endif // _WIN32
-#endif // CPPHTTPLIB_OPENSSL_SUPPORT
-
-#ifdef _WIN32
-class WSInit {
-public:
-  WSInit() {
-    WSADATA wsaData;
-    if (WSAStartup(0x0002, &wsaData) == 0) is_valid_ = true;
-  }
-
-  ~WSInit() {
-    if (is_valid_) WSACleanup();
-  }
-
-  bool is_valid_ = false;
-};
-
-static WSInit wsinit_;
-#endif
-
-inline bool parse_www_authenticate(const Response &res,
-                                   std::map<std::string, std::string> &auth,
-                                   bool is_proxy) {
-  auto auth_key = is_proxy ? "Proxy-Authenticate" : "WWW-Authenticate";
-  if (res.has_header(auth_key)) {
-    thread_local auto re =
-        std::regex(R"~((?:(?:,\s*)?(.+?)=(?:"(.*?)"|([^,]*))))~");
-    auto s = res.get_header_value(auth_key);
-    auto pos = s.find(' ');
-    if (pos != std::string::npos) {
-      auto type = s.substr(0, pos);
-      if (type == "Basic") {
-        return false;
-      } else if (type == "Digest") {
-        s = s.substr(pos + 1);
-        auto beg = std::sregex_iterator(s.begin(), s.end(), re);
-        for (auto i = beg; i != std::sregex_iterator(); ++i) {
-          const auto &m = *i;
-          auto key = s.substr(static_cast<size_t>(m.position(1)),
-                              static_cast<size_t>(m.length(1)));
-          auto val = m.length(2) > 0
-                         ? s.substr(static_cast<size_t>(m.position(2)),
-                                    static_cast<size_t>(m.length(2)))
-                         : s.substr(static_cast<size_t>(m.position(3)),
-                                    static_cast<size_t>(m.length(3)));
-          auth[key] = val;
-        }
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-class ContentProviderAdapter {
-public:
-  explicit ContentProviderAdapter(
-      ContentProviderWithoutLength &&content_provider)
-      : content_provider_(content_provider) {}
-
-  bool operator()(size_t offset, size_t, DataSink &sink) {
-    return content_provider_(offset, sink);
-  }
-
-private:
-  ContentProviderWithoutLength content_provider_;
-};
-
-} // namespace detail
-
-inline std::string hosted_at(const std::string &hostname) {
-  std::vector<std::string> addrs;
-  hosted_at(hostname, addrs);
-  if (addrs.empty()) { return std::string(); }
-  return addrs[0];
-}
-
-inline void hosted_at(const std::string &hostname,
-                      std::vector<std::string> &addrs) {
-  struct addrinfo hints;
-  struct addrinfo *result;
-
-  memset(&hints, 0, sizeof(struct addrinfo));
-  hints.ai_family = AF_UNSPEC;
-  hints.ai_socktype = SOCK_STREAM;
-  hints.ai_protocol = 0;
-
-  if (getaddrinfo(hostname.c_str(), nullptr, &hints, &result)) {
-#if defined __linux__ && !defined __ANDROID__
-    res_init();
-#endif
-    return;
-  }
-  auto se = detail::scope_exit([&] { freeaddrinfo(result); });
-
-  for (auto rp = result; rp; rp = rp->ai_next) {
-    const auto &addr =
-        *reinterpret_cast<struct sockaddr_storage *>(rp->ai_addr);
-    std::string ip;
-    auto dummy = -1;
-    if (detail::get_ip_and_port(addr, sizeof(struct sockaddr_storage), ip,
-                                dummy)) {
-      addrs.push_back(ip);
-    }
-  }
-}
-
-inline std::string append_query_params(const std::string &path,
-                                       const Params &params) {
-  std::string path_with_query = path;
-  thread_local const std::regex re("[^?]+\\?.*");
-  auto delm = std::regex_match(path, re) ? '&' : '?';
-  path_with_query += delm + detail::params_to_query_str(params);
-  return path_with_query;
-}
-
-// Header utilities
-inline std::pair<std::string, std::string>
-make_range_header(const Ranges &ranges) {
-  std::string field = "bytes=";
-  auto i = 0;
-  for (const auto &r : ranges) {
-    if (i != 0) { field += ", "; }
-    if (r.first != -1) { field += std::to_string(r.first); }
-    field += '-';
-    if (r.second != -1) { field += std::to_string(r.second); }
-    i++;
-  }
-  return std::make_pair("Range", std::move(field));
-}
-
-inline std::pair<std::string, std::string>
-make_basic_authentication_header(const std::string &username,
-                                 const std::string &password, bool is_proxy) {
-  auto field = "Basic " + detail::base64_encode(username + ":" + password);
-  auto key = is_proxy ? "Proxy-Authorization" : "Authorization";
-  return std::make_pair(key, std::move(field));
-}
-
-inline std::pair<std::string, std::string>
-make_bearer_token_authentication_header(const std::string &token,
-                                        bool is_proxy = false) {
-  auto field = "Bearer " + token;
-  auto key = is_proxy ? "Proxy-Authorization" : "Authorization";
-  return std::make_pair(key, std::move(field));
-}
-
-// Request implementation
-inline bool Request::has_header(const std::string &key) const {
-  return detail::has_header(headers, key);
-}
-
-inline std::string Request::get_header_value(const std::string &key,
-                                             const char *def, size_t id) const {
-  return detail::get_header_value(headers, key, def, id);
-}
-
-inline size_t Request::get_header_value_count(const std::string &key) const {
-  auto r = headers.equal_range(key);
-  return static_cast<size_t>(std::distance(r.first, r.second));
-}
-
-inline void Request::set_header(const std::string &key,
-                                const std::string &val) {
-  if (detail::fields::is_field_name(key) &&
-      detail::fields::is_field_value(val)) {
-    headers.emplace(key, val);
-  }
-}
-
-inline bool Request::has_param(const std::string &key) const {
-  return params.find(key) != params.end();
-}
-
-inline std::string Request::get_param_value(const std::string &key,
-                                            size_t id) const {
-  auto rng = params.equal_range(key);
-  auto it = rng.first;
-  std::advance(it, static_cast<ssize_t>(id));
-  if (it != rng.second) { return it->second; }
-  return std::string();
-}
-
-inline size_t Request::get_param_value_count(const std::string &key) const {
-  auto r = params.equal_range(key);
-  return static_cast<size_t>(std::distance(r.first, r.second));
-}
-
-inline bool Request::is_multipart_form_data() const {
-  const auto &content_type = get_header_value("Content-Type");
-  return !content_type.rfind("multipart/form-data", 0);
-}
-
-inline bool Request::has_file(const std::string &key) const {
-  return files.find(key) != files.end();
-}
-
-inline MultipartFormData Request::get_file_value(const std::string &key) const {
-  auto it = files.find(key);
-  if (it != files.end()) { return it->second; }
-  return MultipartFormData();
-}
-
-inline std::vector<MultipartFormData>
-Request::get_file_values(const std::string &key) const {
-  std::vector<MultipartFormData> values;
-  auto rng = files.equal_range(key);
-  for (auto it = rng.first; it != rng.second; it++) {
-    values.push_back(it->second);
-  }
-  return values;
-}
-
-// Response implementation
-inline bool Response::has_header(const std::string &key) const {
-  return headers.find(key) != headers.end();
-}
-
-inline std::string Response::get_header_value(const std::string &key,
-                                              const char *def,
-                                              size_t id) const {
-  return detail::get_header_value(headers, key, def, id);
-}
-
-inline size_t Response::get_header_value_count(const std::string &key) const {
-  auto r = headers.equal_range(key);
-  return static_cast<size_t>(std::distance(r.first, r.second));
-}
-
-inline void Response::set_header(const std::string &key,
-                                 const std::string &val) {
-  if (detail::fields::is_field_name(key) &&
-      detail::fields::is_field_value(val)) {
-    headers.emplace(key, val);
-  }
-}
-
-inline void Response::set_redirect(const std::string &url, int stat) {
-  if (detail::fields::is_field_value(url)) {
-    set_header("Location", url);
-    if (300 <= stat && stat < 400) {
-      this->status = stat;
-    } else {
-      this->status = StatusCode::Found_302;
-    }
-  }
-}
-
-inline void Response::set_content(const char *s, size_t n,
-                                  const std::string &content_type) {
-  body.assign(s, n);
-
-  auto rng = headers.equal_range("Content-Type");
-  headers.erase(rng.first, rng.second);
-  set_header("Content-Type", content_type);
-}
-
-inline void Response::set_content(const std::string &s,
-                                  const std::string &content_type) {
-  set_content(s.data(), s.size(), content_type);
-}
-
-inline void Response::set_content(std::string &&s,
-                                  const std::string &content_type) {
-  body = std::move(s);
-
-  auto rng = headers.equal_range("Content-Type");
-  headers.erase(rng.first, rng.second);
-  set_header("Content-Type", content_type);
-}
-
-inline void Response::set_content_provider(
-    size_t in_length, const std::string &content_type, ContentProvider provider,
-    ContentProviderResourceReleaser resource_releaser) {
-  set_header("Content-Type", content_type);
-  content_length_ = in_length;
-  if (in_length > 0) { content_provider_ = std::move(provider); }
-  content_provider_resource_releaser_ = std::move(resource_releaser);
-  is_chunked_content_provider_ = false;
-}
-
-inline void Response::set_content_provider(
-    const std::string &content_type, ContentProviderWithoutLength provider,
-    ContentProviderResourceReleaser resource_releaser) {
-  set_header("Content-Type", content_type);
-  content_length_ = 0;
-  content_provider_ = detail::ContentProviderAdapter(std::move(provider));
-  content_provider_resource_releaser_ = std::move(resource_releaser);
-  is_chunked_content_provider_ = false;
-}
-
-inline void Response::set_chunked_content_provider(
-    const std::string &content_type, ContentProviderWithoutLength provider,
-    ContentProviderResourceReleaser resource_releaser) {
-  set_header("Content-Type", content_type);
-  content_length_ = 0;
-  content_provider_ = detail::ContentProviderAdapter(std::move(provider));
-  content_provider_resource_releaser_ = std::move(resource_releaser);
-  is_chunked_content_provider_ = true;
-}
-
-inline void Response::set_file_content(const std::string &path,
-                                       const std::string &content_type) {
-  file_content_path_ = path;
-  file_content_content_type_ = content_type;
-}
-
-inline void Response::set_file_content(const std::string &path) {
-  file_content_path_ = path;
-}
-
-// Result implementation
-inline bool Result::has_request_header(const std::string &key) const {
-  return request_headers_.find(key) != request_headers_.end();
-}
-
-inline std::string Result::get_request_header_value(const std::string &key,
-                                                    const char *def,
-                                                    size_t id) const {
-  return detail::get_header_value(request_headers_, key, def, id);
-}
-
-inline size_t
-Result::get_request_header_value_count(const std::string &key) const {
-  auto r = request_headers_.equal_range(key);
-  return static_cast<size_t>(std::distance(r.first, r.second));
-}
-
-// Stream implementation
-inline ssize_t Stream::write(const char *ptr) {
-  return write(ptr, strlen(ptr));
-}
-
-inline ssize_t Stream::write(const std::string &s) {
-  return write(s.data(), s.size());
-}
-
-namespace detail {
-
-inline void calc_actual_timeout(time_t max_timeout_msec, time_t duration_msec,
-                                time_t timeout_sec, time_t timeout_usec,
-                                time_t &actual_timeout_sec,
-                                time_t &actual_timeout_usec) {
-  auto timeout_msec = (timeout_sec * 1000) + (timeout_usec / 1000);
-
-  auto actual_timeout_msec =
-      (std::min)(max_timeout_msec - duration_msec, timeout_msec);
-
-  actual_timeout_sec = actual_timeout_msec / 1000;
-  actual_timeout_usec = (actual_timeout_msec % 1000) * 1000;
-}
-
-// Socket stream implementation
-inline SocketStream::SocketStream(
-    socket_t sock, time_t read_timeout_sec, time_t read_timeout_usec,
-    time_t write_timeout_sec, time_t write_timeout_usec,
-    time_t max_timeout_msec,
-    std::chrono::time_point<std::chrono::steady_clock> start_time)
-    : sock_(sock), read_timeout_sec_(read_timeout_sec),
-      read_timeout_usec_(read_timeout_usec),
-      write_timeout_sec_(write_timeout_sec),
-      write_timeout_usec_(write_timeout_usec),
-      max_timeout_msec_(max_timeout_msec), start_time_(start_time),
-      read_buff_(read_buff_size_, 0) {}
-
-inline SocketStream::~SocketStream() = default;
-
-inline bool SocketStream::is_readable() const {
-  return read_buff_off_ < read_buff_content_size_;
-}
-
-inline bool SocketStream::wait_readable() const {
-  if (max_timeout_msec_ <= 0) {
-    return select_read(sock_, read_timeout_sec_, read_timeout_usec_) > 0;
-  }
-
-  time_t read_timeout_sec;
-  time_t read_timeout_usec;
-  calc_actual_timeout(max_timeout_msec_, duration(), read_timeout_sec_,
-                      read_timeout_usec_, read_timeout_sec, read_timeout_usec);
-
-  return select_read(sock_, read_timeout_sec, read_timeout_usec) > 0;
-}
-
-inline bool SocketStream::wait_writable() const {
-  return select_write(sock_, write_timeout_sec_, write_timeout_usec_) > 0 &&
-         is_socket_alive(sock_);
-}
-
-inline ssize_t SocketStream::read(char *ptr, size_t size) {
-#ifdef _WIN32
-  size =
-      (std::min)(size, static_cast<size_t>((std::numeric_limits<int>::max)()));
-#else
-  size = (std::min)(size,
-                    static_cast<size_t>((std::numeric_limits<ssize_t>::max)()));
-#endif
-
-  if (read_buff_off_ < read_buff_content_size_) {
-    auto remaining_size = read_buff_content_size_ - read_buff_off_;
-    if (size <= remaining_size) {
-      memcpy(ptr, read_buff_.data() + read_buff_off_, size);
-      read_buff_off_ += size;
-      return static_cast<ssize_t>(size);
-    } else {
-      memcpy(ptr, read_buff_.data() + read_buff_off_, remaining_size);
-      read_buff_off_ += remaining_size;
-      return static_cast<ssize_t>(remaining_size);
-    }
-  }
-
-  if (!wait_readable()) { return -1; }
-
-  read_buff_off_ = 0;
-  read_buff_content_size_ = 0;
-
-  if (size < read_buff_size_) {
-    auto n = read_socket(sock_, read_buff_.data(), read_buff_size_,
-                         CPPHTTPLIB_RECV_FLAGS);
-    if (n <= 0) {
-      return n;
-    } else if (n <= static_cast<ssize_t>(size)) {
-      memcpy(ptr, read_buff_.data(), static_cast<size_t>(n));
-      return n;
-    } else {
-      memcpy(ptr, read_buff_.data(), size);
-      read_buff_off_ = size;
-      read_buff_content_size_ = static_cast<size_t>(n);
-      return static_cast<ssize_t>(size);
-    }
-  } else {
-    return read_socket(sock_, ptr, size, CPPHTTPLIB_RECV_FLAGS);
-  }
-}
-
-inline ssize_t SocketStream::write(const char *ptr, size_t size) {
-  if (!wait_writable()) { return -1; }
-
-#if defined(_WIN32) && !defined(_WIN64)
-  size =
-      (std::min)(size, static_cast<size_t>((std::numeric_limits<int>::max)()));
-#endif
-
-  return send_socket(sock_, ptr, size, CPPHTTPLIB_SEND_FLAGS);
-}
-
-inline void SocketStream::get_remote_ip_and_port(std::string &ip,
-                                                 int &port) const {
-  return detail::get_remote_ip_and_port(sock_, ip, port);
-}
-
-inline void SocketStream::get_local_ip_and_port(std::string &ip,
-                                                int &port) const {
-  return detail::get_local_ip_and_port(sock_, ip, port);
-}
-
-inline socket_t SocketStream::socket() const { return sock_; }
-
-inline time_t SocketStream::duration() const {
-  return std::chrono::duration_cast<std::chrono::milliseconds>(
-             std::chrono::steady_clock::now() - start_time_)
-      .count();
-}
-
-// Buffer stream implementation
-inline bool BufferStream::is_readable() const { return true; }
-
-inline bool BufferStream::wait_readable() const { return true; }
-
-inline bool BufferStream::wait_writable() const { return true; }
-
-inline ssize_t BufferStream::read(char *ptr, size_t size) {
-#if defined(_MSC_VER) && _MSC_VER < 1910
-  auto len_read = buffer._Copy_s(ptr, size, size, position);
-#else
-  auto len_read = buffer.copy(ptr, size, position);
-#endif
-  position += static_cast<size_t>(len_read);
-  return static_cast<ssize_t>(len_read);
-}
-
-inline ssize_t BufferStream::write(const char *ptr, size_t size) {
-  buffer.append(ptr, size);
-  return static_cast<ssize_t>(size);
-}
-
-inline void BufferStream::get_remote_ip_and_port(std::string & /*ip*/,
-                                                 int & /*port*/) const {}
-
-inline void BufferStream::get_local_ip_and_port(std::string & /*ip*/,
-                                                int & /*port*/) const {}
-
-inline socket_t BufferStream::socket() const { return 0; }
-
-inline time_t BufferStream::duration() const { return 0; }
-
-inline const std::string &BufferStream::get_buffer() const { return buffer; }
-
-inline PathParamsMatcher::PathParamsMatcher(const std::string &pattern) {
-  constexpr const char marker[] = "/:";
-
-  // One past the last ending position of a path param substring
-  std::size_t last_param_end = 0;
-
-#ifndef CPPHTTPLIB_NO_EXCEPTIONS
-  // Needed to ensure that parameter names are unique during matcher
-  // construction
-  // If exceptions are disabled, only last duplicate path
-  // parameter will be set
-  std::unordered_set<std::string> param_name_set;
-#endif
-
-  while (true) {
-    const auto marker_pos = pattern.find(
-        marker, last_param_end == 0 ? last_param_end : last_param_end - 1);
-    if (marker_pos == std::string::npos) { break; }
-
-    static_fragments_.push_back(
-        pattern.substr(last_param_end, marker_pos - last_param_end + 1));
-
-    const auto param_name_start = marker_pos + str_len(marker);
-
-    auto sep_pos = pattern.find(separator, param_name_start);
-    if (sep_pos == std::string::npos) { sep_pos = pattern.length(); }
-
-    auto param_name =
-        pattern.substr(param_name_start, sep_pos - param_name_start);
-
-#ifndef CPPHTTPLIB_NO_EXCEPTIONS
-    if (param_name_set.find(param_name) != param_name_set.cend()) {
-      std::string msg = "Encountered path parameter '" + param_name +
-                        "' multiple times in route pattern '" + pattern + "'.";
-      throw std::invalid_argument(msg);
-    }
-#endif
-
-    param_names_.push_back(std::move(param_name));
-
-    last_param_end = sep_pos + 1;
-  }
-
-  if (last_param_end < pattern.length()) {
-    static_fragments_.push_back(pattern.substr(last_param_end));
-  }
-}
-
-inline bool PathParamsMatcher::match(Request &request) const {
-  request.matches = std::smatch();
-  request.path_params.clear();
-  request.path_params.reserve(param_names_.size());
-
-  // One past the position at which the path matched the pattern last time
-  std::size_t starting_pos = 0;
-  for (size_t i = 0; i < static_fragments_.size(); ++i) {
-    const auto &fragment = static_fragments_[i];
-
-    if (starting_pos + fragment.length() > request.path.length()) {
-      return false;
-    }
-
-    // Avoid unnecessary allocation by using strncmp instead of substr +
-    // comparison
-    if (std::strncmp(request.path.c_str() + starting_pos, fragment.c_str(),
-                     fragment.length()) != 0) {
-      return false;
-    }
-
-    starting_pos += fragment.length();
-
-    // Should only happen when we have a static fragment after a param
-    // Example: '/users/:id/subscriptions'
-    // The 'subscriptions' fragment here does not have a corresponding param
-    if (i >= param_names_.size()) { continue; }
-
-    auto sep_pos = request.path.find(separator, starting_pos);
-    if (sep_pos == std::string::npos) { sep_pos = request.path.length(); }
-
-    const auto &param_name = param_names_[i];
-
-    request.path_params.emplace(
-        param_name, request.path.substr(starting_pos, sep_pos - starting_pos));
-
-    // Mark everything up to '/' as matched
-    starting_pos = sep_pos + 1;
-  }
-  // Returns false if the path is longer than the pattern
-  return starting_pos >= request.path.length();
-}
-
-inline bool RegexMatcher::match(Request &request) const {
-  request.path_params.clear();
-  return std::regex_match(request.path, request.matches, regex_);
-}
-
-} // namespace detail
-
-// HTTP server implementation
-inline Server::Server()
-    : new_task_queue(
-          [] { return new ThreadPool(CPPHTTPLIB_THREAD_POOL_COUNT); }) {
-#ifndef _WIN32
-  signal(SIGPIPE, SIG_IGN);
-#endif
-}
-
-inline Server::~Server() = default;
-
-inline std::unique_ptr<detail::MatcherBase>
-Server::make_matcher(const std::string &pattern) {
-  if (pattern.find("/:") != std::string::npos) {
-    return detail::make_unique<detail::PathParamsMatcher>(pattern);
-  } else {
-    return detail::make_unique<detail::RegexMatcher>(pattern);
-  }
-}
-
-inline Server &Server::Get(const std::string &pattern, Handler handler) {
-  get_handlers_.emplace_back(make_matcher(pattern), std::move(handler));
-  return *this;
-}
-
-inline Server &Server::Post(const std::string &pattern, Handler handler) {
-  post_handlers_.emplace_back(make_matcher(pattern), std::move(handler));
-  return *this;
-}
-
-inline Server &Server::Post(const std::string &pattern,
-                            HandlerWithContentReader handler) {
-  post_handlers_for_content_reader_.emplace_back(make_matcher(pattern),
-                                                 std::move(handler));
-  return *this;
-}
-
-inline Server &Server::Put(const std::string &pattern, Handler handler) {
-  put_handlers_.emplace_back(make_matcher(pattern), std::move(handler));
-  return *this;
-}
-
-inline Server &Server::Put(const std::string &pattern,
-                           HandlerWithContentReader handler) {
-  put_handlers_for_content_reader_.emplace_back(make_matcher(pattern),
-                                                std::move(handler));
-  return *this;
-}
-
-inline Server &Server::Patch(const std::string &pattern, Handler handler) {
-  patch_handlers_.emplace_back(make_matcher(pattern), std::move(handler));
-  return *this;
-}
-
-inline Server &Server::Patch(const std::string &pattern,
-                             HandlerWithContentReader handler) {
-  patch_handlers_for_content_reader_.emplace_back(make_matcher(pattern),
-                                                  std::move(handler));
-  return *this;
-}
-
-inline Server &Server::Delete(const std::string &pattern, Handler handler) {
-  delete_handlers_.emplace_back(make_matcher(pattern), std::move(handler));
-  return *this;
-}
-
-inline Server &Server::Delete(const std::string &pattern,
-                              HandlerWithContentReader handler) {
-  delete_handlers_for_content_reader_.emplace_back(make_matcher(pattern),
-                                                   std::move(handler));
-  return *this;
-}
-
-inline Server &Server::Options(const std::string &pattern, Handler handler) {
-  options_handlers_.emplace_back(make_matcher(pattern), std::move(handler));
-  return *this;
-}
-
-inline bool Server::set_base_dir(const std::string &dir,
-                                 const std::string &mount_point) {
-  return set_mount_point(mount_point, dir);
-}
-
-inline bool Server::set_mount_point(const std::string &mount_point,
-                                    const std::string &dir, Headers headers) {
-  detail::FileStat stat(dir);
-  if (stat.is_dir()) {
-    std::string mnt = !mount_point.empty() ? mount_point : "/";
-    if (!mnt.empty() && mnt[0] == '/') {
-      base_dirs_.push_back({mnt, dir, std::move(headers)});
-      return true;
-    }
-  }
-  return false;
-}
-
-inline bool Server::remove_mount_point(const std::string &mount_point) {
-  for (auto it = base_dirs_.begin(); it != base_dirs_.end(); ++it) {
-    if (it->mount_point == mount_point) {
-      base_dirs_.erase(it);
-      return true;
-    }
-  }
-  return false;
-}
-
-inline Server &
-Server::set_file_extension_and_mimetype_mapping(const std::string &ext,
-                                                const std::string &mime) {
-  file_extension_and_mimetype_map_[ext] = mime;
-  return *this;
-}
-
-inline Server &Server::set_default_file_mimetype(const std::string &mime) {
-  default_file_mimetype_ = mime;
-  return *this;
-}
-
-inline Server &Server::set_file_request_handler(Handler handler) {
-  file_request_handler_ = std::move(handler);
-  return *this;
-}
-
-inline Server &Server::set_error_handler_core(HandlerWithResponse handler,
-                                              std::true_type) {
-  error_handler_ = std::move(handler);
-  return *this;
-}
-
-inline Server &Server::set_error_handler_core(Handler handler,
-                                              std::false_type) {
-  error_handler_ = [handler](const Request &req, Response &res) {
-    handler(req, res);
-    return HandlerResponse::Handled;
-  };
-  return *this;
-}
-
-inline Server &Server::set_exception_handler(ExceptionHandler handler) {
-  exception_handler_ = std::move(handler);
-  return *this;
-}
-
-inline Server &Server::set_pre_routing_handler(HandlerWithResponse handler) {
-  pre_routing_handler_ = std::move(handler);
-  return *this;
-}
-
-inline Server &Server::set_post_routing_handler(Handler handler) {
-  post_routing_handler_ = std::move(handler);
-  return *this;
-}
-
-inline Server &Server::set_logger(Logger logger) {
-  logger_ = std::move(logger);
-  return *this;
-}
-
-inline Server &
-Server::set_expect_100_continue_handler(Expect100ContinueHandler handler) {
-  expect_100_continue_handler_ = std::move(handler);
-  return *this;
-}
-
-inline Server &Server::set_address_family(int family) {
-  address_family_ = family;
-  return *this;
-}
-
-inline Server &Server::set_tcp_nodelay(bool on) {
-  tcp_nodelay_ = on;
-  return *this;
-}
-
-inline Server &Server::set_ipv6_v6only(bool on) {
-  ipv6_v6only_ = on;
-  return *this;
-}
-
-inline Server &Server::set_socket_options(SocketOptions socket_options) {
-  socket_options_ = std::move(socket_options);
-  return *this;
-}
-
-inline Server &Server::set_default_headers(Headers headers) {
-  default_headers_ = std::move(headers);
-  return *this;
-}
-
-inline Server &Server::set_header_writer(
-    std::function<ssize_t(Stream &, Headers &)> const &writer) {
-  header_writer_ = writer;
-  return *this;
-}
-
-inline Server &Server::set_keep_alive_max_count(size_t count) {
-  keep_alive_max_count_ = count;
-  return *this;
-}
-
-inline Server &Server::set_keep_alive_timeout(time_t sec) {
-  keep_alive_timeout_sec_ = sec;
-  return *this;
-}
-
-inline Server &Server::set_read_timeout(time_t sec, time_t usec) {
-  read_timeout_sec_ = sec;
-  read_timeout_usec_ = usec;
-  return *this;
-}
-
-inline Server &Server::set_write_timeout(time_t sec, time_t usec) {
-  write_timeout_sec_ = sec;
-  write_timeout_usec_ = usec;
-  return *this;
-}
-
-inline Server &Server::set_idle_interval(time_t sec, time_t usec) {
-  idle_interval_sec_ = sec;
-  idle_interval_usec_ = usec;
-  return *this;
-}
-
-inline Server &Server::set_payload_max_length(size_t length) {
-  payload_max_length_ = length;
-  return *this;
-}
-
-inline bool Server::bind_to_port(const std::string &host, int port,
-                                 int socket_flags) {
-  auto ret = bind_internal(host, port, socket_flags);
-  if (ret == -1) { is_decommissioned = true; }
-  return ret >= 0;
-}
-inline int Server::bind_to_any_port(const std::string &host, int socket_flags) {
-  auto ret = bind_internal(host, 0, socket_flags);
-  if (ret == -1) { is_decommissioned = true; }
-  return ret;
-}
-
-inline bool Server::listen_after_bind() { return listen_internal(); }
-
-inline bool Server::listen(const std::string &host, int port,
-                           int socket_flags) {
-  return bind_to_port(host, port, socket_flags) && listen_internal();
-}
-
-inline bool Server::is_running() const { return is_running_; }
-
-inline void Server::wait_until_ready() const {
-  while (!is_running_ && !is_decommissioned) {
-    std::this_thread::sleep_for(std::chrono::milliseconds{1});
-  }
-}
-
-inline void Server::stop() {
-  if (is_running_) {
-    assert(svr_sock_ != INVALID_SOCKET);
-    std::atomic<socket_t> sock(svr_sock_.exchange(INVALID_SOCKET));
-    detail::shutdown_socket(sock);
-    detail::close_socket(sock);
-  }
-  is_decommissioned = false;
-}
-
-inline void Server::decommission() { is_decommissioned = true; }
-
-inline bool Server::parse_request_line(const char *s, Request &req) const {
-  auto len = strlen(s);
-  if (len < 2 || s[len - 2] != '\r' || s[len - 1] != '\n') { return false; }
-  len -= 2;
-
-  {
-    size_t count = 0;
-
-    detail::split(s, s + len, ' ', [&](const char *b, const char *e) {
-      switch (count) {
-      case 0: req.method = std::string(b, e); break;
-      case 1: req.target = std::string(b, e); break;
-      case 2: req.version = std::string(b, e); break;
-      default: break;
-      }
-      count++;
-    });
-
-    if (count != 3) { return false; }
-  }
-
-  thread_local const std::set<std::string> methods{
-      "GET",     "HEAD",    "POST",  "PUT",   "DELETE",
-      "CONNECT", "OPTIONS", "TRACE", "PATCH", "PRI"};
-
-  if (methods.find(req.method) == methods.end()) { return false; }
-
-  if (req.version != "HTTP/1.1" && req.version != "HTTP/1.0") { return false; }
-
-  {
-    // Skip URL fragment
-    for (size_t i = 0; i < req.target.size(); i++) {
-      if (req.target[i] == '#') {
-        req.target.erase(i);
-        break;
-      }
-    }
-
-    detail::divide(req.target, '?',
-                   [&](const char *lhs_data, std::size_t lhs_size,
-                       const char *rhs_data, std::size_t rhs_size) {
-                     req.path = detail::decode_url(
-                         std::string(lhs_data, lhs_size), false);
-                     detail::parse_query_text(rhs_data, rhs_size, req.params);
-                   });
-  }
-
-  return true;
-}
-
-inline bool Server::write_response(Stream &strm, bool close_connection,
-                                   Request &req, Response &res) {
-  // NOTE: `req.ranges` should be empty, otherwise it will be applied
-  // incorrectly to the error content.
-  req.ranges.clear();
-  return write_response_core(strm, close_connection, req, res, false);
-}
-
-inline bool Server::write_response_with_content(Stream &strm,
-                                                bool close_connection,
-                                                const Request &req,
-                                                Response &res) {
-  return write_response_core(strm, close_connection, req, res, true);
-}
-
-inline bool Server::write_response_core(Stream &strm, bool close_connection,
-                                        const Request &req, Response &res,
-                                        bool need_apply_ranges) {
-  assert(res.status != -1);
-
-  if (400 <= res.status && error_handler_ &&
-      error_handler_(req, res) == HandlerResponse::Handled) {
-    need_apply_ranges = true;
-  }
-
-  std::string content_type;
-  std::string boundary;
-  if (need_apply_ranges) { apply_ranges(req, res, content_type, boundary); }
-
-  // Prepare additional headers
-  if (close_connection || req.get_header_value("Connection") == "close") {
-    res.set_header("Connection", "close");
-  } else {
-    std::string s = "timeout=";
-    s += std::to_string(keep_alive_timeout_sec_);
-    s += ", max=";
-    s += std::to_string(keep_alive_max_count_);
-    res.set_header("Keep-Alive", s);
-  }
-
-  if ((!res.body.empty() || res.content_length_ > 0 || res.content_provider_) &&
-      !res.has_header("Content-Type")) {
-    res.set_header("Content-Type", "text/plain");
-  }
-
-  if (res.body.empty() && !res.content_length_ && !res.content_provider_ &&
-      !res.has_header("Content-Length")) {
-    res.set_header("Content-Length", "0");
-  }
-
-  if (req.method == "HEAD" && !res.has_header("Accept-Ranges")) {
-    res.set_header("Accept-Ranges", "bytes");
-  }
-
-  if (post_routing_handler_) { post_routing_handler_(req, res); }
-
-  // Response line and headers
-  {
-    detail::BufferStream bstrm;
-    if (!detail::write_response_line(bstrm, res.status)) { return false; }
-    if (!header_writer_(bstrm, res.headers)) { return false; }
-
-    // Flush buffer
-    auto &data = bstrm.get_buffer();
-    detail::write_data(strm, data.data(), data.size());
-  }
-
-  // Body
-  auto ret = true;
-  if (req.method != "HEAD") {
-    if (!res.body.empty()) {
-      if (!detail::write_data(strm, res.body.data(), res.body.size())) {
-        ret = false;
-      }
-    } else if (res.content_provider_) {
-      if (write_content_with_provider(strm, req, res, boundary, content_type)) {
-        res.content_provider_success_ = true;
-      } else {
-        ret = false;
-      }
-    }
-  }
-
-  // Log
-  if (logger_) { logger_(req, res); }
-
-  return ret;
-}
-
-inline bool
-Server::write_content_with_provider(Stream &strm, const Request &req,
-                                    Response &res, const std::string &boundary,
-                                    const std::string &content_type) {
-  auto is_shutting_down = [this]() {
-    return this->svr_sock_ == INVALID_SOCKET;
-  };
-
-  if (res.content_length_ > 0) {
-    if (req.ranges.empty()) {
-      return detail::write_content(strm, res.content_provider_, 0,
-                                   res.content_length_, is_shutting_down);
-    } else if (req.ranges.size() == 1) {
-      auto offset_and_length = detail::get_range_offset_and_length(
-          req.ranges[0], res.content_length_);
-
-      return detail::write_content(strm, res.content_provider_,
-                                   offset_and_length.first,
-                                   offset_and_length.second, is_shutting_down);
-    } else {
-      return detail::write_multipart_ranges_data(
-          strm, req, res, boundary, content_type, res.content_length_,
-          is_shutting_down);
-    }
-  } else {
-    if (res.is_chunked_content_provider_) {
-      auto type = detail::encoding_type(req, res);
-
-      std::unique_ptr<detail::compressor> compressor;
-      if (type == detail::EncodingType::Gzip) {
-#ifdef CPPHTTPLIB_ZLIB_SUPPORT
-        compressor = detail::make_unique<detail::gzip_compressor>();
-#endif
-      } else if (type == detail::EncodingType::Brotli) {
-#ifdef CPPHTTPLIB_BROTLI_SUPPORT
-        compressor = detail::make_unique<detail::brotli_compressor>();
-#endif
-      } else if (type == detail::EncodingType::Zstd) {
-#ifdef CPPHTTPLIB_ZSTD_SUPPORT
-        compressor = detail::make_unique<detail::zstd_compressor>();
-#endif
-      } else {
-        compressor = detail::make_unique<detail::nocompressor>();
-      }
-      assert(compressor != nullptr);
-
-      return detail::write_content_chunked(strm, res.content_provider_,
-                                           is_shutting_down, *compressor);
-    } else {
-      return detail::write_content_without_length(strm, res.content_provider_,
-                                                  is_shutting_down);
-    }
-  }
-}
-
-inline bool Server::read_content(Stream &strm, Request &req, Response &res) {
-  MultipartFormDataMap::iterator cur;
-  auto file_count = 0;
-  if (read_content_core(
-          strm, req, res,
-          // Regular
-          [&](const char *buf, size_t n) {
-            if (req.body.size() + n > req.body.max_size()) { return false; }
-            req.body.append(buf, n);
-            return true;
-          },
-          // Multipart
-          [&](const MultipartFormData &file) {
-            if (file_count++ == CPPHTTPLIB_MULTIPART_FORM_DATA_FILE_MAX_COUNT) {
-              return false;
-            }
-            cur = req.files.emplace(file.name, file);
-            return true;
-          },
-          [&](const char *buf, size_t n) {
-            auto &content = cur->second.content;
-            if (content.size() + n > content.max_size()) { return false; }
-            content.append(buf, n);
-            return true;
-          })) {
-    const auto &content_type = req.get_header_value("Content-Type");
-    if (!content_type.find("application/x-www-form-urlencoded")) {
-      if (req.body.size() > CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH) {
-        res.status = StatusCode::PayloadTooLarge_413; // NOTE: should be 414?
-        return false;
-      }
-      detail::parse_query_text(req.body, req.params);
-    }
-    return true;
-  }
-  return false;
-}
-
-inline bool Server::read_content_with_content_receiver(
-    Stream &strm, Request &req, Response &res, ContentReceiver receiver,
-    MultipartContentHeader multipart_header,
-    ContentReceiver multipart_receiver) {
-  return read_content_core(strm, req, res, std::move(receiver),
-                           std::move(multipart_header),
-                           std::move(multipart_receiver));
-}
-
-inline bool
-Server::read_content_core(Stream &strm, Request &req, Response &res,
-                          ContentReceiver receiver,
-                          MultipartContentHeader multipart_header,
-                          ContentReceiver multipart_receiver) const {
-  detail::MultipartFormDataParser multipart_form_data_parser;
-  ContentReceiverWithProgress out;
-
-  if (req.is_multipart_form_data()) {
-    const auto &content_type = req.get_header_value("Content-Type");
-    std::string boundary;
-    if (!detail::parse_multipart_boundary(content_type, boundary)) {
-      res.status = StatusCode::BadRequest_400;
-      return false;
-    }
-
-    multipart_form_data_parser.set_boundary(std::move(boundary));
-    out = [&](const char *buf, size_t n, uint64_t /*off*/, uint64_t /*len*/) {
-      /* For debug
-      size_t pos = 0;
-      while (pos < n) {
-        auto read_size = (std::min)<size_t>(1, n - pos);
-        auto ret = multipart_form_data_parser.parse(
-            buf + pos, read_size, multipart_receiver, multipart_header);
-        if (!ret) { return false; }
-        pos += read_size;
-      }
-      return true;
-      */
-      return multipart_form_data_parser.parse(buf, n, multipart_receiver,
-                                              multipart_header);
-    };
-  } else {
-    out = [receiver](const char *buf, size_t n, uint64_t /*off*/,
-                     uint64_t /*len*/) { return receiver(buf, n); };
-  }
-
-  if (req.method == "DELETE" && !req.has_header("Content-Length")) {
-    return true;
-  }
-
-  if (!detail::read_content(strm, req, payload_max_length_, res.status, nullptr,
-                            out, true)) {
-    return false;
-  }
-
-  if (req.is_multipart_form_data()) {
-    if (!multipart_form_data_parser.is_valid()) {
-      res.status = StatusCode::BadRequest_400;
-      return false;
-    }
-  }
-
-  return true;
-}
-
-inline bool Server::handle_file_request(const Request &req, Response &res,
-                                        bool head) {
-  for (const auto &entry : base_dirs_) {
-    // Prefix match
-    if (!req.path.compare(0, entry.mount_point.size(), entry.mount_point)) {
-      std::string sub_path = "/" + req.path.substr(entry.mount_point.size());
-      if (detail::is_valid_path(sub_path)) {
-        auto path = entry.base_dir + sub_path;
-        if (path.back() == '/') { path += "index.html"; }
-
-        detail::FileStat stat(path);
-
-        if (stat.is_dir()) {
-          res.set_redirect(sub_path + "/", StatusCode::MovedPermanently_301);
-          return true;
-        }
-
-        if (stat.is_file()) {
-          for (const auto &kv : entry.headers) {
-            res.set_header(kv.first, kv.second);
-          }
-
-          auto mm = std::make_shared<detail::mmap>(path.c_str());
-          if (!mm->is_open()) { return false; }
-
-          res.set_content_provider(
-              mm->size(),
-              detail::find_content_type(path, file_extension_and_mimetype_map_,
-                                        default_file_mimetype_),
-              [mm](size_t offset, size_t length, DataSink &sink) -> bool {
-                sink.write(mm->data() + offset, length);
-                return true;
-              });
-
-          if (!head && file_request_handler_) {
-            file_request_handler_(req, res);
-          }
-
-          return true;
-        }
-      }
-    }
-  }
-  return false;
-}
-
-inline socket_t
-Server::create_server_socket(const std::string &host, int port,
-                             int socket_flags,
-                             SocketOptions socket_options) const {
-  return detail::create_socket(
-      host, std::string(), port, address_family_, socket_flags, tcp_nodelay_,
-      ipv6_v6only_, std::move(socket_options),
-      [](socket_t sock, struct addrinfo &ai, bool & /*quit*/) -> bool {
-        if (::bind(sock, ai.ai_addr, static_cast<socklen_t>(ai.ai_addrlen))) {
-          return false;
-        }
-        if (::listen(sock, CPPHTTPLIB_LISTEN_BACKLOG)) { return false; }
-        return true;
-      });
-}
-
-inline int Server::bind_internal(const std::string &host, int port,
-                                 int socket_flags) {
-  if (is_decommissioned) { return -1; }
-
-  if (!is_valid()) { return -1; }
-
-  svr_sock_ = create_server_socket(host, port, socket_flags, socket_options_);
-  if (svr_sock_ == INVALID_SOCKET) { return -1; }
-
-  if (port == 0) {
-    struct sockaddr_storage addr;
-    socklen_t addr_len = sizeof(addr);
-    if (getsockname(svr_sock_, reinterpret_cast<struct sockaddr *>(&addr),
-                    &addr_len) == -1) {
-      return -1;
-    }
-    if (addr.ss_family == AF_INET) {
-      return ntohs(reinterpret_cast<struct sockaddr_in *>(&addr)->sin_port);
-    } else if (addr.ss_family == AF_INET6) {
-      return ntohs(reinterpret_cast<struct sockaddr_in6 *>(&addr)->sin6_port);
-    } else {
-      return -1;
-    }
-  } else {
-    return port;
-  }
-}
-
-inline bool Server::listen_internal() {
-  if (is_decommissioned) { return false; }
-
-  auto ret = true;
-  is_running_ = true;
-  auto se = detail::scope_exit([&]() { is_running_ = false; });
-
-  {
-    std::unique_ptr<TaskQueue> task_queue(new_task_queue());
-
-    while (svr_sock_ != INVALID_SOCKET) {
-#ifndef _WIN32
-      if (idle_interval_sec_ > 0 || idle_interval_usec_ > 0) {
-#endif
-        auto val = detail::select_read(svr_sock_, idle_interval_sec_,
-                                       idle_interval_usec_);
-        if (val == 0) { // Timeout
-          task_queue->on_idle();
-          continue;
-        }
-#ifndef _WIN32
-      }
-#endif
-
-#if defined _WIN32
-      // sockets connected via WASAccept inherit flags NO_HANDLE_INHERIT,
-      // OVERLAPPED
-      socket_t sock = WSAAccept(svr_sock_, nullptr, nullptr, nullptr, 0);
-#elif defined SOCK_CLOEXEC
-      socket_t sock = accept4(svr_sock_, nullptr, nullptr, SOCK_CLOEXEC);
-#else
-      socket_t sock = accept(svr_sock_, nullptr, nullptr);
-#endif
-
-      if (sock == INVALID_SOCKET) {
-        if (errno == EMFILE) {
-          // The per-process limit of open file descriptors has been reached.
-          // Try to accept new connections after a short sleep.
-          std::this_thread::sleep_for(std::chrono::microseconds{1});
-          continue;
-        } else if (errno == EINTR || errno == EAGAIN) {
-          continue;
-        }
-        if (svr_sock_ != INVALID_SOCKET) {
-          detail::close_socket(svr_sock_);
-          ret = false;
-        } else {
-          ; // The server socket was closed by user.
-        }
-        break;
-      }
-
-      detail::set_socket_opt_time(sock, SOL_SOCKET, SO_RCVTIMEO,
-                                  read_timeout_sec_, read_timeout_usec_);
-      detail::set_socket_opt_time(sock, SOL_SOCKET, SO_SNDTIMEO,
-                                  write_timeout_sec_, write_timeout_usec_);
-
-      if (!task_queue->enqueue(
-              [this, sock]() { process_and_close_socket(sock); })) {
-        detail::shutdown_socket(sock);
-        detail::close_socket(sock);
-      }
-    }
-
-    task_queue->shutdown();
-  }
-
-  is_decommissioned = !ret;
-  return ret;
-}
-
-inline bool Server::routing(Request &req, Response &res, Stream &strm) {
-  if (pre_routing_handler_ &&
-      pre_routing_handler_(req, res) == HandlerResponse::Handled) {
-    return true;
-  }
-
-  // File handler
-  auto is_head_request = req.method == "HEAD";
-  if ((req.method == "GET" || is_head_request) &&
-      handle_file_request(req, res, is_head_request)) {
-    return true;
-  }
-
-  if (detail::expect_content(req)) {
-    // Content reader handler
-    {
-      ContentReader reader(
-          [&](ContentReceiver receiver) {
-            return read_content_with_content_receiver(
-                strm, req, res, std::move(receiver), nullptr, nullptr);
-          },
-          [&](MultipartContentHeader header, ContentReceiver receiver) {
-            return read_content_with_content_receiver(strm, req, res, nullptr,
-                                                      std::move(header),
-                                                      std::move(receiver));
-          });
-
-      if (req.method == "POST") {
-        if (dispatch_request_for_content_reader(
-                req, res, std::move(reader),
-                post_handlers_for_content_reader_)) {
-          return true;
-        }
-      } else if (req.method == "PUT") {
-        if (dispatch_request_for_content_reader(
-                req, res, std::move(reader),
-                put_handlers_for_content_reader_)) {
-          return true;
-        }
-      } else if (req.method == "PATCH") {
-        if (dispatch_request_for_content_reader(
-                req, res, std::move(reader),
-                patch_handlers_for_content_reader_)) {
-          return true;
-        }
-      } else if (req.method == "DELETE") {
-        if (dispatch_request_for_content_reader(
-                req, res, std::move(reader),
-                delete_handlers_for_content_reader_)) {
-          return true;
-        }
-      }
-    }
-
-    // Read content into `req.body`
-    if (!read_content(strm, req, res)) { return false; }
-  }
-
-  // Regular handler
-  if (req.method == "GET" || req.method == "HEAD") {
-    return dispatch_request(req, res, get_handlers_);
-  } else if (req.method == "POST") {
-    return dispatch_request(req, res, post_handlers_);
-  } else if (req.method == "PUT") {
-    return dispatch_request(req, res, put_handlers_);
-  } else if (req.method == "DELETE") {
-    return dispatch_request(req, res, delete_handlers_);
-  } else if (req.method == "OPTIONS") {
-    return dispatch_request(req, res, options_handlers_);
-  } else if (req.method == "PATCH") {
-    return dispatch_request(req, res, patch_handlers_);
-  }
-
-  res.status = StatusCode::BadRequest_400;
-  return false;
-}
-
-inline bool Server::dispatch_request(Request &req, Response &res,
-                                     const Handlers &handlers) const {
-  for (const auto &x : handlers) {
-    const auto &matcher = x.first;
-    const auto &handler = x.second;
-
-    if (matcher->match(req)) {
-      handler(req, res);
-      return true;
-    }
-  }
-  return false;
-}
-
-inline void Server::apply_ranges(const Request &req, Response &res,
-                                 std::string &content_type,
-                                 std::string &boundary) const {
-  if (req.ranges.size() > 1 && res.status == StatusCode::PartialContent_206) {
-    auto it = res.headers.find("Content-Type");
-    if (it != res.headers.end()) {
-      content_type = it->second;
-      res.headers.erase(it);
-    }
-
-    boundary = detail::make_multipart_data_boundary();
-
-    res.set_header("Content-Type",
-                   "multipart/byteranges; boundary=" + boundary);
-  }
-
-  auto type = detail::encoding_type(req, res);
-
-  if (res.body.empty()) {
-    if (res.content_length_ > 0) {
-      size_t length = 0;
-      if (req.ranges.empty() || res.status != StatusCode::PartialContent_206) {
-        length = res.content_length_;
-      } else if (req.ranges.size() == 1) {
-        auto offset_and_length = detail::get_range_offset_and_length(
-            req.ranges[0], res.content_length_);
-
-        length = offset_and_length.second;
-
-        auto content_range = detail::make_content_range_header_field(
-            offset_and_length, res.content_length_);
-        res.set_header("Content-Range", content_range);
-      } else {
-        length = detail::get_multipart_ranges_data_length(
-            req, boundary, content_type, res.content_length_);
-      }
-      res.set_header("Content-Length", std::to_string(length));
-    } else {
-      if (res.content_provider_) {
-        if (res.is_chunked_content_provider_) {
-          res.set_header("Transfer-Encoding", "chunked");
-          if (type == detail::EncodingType::Gzip) {
-            res.set_header("Content-Encoding", "gzip");
-          } else if (type == detail::EncodingType::Brotli) {
-            res.set_header("Content-Encoding", "br");
-          } else if (type == detail::EncodingType::Zstd) {
-            res.set_header("Content-Encoding", "zstd");
-          }
-        }
-      }
-    }
-  } else {
-    if (req.ranges.empty() || res.status != StatusCode::PartialContent_206) {
-      ;
-    } else if (req.ranges.size() == 1) {
-      auto offset_and_length =
-          detail::get_range_offset_and_length(req.ranges[0], res.body.size());
-      auto offset = offset_and_length.first;
-      auto length = offset_and_length.second;
-
-      auto content_range = detail::make_content_range_header_field(
-          offset_and_length, res.body.size());
-      res.set_header("Content-Range", content_range);
-
-      assert(offset + length <= res.body.size());
-      res.body = res.body.substr(offset, length);
-    } else {
-      std::string data;
-      detail::make_multipart_ranges_data(req, res, boundary, content_type,
-                                         res.body.size(), data);
-      res.body.swap(data);
-    }
-
-    if (type != detail::EncodingType::None) {
-      std::unique_ptr<detail::compressor> compressor;
-      std::string content_encoding;
-
-      if (type == detail::EncodingType::Gzip) {
-#ifdef CPPHTTPLIB_ZLIB_SUPPORT
-        compressor = detail::make_unique<detail::gzip_compressor>();
-        content_encoding = "gzip";
-#endif
-      } else if (type == detail::EncodingType::Brotli) {
-#ifdef CPPHTTPLIB_BROTLI_SUPPORT
-        compressor = detail::make_unique<detail::brotli_compressor>();
-        content_encoding = "br";
-#endif
-      } else if (type == detail::EncodingType::Zstd) {
-#ifdef CPPHTTPLIB_ZSTD_SUPPORT
-        compressor = detail::make_unique<detail::zstd_compressor>();
-        content_encoding = "zstd";
-#endif
-      }
-
-      if (compressor) {
-        std::string compressed;
-        if (compressor->compress(res.body.data(), res.body.size(), true,
-                                 [&](const char *data, size_t data_len) {
-                                   compressed.append(data, data_len);
-                                   return true;
-                                 })) {
-          res.body.swap(compressed);
-          res.set_header("Content-Encoding", content_encoding);
-        }
-      }
-    }
-
-    auto length = std::to_string(res.body.size());
-    res.set_header("Content-Length", length);
-  }
-}
-
-inline bool Server::dispatch_request_for_content_reader(
-    Request &req, Response &res, ContentReader content_reader,
-    const HandlersForContentReader &handlers) const {
-  for (const auto &x : handlers) {
-    const auto &matcher = x.first;
-    const auto &handler = x.second;
-
-    if (matcher->match(req)) {
-      handler(req, res, content_reader);
-      return true;
-    }
-  }
-  return false;
-}
-
-inline bool
-Server::process_request(Stream &strm, const std::string &remote_addr,
-                        int remote_port, const std::string &local_addr,
-                        int local_port, bool close_connection,
-                        bool &connection_closed,
-                        const std::function<void(Request &)> &setup_request) {
-  std::array<char, 2048> buf{};
-
-  detail::stream_line_reader line_reader(strm, buf.data(), buf.size());
-
-  // Connection has been closed on client
-  if (!line_reader.getline()) { return false; }
-
-  Request req;
-
-  Response res;
-  res.version = "HTTP/1.1";
-  res.headers = default_headers_;
-
-  // Request line and headers
-  if (!parse_request_line(line_reader.ptr(), req) ||
-      !detail::read_headers(strm, req.headers)) {
-    res.status = StatusCode::BadRequest_400;
-    return write_response(strm, close_connection, req, res);
-  }
-
-  // Check if the request URI doesn't exceed the limit
-  if (req.target.size() > CPPHTTPLIB_REQUEST_URI_MAX_LENGTH) {
-    Headers dummy;
-    detail::read_headers(strm, dummy);
-    res.status = StatusCode::UriTooLong_414;
-    return write_response(strm, close_connection, req, res);
-  }
-
-  if (req.get_header_value("Connection") == "close") {
-    connection_closed = true;
-  }
-
-  if (req.version == "HTTP/1.0" &&
-      req.get_header_value("Connection") != "Keep-Alive") {
-    connection_closed = true;
-  }
-
-  req.remote_addr = remote_addr;
-  req.remote_port = remote_port;
-  req.set_header("REMOTE_ADDR", req.remote_addr);
-  req.set_header("REMOTE_PORT", std::to_string(req.remote_port));
-
-  req.local_addr = local_addr;
-  req.local_port = local_port;
-  req.set_header("LOCAL_ADDR", req.local_addr);
-  req.set_header("LOCAL_PORT", std::to_string(req.local_port));
-
-  if (req.has_header("Range")) {
-    const auto &range_header_value = req.get_header_value("Range");
-    if (!detail::parse_range_header(range_header_value, req.ranges)) {
-      res.status = StatusCode::RangeNotSatisfiable_416;
-      return write_response(strm, close_connection, req, res);
-    }
-  }
-
-  if (setup_request) { setup_request(req); }
-
-  if (req.get_header_value("Expect") == "100-continue") {
-    int status = StatusCode::Continue_100;
-    if (expect_100_continue_handler_) {
-      status = expect_100_continue_handler_(req, res);
-    }
-    switch (status) {
-    case StatusCode::Continue_100:
-    case StatusCode::ExpectationFailed_417:
-      detail::write_response_line(strm, status);
-      strm.write("\r\n");
-      break;
-    default:
-      connection_closed = true;
-      return write_response(strm, true, req, res);
-    }
-  }
-
-  // Setup `is_connection_closed` method
-  req.is_connection_closed = [&]() {
-    return !detail::is_socket_alive(strm.socket());
-  };
-
-  // Routing
-  auto routed = false;
-#ifdef CPPHTTPLIB_NO_EXCEPTIONS
-  routed = routing(req, res, strm);
-#else
-  try {
-    routed = routing(req, res, strm);
-  } catch (std::exception &e) {
-    if (exception_handler_) {
-      auto ep = std::current_exception();
-      exception_handler_(req, res, ep);
-      routed = true;
-    } else {
-      res.status = StatusCode::InternalServerError_500;
-      std::string val;
-      auto s = e.what();
-      for (size_t i = 0; s[i]; i++) {
-        switch (s[i]) {
-        case '\r': val += "\\r"; break;
-        case '\n': val += "\\n"; break;
-        default: val += s[i]; break;
-        }
-      }
-      res.set_header("EXCEPTION_WHAT", val);
-    }
-  } catch (...) {
-    if (exception_handler_) {
-      auto ep = std::current_exception();
-      exception_handler_(req, res, ep);
-      routed = true;
-    } else {
-      res.status = StatusCode::InternalServerError_500;
-      res.set_header("EXCEPTION_WHAT", "UNKNOWN");
-    }
-  }
-#endif
-  if (routed) {
-    if (res.status == -1) {
-      res.status = req.ranges.empty() ? StatusCode::OK_200
-                                      : StatusCode::PartialContent_206;
-    }
-
-    // Serve file content by using a content provider
-    if (!res.file_content_path_.empty()) {
-      const auto &path = res.file_content_path_;
-      auto mm = std::make_shared<detail::mmap>(path.c_str());
-      if (!mm->is_open()) {
-        res.body.clear();
-        res.content_length_ = 0;
-        res.content_provider_ = nullptr;
-        res.status = StatusCode::NotFound_404;
-        return write_response(strm, close_connection, req, res);
-      }
-
-      auto content_type = res.file_content_content_type_;
-      if (content_type.empty()) {
-        content_type = detail::find_content_type(
-            path, file_extension_and_mimetype_map_, default_file_mimetype_);
-      }
-
-      res.set_content_provider(
-          mm->size(), content_type,
-          [mm](size_t offset, size_t length, DataSink &sink) -> bool {
-            sink.write(mm->data() + offset, length);
-            return true;
-          });
-    }
-
-    if (detail::range_error(req, res)) {
-      res.body.clear();
-      res.content_length_ = 0;
-      res.content_provider_ = nullptr;
-      res.status = StatusCode::RangeNotSatisfiable_416;
-      return write_response(strm, close_connection, req, res);
-    }
-
-    return write_response_with_content(strm, close_connection, req, res);
-  } else {
-    if (res.status == -1) { res.status = StatusCode::NotFound_404; }
-
-    return write_response(strm, close_connection, req, res);
-  }
-}
-
-inline bool Server::is_valid() const { return true; }
-
-inline bool Server::process_and_close_socket(socket_t sock) {
-  std::string remote_addr;
-  int remote_port = 0;
-  detail::get_remote_ip_and_port(sock, remote_addr, remote_port);
-
-  std::string local_addr;
-  int local_port = 0;
-  detail::get_local_ip_and_port(sock, local_addr, local_port);
-
-  auto ret = detail::process_server_socket(
-      svr_sock_, sock, keep_alive_max_count_, keep_alive_timeout_sec_,
-      read_timeout_sec_, read_timeout_usec_, write_timeout_sec_,
-      write_timeout_usec_,
-      [&](Stream &strm, bool close_connection, bool &connection_closed) {
-        return process_request(strm, remote_addr, remote_port, local_addr,
-                               local_port, close_connection, connection_closed,
-                               nullptr);
-      });
-
-  detail::shutdown_socket(sock);
-  detail::close_socket(sock);
-  return ret;
-}
-
-// HTTP client implementation
-inline ClientImpl::ClientImpl(const std::string &host)
-    : ClientImpl(host, 80, std::string(), std::string()) {}
-
-inline ClientImpl::ClientImpl(const std::string &host, int port)
-    : ClientImpl(host, port, std::string(), std::string()) {}
-
-inline ClientImpl::ClientImpl(const std::string &host, int port,
-                              const std::string &client_cert_path,
-                              const std::string &client_key_path)
-    : host_(detail::escape_abstract_namespace_unix_domain(host)), port_(port),
-      host_and_port_(adjust_host_string(host_) + ":" + std::to_string(port)),
-      client_cert_path_(client_cert_path), client_key_path_(client_key_path) {}
-
-inline ClientImpl::~ClientImpl() {
-  // Wait until all the requests in flight are handled.
-  size_t retry_count = 10;
-  while (retry_count-- > 0) {
-    {
-      std::lock_guard<std::mutex> guard(socket_mutex_);
-      if (socket_requests_in_flight_ == 0) { break; }
-    }
-    std::this_thread::sleep_for(std::chrono::milliseconds{1});
-  }
-
-  std::lock_guard<std::mutex> guard(socket_mutex_);
-  shutdown_socket(socket_);
-  close_socket(socket_);
-}
-
-inline bool ClientImpl::is_valid() const { return true; }
-
-inline void ClientImpl::copy_settings(const ClientImpl &rhs) {
-  client_cert_path_ = rhs.client_cert_path_;
-  client_key_path_ = rhs.client_key_path_;
-  connection_timeout_sec_ = rhs.connection_timeout_sec_;
-  read_timeout_sec_ = rhs.read_timeout_sec_;
-  read_timeout_usec_ = rhs.read_timeout_usec_;
-  write_timeout_sec_ = rhs.write_timeout_sec_;
-  write_timeout_usec_ = rhs.write_timeout_usec_;
-  max_timeout_msec_ = rhs.max_timeout_msec_;
-  basic_auth_username_ = rhs.basic_auth_username_;
-  basic_auth_password_ = rhs.basic_auth_password_;
-  bearer_token_auth_token_ = rhs.bearer_token_auth_token_;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  digest_auth_username_ = rhs.digest_auth_username_;
-  digest_auth_password_ = rhs.digest_auth_password_;
-#endif
-  keep_alive_ = rhs.keep_alive_;
-  follow_location_ = rhs.follow_location_;
-  url_encode_ = rhs.url_encode_;
-  address_family_ = rhs.address_family_;
-  tcp_nodelay_ = rhs.tcp_nodelay_;
-  ipv6_v6only_ = rhs.ipv6_v6only_;
-  socket_options_ = rhs.socket_options_;
-  compress_ = rhs.compress_;
-  decompress_ = rhs.decompress_;
-  interface_ = rhs.interface_;
-  proxy_host_ = rhs.proxy_host_;
-  proxy_port_ = rhs.proxy_port_;
-  proxy_basic_auth_username_ = rhs.proxy_basic_auth_username_;
-  proxy_basic_auth_password_ = rhs.proxy_basic_auth_password_;
-  proxy_bearer_token_auth_token_ = rhs.proxy_bearer_token_auth_token_;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  proxy_digest_auth_username_ = rhs.proxy_digest_auth_username_;
-  proxy_digest_auth_password_ = rhs.proxy_digest_auth_password_;
-#endif
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  ca_cert_file_path_ = rhs.ca_cert_file_path_;
-  ca_cert_dir_path_ = rhs.ca_cert_dir_path_;
-  ca_cert_store_ = rhs.ca_cert_store_;
-#endif
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  server_certificate_verification_ = rhs.server_certificate_verification_;
-  server_hostname_verification_ = rhs.server_hostname_verification_;
-  server_certificate_verifier_ = rhs.server_certificate_verifier_;
-#endif
-  logger_ = rhs.logger_;
-}
-
-inline socket_t ClientImpl::create_client_socket(Error &error) const {
-  if (!proxy_host_.empty() && proxy_port_ != -1) {
-    return detail::create_client_socket(
-        proxy_host_, std::string(), proxy_port_, address_family_, tcp_nodelay_,
-        ipv6_v6only_, socket_options_, connection_timeout_sec_,
-        connection_timeout_usec_, read_timeout_sec_, read_timeout_usec_,
-        write_timeout_sec_, write_timeout_usec_, interface_, error);
-  }
-
-  // Check is custom IP specified for host_
-  std::string ip;
-  auto it = addr_map_.find(host_);
-  if (it != addr_map_.end()) { ip = it->second; }
-
-  return detail::create_client_socket(
-      host_, ip, port_, address_family_, tcp_nodelay_, ipv6_v6only_,
-      socket_options_, connection_timeout_sec_, connection_timeout_usec_,
-      read_timeout_sec_, read_timeout_usec_, write_timeout_sec_,
-      write_timeout_usec_, interface_, error);
-}
-
-inline bool ClientImpl::create_and_connect_socket(Socket &socket,
-                                                  Error &error) {
-  auto sock = create_client_socket(error);
-  if (sock == INVALID_SOCKET) { return false; }
-  socket.sock = sock;
-  return true;
-}
-
-inline void ClientImpl::shutdown_ssl(Socket & /*socket*/,
-                                     bool /*shutdown_gracefully*/) {
-  // If there are any requests in flight from threads other than us, then it's
-  // a thread-unsafe race because individual ssl* objects are not thread-safe.
-  assert(socket_requests_in_flight_ == 0 ||
-         socket_requests_are_from_thread_ == std::this_thread::get_id());
-}
-
-inline void ClientImpl::shutdown_socket(Socket &socket) const {
-  if (socket.sock == INVALID_SOCKET) { return; }
-  detail::shutdown_socket(socket.sock);
-}
-
-inline void ClientImpl::close_socket(Socket &socket) {
-  // If there are requests in flight in another thread, usually closing
-  // the socket will be fine and they will simply receive an error when
-  // using the closed socket, but it is still a bug since rarely the OS
-  // may reassign the socket id to be used for a new socket, and then
-  // suddenly they will be operating on a live socket that is different
-  // than the one they intended!
-  assert(socket_requests_in_flight_ == 0 ||
-         socket_requests_are_from_thread_ == std::this_thread::get_id());
-
-  // It is also a bug if this happens while SSL is still active
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  assert(socket.ssl == nullptr);
-#endif
-  if (socket.sock == INVALID_SOCKET) { return; }
-  detail::close_socket(socket.sock);
-  socket.sock = INVALID_SOCKET;
-}
-
-inline bool ClientImpl::read_response_line(Stream &strm, const Request &req,
-                                           Response &res) const {
-  std::array<char, 2048> buf{};
-
-  detail::stream_line_reader line_reader(strm, buf.data(), buf.size());
-
-  if (!line_reader.getline()) { return false; }
-
-#ifdef CPPHTTPLIB_ALLOW_LF_AS_LINE_TERMINATOR
-  thread_local const std::regex re("(HTTP/1\\.[01]) (\\d{3})(?: (.*?))?\r?\n");
-#else
-  thread_local const std::regex re("(HTTP/1\\.[01]) (\\d{3})(?: (.*?))?\r\n");
-#endif
-
-  std::cmatch m;
-  if (!std::regex_match(line_reader.ptr(), m, re)) {
-    return req.method == "CONNECT";
-  }
-  res.version = std::string(m[1]);
-  res.status = std::stoi(std::string(m[2]));
-  res.reason = std::string(m[3]);
-
-  // Ignore '100 Continue'
-  while (res.status == StatusCode::Continue_100) {
-    if (!line_reader.getline()) { return false; } // CRLF
-    if (!line_reader.getline()) { return false; } // next response line
-
-    if (!std::regex_match(line_reader.ptr(), m, re)) { return false; }
-    res.version = std::string(m[1]);
-    res.status = std::stoi(std::string(m[2]));
-    res.reason = std::string(m[3]);
-  }
-
-  return true;
-}
-
-inline bool ClientImpl::send(Request &req, Response &res, Error &error) {
-  std::lock_guard<std::recursive_mutex> request_mutex_guard(request_mutex_);
-  auto ret = send_(req, res, error);
-  if (error == Error::SSLPeerCouldBeClosed_) {
-    assert(!ret);
-    ret = send_(req, res, error);
-  }
-  return ret;
-}
-
-inline bool ClientImpl::send_(Request &req, Response &res, Error &error) {
-  {
-    std::lock_guard<std::mutex> guard(socket_mutex_);
-
-    // Set this to false immediately - if it ever gets set to true by the end of
-    // the request, we know another thread instructed us to close the socket.
-    socket_should_be_closed_when_request_is_done_ = false;
-
-    auto is_alive = false;
-    if (socket_.is_open()) {
-      is_alive = detail::is_socket_alive(socket_.sock);
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-      if (is_alive && is_ssl()) {
-        if (detail::is_ssl_peer_could_be_closed(socket_.ssl, socket_.sock)) {
-          is_alive = false;
-        }
-      }
-#endif
-
-      if (!is_alive) {
-        // Attempt to avoid sigpipe by shutting down non-gracefully if it seems
-        // like the other side has already closed the connection Also, there
-        // cannot be any requests in flight from other threads since we locked
-        // request_mutex_, so safe to close everything immediately
-        const bool shutdown_gracefully = false;
-        shutdown_ssl(socket_, shutdown_gracefully);
-        shutdown_socket(socket_);
-        close_socket(socket_);
-      }
-    }
-
-    if (!is_alive) {
-      if (!create_and_connect_socket(socket_, error)) { return false; }
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-      // TODO: refactoring
-      if (is_ssl()) {
-        auto &scli = static_cast<SSLClient &>(*this);
-        if (!proxy_host_.empty() && proxy_port_ != -1) {
-          auto success = false;
-          if (!scli.connect_with_proxy(socket_, req.start_time_, res, success,
-                                       error)) {
-            return success;
-          }
-        }
-
-        if (!scli.initialize_ssl(socket_, error)) { return false; }
-      }
-#endif
-    }
-
-    // Mark the current socket as being in use so that it cannot be closed by
-    // anyone else while this request is ongoing, even though we will be
-    // releasing the mutex.
-    if (socket_requests_in_flight_ > 1) {
-      assert(socket_requests_are_from_thread_ == std::this_thread::get_id());
-    }
-    socket_requests_in_flight_ += 1;
-    socket_requests_are_from_thread_ = std::this_thread::get_id();
-  }
-
-  for (const auto &header : default_headers_) {
-    if (req.headers.find(header.first) == req.headers.end()) {
-      req.headers.insert(header);
-    }
-  }
-
-  auto ret = false;
-  auto close_connection = !keep_alive_;
-
-  auto se = detail::scope_exit([&]() {
-    // Briefly lock mutex in order to mark that a request is no longer ongoing
-    std::lock_guard<std::mutex> guard(socket_mutex_);
-    socket_requests_in_flight_ -= 1;
-    if (socket_requests_in_flight_ <= 0) {
-      assert(socket_requests_in_flight_ == 0);
-      socket_requests_are_from_thread_ = std::thread::id();
-    }
-
-    if (socket_should_be_closed_when_request_is_done_ || close_connection ||
-        !ret) {
-      shutdown_ssl(socket_, true);
-      shutdown_socket(socket_);
-      close_socket(socket_);
-    }
-  });
-
-  ret = process_socket(socket_, req.start_time_, [&](Stream &strm) {
-    return handle_request(strm, req, res, close_connection, error);
-  });
-
-  if (!ret) {
-    if (error == Error::Success) { error = Error::Unknown; }
-  }
-
-  return ret;
-}
-
-inline Result ClientImpl::send(const Request &req) {
-  auto req2 = req;
-  return send_(std::move(req2));
-}
-
-inline Result ClientImpl::send_(Request &&req) {
-  auto res = detail::make_unique<Response>();
-  auto error = Error::Success;
-  auto ret = send(req, *res, error);
-  return Result{ret ? std::move(res) : nullptr, error, std::move(req.headers)};
-}
-
-inline bool ClientImpl::handle_request(Stream &strm, Request &req,
-                                       Response &res, bool close_connection,
-                                       Error &error) {
-  if (req.path.empty()) {
-    error = Error::Connection;
-    return false;
-  }
-
-  auto req_save = req;
-
-  bool ret;
-
-  if (!is_ssl() && !proxy_host_.empty() && proxy_port_ != -1) {
-    auto req2 = req;
-    req2.path = "http://" + host_and_port_ + req.path;
-    ret = process_request(strm, req2, res, close_connection, error);
-    req = req2;
-    req.path = req_save.path;
-  } else {
-    ret = process_request(strm, req, res, close_connection, error);
-  }
-
-  if (!ret) { return false; }
-
-  if (res.get_header_value("Connection") == "close" ||
-      (res.version == "HTTP/1.0" && res.reason != "Connection established")) {
-    // TODO this requires a not-entirely-obvious chain of calls to be correct
-    // for this to be safe.
-
-    // This is safe to call because handle_request is only called by send_
-    // which locks the request mutex during the process. It would be a bug
-    // to call it from a different thread since it's a thread-safety issue
-    // to do these things to the socket if another thread is using the socket.
-    std::lock_guard<std::mutex> guard(socket_mutex_);
-    shutdown_ssl(socket_, true);
-    shutdown_socket(socket_);
-    close_socket(socket_);
-  }
-
-  if (300 < res.status && res.status < 400 && follow_location_) {
-    req = req_save;
-    ret = redirect(req, res, error);
-  }
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  if ((res.status == StatusCode::Unauthorized_401 ||
-       res.status == StatusCode::ProxyAuthenticationRequired_407) &&
-      req.authorization_count_ < 5) {
-    auto is_proxy = res.status == StatusCode::ProxyAuthenticationRequired_407;
-    const auto &username =
-        is_proxy ? proxy_digest_auth_username_ : digest_auth_username_;
-    const auto &password =
-        is_proxy ? proxy_digest_auth_password_ : digest_auth_password_;
-
-    if (!username.empty() && !password.empty()) {
-      std::map<std::string, std::string> auth;
-      if (detail::parse_www_authenticate(res, auth, is_proxy)) {
-        Request new_req = req;
-        new_req.authorization_count_ += 1;
-        new_req.headers.erase(is_proxy ? "Proxy-Authorization"
-                                       : "Authorization");
-        new_req.headers.insert(detail::make_digest_authentication_header(
-            req, auth, new_req.authorization_count_, detail::random_string(10),
-            username, password, is_proxy));
-
-        Response new_res;
-
-        ret = send(new_req, new_res, error);
-        if (ret) { res = new_res; }
-      }
-    }
-  }
-#endif
-
-  return ret;
-}
-
-inline bool ClientImpl::redirect(Request &req, Response &res, Error &error) {
-  if (req.redirect_count_ == 0) {
-    error = Error::ExceedRedirectCount;
-    return false;
-  }
-
-  auto location = res.get_header_value("location");
-  if (location.empty()) { return false; }
-
-  thread_local const std::regex re(
-      R"((?:(https?):)?(?://(?:\[([a-fA-F\d:]+)\]|([^:/?#]+))(?::(\d+))?)?([^?#]*)(\?[^#]*)?(?:#.*)?)");
-
-  std::smatch m;
-  if (!std::regex_match(location, m, re)) { return false; }
-
-  auto scheme = is_ssl() ? "https" : "http";
-
-  auto next_scheme = m[1].str();
-  auto next_host = m[2].str();
-  if (next_host.empty()) { next_host = m[3].str(); }
-  auto port_str = m[4].str();
-  auto next_path = m[5].str();
-  auto next_query = m[6].str();
-
-  auto next_port = port_;
-  if (!port_str.empty()) {
-    next_port = std::stoi(port_str);
-  } else if (!next_scheme.empty()) {
-    next_port = next_scheme == "https" ? 443 : 80;
-  }
-
-  if (next_scheme.empty()) { next_scheme = scheme; }
-  if (next_host.empty()) { next_host = host_; }
-  if (next_path.empty()) { next_path = "/"; }
-
-  auto path = detail::decode_url(next_path, true) + next_query;
-
-  if (next_scheme == scheme && next_host == host_ && next_port == port_) {
-    return detail::redirect(*this, req, res, path, location, error);
-  } else {
-    if (next_scheme == "https") {
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-      SSLClient cli(next_host, next_port);
-      cli.copy_settings(*this);
-      if (ca_cert_store_) { cli.set_ca_cert_store(ca_cert_store_); }
-      return detail::redirect(cli, req, res, path, location, error);
-#else
-      return false;
-#endif
-    } else {
-      ClientImpl cli(next_host, next_port);
-      cli.copy_settings(*this);
-      return detail::redirect(cli, req, res, path, location, error);
-    }
-  }
-}
-
-inline bool ClientImpl::write_content_with_provider(Stream &strm,
-                                                    const Request &req,
-                                                    Error &error) const {
-  auto is_shutting_down = []() { return false; };
-
-  if (req.is_chunked_content_provider_) {
-    // TODO: Brotli support
-    std::unique_ptr<detail::compressor> compressor;
-#ifdef CPPHTTPLIB_ZLIB_SUPPORT
-    if (compress_) {
-      compressor = detail::make_unique<detail::gzip_compressor>();
-    } else
-#endif
-    {
-      compressor = detail::make_unique<detail::nocompressor>();
-    }
-
-    return detail::write_content_chunked(strm, req.content_provider_,
-                                         is_shutting_down, *compressor, error);
-  } else {
-    return detail::write_content(strm, req.content_provider_, 0,
-                                 req.content_length_, is_shutting_down, error);
-  }
-}
-
-inline bool ClientImpl::write_request(Stream &strm, Request &req,
-                                      bool close_connection, Error &error) {
-  // Prepare additional headers
-  if (close_connection) {
-    if (!req.has_header("Connection")) {
-      req.set_header("Connection", "close");
-    }
-  }
-
-  if (!req.has_header("Host")) {
-    if (is_ssl()) {
-      if (port_ == 443) {
-        req.set_header("Host", host_);
-      } else {
-        req.set_header("Host", host_and_port_);
-      }
-    } else {
-      if (port_ == 80) {
-        req.set_header("Host", host_);
-      } else {
-        req.set_header("Host", host_and_port_);
-      }
-    }
-  }
-
-  if (!req.has_header("Accept")) { req.set_header("Accept", "*/*"); }
-
-  if (!req.content_receiver) {
-    if (!req.has_header("Accept-Encoding")) {
-      std::string accept_encoding;
-#ifdef CPPHTTPLIB_BROTLI_SUPPORT
-      accept_encoding = "br";
-#endif
-#ifdef CPPHTTPLIB_ZLIB_SUPPORT
-      if (!accept_encoding.empty()) { accept_encoding += ", "; }
-      accept_encoding += "gzip, deflate";
-#endif
-#ifdef CPPHTTPLIB_ZSTD_SUPPORT
-      if (!accept_encoding.empty()) { accept_encoding += ", "; }
-      accept_encoding += "zstd";
-#endif
-      req.set_header("Accept-Encoding", accept_encoding);
-    }
-
-#ifndef CPPHTTPLIB_NO_DEFAULT_USER_AGENT
-    if (!req.has_header("User-Agent")) {
-      auto agent = std::string("cpp-httplib/") + CPPHTTPLIB_VERSION;
-      req.set_header("User-Agent", agent);
-    }
-#endif
-  };
-
-  if (req.body.empty()) {
-    if (req.content_provider_) {
-      if (!req.is_chunked_content_provider_) {
-        if (!req.has_header("Content-Length")) {
-          auto length = std::to_string(req.content_length_);
-          req.set_header("Content-Length", length);
-        }
-      }
-    } else {
-      if (req.method == "POST" || req.method == "PUT" ||
-          req.method == "PATCH") {
-        req.set_header("Content-Length", "0");
-      }
-    }
-  } else {
-    if (!req.has_header("Content-Type")) {
-      req.set_header("Content-Type", "text/plain");
-    }
-
-    if (!req.has_header("Content-Length")) {
-      auto length = std::to_string(req.body.size());
-      req.set_header("Content-Length", length);
-    }
-  }
-
-  if (!basic_auth_password_.empty() || !basic_auth_username_.empty()) {
-    if (!req.has_header("Authorization")) {
-      req.headers.insert(make_basic_authentication_header(
-          basic_auth_username_, basic_auth_password_, false));
-    }
-  }
-
-  if (!proxy_basic_auth_username_.empty() &&
-      !proxy_basic_auth_password_.empty()) {
-    if (!req.has_header("Proxy-Authorization")) {
-      req.headers.insert(make_basic_authentication_header(
-          proxy_basic_auth_username_, proxy_basic_auth_password_, true));
-    }
-  }
-
-  if (!bearer_token_auth_token_.empty()) {
-    if (!req.has_header("Authorization")) {
-      req.headers.insert(make_bearer_token_authentication_header(
-          bearer_token_auth_token_, false));
-    }
-  }
-
-  if (!proxy_bearer_token_auth_token_.empty()) {
-    if (!req.has_header("Proxy-Authorization")) {
-      req.headers.insert(make_bearer_token_authentication_header(
-          proxy_bearer_token_auth_token_, true));
-    }
-  }
-
-  // Request line and headers
-  {
-    detail::BufferStream bstrm;
-
-    const auto &path_with_query =
-        req.params.empty() ? req.path
-                           : append_query_params(req.path, req.params);
-
-    const auto &path =
-        url_encode_ ? detail::encode_url(path_with_query) : path_with_query;
-
-    detail::write_request_line(bstrm, req.method, path);
-
-    header_writer_(bstrm, req.headers);
-
-    // Flush buffer
-    auto &data = bstrm.get_buffer();
-    if (!detail::write_data(strm, data.data(), data.size())) {
-      error = Error::Write;
-      return false;
-    }
-  }
-
-  // Body
-  if (req.body.empty()) {
-    return write_content_with_provider(strm, req, error);
-  }
-
-  if (!detail::write_data(strm, req.body.data(), req.body.size())) {
-    error = Error::Write;
-    return false;
-  }
-
-  return true;
-}
-
-inline std::unique_ptr<Response> ClientImpl::send_with_content_provider(
-    Request &req, const char *body, size_t content_length,
-    ContentProvider content_provider,
-    ContentProviderWithoutLength content_provider_without_length,
-    const std::string &content_type, Error &error) {
-  if (!content_type.empty()) { req.set_header("Content-Type", content_type); }
-
-#ifdef CPPHTTPLIB_ZLIB_SUPPORT
-  if (compress_) { req.set_header("Content-Encoding", "gzip"); }
-#endif
-
-#ifdef CPPHTTPLIB_ZLIB_SUPPORT
-  if (compress_ && !content_provider_without_length) {
-    // TODO: Brotli support
-    detail::gzip_compressor compressor;
-
-    if (content_provider) {
-      auto ok = true;
-      size_t offset = 0;
-      DataSink data_sink;
-
-      data_sink.write = [&](const char *data, size_t data_len) -> bool {
-        if (ok) {
-          auto last = offset + data_len == content_length;
-
-          auto ret = compressor.compress(
-              data, data_len, last,
-              [&](const char *compressed_data, size_t compressed_data_len) {
-                req.body.append(compressed_data, compressed_data_len);
-                return true;
-              });
-
-          if (ret) {
-            offset += data_len;
-          } else {
-            ok = false;
-          }
-        }
-        return ok;
-      };
-
-      while (ok && offset < content_length) {
-        if (!content_provider(offset, content_length - offset, data_sink)) {
-          error = Error::Canceled;
-          return nullptr;
-        }
-      }
-    } else {
-      if (!compressor.compress(body, content_length, true,
-                               [&](const char *data, size_t data_len) {
-                                 req.body.append(data, data_len);
-                                 return true;
-                               })) {
-        error = Error::Compression;
-        return nullptr;
-      }
-    }
-  } else
-#endif
-  {
-    if (content_provider) {
-      req.content_length_ = content_length;
-      req.content_provider_ = std::move(content_provider);
-      req.is_chunked_content_provider_ = false;
-    } else if (content_provider_without_length) {
-      req.content_length_ = 0;
-      req.content_provider_ = detail::ContentProviderAdapter(
-          std::move(content_provider_without_length));
-      req.is_chunked_content_provider_ = true;
-      req.set_header("Transfer-Encoding", "chunked");
-    } else {
-      req.body.assign(body, content_length);
-    }
-  }
-
-  auto res = detail::make_unique<Response>();
-  return send(req, *res, error) ? std::move(res) : nullptr;
-}
-
-inline Result ClientImpl::send_with_content_provider(
-    const std::string &method, const std::string &path, const Headers &headers,
-    const char *body, size_t content_length, ContentProvider content_provider,
-    ContentProviderWithoutLength content_provider_without_length,
-    const std::string &content_type, Progress progress) {
-  Request req;
-  req.method = method;
-  req.headers = headers;
-  req.path = path;
-  req.progress = progress;
-  if (max_timeout_msec_ > 0) {
-    req.start_time_ = std::chrono::steady_clock::now();
-  }
-
-  auto error = Error::Success;
-
-  auto res = send_with_content_provider(
-      req, body, content_length, std::move(content_provider),
-      std::move(content_provider_without_length), content_type, error);
-
-  return Result{std::move(res), error, std::move(req.headers)};
-}
-
-inline std::string
-ClientImpl::adjust_host_string(const std::string &host) const {
-  if (host.find(':') != std::string::npos) { return "[" + host + "]"; }
-  return host;
-}
-
-inline bool ClientImpl::process_request(Stream &strm, Request &req,
-                                        Response &res, bool close_connection,
-                                        Error &error) {
-  // Send request
-  if (!write_request(strm, req, close_connection, error)) { return false; }
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  if (is_ssl()) {
-    auto is_proxy_enabled = !proxy_host_.empty() && proxy_port_ != -1;
-    if (!is_proxy_enabled) {
-      if (detail::is_ssl_peer_could_be_closed(socket_.ssl, socket_.sock)) {
-        error = Error::SSLPeerCouldBeClosed_;
-        return false;
-      }
-    }
-  }
-#endif
-
-  // Receive response and headers
-  if (!read_response_line(strm, req, res) ||
-      !detail::read_headers(strm, res.headers)) {
-    error = Error::Read;
-    return false;
-  }
-
-  // Body
-  if ((res.status != StatusCode::NoContent_204) && req.method != "HEAD" &&
-      req.method != "CONNECT") {
-    auto redirect = 300 < res.status && res.status < 400 &&
-                    res.status != StatusCode::NotModified_304 &&
-                    follow_location_;
-
-    if (req.response_handler && !redirect) {
-      if (!req.response_handler(res)) {
-        error = Error::Canceled;
-        return false;
-      }
-    }
-
-    auto out =
-        req.content_receiver
-            ? static_cast<ContentReceiverWithProgress>(
-                  [&](const char *buf, size_t n, uint64_t off, uint64_t len) {
-                    if (redirect) { return true; }
-                    auto ret = req.content_receiver(buf, n, off, len);
-                    if (!ret) { error = Error::Canceled; }
-                    return ret;
-                  })
-            : static_cast<ContentReceiverWithProgress>(
-                  [&](const char *buf, size_t n, uint64_t /*off*/,
-                      uint64_t /*len*/) {
-                    assert(res.body.size() + n <= res.body.max_size());
-                    res.body.append(buf, n);
-                    return true;
-                  });
-
-    auto progress = [&](uint64_t current, uint64_t total) {
-      if (!req.progress || redirect) { return true; }
-      auto ret = req.progress(current, total);
-      if (!ret) { error = Error::Canceled; }
-      return ret;
-    };
-
-    if (res.has_header("Content-Length")) {
-      if (!req.content_receiver) {
-        auto len = res.get_header_value_u64("Content-Length");
-        if (len > res.body.max_size()) {
-          error = Error::Read;
-          return false;
-        }
-        res.body.reserve(static_cast<size_t>(len));
-      }
-    }
-
-    if (res.status != StatusCode::NotModified_304) {
-      int dummy_status;
-      if (!detail::read_content(strm, res, (std::numeric_limits<size_t>::max)(),
-                                dummy_status, std::move(progress),
-                                std::move(out), decompress_)) {
-        if (error != Error::Canceled) { error = Error::Read; }
-        return false;
-      }
-    }
-  }
-
-  // Log
-  if (logger_) { logger_(req, res); }
-
-  return true;
-}
-
-inline ContentProviderWithoutLength ClientImpl::get_multipart_content_provider(
-    const std::string &boundary, const MultipartFormDataItems &items,
-    const MultipartFormDataProviderItems &provider_items) const {
-  size_t cur_item = 0;
-  size_t cur_start = 0;
-  // cur_item and cur_start are copied to within the std::function and maintain
-  // state between successive calls
-  return [&, cur_item, cur_start](size_t offset,
-                                  DataSink &sink) mutable -> bool {
-    if (!offset && !items.empty()) {
-      sink.os << detail::serialize_multipart_formdata(items, boundary, false);
-      return true;
-    } else if (cur_item < provider_items.size()) {
-      if (!cur_start) {
-        const auto &begin = detail::serialize_multipart_formdata_item_begin(
-            provider_items[cur_item], boundary);
-        offset += begin.size();
-        cur_start = offset;
-        sink.os << begin;
-      }
-
-      DataSink cur_sink;
-      auto has_data = true;
-      cur_sink.write = sink.write;
-      cur_sink.done = [&]() { has_data = false; };
-
-      if (!provider_items[cur_item].provider(offset - cur_start, cur_sink)) {
-        return false;
-      }
-
-      if (!has_data) {
-        sink.os << detail::serialize_multipart_formdata_item_end();
-        cur_item++;
-        cur_start = 0;
-      }
-      return true;
-    } else {
-      sink.os << detail::serialize_multipart_formdata_finish(boundary);
-      sink.done();
-      return true;
-    }
-  };
-}
-
-inline bool ClientImpl::process_socket(
-    const Socket &socket,
-    std::chrono::time_point<std::chrono::steady_clock> start_time,
-    std::function<bool(Stream &strm)> callback) {
-  return detail::process_client_socket(
-      socket.sock, read_timeout_sec_, read_timeout_usec_, write_timeout_sec_,
-      write_timeout_usec_, max_timeout_msec_, start_time, std::move(callback));
-}
-
-inline bool ClientImpl::is_ssl() const { return false; }
-
-inline Result ClientImpl::Get(const std::string &path) {
-  return Get(path, Headers(), Progress());
-}
-
-inline Result ClientImpl::Get(const std::string &path, Progress progress) {
-  return Get(path, Headers(), std::move(progress));
-}
-
-inline Result ClientImpl::Get(const std::string &path, const Headers &headers) {
-  return Get(path, headers, Progress());
-}
-
-inline Result ClientImpl::Get(const std::string &path, const Headers &headers,
-                              Progress progress) {
-  Request req;
-  req.method = "GET";
-  req.path = path;
-  req.headers = headers;
-  req.progress = std::move(progress);
-  if (max_timeout_msec_ > 0) {
-    req.start_time_ = std::chrono::steady_clock::now();
-  }
-
-  return send_(std::move(req));
-}
-
-inline Result ClientImpl::Get(const std::string &path,
-                              ContentReceiver content_receiver) {
-  return Get(path, Headers(), nullptr, std::move(content_receiver), nullptr);
-}
-
-inline Result ClientImpl::Get(const std::string &path,
-                              ContentReceiver content_receiver,
-                              Progress progress) {
-  return Get(path, Headers(), nullptr, std::move(content_receiver),
-             std::move(progress));
-}
-
-inline Result ClientImpl::Get(const std::string &path, const Headers &headers,
-                              ContentReceiver content_receiver) {
-  return Get(path, headers, nullptr, std::move(content_receiver), nullptr);
-}
-
-inline Result ClientImpl::Get(const std::string &path, const Headers &headers,
-                              ContentReceiver content_receiver,
-                              Progress progress) {
-  return Get(path, headers, nullptr, std::move(content_receiver),
-             std::move(progress));
-}
-
-inline Result ClientImpl::Get(const std::string &path,
-                              ResponseHandler response_handler,
-                              ContentReceiver content_receiver) {
-  return Get(path, Headers(), std::move(response_handler),
-             std::move(content_receiver), nullptr);
-}
-
-inline Result ClientImpl::Get(const std::string &path, const Headers &headers,
-                              ResponseHandler response_handler,
-                              ContentReceiver content_receiver) {
-  return Get(path, headers, std::move(response_handler),
-             std::move(content_receiver), nullptr);
-}
-
-inline Result ClientImpl::Get(const std::string &path,
-                              ResponseHandler response_handler,
-                              ContentReceiver content_receiver,
-                              Progress progress) {
-  return Get(path, Headers(), std::move(response_handler),
-             std::move(content_receiver), std::move(progress));
-}
-
-inline Result ClientImpl::Get(const std::string &path, const Headers &headers,
-                              ResponseHandler response_handler,
-                              ContentReceiver content_receiver,
-                              Progress progress) {
-  Request req;
-  req.method = "GET";
-  req.path = path;
-  req.headers = headers;
-  req.response_handler = std::move(response_handler);
-  req.content_receiver =
-      [content_receiver](const char *data, size_t data_length,
-                         uint64_t /*offset*/, uint64_t /*total_length*/) {
-        return content_receiver(data, data_length);
-      };
-  req.progress = std::move(progress);
-  if (max_timeout_msec_ > 0) {
-    req.start_time_ = std::chrono::steady_clock::now();
-  }
-
-  return send_(std::move(req));
-}
-
-inline Result ClientImpl::Get(const std::string &path, const Params &params,
-                              const Headers &headers, Progress progress) {
-  if (params.empty()) { return Get(path, headers); }
-
-  std::string path_with_query = append_query_params(path, params);
-  return Get(path_with_query, headers, std::move(progress));
-}
-
-inline Result ClientImpl::Get(const std::string &path, const Params &params,
-                              const Headers &headers,
-                              ContentReceiver content_receiver,
-                              Progress progress) {
-  return Get(path, params, headers, nullptr, std::move(content_receiver),
-             std::move(progress));
-}
-
-inline Result ClientImpl::Get(const std::string &path, const Params &params,
-                              const Headers &headers,
-                              ResponseHandler response_handler,
-                              ContentReceiver content_receiver,
-                              Progress progress) {
-  if (params.empty()) {
-    return Get(path, headers, std::move(response_handler),
-               std::move(content_receiver), std::move(progress));
-  }
-
-  std::string path_with_query = append_query_params(path, params);
-  return Get(path_with_query, headers, std::move(response_handler),
-             std::move(content_receiver), std::move(progress));
-}
-
-inline Result ClientImpl::Head(const std::string &path) {
-  return Head(path, Headers());
-}
-
-inline Result ClientImpl::Head(const std::string &path,
-                               const Headers &headers) {
-  Request req;
-  req.method = "HEAD";
-  req.headers = headers;
-  req.path = path;
-  if (max_timeout_msec_ > 0) {
-    req.start_time_ = std::chrono::steady_clock::now();
-  }
-
-  return send_(std::move(req));
-}
-
-inline Result ClientImpl::Post(const std::string &path) {
-  return Post(path, std::string(), std::string());
-}
-
-inline Result ClientImpl::Post(const std::string &path,
-                               const Headers &headers) {
-  return Post(path, headers, nullptr, 0, std::string());
-}
-
-inline Result ClientImpl::Post(const std::string &path, const char *body,
-                               size_t content_length,
-                               const std::string &content_type) {
-  return Post(path, Headers(), body, content_length, content_type, nullptr);
-}
-
-inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
-                               const char *body, size_t content_length,
-                               const std::string &content_type) {
-  return send_with_content_provider("POST", path, headers, body, content_length,
-                                    nullptr, nullptr, content_type, nullptr);
-}
-
-inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
-                               const char *body, size_t content_length,
-                               const std::string &content_type,
-                               Progress progress) {
-  return send_with_content_provider("POST", path, headers, body, content_length,
-                                    nullptr, nullptr, content_type, progress);
-}
-
-inline Result ClientImpl::Post(const std::string &path, const std::string &body,
-                               const std::string &content_type) {
-  return Post(path, Headers(), body, content_type);
-}
-
-inline Result ClientImpl::Post(const std::string &path, const std::string &body,
-                               const std::string &content_type,
-                               Progress progress) {
-  return Post(path, Headers(), body, content_type, progress);
-}
-
-inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
-                               const std::string &body,
-                               const std::string &content_type) {
-  return send_with_content_provider("POST", path, headers, body.data(),
-                                    body.size(), nullptr, nullptr, content_type,
-                                    nullptr);
-}
-
-inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
-                               const std::string &body,
-                               const std::string &content_type,
-                               Progress progress) {
-  return send_with_content_provider("POST", path, headers, body.data(),
-                                    body.size(), nullptr, nullptr, content_type,
-                                    progress);
-}
-
-inline Result ClientImpl::Post(const std::string &path, const Params &params) {
-  return Post(path, Headers(), params);
-}
-
-inline Result ClientImpl::Post(const std::string &path, size_t content_length,
-                               ContentProvider content_provider,
-                               const std::string &content_type) {
-  return Post(path, Headers(), content_length, std::move(content_provider),
-              content_type);
-}
-
-inline Result ClientImpl::Post(const std::string &path,
-                               ContentProviderWithoutLength content_provider,
-                               const std::string &content_type) {
-  return Post(path, Headers(), std::move(content_provider), content_type);
-}
-
-inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
-                               size_t content_length,
-                               ContentProvider content_provider,
-                               const std::string &content_type) {
-  return send_with_content_provider("POST", path, headers, nullptr,
-                                    content_length, std::move(content_provider),
-                                    nullptr, content_type, nullptr);
-}
-
-inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
-                               ContentProviderWithoutLength content_provider,
-                               const std::string &content_type) {
-  return send_with_content_provider("POST", path, headers, nullptr, 0, nullptr,
-                                    std::move(content_provider), content_type,
-                                    nullptr);
-}
-
-inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
-                               const Params &params) {
-  auto query = detail::params_to_query_str(params);
-  return Post(path, headers, query, "application/x-www-form-urlencoded");
-}
-
-inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
-                               const Params &params, Progress progress) {
-  auto query = detail::params_to_query_str(params);
-  return Post(path, headers, query, "application/x-www-form-urlencoded",
-              progress);
-}
-
-inline Result ClientImpl::Post(const std::string &path,
-                               const MultipartFormDataItems &items) {
-  return Post(path, Headers(), items);
-}
-
-inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
-                               const MultipartFormDataItems &items) {
-  const auto &boundary = detail::make_multipart_data_boundary();
-  const auto &content_type =
-      detail::serialize_multipart_formdata_get_content_type(boundary);
-  const auto &body = detail::serialize_multipart_formdata(items, boundary);
-  return Post(path, headers, body, content_type);
-}
-
-inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
-                               const MultipartFormDataItems &items,
-                               const std::string &boundary) {
-  if (!detail::is_multipart_boundary_chars_valid(boundary)) {
-    return Result{nullptr, Error::UnsupportedMultipartBoundaryChars};
-  }
-
-  const auto &content_type =
-      detail::serialize_multipart_formdata_get_content_type(boundary);
-  const auto &body = detail::serialize_multipart_formdata(items, boundary);
-  return Post(path, headers, body, content_type);
-}
-
-inline Result
-ClientImpl::Post(const std::string &path, const Headers &headers,
-                 const MultipartFormDataItems &items,
-                 const MultipartFormDataProviderItems &provider_items) {
-  const auto &boundary = detail::make_multipart_data_boundary();
-  const auto &content_type =
-      detail::serialize_multipart_formdata_get_content_type(boundary);
-  return send_with_content_provider(
-      "POST", path, headers, nullptr, 0, nullptr,
-      get_multipart_content_provider(boundary, items, provider_items),
-      content_type, nullptr);
-}
-
-inline Result ClientImpl::Put(const std::string &path) {
-  return Put(path, std::string(), std::string());
-}
-
-inline Result ClientImpl::Put(const std::string &path, const char *body,
-                              size_t content_length,
-                              const std::string &content_type) {
-  return Put(path, Headers(), body, content_length, content_type);
-}
-
-inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
-                              const char *body, size_t content_length,
-                              const std::string &content_type) {
-  return send_with_content_provider("PUT", path, headers, body, content_length,
-                                    nullptr, nullptr, content_type, nullptr);
-}
-
-inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
-                              const char *body, size_t content_length,
-                              const std::string &content_type,
-                              Progress progress) {
-  return send_with_content_provider("PUT", path, headers, body, content_length,
-                                    nullptr, nullptr, content_type, progress);
-}
-
-inline Result ClientImpl::Put(const std::string &path, const std::string &body,
-                              const std::string &content_type) {
-  return Put(path, Headers(), body, content_type);
-}
-
-inline Result ClientImpl::Put(const std::string &path, const std::string &body,
-                              const std::string &content_type,
-                              Progress progress) {
-  return Put(path, Headers(), body, content_type, progress);
-}
-
-inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
-                              const std::string &body,
-                              const std::string &content_type) {
-  return send_with_content_provider("PUT", path, headers, body.data(),
-                                    body.size(), nullptr, nullptr, content_type,
-                                    nullptr);
-}
-
-inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
-                              const std::string &body,
-                              const std::string &content_type,
-                              Progress progress) {
-  return send_with_content_provider("PUT", path, headers, body.data(),
-                                    body.size(), nullptr, nullptr, content_type,
-                                    progress);
-}
-
-inline Result ClientImpl::Put(const std::string &path, size_t content_length,
-                              ContentProvider content_provider,
-                              const std::string &content_type) {
-  return Put(path, Headers(), content_length, std::move(content_provider),
-             content_type);
-}
-
-inline Result ClientImpl::Put(const std::string &path,
-                              ContentProviderWithoutLength content_provider,
-                              const std::string &content_type) {
-  return Put(path, Headers(), std::move(content_provider), content_type);
-}
-
-inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
-                              size_t content_length,
-                              ContentProvider content_provider,
-                              const std::string &content_type) {
-  return send_with_content_provider("PUT", path, headers, nullptr,
-                                    content_length, std::move(content_provider),
-                                    nullptr, content_type, nullptr);
-}
-
-inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
-                              ContentProviderWithoutLength content_provider,
-                              const std::string &content_type) {
-  return send_with_content_provider("PUT", path, headers, nullptr, 0, nullptr,
-                                    std::move(content_provider), content_type,
-                                    nullptr);
-}
-
-inline Result ClientImpl::Put(const std::string &path, const Params &params) {
-  return Put(path, Headers(), params);
-}
-
-inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
-                              const Params &params) {
-  auto query = detail::params_to_query_str(params);
-  return Put(path, headers, query, "application/x-www-form-urlencoded");
-}
-
-inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
-                              const Params &params, Progress progress) {
-  auto query = detail::params_to_query_str(params);
-  return Put(path, headers, query, "application/x-www-form-urlencoded",
-             progress);
-}
-
-inline Result ClientImpl::Put(const std::string &path,
-                              const MultipartFormDataItems &items) {
-  return Put(path, Headers(), items);
-}
-
-inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
-                              const MultipartFormDataItems &items) {
-  const auto &boundary = detail::make_multipart_data_boundary();
-  const auto &content_type =
-      detail::serialize_multipart_formdata_get_content_type(boundary);
-  const auto &body = detail::serialize_multipart_formdata(items, boundary);
-  return Put(path, headers, body, content_type);
-}
-
-inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
-                              const MultipartFormDataItems &items,
-                              const std::string &boundary) {
-  if (!detail::is_multipart_boundary_chars_valid(boundary)) {
-    return Result{nullptr, Error::UnsupportedMultipartBoundaryChars};
-  }
-
-  const auto &content_type =
-      detail::serialize_multipart_formdata_get_content_type(boundary);
-  const auto &body = detail::serialize_multipart_formdata(items, boundary);
-  return Put(path, headers, body, content_type);
-}
-
-inline Result
-ClientImpl::Put(const std::string &path, const Headers &headers,
-                const MultipartFormDataItems &items,
-                const MultipartFormDataProviderItems &provider_items) {
-  const auto &boundary = detail::make_multipart_data_boundary();
-  const auto &content_type =
-      detail::serialize_multipart_formdata_get_content_type(boundary);
-  return send_with_content_provider(
-      "PUT", path, headers, nullptr, 0, nullptr,
-      get_multipart_content_provider(boundary, items, provider_items),
-      content_type, nullptr);
-}
-inline Result ClientImpl::Patch(const std::string &path) {
-  return Patch(path, std::string(), std::string());
-}
-
-inline Result ClientImpl::Patch(const std::string &path, const char *body,
-                                size_t content_length,
-                                const std::string &content_type) {
-  return Patch(path, Headers(), body, content_length, content_type);
-}
-
-inline Result ClientImpl::Patch(const std::string &path, const char *body,
-                                size_t content_length,
-                                const std::string &content_type,
-                                Progress progress) {
-  return Patch(path, Headers(), body, content_length, content_type, progress);
-}
-
-inline Result ClientImpl::Patch(const std::string &path, const Headers &headers,
-                                const char *body, size_t content_length,
-                                const std::string &content_type) {
-  return Patch(path, headers, body, content_length, content_type, nullptr);
-}
-
-inline Result ClientImpl::Patch(const std::string &path, const Headers &headers,
-                                const char *body, size_t content_length,
-                                const std::string &content_type,
-                                Progress progress) {
-  return send_with_content_provider("PATCH", path, headers, body,
-                                    content_length, nullptr, nullptr,
-                                    content_type, progress);
-}
-
-inline Result ClientImpl::Patch(const std::string &path,
-                                const std::string &body,
-                                const std::string &content_type) {
-  return Patch(path, Headers(), body, content_type);
-}
-
-inline Result ClientImpl::Patch(const std::string &path,
-                                const std::string &body,
-                                const std::string &content_type,
-                                Progress progress) {
-  return Patch(path, Headers(), body, content_type, progress);
-}
-
-inline Result ClientImpl::Patch(const std::string &path, const Headers &headers,
-                                const std::string &body,
-                                const std::string &content_type) {
-  return Patch(path, headers, body, content_type, nullptr);
-}
-
-inline Result ClientImpl::Patch(const std::string &path, const Headers &headers,
-                                const std::string &body,
-                                const std::string &content_type,
-                                Progress progress) {
-  return send_with_content_provider("PATCH", path, headers, body.data(),
-                                    body.size(), nullptr, nullptr, content_type,
-                                    progress);
-}
-
-inline Result ClientImpl::Patch(const std::string &path, size_t content_length,
-                                ContentProvider content_provider,
-                                const std::string &content_type) {
-  return Patch(path, Headers(), content_length, std::move(content_provider),
-               content_type);
-}
-
-inline Result ClientImpl::Patch(const std::string &path,
-                                ContentProviderWithoutLength content_provider,
-                                const std::string &content_type) {
-  return Patch(path, Headers(), std::move(content_provider), content_type);
-}
-
-inline Result ClientImpl::Patch(const std::string &path, const Headers &headers,
-                                size_t content_length,
-                                ContentProvider content_provider,
-                                const std::string &content_type) {
-  return send_with_content_provider("PATCH", path, headers, nullptr,
-                                    content_length, std::move(content_provider),
-                                    nullptr, content_type, nullptr);
-}
-
-inline Result ClientImpl::Patch(const std::string &path, const Headers &headers,
-                                ContentProviderWithoutLength content_provider,
-                                const std::string &content_type) {
-  return send_with_content_provider("PATCH", path, headers, nullptr, 0, nullptr,
-                                    std::move(content_provider), content_type,
-                                    nullptr);
-}
-
-inline Result ClientImpl::Delete(const std::string &path) {
-  return Delete(path, Headers(), std::string(), std::string());
-}
-
-inline Result ClientImpl::Delete(const std::string &path,
-                                 const Headers &headers) {
-  return Delete(path, headers, std::string(), std::string());
-}
-
-inline Result ClientImpl::Delete(const std::string &path, const char *body,
-                                 size_t content_length,
-                                 const std::string &content_type) {
-  return Delete(path, Headers(), body, content_length, content_type);
-}
-
-inline Result ClientImpl::Delete(const std::string &path, const char *body,
-                                 size_t content_length,
-                                 const std::string &content_type,
-                                 Progress progress) {
-  return Delete(path, Headers(), body, content_length, content_type, progress);
-}
-
-inline Result ClientImpl::Delete(const std::string &path,
-                                 const Headers &headers, const char *body,
-                                 size_t content_length,
-                                 const std::string &content_type) {
-  return Delete(path, headers, body, content_length, content_type, nullptr);
-}
-
-inline Result ClientImpl::Delete(const std::string &path,
-                                 const Headers &headers, const char *body,
-                                 size_t content_length,
-                                 const std::string &content_type,
-                                 Progress progress) {
-  Request req;
-  req.method = "DELETE";
-  req.headers = headers;
-  req.path = path;
-  req.progress = progress;
-  if (max_timeout_msec_ > 0) {
-    req.start_time_ = std::chrono::steady_clock::now();
-  }
-
-  if (!content_type.empty()) { req.set_header("Content-Type", content_type); }
-  req.body.assign(body, content_length);
-
-  return send_(std::move(req));
-}
-
-inline Result ClientImpl::Delete(const std::string &path,
-                                 const std::string &body,
-                                 const std::string &content_type) {
-  return Delete(path, Headers(), body.data(), body.size(), content_type);
-}
-
-inline Result ClientImpl::Delete(const std::string &path,
-                                 const std::string &body,
-                                 const std::string &content_type,
-                                 Progress progress) {
-  return Delete(path, Headers(), body.data(), body.size(), content_type,
-                progress);
-}
-
-inline Result ClientImpl::Delete(const std::string &path,
-                                 const Headers &headers,
-                                 const std::string &body,
-                                 const std::string &content_type) {
-  return Delete(path, headers, body.data(), body.size(), content_type);
-}
-
-inline Result ClientImpl::Delete(const std::string &path,
-                                 const Headers &headers,
-                                 const std::string &body,
-                                 const std::string &content_type,
-                                 Progress progress) {
-  return Delete(path, headers, body.data(), body.size(), content_type,
-                progress);
-}
-
-inline Result ClientImpl::Options(const std::string &path) {
-  return Options(path, Headers());
-}
-
-inline Result ClientImpl::Options(const std::string &path,
-                                  const Headers &headers) {
-  Request req;
-  req.method = "OPTIONS";
-  req.headers = headers;
-  req.path = path;
-  if (max_timeout_msec_ > 0) {
-    req.start_time_ = std::chrono::steady_clock::now();
-  }
-
-  return send_(std::move(req));
-}
-
-inline void ClientImpl::stop() {
-  std::lock_guard<std::mutex> guard(socket_mutex_);
-
-  // If there is anything ongoing right now, the ONLY thread-safe thing we can
-  // do is to shutdown_socket, so that threads using this socket suddenly
-  // discover they can't read/write any more and error out. Everything else
-  // (closing the socket, shutting ssl down) is unsafe because these actions are
-  // not thread-safe.
-  if (socket_requests_in_flight_ > 0) {
-    shutdown_socket(socket_);
-
-    // Aside from that, we set a flag for the socket to be closed when we're
-    // done.
-    socket_should_be_closed_when_request_is_done_ = true;
-    return;
-  }
-
-  // Otherwise, still holding the mutex, we can shut everything down ourselves
-  shutdown_ssl(socket_, true);
-  shutdown_socket(socket_);
-  close_socket(socket_);
-}
-
-inline std::string ClientImpl::host() const { return host_; }
-
-inline int ClientImpl::port() const { return port_; }
-
-inline size_t ClientImpl::is_socket_open() const {
-  std::lock_guard<std::mutex> guard(socket_mutex_);
-  return socket_.is_open();
-}
-
-inline socket_t ClientImpl::socket() const { return socket_.sock; }
-
-inline void ClientImpl::set_connection_timeout(time_t sec, time_t usec) {
-  connection_timeout_sec_ = sec;
-  connection_timeout_usec_ = usec;
-}
-
-inline void ClientImpl::set_read_timeout(time_t sec, time_t usec) {
-  read_timeout_sec_ = sec;
-  read_timeout_usec_ = usec;
-}
-
-inline void ClientImpl::set_write_timeout(time_t sec, time_t usec) {
-  write_timeout_sec_ = sec;
-  write_timeout_usec_ = usec;
-}
-
-inline void ClientImpl::set_max_timeout(time_t msec) {
-  max_timeout_msec_ = msec;
-}
-
-inline void ClientImpl::set_basic_auth(const std::string &username,
-                                       const std::string &password) {
-  basic_auth_username_ = username;
-  basic_auth_password_ = password;
-}
-
-inline void ClientImpl::set_bearer_token_auth(const std::string &token) {
-  bearer_token_auth_token_ = token;
-}
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-inline void ClientImpl::set_digest_auth(const std::string &username,
-                                        const std::string &password) {
-  digest_auth_username_ = username;
-  digest_auth_password_ = password;
-}
-#endif
-
-inline void ClientImpl::set_keep_alive(bool on) { keep_alive_ = on; }
-
-inline void ClientImpl::set_follow_location(bool on) { follow_location_ = on; }
-
-inline void ClientImpl::set_url_encode(bool on) { url_encode_ = on; }
-
-inline void
-ClientImpl::set_hostname_addr_map(std::map<std::string, std::string> addr_map) {
-  addr_map_ = std::move(addr_map);
-}
-
-inline void ClientImpl::set_default_headers(Headers headers) {
-  default_headers_ = std::move(headers);
-}
-
-inline void ClientImpl::set_header_writer(
-    std::function<ssize_t(Stream &, Headers &)> const &writer) {
-  header_writer_ = writer;
-}
-
-inline void ClientImpl::set_address_family(int family) {
-  address_family_ = family;
-}
-
-inline void ClientImpl::set_tcp_nodelay(bool on) { tcp_nodelay_ = on; }
-
-inline void ClientImpl::set_ipv6_v6only(bool on) { ipv6_v6only_ = on; }
-
-inline void ClientImpl::set_socket_options(SocketOptions socket_options) {
-  socket_options_ = std::move(socket_options);
-}
-
-inline void ClientImpl::set_compress(bool on) { compress_ = on; }
-
-inline void ClientImpl::set_decompress(bool on) { decompress_ = on; }
-
-inline void ClientImpl::set_interface(const std::string &intf) {
-  interface_ = intf;
-}
-
-inline void ClientImpl::set_proxy(const std::string &host, int port) {
-  proxy_host_ = host;
-  proxy_port_ = port;
-}
-
-inline void ClientImpl::set_proxy_basic_auth(const std::string &username,
-                                             const std::string &password) {
-  proxy_basic_auth_username_ = username;
-  proxy_basic_auth_password_ = password;
-}
-
-inline void ClientImpl::set_proxy_bearer_token_auth(const std::string &token) {
-  proxy_bearer_token_auth_token_ = token;
-}
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-inline void ClientImpl::set_proxy_digest_auth(const std::string &username,
-                                              const std::string &password) {
-  proxy_digest_auth_username_ = username;
-  proxy_digest_auth_password_ = password;
-}
-
-inline void ClientImpl::set_ca_cert_path(const std::string &ca_cert_file_path,
-                                         const std::string &ca_cert_dir_path) {
-  ca_cert_file_path_ = ca_cert_file_path;
-  ca_cert_dir_path_ = ca_cert_dir_path;
-}
-
-inline void ClientImpl::set_ca_cert_store(X509_STORE *ca_cert_store) {
-  if (ca_cert_store && ca_cert_store != ca_cert_store_) {
-    ca_cert_store_ = ca_cert_store;
-  }
-}
-
-inline X509_STORE *ClientImpl::create_ca_cert_store(const char *ca_cert,
-                                                    std::size_t size) const {
-  auto mem = BIO_new_mem_buf(ca_cert, static_cast<int>(size));
-  auto se = detail::scope_exit([&] { BIO_free_all(mem); });
-  if (!mem) { return nullptr; }
-
-  auto inf = PEM_X509_INFO_read_bio(mem, nullptr, nullptr, nullptr);
-  if (!inf) { return nullptr; }
-
-  auto cts = X509_STORE_new();
-  if (cts) {
-    for (auto i = 0; i < static_cast<int>(sk_X509_INFO_num(inf)); i++) {
-      auto itmp = sk_X509_INFO_value(inf, i);
-      if (!itmp) { continue; }
-
-      if (itmp->x509) { X509_STORE_add_cert(cts, itmp->x509); }
-      if (itmp->crl) { X509_STORE_add_crl(cts, itmp->crl); }
-    }
-  }
-
-  sk_X509_INFO_pop_free(inf, X509_INFO_free);
-  return cts;
-}
-
-inline void ClientImpl::enable_server_certificate_verification(bool enabled) {
-  server_certificate_verification_ = enabled;
-}
-
-inline void ClientImpl::enable_server_hostname_verification(bool enabled) {
-  server_hostname_verification_ = enabled;
-}
-
-inline void ClientImpl::set_server_certificate_verifier(
-    std::function<SSLVerifierResponse(SSL *ssl)> verifier) {
-  server_certificate_verifier_ = verifier;
-}
-#endif
-
-inline void ClientImpl::set_logger(Logger logger) {
-  logger_ = std::move(logger);
-}
-
-/*
- * SSL Implementation
- */
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-namespace detail {
-
-template <typename U, typename V>
-inline SSL *ssl_new(socket_t sock, SSL_CTX *ctx, std::mutex &ctx_mutex,
-                    U SSL_connect_or_accept, V setup) {
-  SSL *ssl = nullptr;
-  {
-    std::lock_guard<std::mutex> guard(ctx_mutex);
-    ssl = SSL_new(ctx);
-  }
-
-  if (ssl) {
-    set_nonblocking(sock, true);
-    auto bio = BIO_new_socket(static_cast<int>(sock), BIO_NOCLOSE);
-    BIO_set_nbio(bio, 1);
-    SSL_set_bio(ssl, bio, bio);
-
-    if (!setup(ssl) || SSL_connect_or_accept(ssl) != 1) {
-      SSL_shutdown(ssl);
-      {
-        std::lock_guard<std::mutex> guard(ctx_mutex);
-        SSL_free(ssl);
-      }
-      set_nonblocking(sock, false);
-      return nullptr;
-    }
-    BIO_set_nbio(bio, 0);
-    set_nonblocking(sock, false);
-  }
-
-  return ssl;
-}
-
-inline void ssl_delete(std::mutex &ctx_mutex, SSL *ssl, socket_t sock,
-                       bool shutdown_gracefully) {
-  // sometimes we may want to skip this to try to avoid SIGPIPE if we know
-  // the remote has closed the network connection
-  // Note that it is not always possible to avoid SIGPIPE, this is merely a
-  // best-efforts.
-  if (shutdown_gracefully) {
-    (void)(sock);
-    // SSL_shutdown() returns 0 on first call (indicating close_notify alert
-    // sent) and 1 on subsequent call (indicating close_notify alert received)
-    if (SSL_shutdown(ssl) == 0) {
-      // Expected to return 1, but even if it doesn't, we free ssl
-      SSL_shutdown(ssl);
-    }
-  }
-
-  std::lock_guard<std::mutex> guard(ctx_mutex);
-  SSL_free(ssl);
-}
-
-template <typename U>
-bool ssl_connect_or_accept_nonblocking(socket_t sock, SSL *ssl,
-                                       U ssl_connect_or_accept,
-                                       time_t timeout_sec,
-                                       time_t timeout_usec) {
-  auto res = 0;
-  while ((res = ssl_connect_or_accept(ssl)) != 1) {
-    auto err = SSL_get_error(ssl, res);
-    switch (err) {
-    case SSL_ERROR_WANT_READ:
-      if (select_read(sock, timeout_sec, timeout_usec) > 0) { continue; }
-      break;
-    case SSL_ERROR_WANT_WRITE:
-      if (select_write(sock, timeout_sec, timeout_usec) > 0) { continue; }
-      break;
-    default: break;
-    }
-    return false;
-  }
-  return true;
-}
-
-template <typename T>
-inline bool process_server_socket_ssl(
-    const std::atomic<socket_t> &svr_sock, SSL *ssl, socket_t sock,
-    size_t keep_alive_max_count, time_t keep_alive_timeout_sec,
-    time_t read_timeout_sec, time_t read_timeout_usec, time_t write_timeout_sec,
-    time_t write_timeout_usec, T callback) {
-  return process_server_socket_core(
-      svr_sock, sock, keep_alive_max_count, keep_alive_timeout_sec,
-      [&](bool close_connection, bool &connection_closed) {
-        SSLSocketStream strm(sock, ssl, read_timeout_sec, read_timeout_usec,
-                             write_timeout_sec, write_timeout_usec);
-        return callback(strm, close_connection, connection_closed);
-      });
-}
-
-template <typename T>
-inline bool process_client_socket_ssl(
-    SSL *ssl, socket_t sock, time_t read_timeout_sec, time_t read_timeout_usec,
-    time_t write_timeout_sec, time_t write_timeout_usec,
-    time_t max_timeout_msec,
-    std::chrono::time_point<std::chrono::steady_clock> start_time, T callback) {
-  SSLSocketStream strm(sock, ssl, read_timeout_sec, read_timeout_usec,
-                       write_timeout_sec, write_timeout_usec, max_timeout_msec,
-                       start_time);
-  return callback(strm);
-}
-
-// SSL socket stream implementation
-inline SSLSocketStream::SSLSocketStream(
-    socket_t sock, SSL *ssl, time_t read_timeout_sec, time_t read_timeout_usec,
-    time_t write_timeout_sec, time_t write_timeout_usec,
-    time_t max_timeout_msec,
-    std::chrono::time_point<std::chrono::steady_clock> start_time)
-    : sock_(sock), ssl_(ssl), read_timeout_sec_(read_timeout_sec),
-      read_timeout_usec_(read_timeout_usec),
-      write_timeout_sec_(write_timeout_sec),
-      write_timeout_usec_(write_timeout_usec),
-      max_timeout_msec_(max_timeout_msec), start_time_(start_time) {
-  SSL_clear_mode(ssl, SSL_MODE_AUTO_RETRY);
-}
-
-inline SSLSocketStream::~SSLSocketStream() = default;
-
-inline bool SSLSocketStream::is_readable() const {
-  return SSL_pending(ssl_) > 0;
-}
-
-inline bool SSLSocketStream::wait_readable() const {
-  if (max_timeout_msec_ <= 0) {
-    return select_read(sock_, read_timeout_sec_, read_timeout_usec_) > 0;
-  }
-
-  time_t read_timeout_sec;
-  time_t read_timeout_usec;
-  calc_actual_timeout(max_timeout_msec_, duration(), read_timeout_sec_,
-                      read_timeout_usec_, read_timeout_sec, read_timeout_usec);
-
-  return select_read(sock_, read_timeout_sec, read_timeout_usec) > 0;
-}
-
-inline bool SSLSocketStream::wait_writable() const {
-  return select_write(sock_, write_timeout_sec_, write_timeout_usec_) > 0 &&
-         is_socket_alive(sock_) && !is_ssl_peer_could_be_closed(ssl_, sock_);
-}
-
-inline ssize_t SSLSocketStream::read(char *ptr, size_t size) {
-  if (SSL_pending(ssl_) > 0) {
-    return SSL_read(ssl_, ptr, static_cast<int>(size));
-  } else if (wait_readable()) {
-    auto ret = SSL_read(ssl_, ptr, static_cast<int>(size));
-    if (ret < 0) {
-      auto err = SSL_get_error(ssl_, ret);
-      auto n = 1000;
-#ifdef _WIN32
-      while (--n >= 0 && (err == SSL_ERROR_WANT_READ ||
-                          (err == SSL_ERROR_SYSCALL &&
-                           WSAGetLastError() == WSAETIMEDOUT))) {
-#else
-      while (--n >= 0 && err == SSL_ERROR_WANT_READ) {
-#endif
-        if (SSL_pending(ssl_) > 0) {
-          return SSL_read(ssl_, ptr, static_cast<int>(size));
-        } else if (wait_readable()) {
-          std::this_thread::sleep_for(std::chrono::microseconds{10});
-          ret = SSL_read(ssl_, ptr, static_cast<int>(size));
-          if (ret >= 0) { return ret; }
-          err = SSL_get_error(ssl_, ret);
-        } else {
-          return -1;
-        }
-      }
-    }
-    return ret;
-  } else {
-    return -1;
-  }
-}
-
-inline ssize_t SSLSocketStream::write(const char *ptr, size_t size) {
-  if (wait_writable()) {
-    auto handle_size = static_cast<int>(
-        std::min<size_t>(size, (std::numeric_limits<int>::max)()));
-
-    auto ret = SSL_write(ssl_, ptr, static_cast<int>(handle_size));
-    if (ret < 0) {
-      auto err = SSL_get_error(ssl_, ret);
-      auto n = 1000;
-#ifdef _WIN32
-      while (--n >= 0 && (err == SSL_ERROR_WANT_WRITE ||
-                          (err == SSL_ERROR_SYSCALL &&
-                           WSAGetLastError() == WSAETIMEDOUT))) {
-#else
-      while (--n >= 0 && err == SSL_ERROR_WANT_WRITE) {
-#endif
-        if (wait_writable()) {
-          std::this_thread::sleep_for(std::chrono::microseconds{10});
-          ret = SSL_write(ssl_, ptr, static_cast<int>(handle_size));
-          if (ret >= 0) { return ret; }
-          err = SSL_get_error(ssl_, ret);
-        } else {
-          return -1;
-        }
-      }
-    }
-    return ret;
-  }
-  return -1;
-}
-
-inline void SSLSocketStream::get_remote_ip_and_port(std::string &ip,
-                                                    int &port) const {
-  detail::get_remote_ip_and_port(sock_, ip, port);
-}
-
-inline void SSLSocketStream::get_local_ip_and_port(std::string &ip,
-                                                   int &port) const {
-  detail::get_local_ip_and_port(sock_, ip, port);
-}
-
-inline socket_t SSLSocketStream::socket() const { return sock_; }
-
-inline time_t SSLSocketStream::duration() const {
-  return std::chrono::duration_cast<std::chrono::milliseconds>(
-             std::chrono::steady_clock::now() - start_time_)
-      .count();
-}
-
-} // namespace detail
-
-// SSL HTTP server implementation
-inline SSLServer::SSLServer(const char *cert_path, const char *private_key_path,
-                            const char *client_ca_cert_file_path,
-                            const char *client_ca_cert_dir_path,
-                            const char *private_key_password) {
-  ctx_ = SSL_CTX_new(TLS_server_method());
-
-  if (ctx_) {
-    SSL_CTX_set_options(ctx_,
-                        SSL_OP_NO_COMPRESSION |
-                            SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION);
-
-    SSL_CTX_set_min_proto_version(ctx_, TLS1_2_VERSION);
-
-    if (private_key_password != nullptr && (private_key_password[0] != '\0')) {
-      SSL_CTX_set_default_passwd_cb_userdata(
-          ctx_,
-          reinterpret_cast<void *>(const_cast<char *>(private_key_password)));
-    }
-
-    if (SSL_CTX_use_certificate_chain_file(ctx_, cert_path) != 1 ||
-        SSL_CTX_use_PrivateKey_file(ctx_, private_key_path, SSL_FILETYPE_PEM) !=
-            1 ||
-        SSL_CTX_check_private_key(ctx_) != 1) {
-      SSL_CTX_free(ctx_);
-      ctx_ = nullptr;
-    } else if (client_ca_cert_file_path || client_ca_cert_dir_path) {
-      SSL_CTX_load_verify_locations(ctx_, client_ca_cert_file_path,
-                                    client_ca_cert_dir_path);
-
-      SSL_CTX_set_verify(
-          ctx_, SSL_VERIFY_PEER | SSL_VERIFY_FAIL_IF_NO_PEER_CERT, nullptr);
-    }
-  }
-}
-
-inline SSLServer::SSLServer(X509 *cert, EVP_PKEY *private_key,
-                            X509_STORE *client_ca_cert_store) {
-  ctx_ = SSL_CTX_new(TLS_server_method());
-
-  if (ctx_) {
-    SSL_CTX_set_options(ctx_,
-                        SSL_OP_NO_COMPRESSION |
-                            SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION);
-
-    SSL_CTX_set_min_proto_version(ctx_, TLS1_2_VERSION);
-
-    if (SSL_CTX_use_certificate(ctx_, cert) != 1 ||
-        SSL_CTX_use_PrivateKey(ctx_, private_key) != 1) {
-      SSL_CTX_free(ctx_);
-      ctx_ = nullptr;
-    } else if (client_ca_cert_store) {
-      SSL_CTX_set_cert_store(ctx_, client_ca_cert_store);
-
-      SSL_CTX_set_verify(
-          ctx_, SSL_VERIFY_PEER | SSL_VERIFY_FAIL_IF_NO_PEER_CERT, nullptr);
-    }
-  }
-}
-
-inline SSLServer::SSLServer(
-    const std::function<bool(SSL_CTX &ssl_ctx)> &setup_ssl_ctx_callback) {
-  ctx_ = SSL_CTX_new(TLS_method());
-  if (ctx_) {
-    if (!setup_ssl_ctx_callback(*ctx_)) {
-      SSL_CTX_free(ctx_);
-      ctx_ = nullptr;
-    }
-  }
-}
-
-inline SSLServer::~SSLServer() {
-  if (ctx_) { SSL_CTX_free(ctx_); }
-}
-
-inline bool SSLServer::is_valid() const { return ctx_; }
-
-inline SSL_CTX *SSLServer::ssl_context() const { return ctx_; }
-
-inline void SSLServer::update_certs(X509 *cert, EVP_PKEY *private_key,
-                                    X509_STORE *client_ca_cert_store) {
-
-  std::lock_guard<std::mutex> guard(ctx_mutex_);
-
-  SSL_CTX_use_certificate(ctx_, cert);
-  SSL_CTX_use_PrivateKey(ctx_, private_key);
-
-  if (client_ca_cert_store != nullptr) {
-    SSL_CTX_set_cert_store(ctx_, client_ca_cert_store);
-  }
-}
-
-inline bool SSLServer::process_and_close_socket(socket_t sock) {
-  auto ssl = detail::ssl_new(
-      sock, ctx_, ctx_mutex_,
-      [&](SSL *ssl2) {
-        return detail::ssl_connect_or_accept_nonblocking(
-            sock, ssl2, SSL_accept, read_timeout_sec_, read_timeout_usec_);
-      },
-      [](SSL * /*ssl2*/) { return true; });
-
-  auto ret = false;
-  if (ssl) {
-    std::string remote_addr;
-    int remote_port = 0;
-    detail::get_remote_ip_and_port(sock, remote_addr, remote_port);
-
-    std::string local_addr;
-    int local_port = 0;
-    detail::get_local_ip_and_port(sock, local_addr, local_port);
-
-    ret = detail::process_server_socket_ssl(
-        svr_sock_, ssl, sock, keep_alive_max_count_, keep_alive_timeout_sec_,
-        read_timeout_sec_, read_timeout_usec_, write_timeout_sec_,
-        write_timeout_usec_,
-        [&](Stream &strm, bool close_connection, bool &connection_closed) {
-          return process_request(strm, remote_addr, remote_port, local_addr,
-                                 local_port, close_connection,
-                                 connection_closed,
-                                 [&](Request &req) { req.ssl = ssl; });
-        });
-
-    // Shutdown gracefully if the result seemed successful, non-gracefully if
-    // the connection appeared to be closed.
-    const bool shutdown_gracefully = ret;
-    detail::ssl_delete(ctx_mutex_, ssl, sock, shutdown_gracefully);
-  }
-
-  detail::shutdown_socket(sock);
-  detail::close_socket(sock);
-  return ret;
-}
-
-// SSL HTTP client implementation
-inline SSLClient::SSLClient(const std::string &host)
-    : SSLClient(host, 443, std::string(), std::string()) {}
-
-inline SSLClient::SSLClient(const std::string &host, int port)
-    : SSLClient(host, port, std::string(), std::string()) {}
-
-inline SSLClient::SSLClient(const std::string &host, int port,
-                            const std::string &client_cert_path,
-                            const std::string &client_key_path,
-                            const std::string &private_key_password)
-    : ClientImpl(host, port, client_cert_path, client_key_path) {
-  ctx_ = SSL_CTX_new(TLS_client_method());
-
-  SSL_CTX_set_min_proto_version(ctx_, TLS1_2_VERSION);
-
-  detail::split(&host_[0], &host_[host_.size()], '.',
-                [&](const char *b, const char *e) {
-                  host_components_.emplace_back(b, e);
-                });
-
-  if (!client_cert_path.empty() && !client_key_path.empty()) {
-    if (!private_key_password.empty()) {
-      SSL_CTX_set_default_passwd_cb_userdata(
-          ctx_, reinterpret_cast<void *>(
-                    const_cast<char *>(private_key_password.c_str())));
-    }
-
-    if (SSL_CTX_use_certificate_file(ctx_, client_cert_path.c_str(),
-                                     SSL_FILETYPE_PEM) != 1 ||
-        SSL_CTX_use_PrivateKey_file(ctx_, client_key_path.c_str(),
-                                    SSL_FILETYPE_PEM) != 1) {
-      SSL_CTX_free(ctx_);
-      ctx_ = nullptr;
-    }
-  }
-}
-
-inline SSLClient::SSLClient(const std::string &host, int port,
-                            X509 *client_cert, EVP_PKEY *client_key,
-                            const std::string &private_key_password)
-    : ClientImpl(host, port) {
-  ctx_ = SSL_CTX_new(TLS_client_method());
-
-  detail::split(&host_[0], &host_[host_.size()], '.',
-                [&](const char *b, const char *e) {
-                  host_components_.emplace_back(b, e);
-                });
-
-  if (client_cert != nullptr && client_key != nullptr) {
-    if (!private_key_password.empty()) {
-      SSL_CTX_set_default_passwd_cb_userdata(
-          ctx_, reinterpret_cast<void *>(
-                    const_cast<char *>(private_key_password.c_str())));
-    }
-
-    if (SSL_CTX_use_certificate(ctx_, client_cert) != 1 ||
-        SSL_CTX_use_PrivateKey(ctx_, client_key) != 1) {
-      SSL_CTX_free(ctx_);
-      ctx_ = nullptr;
-    }
-  }
-}
-
-inline SSLClient::~SSLClient() {
-  if (ctx_) { SSL_CTX_free(ctx_); }
-  // Make sure to shut down SSL since shutdown_ssl will resolve to the
-  // base function rather than the derived function once we get to the
-  // base class destructor, and won't free the SSL (causing a leak).
-  shutdown_ssl_impl(socket_, true);
-}
-
-inline bool SSLClient::is_valid() const { return ctx_; }
-
-inline void SSLClient::set_ca_cert_store(X509_STORE *ca_cert_store) {
-  if (ca_cert_store) {
-    if (ctx_) {
-      if (SSL_CTX_get_cert_store(ctx_) != ca_cert_store) {
-        // Free memory allocated for old cert and use new store `ca_cert_store`
-        SSL_CTX_set_cert_store(ctx_, ca_cert_store);
-      }
-    } else {
-      X509_STORE_free(ca_cert_store);
-    }
-  }
-}
-
-inline void SSLClient::load_ca_cert_store(const char *ca_cert,
-                                          std::size_t size) {
-  set_ca_cert_store(ClientImpl::create_ca_cert_store(ca_cert, size));
-}
-
-inline long SSLClient::get_openssl_verify_result() const {
-  return verify_result_;
-}
-
-inline SSL_CTX *SSLClient::ssl_context() const { return ctx_; }
-
-inline bool SSLClient::create_and_connect_socket(Socket &socket, Error &error) {
-  return is_valid() && ClientImpl::create_and_connect_socket(socket, error);
-}
-
-// Assumes that socket_mutex_ is locked and that there are no requests in flight
-inline bool SSLClient::connect_with_proxy(
-    Socket &socket,
-    std::chrono::time_point<std::chrono::steady_clock> start_time,
-    Response &res, bool &success, Error &error) {
-  success = true;
-  Response proxy_res;
-  if (!detail::process_client_socket(
-          socket.sock, read_timeout_sec_, read_timeout_usec_,
-          write_timeout_sec_, write_timeout_usec_, max_timeout_msec_,
-          start_time, [&](Stream &strm) {
-            Request req2;
-            req2.method = "CONNECT";
-            req2.path = host_and_port_;
-            if (max_timeout_msec_ > 0) {
-              req2.start_time_ = std::chrono::steady_clock::now();
-            }
-            return process_request(strm, req2, proxy_res, false, error);
-          })) {
-    // Thread-safe to close everything because we are assuming there are no
-    // requests in flight
-    shutdown_ssl(socket, true);
-    shutdown_socket(socket);
-    close_socket(socket);
-    success = false;
-    return false;
-  }
-
-  if (proxy_res.status == StatusCode::ProxyAuthenticationRequired_407) {
-    if (!proxy_digest_auth_username_.empty() &&
-        !proxy_digest_auth_password_.empty()) {
-      std::map<std::string, std::string> auth;
-      if (detail::parse_www_authenticate(proxy_res, auth, true)) {
-        proxy_res = Response();
-        if (!detail::process_client_socket(
-                socket.sock, read_timeout_sec_, read_timeout_usec_,
-                write_timeout_sec_, write_timeout_usec_, max_timeout_msec_,
-                start_time, [&](Stream &strm) {
-                  Request req3;
-                  req3.method = "CONNECT";
-                  req3.path = host_and_port_;
-                  req3.headers.insert(detail::make_digest_authentication_header(
-                      req3, auth, 1, detail::random_string(10),
-                      proxy_digest_auth_username_, proxy_digest_auth_password_,
-                      true));
-                  if (max_timeout_msec_ > 0) {
-                    req3.start_time_ = std::chrono::steady_clock::now();
-                  }
-                  return process_request(strm, req3, proxy_res, false, error);
-                })) {
-          // Thread-safe to close everything because we are assuming there are
-          // no requests in flight
-          shutdown_ssl(socket, true);
-          shutdown_socket(socket);
-          close_socket(socket);
-          success = false;
-          return false;
-        }
-      }
-    }
-  }
-
-  // If status code is not 200, proxy request is failed.
-  // Set error to ProxyConnection and return proxy response
-  // as the response of the request
-  if (proxy_res.status != StatusCode::OK_200) {
-    error = Error::ProxyConnection;
-    res = std::move(proxy_res);
-    // Thread-safe to close everything because we are assuming there are
-    // no requests in flight
-    shutdown_ssl(socket, true);
-    shutdown_socket(socket);
-    close_socket(socket);
-    return false;
-  }
-
-  return true;
-}
-
-inline bool SSLClient::load_certs() {
-  auto ret = true;
-
-  std::call_once(initialize_cert_, [&]() {
-    std::lock_guard<std::mutex> guard(ctx_mutex_);
-    if (!ca_cert_file_path_.empty()) {
-      if (!SSL_CTX_load_verify_locations(ctx_, ca_cert_file_path_.c_str(),
-                                         nullptr)) {
-        ret = false;
-      }
-    } else if (!ca_cert_dir_path_.empty()) {
-      if (!SSL_CTX_load_verify_locations(ctx_, nullptr,
-                                         ca_cert_dir_path_.c_str())) {
-        ret = false;
-      }
-    } else {
-      auto loaded = false;
-#ifdef _WIN32
-      loaded =
-          detail::load_system_certs_on_windows(SSL_CTX_get_cert_store(ctx_));
-#elif defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN) && defined(__APPLE__)
-#if TARGET_OS_OSX
-      loaded = detail::load_system_certs_on_macos(SSL_CTX_get_cert_store(ctx_));
-#endif // TARGET_OS_OSX
-#endif // _WIN32
-      if (!loaded) { SSL_CTX_set_default_verify_paths(ctx_); }
-    }
-  });
-
-  return ret;
-}
-
-inline bool SSLClient::initialize_ssl(Socket &socket, Error &error) {
-  auto ssl = detail::ssl_new(
-      socket.sock, ctx_, ctx_mutex_,
-      [&](SSL *ssl2) {
-        if (server_certificate_verification_) {
-          if (!load_certs()) {
-            error = Error::SSLLoadingCerts;
-            return false;
-          }
-          SSL_set_verify(ssl2, SSL_VERIFY_NONE, nullptr);
-        }
-
-        if (!detail::ssl_connect_or_accept_nonblocking(
-                socket.sock, ssl2, SSL_connect, connection_timeout_sec_,
-                connection_timeout_usec_)) {
-          error = Error::SSLConnection;
-          return false;
-        }
-
-        if (server_certificate_verification_) {
-          auto verification_status = SSLVerifierResponse::NoDecisionMade;
-
-          if (server_certificate_verifier_) {
-            verification_status = server_certificate_verifier_(ssl2);
-          }
-
-          if (verification_status == SSLVerifierResponse::CertificateRejected) {
-            error = Error::SSLServerVerification;
-            return false;
-          }
-
-          if (verification_status == SSLVerifierResponse::NoDecisionMade) {
-            verify_result_ = SSL_get_verify_result(ssl2);
-
-            if (verify_result_ != X509_V_OK) {
-              error = Error::SSLServerVerification;
-              return false;
-            }
-
-            auto server_cert = SSL_get1_peer_certificate(ssl2);
-            auto se = detail::scope_exit([&] { X509_free(server_cert); });
-
-            if (server_cert == nullptr) {
-              error = Error::SSLServerVerification;
-              return false;
-            }
-
-            if (server_hostname_verification_) {
-              if (!verify_host(server_cert)) {
-                error = Error::SSLServerHostnameVerification;
-                return false;
-              }
-            }
-          }
-        }
-
-        return true;
-      },
-      [&](SSL *ssl2) {
-#if defined(OPENSSL_IS_BORINGSSL)
-        SSL_set_tlsext_host_name(ssl2, host_.c_str());
-#else
-        // NOTE: Direct call instead of using the OpenSSL macro to suppress
-        // -Wold-style-cast warning
-        SSL_ctrl(ssl2, SSL_CTRL_SET_TLSEXT_HOSTNAME, TLSEXT_NAMETYPE_host_name,
-                 static_cast<void *>(const_cast<char *>(host_.c_str())));
-#endif
-        return true;
-      });
-
-  if (ssl) {
-    socket.ssl = ssl;
-    return true;
-  }
-
-  shutdown_socket(socket);
-  close_socket(socket);
-  return false;
-}
-
-inline void SSLClient::shutdown_ssl(Socket &socket, bool shutdown_gracefully) {
-  shutdown_ssl_impl(socket, shutdown_gracefully);
-}
-
-inline void SSLClient::shutdown_ssl_impl(Socket &socket,
-                                         bool shutdown_gracefully) {
-  if (socket.sock == INVALID_SOCKET) {
-    assert(socket.ssl == nullptr);
-    return;
-  }
-  if (socket.ssl) {
-    detail::ssl_delete(ctx_mutex_, socket.ssl, socket.sock,
-                       shutdown_gracefully);
-    socket.ssl = nullptr;
-  }
-  assert(socket.ssl == nullptr);
-}
-
-inline bool SSLClient::process_socket(
-    const Socket &socket,
-    std::chrono::time_point<std::chrono::steady_clock> start_time,
-    std::function<bool(Stream &strm)> callback) {
-  assert(socket.ssl);
-  return detail::process_client_socket_ssl(
-      socket.ssl, socket.sock, read_timeout_sec_, read_timeout_usec_,
-      write_timeout_sec_, write_timeout_usec_, max_timeout_msec_, start_time,
-      std::move(callback));
-}
-
-inline bool SSLClient::is_ssl() const { return true; }
-
-inline bool SSLClient::verify_host(X509 *server_cert) const {
-  /* Quote from RFC2818 section 3.1 "Server Identity"
-
-     If a subjectAltName extension of type dNSName is present, that MUST
-     be used as the identity. Otherwise, the (most specific) Common Name
-     field in the Subject field of the certificate MUST be used. Although
-     the use of the Common Name is existing practice, it is deprecated and
-     Certification Authorities are encouraged to use the dNSName instead.
-
-     Matching is performed using the matching rules specified by
-     [RFC2459].  If more than one identity of a given type is present in
-     the certificate (e.g., more than one dNSName name, a match in any one
-     of the set is considered acceptable.) Names may contain the wildcard
-     character * which is considered to match any single domain name
-     component or component fragment. E.g., *.a.com matches foo.a.com but
-     not bar.foo.a.com. f*.com matches foo.com but not bar.com.
-
-     In some cases, the URI is specified as an IP address rather than a
-     hostname. In this case, the iPAddress subjectAltName must be present
-     in the certificate and must exactly match the IP in the URI.
-
-  */
-  return verify_host_with_subject_alt_name(server_cert) ||
-         verify_host_with_common_name(server_cert);
-}
-
-inline bool
-SSLClient::verify_host_with_subject_alt_name(X509 *server_cert) const {
-  auto ret = false;
-
-  auto type = GEN_DNS;
-
-  struct in6_addr addr6 = {};
-  struct in_addr addr = {};
-  size_t addr_len = 0;
-
-#ifndef __MINGW32__
-  if (inet_pton(AF_INET6, host_.c_str(), &addr6)) {
-    type = GEN_IPADD;
-    addr_len = sizeof(struct in6_addr);
-  } else if (inet_pton(AF_INET, host_.c_str(), &addr)) {
-    type = GEN_IPADD;
-    addr_len = sizeof(struct in_addr);
-  }
-#endif
-
-  auto alt_names = static_cast<const struct stack_st_GENERAL_NAME *>(
-      X509_get_ext_d2i(server_cert, NID_subject_alt_name, nullptr, nullptr));
-
-  if (alt_names) {
-    auto dsn_matched = false;
-    auto ip_matched = false;
-
-    auto count = sk_GENERAL_NAME_num(alt_names);
-
-    for (decltype(count) i = 0; i < count && !dsn_matched; i++) {
-      auto val = sk_GENERAL_NAME_value(alt_names, i);
-      if (val->type == type) {
-        auto name =
-            reinterpret_cast<const char *>(ASN1_STRING_get0_data(val->d.ia5));
-        auto name_len = static_cast<size_t>(ASN1_STRING_length(val->d.ia5));
-
-        switch (type) {
-        case GEN_DNS: dsn_matched = check_host_name(name, name_len); break;
-
-        case GEN_IPADD:
-          if (!memcmp(&addr6, name, addr_len) ||
-              !memcmp(&addr, name, addr_len)) {
-            ip_matched = true;
-          }
-          break;
-        }
-      }
-    }
-
-    if (dsn_matched || ip_matched) { ret = true; }
-  }
-
-  GENERAL_NAMES_free(const_cast<STACK_OF(GENERAL_NAME) *>(
-      reinterpret_cast<const STACK_OF(GENERAL_NAME) *>(alt_names)));
-  return ret;
-}
-
-inline bool SSLClient::verify_host_with_common_name(X509 *server_cert) const {
-  const auto subject_name = X509_get_subject_name(server_cert);
-
-  if (subject_name != nullptr) {
-    char name[BUFSIZ];
-    auto name_len = X509_NAME_get_text_by_NID(subject_name, NID_commonName,
-                                              name, sizeof(name));
-
-    if (name_len != -1) {
-      return check_host_name(name, static_cast<size_t>(name_len));
-    }
-  }
-
-  return false;
-}
-
-inline bool SSLClient::check_host_name(const char *pattern,
-                                       size_t pattern_len) const {
-  if (host_.size() == pattern_len && host_ == pattern) { return true; }
-
-  // Wildcard match
-  // https://bugs.launchpad.net/ubuntu/+source/firefox-3.0/+bug/376484
-  std::vector<std::string> pattern_components;
-  detail::split(&pattern[0], &pattern[pattern_len], '.',
-                [&](const char *b, const char *e) {
-                  pattern_components.emplace_back(b, e);
-                });
-
-  if (host_components_.size() != pattern_components.size()) { return false; }
-
-  auto itr = pattern_components.begin();
-  for (const auto &h : host_components_) {
-    auto &p = *itr;
-    if (p != h && p != "*") {
-      auto partial_match = (p.size() > 0 && p[p.size() - 1] == '*' &&
-                            !p.compare(0, p.size() - 1, h));
-      if (!partial_match) { return false; }
-    }
-    ++itr;
-  }
-
-  return true;
-}
-#endif
-
-// Universal client implementation
-inline Client::Client(const std::string &scheme_host_port)
-    : Client(scheme_host_port, std::string(), std::string()) {}
-
-inline Client::Client(const std::string &scheme_host_port,
-                      const std::string &client_cert_path,
-                      const std::string &client_key_path) {
-  const static std::regex re(
-      R"((?:([a-z]+):\/\/)?(?:\[([a-fA-F\d:]+)\]|([^:/?#]+))(?::(\d+))?)");
-
-  std::smatch m;
-  if (std::regex_match(scheme_host_port, m, re)) {
-    auto scheme = m[1].str();
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-    if (!scheme.empty() && (scheme != "http" && scheme != "https")) {
-#else
-    if (!scheme.empty() && scheme != "http") {
-#endif
-#ifndef CPPHTTPLIB_NO_EXCEPTIONS
-      std::string msg = "'" + scheme + "' scheme is not supported.";
-      throw std::invalid_argument(msg);
-#endif
-      return;
-    }
-
-    auto is_ssl = scheme == "https";
-
-    auto host = m[2].str();
-    if (host.empty()) { host = m[3].str(); }
-
-    auto port_str = m[4].str();
-    auto port = !port_str.empty() ? std::stoi(port_str) : (is_ssl ? 443 : 80);
-
-    if (is_ssl) {
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-      cli_ = detail::make_unique<SSLClient>(host, port, client_cert_path,
-                                            client_key_path);
-      is_ssl_ = is_ssl;
-#endif
-    } else {
-      cli_ = detail::make_unique<ClientImpl>(host, port, client_cert_path,
-                                             client_key_path);
-    }
-  } else {
-    // NOTE: Update TEST(UniversalClientImplTest, Ipv6LiteralAddress)
-    // if port param below changes.
-    cli_ = detail::make_unique<ClientImpl>(scheme_host_port, 80,
-                                           client_cert_path, client_key_path);
-  }
-} // namespace detail
-
-inline Client::Client(const std::string &host, int port)
-    : cli_(detail::make_unique<ClientImpl>(host, port)) {}
-
-inline Client::Client(const std::string &host, int port,
-                      const std::string &client_cert_path,
-                      const std::string &client_key_path)
-    : cli_(detail::make_unique<ClientImpl>(host, port, client_cert_path,
-                                           client_key_path)) {}
-
-inline Client::~Client() = default;
-
-inline bool Client::is_valid() const {
-  return cli_ != nullptr && cli_->is_valid();
-}
-
-inline Result Client::Get(const std::string &path) { return cli_->Get(path); }
-inline Result Client::Get(const std::string &path, const Headers &headers) {
-  return cli_->Get(path, headers);
-}
-inline Result Client::Get(const std::string &path, Progress progress) {
-  return cli_->Get(path, std::move(progress));
-}
-inline Result Client::Get(const std::string &path, const Headers &headers,
-                          Progress progress) {
-  return cli_->Get(path, headers, std::move(progress));
-}
-inline Result Client::Get(const std::string &path,
-                          ContentReceiver content_receiver) {
-  return cli_->Get(path, std::move(content_receiver));
-}
-inline Result Client::Get(const std::string &path, const Headers &headers,
-                          ContentReceiver content_receiver) {
-  return cli_->Get(path, headers, std::move(content_receiver));
-}
-inline Result Client::Get(const std::string &path,
-                          ContentReceiver content_receiver, Progress progress) {
-  return cli_->Get(path, std::move(content_receiver), std::move(progress));
-}
-inline Result Client::Get(const std::string &path, const Headers &headers,
-                          ContentReceiver content_receiver, Progress progress) {
-  return cli_->Get(path, headers, std::move(content_receiver),
-                   std::move(progress));
-}
-inline Result Client::Get(const std::string &path,
-                          ResponseHandler response_handler,
-                          ContentReceiver content_receiver) {
-  return cli_->Get(path, std::move(response_handler),
-                   std::move(content_receiver));
-}
-inline Result Client::Get(const std::string &path, const Headers &headers,
-                          ResponseHandler response_handler,
-                          ContentReceiver content_receiver) {
-  return cli_->Get(path, headers, std::move(response_handler),
-                   std::move(content_receiver));
-}
-inline Result Client::Get(const std::string &path,
-                          ResponseHandler response_handler,
-                          ContentReceiver content_receiver, Progress progress) {
-  return cli_->Get(path, std::move(response_handler),
-                   std::move(content_receiver), std::move(progress));
-}
-inline Result Client::Get(const std::string &path, const Headers &headers,
-                          ResponseHandler response_handler,
-                          ContentReceiver content_receiver, Progress progress) {
-  return cli_->Get(path, headers, std::move(response_handler),
-                   std::move(content_receiver), std::move(progress));
-}
-inline Result Client::Get(const std::string &path, const Params &params,
-                          const Headers &headers, Progress progress) {
-  return cli_->Get(path, params, headers, std::move(progress));
-}
-inline Result Client::Get(const std::string &path, const Params &params,
-                          const Headers &headers,
-                          ContentReceiver content_receiver, Progress progress) {
-  return cli_->Get(path, params, headers, std::move(content_receiver),
-                   std::move(progress));
-}
-inline Result Client::Get(const std::string &path, const Params &params,
-                          const Headers &headers,
-                          ResponseHandler response_handler,
-                          ContentReceiver content_receiver, Progress progress) {
-  return cli_->Get(path, params, headers, std::move(response_handler),
-                   std::move(content_receiver), std::move(progress));
-}
-
-inline Result Client::Head(const std::string &path) { return cli_->Head(path); }
-inline Result Client::Head(const std::string &path, const Headers &headers) {
-  return cli_->Head(path, headers);
-}
-
-inline Result Client::Post(const std::string &path) { return cli_->Post(path); }
-inline Result Client::Post(const std::string &path, const Headers &headers) {
-  return cli_->Post(path, headers);
-}
-inline Result Client::Post(const std::string &path, const char *body,
-                           size_t content_length,
-                           const std::string &content_type) {
-  return cli_->Post(path, body, content_length, content_type);
-}
-inline Result Client::Post(const std::string &path, const Headers &headers,
-                           const char *body, size_t content_length,
-                           const std::string &content_type) {
-  return cli_->Post(path, headers, body, content_length, content_type);
-}
-inline Result Client::Post(const std::string &path, const Headers &headers,
-                           const char *body, size_t content_length,
-                           const std::string &content_type, Progress progress) {
-  return cli_->Post(path, headers, body, content_length, content_type,
-                    progress);
-}
-inline Result Client::Post(const std::string &path, const std::string &body,
-                           const std::string &content_type) {
-  return cli_->Post(path, body, content_type);
-}
-inline Result Client::Post(const std::string &path, const std::string &body,
-                           const std::string &content_type, Progress progress) {
-  return cli_->Post(path, body, content_type, progress);
-}
-inline Result Client::Post(const std::string &path, const Headers &headers,
-                           const std::string &body,
-                           const std::string &content_type) {
-  return cli_->Post(path, headers, body, content_type);
-}
-inline Result Client::Post(const std::string &path, const Headers &headers,
-                           const std::string &body,
-                           const std::string &content_type, Progress progress) {
-  return cli_->Post(path, headers, body, content_type, progress);
-}
-inline Result Client::Post(const std::string &path, size_t content_length,
-                           ContentProvider content_provider,
-                           const std::string &content_type) {
-  return cli_->Post(path, content_length, std::move(content_provider),
-                    content_type);
-}
-inline Result Client::Post(const std::string &path,
-                           ContentProviderWithoutLength content_provider,
-                           const std::string &content_type) {
-  return cli_->Post(path, std::move(content_provider), content_type);
-}
-inline Result Client::Post(const std::string &path, const Headers &headers,
-                           size_t content_length,
-                           ContentProvider content_provider,
-                           const std::string &content_type) {
-  return cli_->Post(path, headers, content_length, std::move(content_provider),
-                    content_type);
-}
-inline Result Client::Post(const std::string &path, const Headers &headers,
-                           ContentProviderWithoutLength content_provider,
-                           const std::string &content_type) {
-  return cli_->Post(path, headers, std::move(content_provider), content_type);
-}
-inline Result Client::Post(const std::string &path, const Params &params) {
-  return cli_->Post(path, params);
-}
-inline Result Client::Post(const std::string &path, const Headers &headers,
-                           const Params &params) {
-  return cli_->Post(path, headers, params);
-}
-inline Result Client::Post(const std::string &path, const Headers &headers,
-                           const Params &params, Progress progress) {
-  return cli_->Post(path, headers, params, progress);
-}
-inline Result Client::Post(const std::string &path,
-                           const MultipartFormDataItems &items) {
-  return cli_->Post(path, items);
-}
-inline Result Client::Post(const std::string &path, const Headers &headers,
-                           const MultipartFormDataItems &items) {
-  return cli_->Post(path, headers, items);
-}
-inline Result Client::Post(const std::string &path, const Headers &headers,
-                           const MultipartFormDataItems &items,
-                           const std::string &boundary) {
-  return cli_->Post(path, headers, items, boundary);
-}
-inline Result
-Client::Post(const std::string &path, const Headers &headers,
-             const MultipartFormDataItems &items,
-             const MultipartFormDataProviderItems &provider_items) {
-  return cli_->Post(path, headers, items, provider_items);
-}
-inline Result Client::Put(const std::string &path) { return cli_->Put(path); }
-inline Result Client::Put(const std::string &path, const char *body,
-                          size_t content_length,
-                          const std::string &content_type) {
-  return cli_->Put(path, body, content_length, content_type);
-}
-inline Result Client::Put(const std::string &path, const Headers &headers,
-                          const char *body, size_t content_length,
-                          const std::string &content_type) {
-  return cli_->Put(path, headers, body, content_length, content_type);
-}
-inline Result Client::Put(const std::string &path, const Headers &headers,
-                          const char *body, size_t content_length,
-                          const std::string &content_type, Progress progress) {
-  return cli_->Put(path, headers, body, content_length, content_type, progress);
-}
-inline Result Client::Put(const std::string &path, const std::string &body,
-                          const std::string &content_type) {
-  return cli_->Put(path, body, content_type);
-}
-inline Result Client::Put(const std::string &path, const std::string &body,
-                          const std::string &content_type, Progress progress) {
-  return cli_->Put(path, body, content_type, progress);
-}
-inline Result Client::Put(const std::string &path, const Headers &headers,
-                          const std::string &body,
-                          const std::string &content_type) {
-  return cli_->Put(path, headers, body, content_type);
-}
-inline Result Client::Put(const std::string &path, const Headers &headers,
-                          const std::string &body,
-                          const std::string &content_type, Progress progress) {
-  return cli_->Put(path, headers, body, content_type, progress);
-}
-inline Result Client::Put(const std::string &path, size_t content_length,
-                          ContentProvider content_provider,
-                          const std::string &content_type) {
-  return cli_->Put(path, content_length, std::move(content_provider),
-                   content_type);
-}
-inline Result Client::Put(const std::string &path,
-                          ContentProviderWithoutLength content_provider,
-                          const std::string &content_type) {
-  return cli_->Put(path, std::move(content_provider), content_type);
-}
-inline Result Client::Put(const std::string &path, const Headers &headers,
-                          size_t content_length,
-                          ContentProvider content_provider,
-                          const std::string &content_type) {
-  return cli_->Put(path, headers, content_length, std::move(content_provider),
-                   content_type);
-}
-inline Result Client::Put(const std::string &path, const Headers &headers,
-                          ContentProviderWithoutLength content_provider,
-                          const std::string &content_type) {
-  return cli_->Put(path, headers, std::move(content_provider), content_type);
-}
-inline Result Client::Put(const std::string &path, const Params &params) {
-  return cli_->Put(path, params);
-}
-inline Result Client::Put(const std::string &path, const Headers &headers,
-                          const Params &params) {
-  return cli_->Put(path, headers, params);
-}
-inline Result Client::Put(const std::string &path, const Headers &headers,
-                          const Params &params, Progress progress) {
-  return cli_->Put(path, headers, params, progress);
-}
-inline Result Client::Put(const std::string &path,
-                          const MultipartFormDataItems &items) {
-  return cli_->Put(path, items);
-}
-inline Result Client::Put(const std::string &path, const Headers &headers,
-                          const MultipartFormDataItems &items) {
-  return cli_->Put(path, headers, items);
-}
-inline Result Client::Put(const std::string &path, const Headers &headers,
-                          const MultipartFormDataItems &items,
-                          const std::string &boundary) {
-  return cli_->Put(path, headers, items, boundary);
-}
-inline Result
-Client::Put(const std::string &path, const Headers &headers,
-            const MultipartFormDataItems &items,
-            const MultipartFormDataProviderItems &provider_items) {
-  return cli_->Put(path, headers, items, provider_items);
-}
-inline Result Client::Patch(const std::string &path) {
-  return cli_->Patch(path);
-}
-inline Result Client::Patch(const std::string &path, const char *body,
-                            size_t content_length,
-                            const std::string &content_type) {
-  return cli_->Patch(path, body, content_length, content_type);
-}
-inline Result Client::Patch(const std::string &path, const char *body,
-                            size_t content_length,
-                            const std::string &content_type,
-                            Progress progress) {
-  return cli_->Patch(path, body, content_length, content_type, progress);
-}
-inline Result Client::Patch(const std::string &path, const Headers &headers,
-                            const char *body, size_t content_length,
-                            const std::string &content_type) {
-  return cli_->Patch(path, headers, body, content_length, content_type);
-}
-inline Result Client::Patch(const std::string &path, const Headers &headers,
-                            const char *body, size_t content_length,
-                            const std::string &content_type,
-                            Progress progress) {
-  return cli_->Patch(path, headers, body, content_length, content_type,
-                     progress);
-}
-inline Result Client::Patch(const std::string &path, const std::string &body,
-                            const std::string &content_type) {
-  return cli_->Patch(path, body, content_type);
-}
-inline Result Client::Patch(const std::string &path, const std::string &body,
-                            const std::string &content_type,
-                            Progress progress) {
-  return cli_->Patch(path, body, content_type, progress);
-}
-inline Result Client::Patch(const std::string &path, const Headers &headers,
-                            const std::string &body,
-                            const std::string &content_type) {
-  return cli_->Patch(path, headers, body, content_type);
-}
-inline Result Client::Patch(const std::string &path, const Headers &headers,
-                            const std::string &body,
-                            const std::string &content_type,
-                            Progress progress) {
-  return cli_->Patch(path, headers, body, content_type, progress);
-}
-inline Result Client::Patch(const std::string &path, size_t content_length,
-                            ContentProvider content_provider,
-                            const std::string &content_type) {
-  return cli_->Patch(path, content_length, std::move(content_provider),
-                     content_type);
-}
-inline Result Client::Patch(const std::string &path,
-                            ContentProviderWithoutLength content_provider,
-                            const std::string &content_type) {
-  return cli_->Patch(path, std::move(content_provider), content_type);
-}
-inline Result Client::Patch(const std::string &path, const Headers &headers,
-                            size_t content_length,
-                            ContentProvider content_provider,
-                            const std::string &content_type) {
-  return cli_->Patch(path, headers, content_length, std::move(content_provider),
-                     content_type);
-}
-inline Result Client::Patch(const std::string &path, const Headers &headers,
-                            ContentProviderWithoutLength content_provider,
-                            const std::string &content_type) {
-  return cli_->Patch(path, headers, std::move(content_provider), content_type);
-}
-inline Result Client::Delete(const std::string &path) {
-  return cli_->Delete(path);
-}
-inline Result Client::Delete(const std::string &path, const Headers &headers) {
-  return cli_->Delete(path, headers);
-}
-inline Result Client::Delete(const std::string &path, const char *body,
-                             size_t content_length,
-                             const std::string &content_type) {
-  return cli_->Delete(path, body, content_length, content_type);
-}
-inline Result Client::Delete(const std::string &path, const char *body,
-                             size_t content_length,
-                             const std::string &content_type,
-                             Progress progress) {
-  return cli_->Delete(path, body, content_length, content_type, progress);
-}
-inline Result Client::Delete(const std::string &path, const Headers &headers,
-                             const char *body, size_t content_length,
-                             const std::string &content_type) {
-  return cli_->Delete(path, headers, body, content_length, content_type);
-}
-inline Result Client::Delete(const std::string &path, const Headers &headers,
-                             const char *body, size_t content_length,
-                             const std::string &content_type,
-                             Progress progress) {
-  return cli_->Delete(path, headers, body, content_length, content_type,
-                      progress);
-}
-inline Result Client::Delete(const std::string &path, const std::string &body,
-                             const std::string &content_type) {
-  return cli_->Delete(path, body, content_type);
-}
-inline Result Client::Delete(const std::string &path, const std::string &body,
-                             const std::string &content_type,
-                             Progress progress) {
-  return cli_->Delete(path, body, content_type, progress);
-}
-inline Result Client::Delete(const std::string &path, const Headers &headers,
-                             const std::string &body,
-                             const std::string &content_type) {
-  return cli_->Delete(path, headers, body, content_type);
-}
-inline Result Client::Delete(const std::string &path, const Headers &headers,
-                             const std::string &body,
-                             const std::string &content_type,
-                             Progress progress) {
-  return cli_->Delete(path, headers, body, content_type, progress);
-}
-inline Result Client::Options(const std::string &path) {
-  return cli_->Options(path);
-}
-inline Result Client::Options(const std::string &path, const Headers &headers) {
-  return cli_->Options(path, headers);
-}
-
-inline bool Client::send(Request &req, Response &res, Error &error) {
-  return cli_->send(req, res, error);
-}
-
-inline Result Client::send(const Request &req) { return cli_->send(req); }
-
-inline void Client::stop() { cli_->stop(); }
-
-inline std::string Client::host() const { return cli_->host(); }
-
-inline int Client::port() const { return cli_->port(); }
-
-inline size_t Client::is_socket_open() const { return cli_->is_socket_open(); }
-
-inline socket_t Client::socket() const { return cli_->socket(); }
-
-inline void
-Client::set_hostname_addr_map(std::map<std::string, std::string> addr_map) {
-  cli_->set_hostname_addr_map(std::move(addr_map));
-}
-
-inline void Client::set_default_headers(Headers headers) {
-  cli_->set_default_headers(std::move(headers));
-}
-
-inline void Client::set_header_writer(
-    std::function<ssize_t(Stream &, Headers &)> const &writer) {
-  cli_->set_header_writer(writer);
-}
-
-inline void Client::set_address_family(int family) {
-  cli_->set_address_family(family);
-}
-
-inline void Client::set_tcp_nodelay(bool on) { cli_->set_tcp_nodelay(on); }
-
-inline void Client::set_socket_options(SocketOptions socket_options) {
-  cli_->set_socket_options(std::move(socket_options));
-}
-
-inline void Client::set_connection_timeout(time_t sec, time_t usec) {
-  cli_->set_connection_timeout(sec, usec);
-}
-
-inline void Client::set_read_timeout(time_t sec, time_t usec) {
-  cli_->set_read_timeout(sec, usec);
-}
-
-inline void Client::set_write_timeout(time_t sec, time_t usec) {
-  cli_->set_write_timeout(sec, usec);
-}
-
-inline void Client::set_basic_auth(const std::string &username,
-                                   const std::string &password) {
-  cli_->set_basic_auth(username, password);
-}
-inline void Client::set_bearer_token_auth(const std::string &token) {
-  cli_->set_bearer_token_auth(token);
-}
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-inline void Client::set_digest_auth(const std::string &username,
-                                    const std::string &password) {
-  cli_->set_digest_auth(username, password);
-}
-#endif
-
-inline void Client::set_keep_alive(bool on) { cli_->set_keep_alive(on); }
-inline void Client::set_follow_location(bool on) {
-  cli_->set_follow_location(on);
-}
-
-inline void Client::set_url_encode(bool on) { cli_->set_url_encode(on); }
-
-inline void Client::set_compress(bool on) { cli_->set_compress(on); }
-
-inline void Client::set_decompress(bool on) { cli_->set_decompress(on); }
-
-inline void Client::set_interface(const std::string &intf) {
-  cli_->set_interface(intf);
-}
-
-inline void Client::set_proxy(const std::string &host, int port) {
-  cli_->set_proxy(host, port);
-}
-inline void Client::set_proxy_basic_auth(const std::string &username,
-                                         const std::string &password) {
-  cli_->set_proxy_basic_auth(username, password);
-}
-inline void Client::set_proxy_bearer_token_auth(const std::string &token) {
-  cli_->set_proxy_bearer_token_auth(token);
-}
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-inline void Client::set_proxy_digest_auth(const std::string &username,
-                                          const std::string &password) {
-  cli_->set_proxy_digest_auth(username, password);
-}
-#endif
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-inline void Client::enable_server_certificate_verification(bool enabled) {
-  cli_->enable_server_certificate_verification(enabled);
-}
-
-inline void Client::enable_server_hostname_verification(bool enabled) {
-  cli_->enable_server_hostname_verification(enabled);
-}
-
-inline void Client::set_server_certificate_verifier(
-    std::function<SSLVerifierResponse(SSL *ssl)> verifier) {
-  cli_->set_server_certificate_verifier(verifier);
-}
-#endif
-
-inline void Client::set_logger(Logger logger) {
-  cli_->set_logger(std::move(logger));
-}
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-inline void Client::set_ca_cert_path(const std::string &ca_cert_file_path,
-                                     const std::string &ca_cert_dir_path) {
-  cli_->set_ca_cert_path(ca_cert_file_path, ca_cert_dir_path);
-}
-
-inline void Client::set_ca_cert_store(X509_STORE *ca_cert_store) {
-  if (is_ssl_) {
-    static_cast<SSLClient &>(*cli_).set_ca_cert_store(ca_cert_store);
-  } else {
-    cli_->set_ca_cert_store(ca_cert_store);
-  }
-}
-
-inline void Client::load_ca_cert_store(const char *ca_cert, std::size_t size) {
-  set_ca_cert_store(cli_->create_ca_cert_store(ca_cert, size));
-}
-
-inline long Client::get_openssl_verify_result() const {
-  if (is_ssl_) {
-    return static_cast<SSLClient &>(*cli_).get_openssl_verify_result();
-  }
-  return -1; // NOTE: -1 doesn't match any of X509_V_ERR_???
-}
-
-inline SSL_CTX *Client::ssl_context() const {
-  if (is_ssl_) { return static_cast<SSLClient &>(*cli_).ssl_context(); }
-  return nullptr;
-}
-#endif
-
-// ----------------------------------------------------------------------------
-
-} // namespace httplib
-
-#endif // CPPHTTPLIB_HTTPLIB_H
diff --git a/llamacpp/native/src/server/utils.hpp b/llamacpp/native/src/server/server-common.cpp
similarity index 59%
rename from llamacpp/native/src/server/utils.hpp
rename to llamacpp/native/src/server/server-common.cpp
index b1ecc5af5..e2e41a0d5 100644
--- a/llamacpp/native/src/server/utils.hpp
+++ b/llamacpp/native/src/server/server-common.cpp
@@ -1,489 +1,737 @@
-#pragma once
-
 #include "common.h"
 #include "log.h"
 #include "llama.h"
-#include "arg.h" // common_remote_get_content
-#include "base64.hpp"
 #include "mtmd.h"
 #include "mtmd-helper.h"
 #include "chat.h"
+#include "arg.h" // for common_remote_get_content; TODO: use download.h only
+#include "base64.hpp"
 
-#include <cpp-httplib/httplib.h>
-
-#define JSON_ASSERT GGML_ASSERT
-#include <nlohmann/json.hpp>
+#include "server-common.h"
 
 #include <random>
 #include <sstream>
-#include <string>
-#include <vector>
-#include <memory>
-#include <cinttypes>
-
-#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
-
-using json = nlohmann::ordered_json;
-
-#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
-#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
-#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
-#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
-
-#define SRV_INF(fmt, ...) LOG_INF("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
-#define SRV_WRN(fmt, ...) LOG_WRN("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
-#define SRV_ERR(fmt, ...) LOG_ERR("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
-#define SRV_DBG(fmt, ...) LOG_DBG("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
-
-#define QUE_INF(fmt, ...) LOG_INF("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
-#define QUE_WRN(fmt, ...) LOG_WRN("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
-#define QUE_ERR(fmt, ...) LOG_ERR("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
-#define QUE_DBG(fmt, ...) LOG_DBG("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
-
-using raw_buffer = std::vector<uint8_t>;
-
-template <typename T>
-static T json_value(const json & body, const std::string & key, const T & default_value) {
-    // Fallback null to default value
-    if (body.contains(key) && !body.at(key).is_null()) {
-        try {
-            return body.at(key);
-        } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const & err) {
-            LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value: %s\n", key.c_str(), json(default_value).type_name(), err.what());
-            return default_value;
-        }
-    } else {
-        return default_value;
+#include <fstream>
+
+json format_error_response(const std::string & message, const enum error_type type) {
+    std::string type_str;
+    int code = 500;
+    switch (type) {
+        case ERROR_TYPE_INVALID_REQUEST:
+            type_str = "invalid_request_error";
+            code = 400;
+            break;
+        case ERROR_TYPE_AUTHENTICATION:
+            type_str = "authentication_error";
+            code = 401;
+            break;
+        case ERROR_TYPE_NOT_FOUND:
+            type_str = "not_found_error";
+            code = 404;
+            break;
+        case ERROR_TYPE_SERVER:
+            type_str = "server_error";
+            code = 500;
+            break;
+        case ERROR_TYPE_PERMISSION:
+            type_str = "permission_error";
+            code = 403;
+            break;
+        case ERROR_TYPE_NOT_SUPPORTED:
+            type_str = "not_supported_error";
+            code = 501;
+            break;
+        case ERROR_TYPE_UNAVAILABLE:
+            type_str = "unavailable_error";
+            code = 503;
+            break;
+        case ERROR_TYPE_EXCEED_CONTEXT_SIZE:
+            type_str = "exceed_context_size_error";
+            code = 400;
+            break;
     }
+    return json {
+        {"code", code},
+        {"message", message},
+        {"type", type_str},
+    };
 }
 
-const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
+//
+// random string / id
+//
 
-// thin wrapper around common_grammar_trigger with (de)serialization functions
-struct server_grammar_trigger {
-    common_grammar_trigger value;
+std::string random_string() {
+    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
 
-    server_grammar_trigger() = default;
-    server_grammar_trigger(const common_grammar_trigger & value) : value(value) {}
-    server_grammar_trigger(const json & in) {
-        value.type = (common_grammar_trigger_type) in.at("type").get<int>();
-        value.value = in.at("value").get<std::string>();
-        if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
-            value.token = (llama_token) in.at("token").get<int>();
-        }
-    }
+    std::random_device rd;
+    std::mt19937 generator(rd());
 
-    json to_json() const {
-        json out {
-            {"type", (int) value.type},
-            {"value", value.value},
-        };
-        if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
-            out["token"] = (int) value.token;
-        }
-        return out;
+    std::string result(32, ' ');
+
+    for (int i = 0; i < 32; ++i) {
+        result[i] = str[generator() % str.size()];
     }
-};
+
+    return result;
+}
+
+std::string gen_chatcmplid() {
+    return "chatcmpl-" + random_string();
+}
+
+std::string gen_tool_call_id() {
+    return random_string();
+}
 
 //
-// tokenizer and input processing utils
+// lora utils
 //
 
-static bool json_is_array_of_numbers(const json & data) {
-    if (data.is_array()) {
-        for (const auto & e : data) {
-            if (!e.is_number_integer()) {
+bool lora_all_alora(const std::vector<common_adapter_lora_info> & loras) {
+    bool found_alora = false;
+    for (const auto & lora : loras) {
+        if (lora.scale != 0) {
+            if (llama_adapter_get_alora_n_invocation_tokens(lora.ptr) == 0) {
                 return false;
             }
+            found_alora = true;
         }
-        return true;
     }
-    return false;
+    return found_alora;
 }
 
-// is array having BOTH numbers & strings?
-static bool json_is_array_of_mixed_numbers_strings(const json & data) {
-    bool seen_string = false;
-    bool seen_number = false;
-    if (data.is_array()) {
-        for (const auto & e : data) {
-            seen_string |= e.is_string();
-            seen_number |= e.is_number_integer();
-            if (seen_number && seen_string) {
-                return true;
-            }
+bool lora_should_clear_cache(
+        const std::vector<common_adapter_lora_info> & current,
+        const std::vector<common_adapter_lora_info> & next) {
+
+    // This should always be called after determining that the two sets are
+    // _not_ equal. This assert is therefore some slightly wasted work and
+    // should be safe to remove as long as this method is called correctly.
+    GGML_ASSERT(!are_lora_equal(current, next));
+
+    return (
+        !(lora_get_enabled_ids(current).empty() || lora_all_alora(current)) ||
+        !lora_all_alora(next));
+}
+
+std::vector<common_adapter_lora_info> parse_lora_request(
+        const std::vector<common_adapter_lora_info> & lora_base,
+        const json & data) {
+    std::vector<common_adapter_lora_info> lora(lora_base);
+    int max_idx = lora.size();
+
+    // clear existing value
+    for (auto & entry : lora) {
+        entry.scale = 0.0f;
+    }
+
+    // set value
+    for (const auto & entry : data) {
+        int id      = json_value(entry, "id", -1);
+        float scale = json_value(entry, "scale", 0.0f);
+        if (0 <= id && id < max_idx) {
+            lora[id].scale = scale;
+        } else {
+            throw std::runtime_error("invalid adapter id");
         }
     }
-    return false;
+
+    return lora;
 }
 
-// does array have any individual integers/tokens?
-static bool json_is_array_and_contains_numbers(const json & data) {
-    if (data.is_array()) {
-        for (const auto & e : data) {
-            if (e.is_number_integer()) {
-                return true;
-            }
-        }
+bool are_lora_equal(
+        const std::vector<common_adapter_lora_info> & l1,
+        const std::vector<common_adapter_lora_info> & l2) {
+    if (l1.size() != l2.size()) {
         return false;
     }
-    return false;
+    for (size_t i = 0; i < l1.size(); ++i) {
+        // we don't check lora.path to reduce the time complexity
+        if (l1[i].scale != l2[i].scale || l1[i].ptr != l2[i].ptr) {
+            return false;
+        }
+    }
+    return true;
 }
 
-// get value by path(key1 / key2)
-static json json_get_nested_values(const std::vector<std::string> & paths, const json & js) {
-    json result = json::object();
-
-    for (const std::string & path : paths) {
-        json current = js;
-        const auto keys = string_split<std::string>(path, /*separator*/ '/');
-        bool valid_path = true;
-        for (const std::string & k : keys) {
-            if (valid_path && current.is_object() && current.contains(k)) {
-                current = current[k];
-            } else {
-                valid_path = false;
-            }
-        }
-        if (valid_path) {
-            result[path] = current;
+std::vector<size_t> lora_get_enabled_ids(const std::vector<common_adapter_lora_info> & loras) {
+    std::vector<size_t> enabled_ids;
+    for (size_t i = 0; i < loras.size(); ++i) {
+        if (loras[i].scale > 0) {
+            enabled_ids.push_back(i);
         }
     }
-    return result;
+    return enabled_ids;
 }
 
-/**
- * this handles 2 cases:
- * - only string, example: "string"
- * - mixed string and tokens, example: [12, 34, "string", 56, 78]
- */
-static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
-    // If `add_bos` is true, we only add BOS, when json_prompt is a string,
-    // or the first element of the json_prompt array is a string.
-    llama_tokens prompt_tokens;
+//
+// base64 utils (TODO: use the base64::decode from base64.hpp)
+//
 
-    if (json_prompt.is_array()) {
-        bool first = true;
-        for (const auto & p : json_prompt) {
-            if (p.is_string()) {
-                auto s = p.template get<std::string>();
+static const std::string base64_chars =
+             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+             "abcdefghijklmnopqrstuvwxyz"
+             "0123456789+/";
 
-                llama_tokens p;
-                if (first) {
-                    p = common_tokenize(vocab, s, add_special, parse_special);
-                    first = false;
-                } else {
-                    p = common_tokenize(vocab, s, false, parse_special);
-                }
+static inline bool is_base64(uint8_t c) {
+    return (isalnum(c) || (c == '+') || (c == '/'));
+}
 
-                prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
-            } else {
-                if (first) {
-                    first = false;
-                }
+static inline raw_buffer base64_decode(const std::string & encoded_string) {
+    int i = 0;
+    int j = 0;
+    int in_ = 0;
 
-                prompt_tokens.push_back(p.template get<llama_token>());
+    int in_len = encoded_string.size();
+
+    uint8_t char_array_4[4];
+    uint8_t char_array_3[3];
+
+    raw_buffer ret;
+
+    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
+        char_array_4[i++] = encoded_string[in_]; in_++;
+        if (i == 4) {
+            for (i = 0; i < 4; i++) {
+                char_array_4[i] = base64_chars.find(char_array_4[i]);
+            }
+
+            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
+            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+
+            for (i = 0; (i < 3); i++) {
+                ret.push_back(char_array_3[i]);
             }
+
+            i = 0;
         }
-    } else {
-        auto s = json_prompt.template get<std::string>();
-        prompt_tokens = common_tokenize(vocab, s, add_special, parse_special);
     }
 
-    return prompt_tokens;
-}
+    if (i) {
+        for (j = i; j < 4; j++) {
+            char_array_4[j] = 0;
+        }
 
-// return the last index of character that can form a valid string
-// if the last character is potentially cut in half, return the index before the cut
-// if validate_utf8(text) == text.size(), then the whole text is valid utf8
-static size_t validate_utf8(const std::string& text) {
-    size_t len = text.size();
-    if (len == 0) return 0;
+        for (j = 0; j < 4; j++) {
+            char_array_4[j] = base64_chars.find(char_array_4[j]);
+        }
 
-    // Check the last few bytes to see if a multi-byte character is cut off
-    for (size_t i = 1; i <= 4 && i <= len; ++i) {
-        unsigned char c = text[len - i];
-        // Check for start of a multi-byte sequence from the end
-        if ((c & 0xE0) == 0xC0) {
-            // 2-byte character start: 110xxxxx
-            // Needs at least 2 bytes
-            if (i < 2) return len - i;
-        } else if ((c & 0xF0) == 0xE0) {
-            // 3-byte character start: 1110xxxx
-            // Needs at least 3 bytes
-            if (i < 3) return len - i;
-        } else if ((c & 0xF8) == 0xF0) {
-            // 4-byte character start: 11110xxx
-            // Needs at least 4 bytes
-            if (i < 4) return len - i;
+        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
+        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+
+        for (j = 0; j < i - 1; j++) {
+            ret.push_back(char_array_3[j]);
         }
     }
 
-    // If no cut-off multi-byte character is found, return full length
-    return len;
+    return ret;
 }
 
 //
-// template utils
+// server_tokens implementation
 //
 
-// format infill task
-static llama_tokens format_infill(
-        const llama_vocab * vocab,
-        const json & input_prefix,
-        const json & input_suffix,
-        const json & input_extra,
-        const int n_batch,
-        const int n_predict,
-        const int n_ctx,
-        const bool spm_infill,
-        const llama_tokens & tokens_prompt
-    ) {
-    // TODO: optimize this block by reducing memory allocations and movement
+server_tokens::server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd) {
+    for (size_t i = 0; i < mtmd_chunks.size(); ++i) {
+        push_back(mtmd_chunks[i]);
+    }
+}
 
-    // use FIM repo-level pattern:
-    // ref: https://arxiv.org/pdf/2409.12186
-    //
-    // [FIM_REP]myproject
-    // [FIM_SEP]filename0
-    // extra chunk 0
-    // [FIM_SEP]filename1
-    // extra chunk 1
-    // ...
-    // [FIM_SEP]filename
-    // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt
-    //
-    llama_tokens extra_tokens;
-    extra_tokens.reserve(n_ctx);
+server_tokens::server_tokens(const llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {
+}
 
-    auto tokens_prefix = tokenize_mixed(vocab, input_prefix, false, false);
-    auto tokens_suffix = tokenize_mixed(vocab, input_suffix, false, false);
+llama_pos server_tokens::pos_next() const {
+    if (!has_mtmd) {
+        return tokens.size();
+    }
 
-    if (llama_vocab_fim_rep(vocab) != LLAMA_TOKEN_NULL) {
-        // TODO: make project name an input
-        static const auto k_fim_repo = common_tokenize(vocab, "myproject\n", false, false);
+    llama_pos res = tokens.size();
 
-        extra_tokens.push_back(llama_vocab_fim_rep(vocab));
-        extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
+    for (auto it = map_idx_to_media.begin(); it != map_idx_to_media.end(); ++it) {
+        const auto & chunk = it->second;
+        res += mtmd_input_chunk_get_n_pos(chunk.get()) - mtmd_input_chunk_get_n_tokens(chunk.get());
     }
-    for (const auto & chunk : input_extra) {
-        // { "text": string, "filename": string }
-        const std::string text     = json_value(chunk, "text",     std::string());
-        const std::string filename = json_value(chunk, "filename", std::string("tmp"));
 
-        if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) {
-            const auto k_fim_file = common_tokenize(vocab, filename + "\n", false, false);
+    return res;
+}
 
-            extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab));
-            extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
+std::string server_tokens::str() const {
+    std::ostringstream oss;
+    oss << "tokens: ";
+    for (size_t idx = 0; idx < tokens.size(); ++idx) {
+        llama_token t = tokens[idx];
+        oss << "idx:" << idx << " ";
+        if (t == LLAMA_TOKEN_NULL) {
+            oss << "<embd> ";
         } else {
-            // chunk separator in binary form to avoid confusing the AI
-            static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00};
-            static const auto k_chunk_prefix_tokens = common_tokenize(vocab, k_chunk_prefix_str, false, false);
-
-            extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end());
+            oss << t << " ";
         }
-
-        const auto chunk_tokens = common_tokenize(vocab, text, false, false);
-        extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end());
     }
+    oss << "\n";
+    oss << "image idx: ";
+    for (const auto & it : map_idx_to_media) {
+        oss << it.first << ", ";
+    }
+    return oss.str();
+}
 
-    if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) {
-        // TODO: current filename
-        static const auto k_fim_file = common_tokenize(vocab, "filename\n", false, false);
-
-        extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab));
-        extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
+const mtmd::input_chunk_ptr & server_tokens::find_chunk(size_t idx) const {
+    auto it = map_idx_to_media.find(idx);
+    if (it != map_idx_to_media.end()) {
+        return it->second;
     }
+    throw std::runtime_error("Chunk not found");
+}
 
-    // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
-    const int n_prefix_take = std::min<int>(tokens_prefix.size(),                3*(n_batch/4));
-    const int n_suffix_take = std::min<int>(tokens_suffix.size(), std::max<int>(0, (n_batch/4) - (2 + tokens_prompt.size())));
+void server_tokens::push_back(llama_token tok) {
+    if (tok == LLAMA_TOKEN_NULL) {
+        throw std::runtime_error("Invalid token");
+    }
+    tokens.emplace_back(tok);
+}
 
-    SRV_DBG("n_prefix_take = %d, n_suffix_take = %d, total = %d\n", n_prefix_take, n_suffix_take, (n_prefix_take + n_suffix_take));
+void server_tokens::push_back(const mtmd_input_chunk * chunk) {
+    auto type = mtmd_input_chunk_get_type(chunk);
+    if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE || type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+        GGML_ASSERT(has_mtmd);
+        const size_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
+        size_t start_idx = tokens.size();
+        for (size_t i = 0; i < n_tokens; ++i) {
+            tokens.emplace_back(LLAMA_TOKEN_NULL);
+        }
+        mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
+        map_idx_to_media[start_idx] = std::move(new_chunk);
+    } else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+        size_t n_tokens;
+        const auto * text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
+        for (size_t i = 0; i < n_tokens; ++i) {
+            push_back(text_tokens[i]);
+        }
+    } else {
+        GGML_ABORT("Invalid chunk type");
+    }
+}
 
-    // fill the rest of the context with extra chunks
-    const int n_extra_take = std::min<int>(std::max<int>(0, n_ctx - (n_batch) - 2*n_predict), extra_tokens.size());
+void server_tokens::push_back(server_tokens & tokens) {
+    size_t start_idx = size();
+    for (size_t i = 0; i < tokens.size(); i++) {
+        push_back(tokens[i]);
+    }
+    if (tokens.has_mtmd) {
+        // Assert if we are copying MTMD chunks to a server_tokens that does not have mtmd.
+        // We could also just check, but this will prevent silently dropping MTMD data.
+        GGML_ASSERT(has_mtmd);
+        for (auto it = tokens.map_idx_to_media.begin(); it != tokens.map_idx_to_media.end(); ) {
+            auto * chunk = tokens.map_idx_to_media[it->first].get();
+            mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
+            map_idx_to_media[start_idx + it->first] = std::move(new_chunk);
+        }
+    }
+}
 
-    tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
-    tokens_suffix.resize(n_suffix_take);
+void server_tokens::insert(const llama_tokens & inp_tokens) {
+    GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
+    tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
+}
 
-    tokens_prefix.insert(tokens_prefix.begin(), llama_vocab_fim_pre(vocab));
-    tokens_prefix.insert(tokens_prefix.end(),   tokens_prompt.begin(), tokens_prompt.end());
-    tokens_suffix.insert(tokens_suffix.begin(), llama_vocab_fim_suf(vocab));
+const llama_tokens & server_tokens::get_text_tokens() const {
+    GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
+    return tokens;
+}
 
-    auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix;
-    auto embd_end = spm_infill ? tokens_prefix : tokens_suffix;
+void server_tokens::set_token(llama_pos pos, llama_token id) {
+    GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
+    tokens[pos] = id;
+}
 
-    if (llama_vocab_get_add_bos(vocab)) {
-        embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
+void server_tokens::keep_first(size_t n) {
+    GGML_ASSERT(n <= tokens.size());
+    if (has_mtmd) {
+        if (n == tokens.size()) {
+            return; // nothing to do
+        }
+        // we throw an error if we try to remove a token in the middle of an image
+        // for ex. with input of 5 text tokens and 2 images:
+        //    [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
+        // n  1   2   3   4   5   6      7      8      9      10
+        // allowed to resize      ^                    ^
+        // disallowed to resize          ^      ^             ^
+        if (n > 0) {
+            // make sure we never remove tokens in the middle of an image
+            // note that the case where we keep a full image at the end is allowed:
+            //   tokens[n - 1] == LLAMA_TOKEN_NULL && tokens[n] != LLAMA_TOKEN_NULL
+            if (tokens[n - 1] == LLAMA_TOKEN_NULL && tokens[n] == LLAMA_TOKEN_NULL) {
+                find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
+            }
+        }
+        // remove all image chunks that are not used anymore
+        for (auto it = map_idx_to_media.begin(); it != map_idx_to_media.end(); ) {
+            size_t idx = it->first;
+            if (idx >= n) {
+                it = map_idx_to_media.erase(it);
+            } else {
+                ++it;
+            }
+        }
     }
+    tokens.resize(n);
+}
 
-    SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size());
-
-    // put the extra context before the FIM prefix
-    embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end());
-
-    embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
-    embd_inp.push_back(llama_vocab_fim_mid(vocab));
-
-    return embd_inp;
+std::string server_tokens::detokenize(const llama_context * ctx, bool special) const {
+    llama_tokens text_tokens;
+    text_tokens.reserve(tokens.size());
+    for (const auto & t : tokens) {
+        if (t != LLAMA_TOKEN_NULL) {
+            text_tokens.push_back(t);
+        }
+    }
+    return common_detokenize(ctx, text_tokens, special);
 }
 
-//
-// base64 utils (TODO: move to common in the future)
-//
+size_t server_tokens::get_common_prefix(const server_tokens & b) const {
+    const size_t max_idx = std::min(tokens.size(), b.tokens.size());
 
-static const std::string base64_chars =
-             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-             "abcdefghijklmnopqrstuvwxyz"
-             "0123456789+/";
+    if (!has_mtmd) {
+        for (size_t i = 0; i < max_idx; ++i) {
+            if (tokens[i] == b.tokens[i]) {
+                continue;
+            }
 
-static inline bool is_base64(uint8_t c) {
-    return (isalnum(c) || (c == '+') || (c == '/'));
-}
+            return i;
+        }
 
-static inline raw_buffer base64_decode(const std::string & encoded_string) {
-    int i = 0;
-    int j = 0;
-    int in_ = 0;
+        return max_idx;
+    }
 
-    int in_len = encoded_string.size();
+    for (size_t i = 0; i < max_idx; ++i) {
+        const llama_token ai =   tokens[i];
+        const llama_token bi = b.tokens[i];
 
-    uint8_t char_array_4[4];
-    uint8_t char_array_3[3];
+        if (ai == LLAMA_TOKEN_NULL && bi == LLAMA_TOKEN_NULL) {
+            const auto & a_chunk =   find_chunk(i);
+            const auto & b_chunk = b.find_chunk(i);
 
-    raw_buffer ret;
+            GGML_ASSERT(a_chunk && b_chunk);
 
-    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
-        char_array_4[i++] = encoded_string[in_]; in_++;
-        if (i == 4) {
-            for (i = 0; i < 4; i++) {
-                char_array_4[i] = base64_chars.find(char_array_4[i]);
-            }
+            const std::string id_ai = mtmd_input_chunk_get_id(a_chunk.get());
+            const std::string id_bi = mtmd_input_chunk_get_id(b_chunk.get());
 
-            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
-            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
-            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+            const size_t n_tok_a = mtmd_input_chunk_get_n_tokens(a_chunk.get());
+            const size_t n_tok_b = mtmd_input_chunk_get_n_tokens(b_chunk.get());
 
-            for (i = 0; (i < 3); i++) {
-                ret.push_back(char_array_3[i]);
+            if (id_ai == id_bi && n_tok_a == n_tok_b) {
+                GGML_ASSERT(n_tok_a > 0 && "Invalid media chunk"); // should never happen
+                i += n_tok_a - 1; // will be +1 by the for loop
+                continue;
             }
 
-            i = 0;
+            return i;
         }
-    }
 
-    if (i) {
-        for (j = i; j < 4; j++) {
-            char_array_4[j] = 0;
+        if (ai == bi) {
+            continue;
         }
 
-        for (j = 0; j < 4; j++) {
-            char_array_4[j] = base64_chars.find(char_array_4[j]);
-        }
+        return i;
+    }
 
-        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
-        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
-        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+    return max_idx; // all tokens are equal
+}
 
-        for (j = 0; j < i - 1; j++) {
-            ret.push_back(char_array_3[j]);
+bool server_tokens::validate(const struct llama_context * ctx) const {
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+    const int32_t n_vocab = llama_vocab_n_tokens(vocab);
+
+    for (size_t i = 0; i < tokens.size(); ++i) {
+        const auto & t = tokens[i];
+        if (t == LLAMA_TOKEN_NULL) {
+            try {
+                const auto & chunk = find_chunk(i);
+                size_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk.get());
+                i += n_tokens - 1; // will be +1 by the for loop
+            } catch (const std::exception & e) {
+                return false;
+            }
+        } else if (t < 0 || t >= n_vocab) {
+            return false;
         }
     }
+    return true;
+}
 
-    return ret;
+int32_t server_tokens::process_chunk(
+            llama_context * ctx,
+            mtmd_context * mctx,
+            size_t idx,
+            llama_pos pos,
+            int32_t seq_id,
+            size_t & n_tokens_out) const {
+    const auto & chunk = find_chunk(idx);
+    const char * name = mtmd_input_chunk_get_type(chunk.get()) == MTMD_INPUT_CHUNK_TYPE_IMAGE
+                        ? "image" : "audio";
+    SRV_INF("processing %s...\n", name);
+    int32_t n_batch = llama_n_batch(ctx);
+    int64_t t0 = ggml_time_ms();
+    llama_pos new_n_past; // unused for now
+    int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
+        chunk.get(),
+        pos,
+        seq_id,
+        n_batch,
+        true, // logits last
+        &new_n_past);
+    SRV_INF("%s processed in %" PRId64 " ms\n", name, ggml_time_ms() - t0);
+    if (result != 0) {
+        LOG_ERR("mtmd_helper_eval failed with status %d", result);
+        n_tokens_out = 0;
+        return result;
+    }
+    n_tokens_out = mtmd_input_chunk_get_n_tokens(chunk.get());
+    return 0;
 }
 
 //
-// random string / id
+// tokenizer and input processing utils
 //
 
-static std::string random_string() {
-    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
-
-    std::random_device rd;
-    std::mt19937 generator(rd());
-
-    std::string result(32, ' ');
-
-    for (int i = 0; i < 32; ++i) {
-        result[i] = str[generator() % str.size()];
+bool json_is_array_of_numbers(const json & data) {
+    if (data.is_array()) {
+        for (const auto & e : data) {
+            if (!e.is_number_integer()) {
+                return false;
+            }
+        }
+        return true;
     }
-
-    return result;
+    return false;
 }
 
-static std::string gen_chatcmplid() {
-    return "chatcmpl-" + random_string();
+bool json_is_array_of_mixed_numbers_strings(const json & data) {
+    bool seen_string = false;
+    bool seen_number = false;
+    if (data.is_array()) {
+        for (const auto & e : data) {
+            seen_string |= e.is_string();
+            seen_number |= e.is_number_integer();
+            if (seen_number && seen_string) {
+                return true;
+            }
+        }
+    }
+    return false;
 }
 
-static std::string gen_tool_call_id() {
-    return random_string();
+bool json_is_array_and_contains_numbers(const json & data) {
+    if (data.is_array()) {
+        for (const auto & e : data) {
+            if (e.is_number_integer()) {
+                return true;
+            }
+        }
+        return false;
+    }
+    return false;
 }
 
-//
-// other common utils
-//
+json json_get_nested_values(const std::vector<std::string> & paths, const json & js) {
+    json result = json::object();
 
-// TODO: reuse llama_detokenize
-template <class Iter>
-static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
-    std::string ret;
-    for (; begin != end; ++begin) {
-        ret += common_token_to_piece(ctx, *begin);
+    for (const std::string & path : paths) {
+        json current = js;
+        const auto keys = string_split<std::string>(path, /*separator*/ '/');
+        bool valid_path = true;
+        for (const std::string & k : keys) {
+            if (valid_path && current.is_object() && current.contains(k)) {
+                current = current[k];
+            } else {
+                valid_path = false;
+            }
+        }
+        if (valid_path) {
+            result[path] = current;
+        }
     }
-
-    return ret;
+    return result;
 }
 
-// format incomplete utf-8 multibyte character for output
-static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
-    std::string out = token == LLAMA_TOKEN_NULL ? "" : common_token_to_piece(ctx, token);
-
-    // if the size is 1 and first bit is 1, meaning it's a partial character
-    //   (size > 1 meaning it's already a known token)
-    if (out.size() == 1 && (out[0] & 0x80) == 0x80) {
-        std::stringstream ss;
-        ss << std::hex << (out[0] & 0xff);
-        std::string res(ss.str());
-        out = "byte: \\x" + res;
-    }
+llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
+    // If `add_bos` is true, we only add BOS, when json_prompt is a string,
+    // or the first element of the json_prompt array is a string.
+    llama_tokens prompt_tokens;
 
-    return out;
-}
+    if (json_prompt.is_array()) {
+        bool first = true;
+        for (const auto & p : json_prompt) {
+            if (p.is_string()) {
+                auto s = p.template get<std::string>();
 
-// note: if data is a json array, it will be sent as multiple events, one per item
-static bool server_sent_event(httplib::DataSink & sink, const json & data) {
-    static auto send_single = [](httplib::DataSink & sink, const json & data) -> bool {
-        const std::string str =
-            "data: " +
-            data.dump(-1, ' ', false, json::error_handler_t::replace) +
-            "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row).
+                llama_tokens p;
+                if (first) {
+                    p = common_tokenize(vocab, s, add_special, parse_special);
+                    first = false;
+                } else {
+                    p = common_tokenize(vocab, s, false, parse_special);
+                }
 
-        LOG_DBG("data stream, to_send: %s", str.c_str());
-        return sink.write(str.c_str(), str.size());
-    };
+                prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
+            } else {
+                if (first) {
+                    first = false;
+                }
 
-    if (data.is_array()) {
-        for (const auto & item : data) {
-            if (!send_single(sink, item)) {
-                return false;
+                prompt_tokens.push_back(p.template get<llama_token>());
             }
         }
     } else {
-        return send_single(sink, data);
+        auto s = json_prompt.template get<std::string>();
+        prompt_tokens = common_tokenize(vocab, s, add_special, parse_special);
     }
 
-    return true;
+    return prompt_tokens;
 }
 
-//
-// OAI utils
-//
+size_t validate_utf8(const std::string& text) {
+    size_t len = text.size();
+    if (len == 0) return 0;
 
-// used by /completions endpoint
-static json oaicompat_completion_params_parse(const json & body) {
+    // Check the last few bytes to see if a multi-byte character is cut off
+    for (size_t i = 1; i <= 4 && i <= len; ++i) {
+        unsigned char c = text[len - i];
+        // Check for start of a multi-byte sequence from the end
+        if ((c & 0xE0) == 0xC0) {
+            // 2-byte character start: 110xxxxx
+            // Needs at least 2 bytes
+            if (i < 2) return len - i;
+        } else if ((c & 0xF0) == 0xE0) {
+            // 3-byte character start: 1110xxxx
+            // Needs at least 3 bytes
+            if (i < 3) return len - i;
+        } else if ((c & 0xF8) == 0xF0) {
+            // 4-byte character start: 11110xxx
+            // Needs at least 4 bytes
+            if (i < 4) return len - i;
+        }
+    }
+
+    // If no cut-off multi-byte character is found, return full length
+    return len;
+}
+
+// Computes FNV-1a hash of the data
+static std::string fnv_hash(const uint8_t * data, size_t len) {
+    const uint64_t fnv_prime = 0x100000001b3ULL;
+    uint64_t hash = 0xcbf29ce484222325ULL;
+
+    for (size_t i = 0; i < len; ++i) {
+        hash ^= data[i];
+        hash *= fnv_prime;
+    }
+    return std::to_string(hash);
+}
+
+server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files) {
+    mtmd::bitmaps bitmaps;
+    for (auto & file : files) {
+        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size()));
+        if (!bmp.ptr) {
+            throw std::runtime_error("Failed to load image or audio file");
+        }
+        // calculate bitmap hash (for KV caching)
+        std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
+        bmp.set_id(hash.c_str());
+        bitmaps.entries.push_back(std::move(bmp));
+    }
+    // process prompt
+    std::vector<server_tokens> inputs;
+    // multimodal
+    mtmd_input_text inp_txt = {
+        prompt.c_str(),
+        /* add_special */   true,
+        /* parse_special */ true,
+    };
+    mtmd::input_chunks chunks(mtmd_input_chunks_init());
+    auto bitmaps_c_ptr = bitmaps.c_ptr();
+    int32_t tokenized = mtmd_tokenize(mctx,
+                                      chunks.ptr.get(),
+                                      &inp_txt,
+                                      bitmaps_c_ptr.data(),
+                                      bitmaps_c_ptr.size());
+    if (tokenized != 0) {
+        throw std::runtime_error("Failed to tokenize prompt");
+    }
+    auto result = server_tokens(chunks, true);
+    return result;
+}
+
+/**
+ * break the input "prompt" object into multiple prompt if needed, then tokenize them
+ * use tokenize_input_prompts() if the input could be an array.
+ * this supports these cases:
+ * - "prompt": "string"
+ * - "prompt": [12, 34, 56]
+ * - "prompt": [12, 34, "string", 56, 78]
+ * - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] }
+ */
+static server_tokens tokenize_input_subprompt(const llama_vocab * vocab, mtmd_context * mctx, const json & json_prompt, bool add_special, bool parse_special) {
+    constexpr char JSON_STRING_PROMPT_KEY[] = "prompt_string";
+    constexpr char JSON_MTMD_DATA_KEY[] = "multimodal_data";
+    const bool has_mtmd = mctx != nullptr;
+    if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
+        // string or mixed
+        llama_tokens tmp = tokenize_mixed(vocab, json_prompt, add_special, parse_special);
+        return server_tokens(tmp, false);
+    } else if (json_is_array_of_numbers(json_prompt)) {
+        // array of tokens
+        llama_tokens tmp = json_prompt.get<llama_tokens>();
+        return server_tokens(tmp, false);
+    } else if (json_prompt.contains(JSON_STRING_PROMPT_KEY)) {
+        // JSON object with prompt key.
+        if (json_prompt.contains(JSON_MTMD_DATA_KEY)) {
+            if (!has_mtmd)
+                throw std::runtime_error("Multimodal data provided, but model does not support multimodal requests.");
+
+            // JSON object with prompt and multimodal key.
+            std::vector<raw_buffer> files;
+            for (const auto & entry : json_prompt.at(JSON_MTMD_DATA_KEY)) {
+                files.push_back(base64_decode(entry));
+            }
+            return process_mtmd_prompt(mctx, json_prompt.at(JSON_STRING_PROMPT_KEY), files);
+        } else {
+            // Not multimodal, but contains a subobject.
+            llama_tokens tmp = tokenize_mixed(vocab, json_prompt.at(JSON_STRING_PROMPT_KEY), add_special, parse_special);
+            return server_tokens(tmp, false);
+        }
+   } else {
+       throw std::runtime_error("\"prompt\" elements must be a string, a list of tokens, a JSON object containing a prompt string, or a list of mixed strings & tokens.");
+   }
+}
+
+std::vector<server_tokens> tokenize_input_prompts(const llama_vocab * vocab, mtmd_context * mctx, const json & json_prompt, bool add_special, bool parse_special) {
+    std::vector<server_tokens> result;
+    if (json_prompt.is_array() && !json_is_array_and_contains_numbers(json_prompt)) {
+        result.reserve(json_prompt.size());
+        for (const auto & p : json_prompt) {
+            result.push_back(tokenize_input_subprompt(vocab, mctx, p,add_special, parse_special));
+        }
+    } else {
+        result.push_back(tokenize_input_subprompt(vocab, mctx, json_prompt, add_special, parse_special));
+    }
+    if (result.empty()) {
+        throw std::runtime_error("\"prompt\" must not be empty");
+    }
+    return result;
+}
+
+//
+// OAI utils
+//
+
+// used by /completions endpoint
+json oaicompat_completion_params_parse(const json & body) {
     json llama_params;
 
     if (!body.contains("prompt")) {
@@ -527,19 +775,67 @@ static json oaicompat_completion_params_parse(const json & body) {
     return llama_params;
 }
 
-struct oaicompat_parser_options {
-    bool use_jinja;
-    bool prefill_assistant;
-    common_reasoning_format reasoning_format;
-    std::map<std::string,std::string> chat_template_kwargs;
-    common_chat_templates * tmpls;
-    bool allow_image;
-    bool allow_audio;
-    bool enable_thinking = true;
-};
+// media_path always end with '/', see arg.cpp
+static void handle_media(
+        std::vector<raw_buffer> & out_files,
+        json & media_obj,
+        const std::string & media_path) {
+    std::string url = json_value(media_obj, "url", std::string());
+    if (string_starts_with(url, "http")) {
+        // download remote image
+        // TODO @ngxson : maybe make these params configurable
+        common_remote_params params;
+        params.headers.push_back("User-Agent: llama.cpp/" + build_info);
+        params.max_size = 1024 * 1024 * 10; // 10MB
+        params.timeout  = 10; // seconds
+        SRV_INF("downloading image from '%s'\n", url.c_str());
+        auto res = common_remote_get_content(url, params);
+        if (200 <= res.first && res.first < 300) {
+            SRV_INF("downloaded %ld bytes\n", res.second.size());
+            raw_buffer data;
+            data.insert(data.end(), res.second.begin(), res.second.end());
+            out_files.push_back(data);
+        } else {
+            throw std::runtime_error("Failed to download image");
+        }
+
+    } else if (string_starts_with(url, "file://")) {
+        if (media_path.empty()) {
+            throw std::invalid_argument("file:// URLs are not allowed unless --media-path is specified");
+        }
+        // load local image file
+        std::string file_path = url.substr(7); // remove "file://"
+        raw_buffer data;
+        if (!fs_validate_filename(file_path, true)) {
+            throw std::invalid_argument("file path is not allowed: " + file_path);
+        }
+        SRV_INF("loading image from local file '%s'\n", (media_path + file_path).c_str());
+        std::ifstream file(media_path + file_path, std::ios::binary);
+        if (!file) {
+            throw std::invalid_argument("file does not exist or cannot be opened: " + file_path);
+        }
+        data.assign((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+        out_files.push_back(data);
+
+    } else {
+        // try to decode base64 image
+        std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
+        if (parts.size() != 2) {
+            throw std::runtime_error("Invalid url value");
+        } else if (!string_starts_with(parts[0], "data:image/")) {
+            throw std::runtime_error("Invalid url format: " + parts[0]);
+        } else if (!string_ends_with(parts[0], "base64")) {
+            throw std::runtime_error("url must be base64 encoded");
+        } else {
+            auto base64_data = parts[1];
+            auto decoded_data = base64_decode(base64_data);
+            out_files.push_back(decoded_data);
+        }
+    }
+}
 
 // used by /chat/completions endpoint
-static json oaicompat_chat_params_parse(
+json oaicompat_chat_params_parse(
     json & body, /* openai api json semantics */
     const oaicompat_parser_options & opt,
     std::vector<raw_buffer> & out_files)
@@ -583,26 +879,26 @@ static json oaicompat_chat_params_parse(
             auto schema_wrapper = json_value(response_format, "json_schema", json::object());
             json_schema = json_value(schema_wrapper, "schema", json::object());
         } else if (!response_type.empty() && response_type != "text") {
-            throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
+            throw std::invalid_argument("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
         }
     }
 
     // get input files
     if (!body.contains("messages")) {
-        throw std::runtime_error("'messages' is required");
+        throw std::invalid_argument("'messages' is required");
     }
     json & messages = body.at("messages");
     if (!messages.is_array()) {
-        throw std::runtime_error("Expected 'messages' to be an array");
+        throw std::invalid_argument("Expected 'messages' to be an array");
     }
     for (auto & msg : messages) {
         std::string role = json_value(msg, "role", std::string());
         if (role != "assistant" && !msg.contains("content")) {
-            throw std::runtime_error("All non-assistant messages must contain 'content'");
+            throw std::invalid_argument("All non-assistant messages must contain 'content'");
         }
         if (role == "assistant") {
             if (!msg.contains("content") && !msg.contains("tool_calls")) {
-                throw std::runtime_error("Assistant message must contain either 'content' or 'tool_calls'!");
+                throw std::invalid_argument("Assistant message must contain either 'content' or 'tool_calls'!");
             }
             if (!msg.contains("content")) {
                 continue; // avoid errors with no content
@@ -614,7 +910,7 @@ static json oaicompat_chat_params_parse(
         }
 
         if (!content.is_array()) {
-            throw std::runtime_error("Expected 'content' to be a string or an array");
+            throw std::invalid_argument("Expected 'content' to be a string or an array");
         }
 
         for (auto & p : content) {
@@ -624,41 +920,8 @@ static json oaicompat_chat_params_parse(
                     throw std::runtime_error("image input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
                 }
 
-                json image_url  = json_value(p, "image_url", json::object());
-                std::string url = json_value(image_url, "url", std::string());
-                if (string_starts_with(url, "http")) {
-                    // download remote image
-                    // TODO @ngxson : maybe make these params configurable
-                    common_remote_params params;
-                    params.headers.push_back("User-Agent: llama.cpp/" + build_info);
-                    params.max_size = 1024 * 1024 * 10; // 10MB
-                    params.timeout  = 10; // seconds
-                    SRV_INF("downloading image from '%s'\n", url.c_str());
-                    auto res = common_remote_get_content(url, params);
-                    if (200 <= res.first && res.first < 300) {
-                        SRV_INF("downloaded %ld bytes\n", res.second.size());
-                        raw_buffer data;
-                        data.insert(data.end(), res.second.begin(), res.second.end());
-                        out_files.push_back(data);
-                    } else {
-                        throw std::runtime_error("Failed to download image");
-                    }
-
-                } else {
-                    // try to decode base64 image
-                    std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
-                    if (parts.size() != 2) {
-                        throw std::runtime_error("Invalid image_url.url value");
-                    } else if (!string_starts_with(parts[0], "data:image/")) {
-                        throw std::runtime_error("Invalid image_url.url format: " + parts[0]);
-                    } else if (!string_ends_with(parts[0], "base64")) {
-                        throw std::runtime_error("image_url.url must be base64 encoded");
-                    } else {
-                        auto base64_data = parts[1];
-                        auto decoded_data = base64_decode(base64_data);
-                        out_files.push_back(decoded_data);
-                    }
-                }
+                json image_url = json_value(p, "image_url", json::object());
+                handle_media(out_files, image_url, opt.media_path);
 
                 // replace this chunk with a marker
                 p["type"] = "text";
@@ -675,18 +938,20 @@ static json oaicompat_chat_params_parse(
                 std::string format = json_value(input_audio, "format", std::string());
                 // while we also support flac, we don't allow it here so we matches the OAI spec
                 if (format != "wav" && format != "mp3") {
-                    throw std::runtime_error("input_audio.format must be either 'wav' or 'mp3'");
+                    throw std::invalid_argument("input_audio.format must be either 'wav' or 'mp3'");
                 }
                 auto decoded_data = base64_decode(data); // expected to be base64 encoded
                 out_files.push_back(decoded_data);
 
+                // TODO: add audio_url support by reusing handle_media()
+
                 // replace this chunk with a marker
                 p["type"] = "text";
                 p["text"] = mtmd_default_marker();
                 p.erase("input_audio");
 
             } else if (type != "text") {
-                throw std::runtime_error("unsupported content[].type");
+                throw std::invalid_argument("unsupported content[].type");
             }
         }
     }
@@ -704,7 +969,7 @@ static json oaicompat_chat_params_parse(
     inputs.enable_thinking       = opt.enable_thinking;
     if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
         if (body.contains("grammar")) {
-            throw std::runtime_error("Cannot use custom grammar constraints with tools.");
+            throw std::invalid_argument("Cannot use custom grammar constraints with tools.");
         }
         llama_params["parse_tool_calls"] = true;
     }
@@ -723,7 +988,7 @@ static json oaicompat_chat_params_parse(
     } else if (enable_thinking_kwarg == "false") {
         inputs.enable_thinking = false;
     } else if (!enable_thinking_kwarg.empty() && enable_thinking_kwarg[0] == '"') {
-        throw std::runtime_error("invalid type for \"enable_thinking\" (expected boolean, got string)");
+        throw std::invalid_argument("invalid type for \"enable_thinking\" (expected boolean, got string)");
     }
 
     // if the assistant message appears at the end of list, we do not add end-of-turn token
@@ -736,14 +1001,14 @@ static json oaicompat_chat_params_parse(
 
         /* sanity check, max one assistant message at the end of the list */
         if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){
-            throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list.");
+            throw std::invalid_argument("Cannot have 2 or more assistant messages at the end of the list.");
         }
 
         /* TODO: test this properly */
         inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE;
 
         if ( inputs.enable_thinking ) {
-            throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking.");
+            throw std::invalid_argument("Assistant response prefill is incompatible with enable_thinking.");
         }
 
         inputs.add_generation_prompt = true;
@@ -784,18 +1049,18 @@ static json oaicompat_chat_params_parse(
     // Handle "n" field
     int n_choices = json_value(body, "n", 1);
     if (n_choices != 1) {
-        throw std::runtime_error("Only one completion choice is allowed");
+        throw std::invalid_argument("Only one completion choice is allowed");
     }
 
     // Handle "logprobs" field
     // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
     if (json_value(body, "logprobs", false)) {
         if (has_tools && stream) {
-            throw std::runtime_error("logprobs is not supported with tools + stream");
+            throw std::invalid_argument("logprobs is not supported with tools + stream");
         }
         llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
     } else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) {
-        throw std::runtime_error("top_logprobs requires logprobs to be set to true");
+        throw std::invalid_argument("top_logprobs requires logprobs to be set to true");
     }
 
     // Copy remaining properties to llama_params
@@ -811,7 +1076,227 @@ static json oaicompat_chat_params_parse(
     return llama_params;
 }
 
-static json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false) {
+json convert_anthropic_to_oai(const json & body) {
+    json oai_body;
+
+    // Convert system prompt
+    json oai_messages = json::array();
+    auto system_param = json_value(body, "system", json());
+    if (!system_param.is_null()) {
+        std::string system_content;
+
+        if (system_param.is_string()) {
+            system_content = system_param.get<std::string>();
+        } else if (system_param.is_array()) {
+            for (const auto & block : system_param) {
+                if (json_value(block, "type", std::string()) == "text") {
+                    system_content += json_value(block, "text", std::string());
+                }
+            }
+        }
+
+        oai_messages.push_back({
+            {"role", "system"},
+            {"content", system_content}
+        });
+    }
+
+    // Convert messages
+    if (!body.contains("messages")) {
+        throw std::runtime_error("'messages' is required");
+    }
+    const json & messages = body.at("messages");
+    if (messages.is_array()) {
+        for (const auto & msg : messages) {
+            std::string role = json_value(msg, "role", std::string());
+
+            if (!msg.contains("content")) {
+                if (role == "assistant") {
+                    continue;
+                }
+                oai_messages.push_back(msg);
+                continue;
+            }
+
+            const json & content = msg.at("content");
+
+            if (content.is_string()) {
+                oai_messages.push_back(msg);
+                continue;
+            }
+
+            if (!content.is_array()) {
+                oai_messages.push_back(msg);
+                continue;
+            }
+
+            json tool_calls = json::array();
+            json converted_content = json::array();
+            json tool_results = json::array();
+            bool has_tool_calls = false;
+
+            for (const auto & block : content) {
+                std::string type = json_value(block, "type", std::string());
+
+                if (type == "text") {
+                    converted_content.push_back(block);
+                } else if (type == "image") {
+                    json source = json_value(block, "source", json::object());
+                    std::string source_type = json_value(source, "type", std::string());
+
+                    if (source_type == "base64") {
+                        std::string media_type = json_value(source, "media_type", std::string("image/jpeg"));
+                        std::string data = json_value(source, "data", std::string());
+                        std::ostringstream ss;
+                        ss << "data:" << media_type << ";base64," << data;
+
+                        converted_content.push_back({
+                            {"type", "image_url"},
+                            {"image_url", {
+                                {"url", ss.str()}
+                            }}
+                        });
+                    } else if (source_type == "url") {
+                        std::string url = json_value(source, "url", std::string());
+                        converted_content.push_back({
+                            {"type", "image_url"},
+                            {"image_url", {
+                                {"url", url}
+                            }}
+                        });
+                    }
+                } else if (type == "tool_use") {
+                    tool_calls.push_back({
+                        {"id", json_value(block, "id", std::string())},
+                        {"type", "function"},
+                        {"function", {
+                            {"name", json_value(block, "name", std::string())},
+                            {"arguments", json_value(block, "input", json::object()).dump()}
+                        }}
+                    });
+                    has_tool_calls = true;
+                } else if (type == "tool_result") {
+                    std::string tool_use_id = json_value(block, "tool_use_id", std::string());
+
+                    auto result_content = json_value(block, "content", json());
+                    std::string result_text;
+                    if (result_content.is_string()) {
+                        result_text = result_content.get<std::string>();
+                    } else if (result_content.is_array()) {
+                        for (const auto & c : result_content) {
+                            if (json_value(c, "type", std::string()) == "text") {
+                                result_text += json_value(c, "text", std::string());
+                            }
+                        }
+                    }
+
+                    tool_results.push_back({
+                        {"role", "tool"},
+                        {"tool_call_id", tool_use_id},
+                        {"content", result_text}
+                    });
+                }
+            }
+
+            if (!converted_content.empty() || has_tool_calls) {
+                json new_msg = {{"role", role}};
+                if (!converted_content.empty()) {
+                    new_msg["content"] = converted_content;
+                } else if (has_tool_calls) {
+                    new_msg["content"] = "";
+                }
+                if (!tool_calls.empty()) {
+                    new_msg["tool_calls"] = tool_calls;
+                }
+                oai_messages.push_back(new_msg);
+            }
+
+            for (const auto & tool_msg : tool_results) {
+                oai_messages.push_back(tool_msg);
+            }
+        }
+    }
+
+    oai_body["messages"] = oai_messages;
+
+    // Convert tools
+    if (body.contains("tools")) {
+        const json & tools = body.at("tools");
+        if (tools.is_array()) {
+            json oai_tools = json::array();
+            for (const auto & tool : tools) {
+                oai_tools.push_back({
+                    {"type", "function"},
+                    {"function", {
+                        {"name", json_value(tool, "name", std::string())},
+                        {"description", json_value(tool, "description", std::string())},
+                        {"parameters", tool.contains("input_schema") ? tool.at("input_schema") : json::object()}
+                    }}
+                });
+            }
+            oai_body["tools"] = oai_tools;
+        }
+    }
+
+    // Convert tool_choice
+    if (body.contains("tool_choice")) {
+        const json & tc = body.at("tool_choice");
+        if (tc.is_object()) {
+            std::string type = json_value(tc, "type", std::string());
+            if (type == "auto") {
+                oai_body["tool_choice"] = "auto";
+            } else if (type == "any" || type == "tool") {
+                oai_body["tool_choice"] = "required";
+            }
+        }
+    }
+
+    // Convert stop_sequences to stop
+    if (body.contains("stop_sequences")) {
+        oai_body["stop"] = body.at("stop_sequences");
+    }
+
+    // Handle max_tokens (required in Anthropic, but we're permissive)
+    if (body.contains("max_tokens")) {
+        oai_body["max_tokens"] = body.at("max_tokens");
+    } else {
+        oai_body["max_tokens"] = 4096;
+    }
+
+    // Pass through common params
+    for (const auto & key : {"temperature", "top_p", "top_k", "stream"}) {
+        if (body.contains(key)) {
+            oai_body[key] = body.at(key);
+        }
+    }
+
+    // Handle Anthropic-specific thinking param
+    if (body.contains("thinking")) {
+        json thinking = json_value(body, "thinking", json::object());
+        std::string thinking_type = json_value(thinking, "type", std::string());
+        if (thinking_type == "enabled") {
+            int budget_tokens = json_value(thinking, "budget_tokens", 10000);
+            oai_body["thinking_budget_tokens"] = budget_tokens;
+        }
+    }
+
+    // Handle Anthropic-specific metadata param
+    if (body.contains("metadata")) {
+        json metadata = json_value(body, "metadata", json::object());
+        std::string user_id = json_value(metadata, "user_id", std::string());
+        if (!user_id.empty()) {
+            oai_body["__metadata_user_id"] = user_id;
+        }
+    }
+
+    return oai_body;
+}
+
+json format_embeddings_response_oaicompat(
+        const json & request,
+        const std::string & model_name,
+        const json & embeddings,
+        bool use_base64) {
     json data = json::array();
     int32_t n_tokens = 0;
     int i = 0;
@@ -841,7 +1326,7 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
     }
 
     json res = json {
-        {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
+        {"model", json_value(request, "model", model_name)},
         {"object", "list"},
         {"usage", json {
             {"prompt_tokens", n_tokens},
@@ -853,8 +1338,9 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
     return res;
 }
 
-static json format_response_rerank(
+json format_response_rerank(
         const json & request,
+        const std::string & model_name,
         const json & ranks,
         bool is_tei_format,
         std::vector<std::string> & texts,
@@ -886,7 +1372,7 @@ static json format_response_rerank(
     if (is_tei_format) return results;
 
     json res = json{
-        {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
+        {"model", json_value(request, "model", model_name)},
         {"object", "list"},
         {"usage", json{
             {"prompt_tokens", n_tokens},
@@ -898,74 +1384,19 @@ static json format_response_rerank(
     return res;
 }
 
-static bool is_valid_utf8(const std::string & str) {
-    const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
-    const unsigned char* end = bytes + str.length();
 
-    while (bytes < end) {
-        if (*bytes <= 0x7F) {
-            // 1-byte sequence (0xxxxxxx)
-            bytes++;
-        } else if ((*bytes & 0xE0) == 0xC0) {
-            // 2-byte sequence (110xxxxx 10xxxxxx)
-            if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
-                return false;
-            bytes += 2;
-        } else if ((*bytes & 0xF0) == 0xE0) {
-            // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
-            if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
-                return false;
-            bytes += 3;
-        } else if ((*bytes & 0xF8) == 0xF0) {
-            // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
-            if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
-                (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
-                return false;
-            bytes += 4;
-        } else {
-            // Invalid UTF-8 lead byte
-            return false;
-        }
-    }
+//
+// other utils
+//
 
-    return true;
-}
+std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) {
+    std::vector<llama_token_data> cur;
+    const auto * logits = llama_get_logits_ith(ctx, idx);
 
-static json format_tokenizer_response(const json & tokens) {
-    return json {
-        {"tokens", tokens}
-    };
-}
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
 
-static json format_detokenized_response(const std::string & content) {
-    return json {
-        {"content", content}
-    };
-}
-
-static json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) {
-    json data = json::array();
-    for (const auto & lb : logit_bias) {
-        data.push_back(json{
-            {"bias", lb.bias},
-            {"token", lb.token},
-        });
-    }
-    return data;
-}
-
-static std::string safe_json_to_str(const json & data) {
-    return data.dump(-1, ' ', false, json::error_handler_t::replace);
-}
-
-static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) {
-    std::vector<llama_token_data> cur;
-    const auto * logits = llama_get_logits_ith(ctx, idx);
-
-    const llama_model * model = llama_get_model(ctx);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
-    const int n_vocab = llama_vocab_n_tokens(vocab);
+    const int n_vocab = llama_vocab_n_tokens(vocab);
 
     cur.resize(n_vocab);
     for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@@ -992,538 +1423,226 @@ static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx
     return cur;
 }
 
-static bool are_lora_equal(
-        const std::vector<common_adapter_lora_info> & l1,
-        const std::vector<common_adapter_lora_info> & l2) {
-    if (l1.size() != l2.size()) {
-        return false;
-    }
-    for (size_t i = 0; i < l1.size(); ++i) {
-        // we don't check lora.path to reduce the time complexity
-        if (l1[i].scale != l2[i].scale || l1[i].ptr != l2[i].ptr) {
-            return false;
-        }
-    }
-    return true;
+std::string safe_json_to_str(const json & data) {
+    return data.dump(-1, ' ', false, json::error_handler_t::replace);
 }
 
-// get the ids of all enabled loras
-static std::vector<size_t> lora_get_enabled_ids(const std::vector<common_adapter_lora_info> & loras) {
-    std::vector<size_t> enabled_ids;
-    for (size_t i = 0; i < loras.size(); ++i) {
-        if (loras[i].scale > 0) {
-            enabled_ids.push_back(i);
-        }
+// TODO: reuse llama_detokenize
+template <class Iter>
+static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
+    std::string ret;
+    for (; begin != end; ++begin) {
+        ret += common_token_to_piece(ctx, *begin);
     }
-    return enabled_ids;
-}
 
-// check whether the given lora set has only aloras activated (empty => false)
-static bool lora_all_alora(const std::vector<common_adapter_lora_info> & loras) {
-    bool found_alora = false;
-    for (const auto & lora : loras) {
-        if (lora.scale != 0) {
-            if (llama_adapter_get_alora_n_invocation_tokens(lora.ptr) == 0) {
-                return false;
-            }
-            found_alora = true;
-        }
-    }
-    return found_alora;
+    return ret;
 }
 
-// if the two sets of loras are different, they require a cache clear unless the
-// change is only from aloras to aloras.
-static bool lora_should_clear_cache(
-        const std::vector<common_adapter_lora_info> & current,
-        const std::vector<common_adapter_lora_info> & next) {
-
-    // This should always be called after determining that the two sets are
-    // _not_ equal. This assert is therefore some slightly wasted work and
-    // should be safe to remove as long as this method is called correctly.
-    GGML_ASSERT(!are_lora_equal(current, next));
-
-    return (
-        !(lora_get_enabled_ids(current).empty() || lora_all_alora(current)) ||
-        !lora_all_alora(next));
+std::string tokens_to_str(llama_context * ctx, const llama_tokens & tokens) {
+    return tokens_to_str(ctx, tokens.begin(), tokens.end());
 }
 
-// parse lora config from JSON request, returned a copy of lora_base with updated scale
-static std::vector<common_adapter_lora_info> parse_lora_request(
-        const std::vector<common_adapter_lora_info> & lora_base,
-        const json & data) {
-    std::vector<common_adapter_lora_info> lora(lora_base);
-    int max_idx = lora.size();
-
-    // clear existing value
-    for (auto & entry : lora) {
-        entry.scale = 0.0f;
-    }
+// format incomplete utf-8 multibyte character for output
+std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
+    std::string out = token == LLAMA_TOKEN_NULL ? "" : common_token_to_piece(ctx, token);
 
-    // set value
-    for (const auto & entry : data) {
-        int id      = json_value(entry, "id", -1);
-        float scale = json_value(entry, "scale", 0.0f);
-        if (0 <= id && id < max_idx) {
-            lora[id].scale = scale;
-        } else {
-            throw std::runtime_error("invalid adapter id");
-        }
+    // if the size is 1 and first bit is 1, meaning it's a partial character
+    //   (size > 1 meaning it's already a known token)
+    if (out.size() == 1 && (out[0] & 0x80) == 0x80) {
+        std::stringstream ss;
+        ss << std::hex << (out[0] & 0xff);
+        std::string res(ss.str());
+        out = "byte: \\x" + res;
     }
 
-    return lora;
+    return out;
 }
 
-//
-// utils for interacting with libmtmd
-// (may need to refactor in near future)
-//
-
-/**
- * server_tokens is a helper to manage the input tokens and image for the server.
- * it is made this way to simplify the logic of KV cache management.
- */
-struct server_tokens {
-    bool has_mtmd = false;
-
-private: // disallow accessing these members directly, risking out-of-sync
-
-    // map a **start** index in tokens to the image chunk
-    // note: the order need to be in-sync with tokens
-    std::map<size_t, mtmd::input_chunk_ptr> map_idx_to_media;
-
-    // list of tokens
-    //   if the token is LLAMA_TOKEN_NULL, it indicates that this position is occupied by media chunk
-    //   otherwise, it is a normal text token
-    // note: a non-text chunk can occupy multiple tokens (aka memory cells) in the token list
-    // note(2): for M-RoPE, an image can occupy different number of pos; do not assume 1-to-1 mapping tokens <-> pos
-    llama_tokens tokens;
-
-    // for ex. with input of 5 text tokens and 2 images (each image occupies 3 tokens and 2 pos):
-    //      [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] [img1]
-    // idx  0   1   2   3   4   5      6      7      8      9      10
-    // pos  0   1   2   3   4   5      5      5      7      7      7
-    // map_idx_to_media will contain: {5, img0}, {8, img1}
-
-public:
-    server_tokens() = default;
-    ~server_tokens() = default;
-
-    // Prevent copying
-    // TODO: server_tokens should be copyable - remove this:
-    server_tokens(const server_tokens&) = delete;
-    server_tokens& operator=(const server_tokens&) = delete;
-
-    // Allow moving (usually implicitly generated if members are movable)
-    server_tokens(server_tokens&&) = default;
-    server_tokens& operator=(server_tokens&&) = default;
-
-    // Allow accessing elements using [] operator
-    llama_token operator[](size_t index) { return tokens[index]; }
-    const llama_token& operator[](size_t index) const { return tokens[index]; }
+// format server-sent event (SSE), return the formatted string to send
+// note: if data is a json array, it will be sent as multiple events, one per item
+std::string format_oai_sse(const json & data) {
+    std::ostringstream ss;
+    auto send_single = [&ss](const json & data) {
+        ss << "data: " <<
+            safe_json_to_str(data) <<
+            "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row).
+    };
 
-    server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd) {
-        for (size_t i = 0; i < mtmd_chunks.size(); ++i) {
-            push_back(mtmd_chunks[i]);
+    if (data.is_array()) {
+        for (const auto & item : data) {
+            send_single(item);
         }
+    } else {
+        send_single(data);
     }
 
-    server_tokens(const llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {
-    }
-
-    llama_pos pos_next() const {
-        if (!has_mtmd) {
-            return tokens.size();
-        }
+    return ss.str();
+}
 
-        llama_pos res = tokens.size();
+std::string format_anthropic_sse(const json & data) {
+    std::ostringstream ss;
 
-        for (auto it = map_idx_to_media.begin(); it != map_idx_to_media.end(); ++it) {
-            const auto & chunk = it->second;
-            res += mtmd_input_chunk_get_n_pos(chunk.get()) - mtmd_input_chunk_get_n_tokens(chunk.get());
+    auto send_event = [&ss](const json & event_obj) {
+        if (event_obj.contains("event") && event_obj.contains("data")) {
+            ss << "event: " << event_obj.at("event").get<std::string>() << "\n";
+            ss << "data: " << safe_json_to_str(event_obj.at("data")) << "\n\n";
+        } else {
+            ss << "data: " << safe_json_to_str(event_obj) << "\n\n";
         }
+    };
 
-        return res;
-    }
-
-    // for debugging
-    std::string str() const {
-        std::ostringstream oss;
-        oss << "tokens: ";
-        for (size_t idx = 0; idx < tokens.size(); ++idx) {
-            llama_token t = tokens[idx];
-            oss << "idx:" << idx << " ";
-            if (t == LLAMA_TOKEN_NULL) {
-                oss << "<embd> ";
-            } else {
-                oss << t << " ";
-            }
-        }
-        oss << "\n";
-        oss << "image idx: ";
-        for (const auto & it : map_idx_to_media) {
-            oss << it.first << ", ";
+    if (data.is_array()) {
+        for (const auto & event : data) {
+            send_event(event);
         }
-        return oss.str();
+    } else {
+        send_event(data);
     }
 
-    const mtmd::input_chunk_ptr & find_chunk(size_t idx) const {
-        auto it = map_idx_to_media.find(idx);
-        if (it != map_idx_to_media.end()) {
-            return it->second;
-        }
-        throw std::runtime_error("Chunk not found");
-    }
+    return ss.str();
+}
 
-    void push_back(llama_token tok) {
-        if (tok == LLAMA_TOKEN_NULL) {
-            throw std::runtime_error("Invalid token");
-        }
-        tokens.emplace_back(tok);
-    }
+bool is_valid_utf8(const std::string & str) {
+    const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
+    const unsigned char* end = bytes + str.length();
 
-    // will create a copy of the chunk if it contains non-text data
-    void push_back(const mtmd_input_chunk * chunk) {
-        auto type = mtmd_input_chunk_get_type(chunk);
-        if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE || type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
-            GGML_ASSERT(has_mtmd);
-            const size_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
-            size_t start_idx = tokens.size();
-            for (size_t i = 0; i < n_tokens; ++i) {
-                tokens.emplace_back(LLAMA_TOKEN_NULL);
-            }
-            mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
-            map_idx_to_media[start_idx] = std::move(new_chunk);
-        } else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
-            size_t n_tokens;
-            const auto * text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
-            for (size_t i = 0; i < n_tokens; ++i) {
-                push_back(text_tokens[i]);
-            }
+    while (bytes < end) {
+        if (*bytes <= 0x7F) {
+            // 1-byte sequence (0xxxxxxx)
+            bytes++;
+        } else if ((*bytes & 0xE0) == 0xC0) {
+            // 2-byte sequence (110xxxxx 10xxxxxx)
+            if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
+                return false;
+            bytes += 2;
+        } else if ((*bytes & 0xF0) == 0xE0) {
+            // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
+            if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
+                return false;
+            bytes += 3;
+        } else if ((*bytes & 0xF8) == 0xF0) {
+            // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
+            if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
+                (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
+                return false;
+            bytes += 4;
         } else {
-            GGML_ABORT("Invalid chunk type");
-        }
-    }
-
-    // appends server tokens, updates the media map. copies media chunks.
-    void push_back(server_tokens & tokens) {
-        size_t start_idx = size();
-        for (size_t i = 0; i < tokens.size(); i++) {
-            push_back(tokens[i]);
-        }
-        if (tokens.has_mtmd) {
-            // Assert if we are copying MTMD chunks to a server_tokens that does not have mtmd.
-            // We could also just check, but this will prevent silently dropping MTMD data.
-            GGML_ASSERT(has_mtmd);
-            for (auto it = tokens.map_idx_to_media.begin(); it != tokens.map_idx_to_media.end(); ) {
-                auto * chunk = tokens.map_idx_to_media[it->first].get();
-                mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
-                map_idx_to_media[start_idx + it->first] = std::move(new_chunk);
-            }
+            // Invalid UTF-8 lead byte
+            return false;
         }
     }
 
-    // for compatibility with context shift and prompt truncation
-    void insert(const llama_tokens & inp_tokens) {
-        GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
-        tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
-    }
-
-    // for compatibility with speculative decoding, ctx shift, slot save/load
-    const llama_tokens & get_text_tokens() const {
-        GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
-        return tokens;
-    }
-
-    // for compatibility with speculative decoding
-    void set_token(llama_pos pos, llama_token id) {
-        GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
-        tokens[pos] = id;
-    }
+    return true;
+}
 
-    size_t size() const {
-        return tokens.size();
-    }
+llama_tokens format_prompt_infill(
+        const llama_vocab * vocab,
+        const json & input_prefix,
+        const json & input_suffix,
+        const json & input_extra,
+        const int n_batch,
+        const int n_predict,
+        const int n_ctx,
+        const bool spm_infill,
+        const llama_tokens & tokens_prompt
+    ) {
+    // TODO: optimize this block by reducing memory allocations and movement
 
-    bool empty() const {
-        return tokens.empty();
-    }
+    // use FIM repo-level pattern:
+    // ref: https://arxiv.org/pdf/2409.12186
+    //
+    // [FIM_REP]myproject
+    // [FIM_SEP]filename0
+    // extra chunk 0
+    // [FIM_SEP]filename1
+    // extra chunk 1
+    // ...
+    // [FIM_SEP]filename
+    // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt
+    //
+    llama_tokens extra_tokens;
+    extra_tokens.reserve(n_ctx);
 
-    void clear() {
-        map_idx_to_media.clear();
-        tokens.clear();
-    }
+    auto tokens_prefix = tokenize_mixed(vocab, input_prefix, false, false);
+    auto tokens_suffix = tokenize_mixed(vocab, input_suffix, false, false);
 
-    void keep_first(size_t n) {
-        GGML_ASSERT(n <= tokens.size());
-        if (has_mtmd) {
-            if (n == tokens.size()) {
-                return; // nothing to do
-            }
-            // we throw an error if we try to remove a token in the middle of an image
-            // for ex. with input of 5 text tokens and 2 images:
-            //    [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
-            // n  1   2   3   4   5   6      7      8      9      10
-            // allowed to resize      ^                    ^
-            // disallowed to resize          ^      ^             ^
-            if (n > 0) {
-                // make sure we never remove tokens in the middle of an image
-                // note that the case where we keep a full image at the end is allowed:
-                //   tokens[n - 1] == LLAMA_TOKEN_NULL && tokens[n] != LLAMA_TOKEN_NULL
-                if (tokens[n - 1] == LLAMA_TOKEN_NULL && tokens[n] == LLAMA_TOKEN_NULL) {
-                    find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
-                }
-            }
-            // remove all image chunks that are not used anymore
-            for (auto it = map_idx_to_media.begin(); it != map_idx_to_media.end(); ) {
-                size_t idx = it->first;
-                if (idx >= n) {
-                    it = map_idx_to_media.erase(it);
-                } else {
-                    ++it;
-                }
-            }
-        }
-        tokens.resize(n);
-    }
+    if (llama_vocab_fim_rep(vocab) != LLAMA_TOKEN_NULL) {
+        // TODO: make project name an input
+        static const auto k_fim_repo = common_tokenize(vocab, "myproject\n", false, false);
 
-    std::string detokenize(const llama_context * ctx, bool special) const {
-        llama_tokens text_tokens;
-        text_tokens.reserve(tokens.size());
-        for (const auto & t : tokens) {
-            if (t != LLAMA_TOKEN_NULL) {
-                text_tokens.push_back(t);
-            }
-        }
-        return common_detokenize(ctx, text_tokens, special);
+        extra_tokens.push_back(llama_vocab_fim_rep(vocab));
+        extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
     }
+    for (const auto & chunk : input_extra) {
+        // { "text": string, "filename": string }
+        const std::string text     = json_value(chunk, "text",     std::string());
+        const std::string filename = json_value(chunk, "filename", std::string("tmp"));
 
-    size_t get_common_prefix(const server_tokens & b) const {
-        const size_t max_idx = std::min(tokens.size(), b.tokens.size());
-
-        if (!has_mtmd) {
-            for (size_t i = 0; i < max_idx; ++i) {
-                if (tokens[i] == b.tokens[i]) {
-                    continue;
-                }
+        if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) {
+            const auto k_fim_file = common_tokenize(vocab, filename + "\n", false, false);
 
-                return i;
-            }
+            extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab));
+            extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
+        } else {
+            // chunk separator in binary form to avoid confusing the AI
+            static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00};
+            static const auto k_chunk_prefix_tokens = common_tokenize(vocab, k_chunk_prefix_str, false, false);
 
-            return max_idx;
+            extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end());
         }
 
-        for (size_t i = 0; i < max_idx; ++i) {
-            const llama_token ai =   tokens[i];
-            const llama_token bi = b.tokens[i];
-
-            if (ai == LLAMA_TOKEN_NULL && bi == LLAMA_TOKEN_NULL) {
-                const auto & a_chunk =   find_chunk(i);
-                const auto & b_chunk = b.find_chunk(i);
-
-                GGML_ASSERT(a_chunk && b_chunk);
-
-                const std::string id_ai = mtmd_input_chunk_get_id(a_chunk.get());
-                const std::string id_bi = mtmd_input_chunk_get_id(b_chunk.get());
-
-                const size_t n_tok_a = mtmd_input_chunk_get_n_tokens(a_chunk.get());
-                const size_t n_tok_b = mtmd_input_chunk_get_n_tokens(b_chunk.get());
-
-                if (id_ai == id_bi && n_tok_a == n_tok_b) {
-                    GGML_ASSERT(n_tok_a > 0 && "Invalid media chunk"); // should never happen
-                    i += n_tok_a - 1; // will be +1 by the for loop
-                    continue;
-                }
+        const auto chunk_tokens = common_tokenize(vocab, text, false, false);
+        extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end());
+    }
 
-                return i;
-            }
+    if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) {
+        // TODO: current filename
+        static const auto k_fim_file = common_tokenize(vocab, "filename\n", false, false);
 
-            if (ai == bi) {
-                continue;
-            }
+        extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab));
+        extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
+    }
 
-            return i;
-        }
+    // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
+    const int n_prefix_take = std::min<int>(tokens_prefix.size(),                3*(n_batch/4));
+    const int n_suffix_take = std::min<int>(tokens_suffix.size(), std::max<int>(0, (n_batch/4) - (2 + tokens_prompt.size())));
 
-        return max_idx; // all tokens are equal
-    }
+    SRV_DBG("n_prefix_take = %d, n_suffix_take = %d, total = %d\n", n_prefix_take, n_suffix_take, (n_prefix_take + n_suffix_take));
 
-    // make sure all text tokens are within the vocab range
-    bool validate(const struct llama_context * ctx) const {
-        const llama_model * model = llama_get_model(ctx);
-        const llama_vocab * vocab = llama_model_get_vocab(model);
-        const int32_t n_vocab = llama_vocab_n_tokens(vocab);
+    // fill the rest of the context with extra chunks
+    const int n_extra_take = std::min<int>(std::max<int>(0, n_ctx - (n_batch) - 2*n_predict), extra_tokens.size());
 
-        for (size_t i = 0; i < tokens.size(); ++i) {
-            const auto & t = tokens[i];
-            if (t == LLAMA_TOKEN_NULL) {
-                try {
-                    const auto & chunk = find_chunk(i);
-                    size_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk.get());
-                    i += n_tokens - 1; // will be +1 by the for loop
-                } catch (const std::exception & e) {
-                    return false;
-                }
-            } else if (t < 0 || t >= n_vocab) {
-                return false;
-            }
-        }
-        return true;
-    }
+    tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
+    tokens_suffix.resize(n_suffix_take);
 
-    // encode and decode the image chunk
-    int32_t process_chunk(
-                llama_context * ctx,
-                mtmd_context * mctx,
-                size_t idx,
-                llama_pos pos,
-                int32_t seq_id,
-                size_t & n_tokens_out) const {
-        const auto & chunk = find_chunk(idx);
-        const char * name = mtmd_input_chunk_get_type(chunk.get()) == MTMD_INPUT_CHUNK_TYPE_IMAGE
-                            ? "image" : "audio";
-        SRV_INF("processing %s...\n", name);
-        int32_t n_batch = llama_n_batch(ctx);
-        int64_t t0 = ggml_time_ms();
-        llama_pos new_n_past; // unused for now
-        int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
-            chunk.get(),
-            pos,
-            seq_id,
-            n_batch,
-            true, // logits last
-            &new_n_past);
-        SRV_INF("%s processed in %" PRId64 " ms\n", name, ggml_time_ms() - t0);
-        if (result != 0) {
-            LOG_ERR("mtmd_helper_eval failed with status %d", result);
-            n_tokens_out = 0;
-            return result;
-        }
-        n_tokens_out = mtmd_input_chunk_get_n_tokens(chunk.get());
-        return 0;
-    }
-};
+    tokens_prefix.insert(tokens_prefix.begin(), llama_vocab_fim_pre(vocab));
+    tokens_prefix.insert(tokens_prefix.end(),   tokens_prompt.begin(), tokens_prompt.end());
+    tokens_suffix.insert(tokens_suffix.begin(), llama_vocab_fim_suf(vocab));
 
-// Computes FNV-1a hash of the data
-static std::string fnv_hash(const uint8_t * data, size_t len) {
-    const uint64_t fnv_prime = 0x100000001b3ULL;
-    uint64_t hash = 0xcbf29ce484222325ULL;
+    auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix;
+    auto embd_end = spm_infill ? tokens_prefix : tokens_suffix;
 
-    for (size_t i = 0; i < len; ++i) {
-        hash ^= data[i];
-        hash *= fnv_prime;
+    if (llama_vocab_get_add_bos(vocab)) {
+        embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
     }
-    return std::to_string(hash);
-}
 
-static server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files) {
-    mtmd::bitmaps bitmaps;
-    for (auto & file : files) {
-        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size()));
-        if (!bmp.ptr) {
-            throw std::runtime_error("Failed to load image or audio file");
-        }
-        // calculate bitmap hash (for KV caching)
-        std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
-        bmp.set_id(hash.c_str());
-        bitmaps.entries.push_back(std::move(bmp));
-    }
-    // process prompt
-    std::vector<server_tokens> inputs;
-    // multimodal
-    mtmd_input_text inp_txt = {
-        prompt.c_str(),
-        /* add_special */   true,
-        /* parse_special */ true,
-    };
-    mtmd::input_chunks chunks(mtmd_input_chunks_init());
-    auto bitmaps_c_ptr = bitmaps.c_ptr();
-    int32_t tokenized = mtmd_tokenize(mctx,
-                                      chunks.ptr.get(),
-                                      &inp_txt,
-                                      bitmaps_c_ptr.data(),
-                                      bitmaps_c_ptr.size());
-    if (tokenized != 0) {
-        throw std::runtime_error("Failed to tokenize prompt");
-    }
-    auto result = server_tokens(chunks, true);
-    return result;
-}
+    SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size());
 
-/**
- * break the input "prompt" object into multiple prompt if needed, then tokenize them
- * use tokenize_input_prompts() if the input could be an array.
- * this supports these cases:
- * - "prompt": "string"
- * - "prompt": [12, 34, 56]
- * - "prompt": [12, 34, "string", 56, 78]
- * - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] }
- */
-static server_tokens tokenize_input_subprompt(const llama_vocab * vocab, mtmd_context * mctx, const json & json_prompt, bool add_special, bool parse_special) {
-    constexpr char JSON_STRING_PROMPT_KEY[] = "prompt_string";
-    constexpr char JSON_MTMD_DATA_KEY[] = "multimodal_data";
-    const bool has_mtmd = mctx != nullptr;
-    if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
-        // string or mixed
-        llama_tokens tmp = tokenize_mixed(vocab, json_prompt, add_special, parse_special);
-        return server_tokens(tmp, false);
-    } else if (json_is_array_of_numbers(json_prompt)) {
-        // array of tokens
-        llama_tokens tmp = json_prompt.get<llama_tokens>();
-        return server_tokens(tmp, false);
-    } else if (json_prompt.contains(JSON_STRING_PROMPT_KEY)) {
-        // JSON object with prompt key.
-        if (json_prompt.contains(JSON_MTMD_DATA_KEY)) {
-            if (!has_mtmd)
-                throw std::runtime_error("Multimodal data provided, but model does not support multimodal requests.");
+    // put the extra context before the FIM prefix
+    embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end());
 
-            // JSON object with prompt and multimodal key.
-            std::vector<raw_buffer> files;
-            for (const auto & entry : json_prompt.at(JSON_MTMD_DATA_KEY)) {
-                files.push_back(base64_decode(entry));
-            }
-            return process_mtmd_prompt(mctx, json_prompt.at(JSON_STRING_PROMPT_KEY), files);
-        } else {
-            // Not multimodal, but contains a subobject.
-            llama_tokens tmp = tokenize_mixed(vocab, json_prompt.at(JSON_STRING_PROMPT_KEY), add_special, parse_special);
-            return server_tokens(tmp, false);
-        }
-   } else {
-       throw std::runtime_error("\"prompt\" elements must be a string, a list of tokens, a JSON object containing a prompt string, or a list of mixed strings & tokens.");
-   }
-}
+    embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
+    embd_inp.push_back(llama_vocab_fim_mid(vocab));
 
-/**
- * break the input "prompt" object into multiple prompt if needed, then tokenize them
- * this supports these cases:
- * - "prompt": "string"
- * - "prompt": [12, 34, 56]
- * - "prompt": [12, 34, "string", 56, 78]
- * - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] }
- * and multiple prompts (multi-tasks):
- * - "prompt": ["string1", "string2"]
- * - "prompt": ["string1", [12, 34, 56]]
- * - "prompt": [[12, 34, 56], [78, 90, 12]]
- * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56], { "prompt_string": "string", "multimodal_data": [ "base64" ]}]
- */
-static std::vector<server_tokens> tokenize_input_prompts(const llama_vocab * vocab, mtmd_context * mctx, const json & json_prompt, bool add_special, bool parse_special) {
-    std::vector<server_tokens> result;
-    if (json_prompt.is_array() && !json_is_array_and_contains_numbers(json_prompt)) {
-        result.reserve(json_prompt.size());
-        for (const auto & p : json_prompt) {
-            result.push_back(tokenize_input_subprompt(vocab, mctx, p,add_special, parse_special));
-        }
-    } else {
-        result.push_back(tokenize_input_subprompt(vocab, mctx, json_prompt, add_special, parse_special));
-    }
-    if (result.empty()) {
-        throw std::runtime_error("\"prompt\" must not be empty");
-    }
-    return result;
+    return embd_inp;
 }
 
-// format rerank task: [BOS]query[EOS][SEP]doc[EOS].
-static server_tokens format_rerank(const struct llama_model * model, const struct llama_vocab * vocab, mtmd_context * mctx, const std::string & query, const std::string & doc) {
+server_tokens format_prompt_rerank(
+        const struct llama_model * model,
+        const struct llama_vocab * vocab,
+        mtmd_context * mctx,
+        const std::string & query,
+        const std::string & doc) {
     server_tokens result = {};
 
     const char * rerank_prompt = llama_model_chat_template(model, "rerank");
diff --git a/llamacpp/native/src/server/server-common.h b/llamacpp/native/src/server/server-common.h
new file mode 100644
index 000000000..bb04e82b4
--- /dev/null
+++ b/llamacpp/native/src/server/server-common.h
@@ -0,0 +1,359 @@
+#pragma once
+
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+#include "chat.h"
+#include "mtmd.h"
+
+#define JSON_ASSERT GGML_ASSERT
+#include <nlohmann/json.hpp>
+
+#include <string>
+#include <vector>
+#include <cinttypes>
+
+const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
+
+using json = nlohmann::ordered_json;
+
+#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
+#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
+#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
+#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
+
+#define SRV_INF(fmt, ...) LOG_INF("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_WRN(fmt, ...) LOG_WRN("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_ERR(fmt, ...) LOG_ERR("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_DBG(fmt, ...) LOG_DBG("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+
+using raw_buffer = std::vector<uint8_t>;
+
+template <typename T>
+static T json_value(const json & body, const std::string & key, const T & default_value) {
+    // Fallback null to default value
+    if (body.contains(key) && !body.at(key).is_null()) {
+        try {
+            return body.at(key);
+        } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const & err) {
+            LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value: %s\n", key.c_str(), json(default_value).type_name(), err.what());
+            return default_value;
+        }
+    } else {
+        return default_value;
+    }
+}
+
+// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
+enum error_type {
+    ERROR_TYPE_INVALID_REQUEST,
+    ERROR_TYPE_AUTHENTICATION,
+    ERROR_TYPE_SERVER,
+    ERROR_TYPE_NOT_FOUND,
+    ERROR_TYPE_PERMISSION,
+    ERROR_TYPE_UNAVAILABLE, // custom error
+    ERROR_TYPE_NOT_SUPPORTED, // custom error
+    ERROR_TYPE_EXCEED_CONTEXT_SIZE, // custom error
+};
+
+// thin wrapper around common_grammar_trigger with (de)serialization functions
+struct server_grammar_trigger {
+    common_grammar_trigger value;
+
+    server_grammar_trigger() = default;
+    server_grammar_trigger(const common_grammar_trigger & value) : value(value) {}
+    server_grammar_trigger(const json & in) {
+        value.type = (common_grammar_trigger_type) in.at("type").get<int>();
+        value.value = in.at("value").get<std::string>();
+        if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
+            value.token = (llama_token) in.at("token").get<int>();
+        }
+    }
+
+    json to_json() const {
+        json out {
+            {"type", (int) value.type},
+            {"value", value.value},
+        };
+        if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
+            out["token"] = (int) value.token;
+        }
+        return out;
+    }
+};
+
+json format_error_response(const std::string & message, const enum error_type type);
+
+//
+// random string / id
+//
+
+std::string random_string();
+std::string gen_chatcmplid();
+std::string gen_tool_call_id();
+
+//
+// lora utils
+//
+
+// check whether the given lora set has only aloras activated (empty => false)
+bool lora_all_alora(const std::vector<common_adapter_lora_info> & loras);
+
+// if the two sets of loras are different, they require a cache clear unless the
+// change is only from aloras to aloras.
+bool lora_should_clear_cache(
+        const std::vector<common_adapter_lora_info> & current,
+        const std::vector<common_adapter_lora_info> & next);
+
+std::vector<common_adapter_lora_info> parse_lora_request(
+        const std::vector<common_adapter_lora_info> & lora_base,
+        const json & data);
+
+bool are_lora_equal(
+        const std::vector<common_adapter_lora_info> & l1,
+        const std::vector<common_adapter_lora_info> & l2);
+
+// get the ids of all enabled loras
+std::vector<size_t> lora_get_enabled_ids(const std::vector<common_adapter_lora_info> & loras);
+
+//
+// server_tokens
+//
+
+/**
+ * server_tokens is a helper to manage the input tokens and image for the server.
+ * it is made this way to simplify the logic of KV cache management.
+ */
+struct server_tokens {
+    bool has_mtmd = false;
+
+private: // disallow accessing these members directly, risking out-of-sync
+
+    // map a **start** index in tokens to the image chunk
+    // note: the order need to be in-sync with tokens
+    std::map<size_t, mtmd::input_chunk_ptr> map_idx_to_media;
+
+    // list of tokens
+    //   if the token is LLAMA_TOKEN_NULL, it indicates that this position is occupied by media chunk
+    //   otherwise, it is a normal text token
+    // note: a non-text chunk can occupy multiple tokens (aka memory cells) in the token list
+    // note(2): for M-RoPE, an image can occupy different number of pos; do not assume 1-to-1 mapping tokens <-> pos
+    llama_tokens tokens;
+
+    // for ex. with input of 5 text tokens and 2 images (each image occupies 3 tokens and 2 pos):
+    //      [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] [img1]
+    // idx  0   1   2   3   4   5      6      7      8      9      10
+    // pos  0   1   2   3   4   5      5      5      7      7      7
+    // map_idx_to_media will contain: {5, img0}, {8, img1}
+
+public:
+    server_tokens() = default;
+    ~server_tokens() = default;
+
+    // Prevent copying
+    // TODO: server_tokens should be copyable - remove this:
+    server_tokens(const server_tokens&) = delete;
+    server_tokens& operator=(const server_tokens&) = delete;
+
+    // Allow moving (usually implicitly generated if members are movable)
+    server_tokens(server_tokens&&) = default;
+    server_tokens& operator=(server_tokens&&) = default;
+
+    // Allow accessing elements using [] operator
+    llama_token operator[](size_t index) { return tokens[index]; }
+    const llama_token& operator[](size_t index) const { return tokens[index]; }
+
+    server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd);
+    server_tokens(const llama_tokens & tokens, bool has_mtmd);
+
+    // for debugging
+    std::string str() const;
+
+    llama_pos pos_next() const;
+    const mtmd::input_chunk_ptr & find_chunk(size_t idx) const;
+
+    void push_back(llama_token tok);
+
+    // will create a copy of the chunk if it contains non-text data
+    void push_back(const mtmd_input_chunk * chunk);
+
+    // appends server tokens, updates the media map. copies media chunks.
+    void push_back(server_tokens & tokens);
+
+    // for compatibility with context shift and prompt truncation
+    void insert(const llama_tokens & inp_tokens);
+
+    // for compatibility with speculative decoding, ctx shift, slot save/load
+    const llama_tokens & get_text_tokens() const;
+
+    // for compatibility with speculative decoding
+    void set_token(llama_pos pos, llama_token id);
+
+    size_t size() const { return tokens.size(); }
+
+    bool empty() const { return tokens.empty(); }
+
+    void clear() {
+        map_idx_to_media.clear();
+        tokens.clear();
+    }
+
+    void keep_first(size_t n);
+
+    std::string detokenize(const llama_context * ctx, bool special) const;
+
+    size_t get_common_prefix(const server_tokens & b) const;
+
+    // make sure all text tokens are within the vocab range
+    bool validate(const struct llama_context * ctx) const;
+
+    // encode and decode the image chunk
+    int32_t process_chunk(
+                llama_context * ctx,
+                mtmd_context * mctx,
+                size_t idx,
+                llama_pos pos,
+                int32_t seq_id,
+                size_t & n_tokens_out) const;
+};
+
+
+//
+// tokenizer and input processing utils
+//
+
+bool json_is_array_of_numbers(const json & data);
+
+// is array having BOTH numbers & strings?
+bool json_is_array_of_mixed_numbers_strings(const json & data);
+
+// does array have any individual integers/tokens?
+bool json_is_array_and_contains_numbers(const json & data);
+
+// get value by path(key1 / key2)
+json json_get_nested_values(const std::vector<std::string> & paths, const json & js);
+
+/**
+ * this handles 2 cases:
+ * - only string, example: "string"
+ * - mixed string and tokens, example: [12, 34, "string", 56, 78]
+ */
+llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special);
+
+// return the last index of character that can form a valid string
+// if the last character is potentially cut in half, return the index before the cut
+// if validate_utf8(text) == text.size(), then the whole text is valid utf8
+size_t validate_utf8(const std::string& text);
+
+// process mtmd prompt, return the server_tokens containing both text tokens and media chunks
+server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files);
+
+/**
+ * break the input "prompt" object into multiple prompt if needed, then tokenize them
+ * this supports these cases:
+ * - "prompt": "string"
+ * - "prompt": [12, 34, 56]
+ * - "prompt": [12, 34, "string", 56, 78]
+ * - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] }
+ * and multiple prompts (multi-tasks):
+ * - "prompt": ["string1", "string2"]
+ * - "prompt": ["string1", [12, 34, 56]]
+ * - "prompt": [[12, 34, 56], [78, 90, 12]]
+ * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56], { "prompt_string": "string", "multimodal_data": [ "base64" ]}]
+ */
+std::vector<server_tokens> tokenize_input_prompts(
+                                        const llama_vocab * vocab,
+                                        mtmd_context * mctx,
+                                        const json & json_prompt,
+                                        bool add_special,
+                                        bool parse_special);
+
+//
+// OAI utils
+//
+
+// used by /completions endpoint
+json oaicompat_completion_params_parse(const json & body);
+
+struct oaicompat_parser_options {
+    bool use_jinja;
+    bool prefill_assistant;
+    common_reasoning_format reasoning_format;
+    std::map<std::string,std::string> chat_template_kwargs;
+    common_chat_templates * tmpls;
+    bool allow_image;
+    bool allow_audio;
+    bool enable_thinking = true;
+    std::string media_path;
+};
+
+// used by /chat/completions endpoint
+json oaicompat_chat_params_parse(
+    json & body, /* openai api json semantics */
+    const oaicompat_parser_options & opt,
+    std::vector<raw_buffer> & out_files);
+
+// convert Anthropic Messages API format to OpenAI Chat Completions API format
+json convert_anthropic_to_oai(const json & body);
+
+// TODO: move it to server-task.cpp
+json format_embeddings_response_oaicompat(
+    const json & request,
+    const std::string & model_name,
+    const json & embeddings,
+    bool use_base64 = false);
+
+// TODO: move it to server-task.cpp
+json format_response_rerank(
+        const json & request,
+        const std::string & model_name,
+        const json & ranks,
+        bool is_tei_format,
+        std::vector<std::string> & texts,
+        int top_n);
+
+//
+// other utils
+//
+
+std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx);
+
+std::string safe_json_to_str(const json & data);
+
+std::string tokens_to_str(llama_context * ctx, const llama_tokens & tokens);
+
+// format incomplete utf-8 multibyte character for output
+std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token);
+
+// format server-sent event (SSE), return the formatted string to send
+// note: if data is a json array, it will be sent as multiple events, one per item
+std::string format_oai_sse(const json & data);
+
+// format Anthropic-style SSE with event types
+std::string format_anthropic_sse(const json & data);
+
+bool is_valid_utf8(const std::string & str);
+
+//
+// formatting output responses
+// TODO: move these to server-task.cpp
+//
+
+llama_tokens format_prompt_infill(
+        const llama_vocab * vocab,
+        const json & input_prefix,
+        const json & input_suffix,
+        const json & input_extra,
+        const int n_batch,
+        const int n_predict,
+        const int n_ctx,
+        const bool spm_infill,
+        const llama_tokens & tokens_prompt);
+
+// format rerank task: [BOS]query[EOS][SEP]doc[EOS].
+server_tokens format_prompt_rerank(
+        const struct llama_model * model,
+        const struct llama_vocab * vocab,
+        mtmd_context * mctx,
+        const std::string & query,
+        const std::string & doc);
diff --git a/llamacpp/native/src/server/server-context.cpp b/llamacpp/native/src/server/server-context.cpp
new file mode 100644
index 000000000..c92457457
--- /dev/null
+++ b/llamacpp/native/src/server/server-context.cpp
@@ -0,0 +1,3637 @@
+#include "server-context.h"
+#include "server-common.h"
+#include "server-http.h"
+#include "server-task.h"
+#include "server-queue.h"
+
+#include "arg.h"
+#include "common.h"
+#include "llama.h"
+#include "log.h"
+#include "sampling.h"
+#include "speculative.h"
+#include "mtmd.h"
+#include "mtmd-helper.h"
+
+#include <cstddef>
+#include <cinttypes>
+#include <memory>
+#include <unordered_set>
+#include <filesystem>
+
+// fix problem with std::min and std::max
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+using json = nlohmann::ordered_json;
+
+constexpr int HTTP_POLLING_SECONDS = 1;
+
+// state diagram: https://github.com/ggml-org/llama.cpp/pull/9283
+enum slot_state {
+    SLOT_STATE_IDLE,
+    SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
+    SLOT_STATE_PROCESSING_PROMPT,
+    SLOT_STATE_DONE_PROMPT,
+    SLOT_STATE_GENERATING,
+};
+
+enum server_state {
+    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
+    SERVER_STATE_READY,          // Server is ready and model is loaded
+};
+
+static bool server_task_type_need_embd(server_task_type task_type) {
+    switch (task_type) {
+        case SERVER_TASK_TYPE_EMBEDDING:
+        case SERVER_TASK_TYPE_RERANK:
+            return true;
+        default:
+            return false;
+    }
+}
+
+static bool server_task_type_need_logits(server_task_type task_type) {
+    switch (task_type) {
+        case SERVER_TASK_TYPE_COMPLETION:
+        case SERVER_TASK_TYPE_INFILL:
+            return true;
+        default:
+            return false;
+    }
+}
+
+struct server_slot {
+    int id;
+
+    llama_batch batch_spec = {};
+
+    // TODO: change to unique_ptrs for consistency:
+    llama_context * ctx = nullptr;
+    llama_context * ctx_dft = nullptr;
+
+    // multimodal
+    mtmd_context * mctx = nullptr;
+
+    common_speculative * spec = nullptr;
+
+    std::unique_ptr<const server_task> task;
+    std::unique_ptr<const server_task> task_prev; // used for debugging
+
+    // used to determine the slot that has been used the longest
+    int64_t t_last_used = -1;
+
+    // generation props
+    int32_t n_ctx       = 0;  // context size per slot
+    int32_t n_keep      = 0;
+    int32_t n_decoded   = 0;
+    int32_t n_remaining = -1;
+    int32_t i_batch     = -1;
+
+    int32_t n_prompt_tokens_cache     = 0;
+    int32_t n_prompt_tokens_processed = 0;
+
+    size_t last_nl_pos = 0;
+
+    std::string  generated_text;
+    llama_tokens generated_tokens;
+
+    common_chat_msg chat_msg;
+
+    std::vector<completion_token_output> generated_token_probs;
+
+    bool has_next_token = true;
+    bool has_new_line   = false;
+    bool truncated      = false;
+
+    stop_type stop;
+
+    std::string stopping_word;
+
+    // state
+    slot_state state = SLOT_STATE_IDLE;
+
+    server_prompt prompt;
+
+    void prompt_save(server_prompt_cache & prompt_cache) const {
+        GGML_ASSERT(prompt.data.size() == 0);
+
+        const size_t cur_size = llama_state_seq_get_size_ext(ctx, id, 0);
+
+        SRV_WRN(" - saving prompt with length %d, total state size = %.3f MiB\n",
+                (int) prompt.tokens.size(), cur_size / (1024.0 * 1024.0));
+
+        auto * cur = prompt_cache.alloc(prompt, cur_size);
+        if (cur == nullptr) {
+            return;
+        }
+
+        llama_state_seq_get_data_ext(ctx, cur->data.data(), cur_size, id, 0);
+    }
+
+    bool prompt_load(server_prompt_cache & prompt_cache, const server_tokens & tokens) {
+        bool res = prompt_cache.load(prompt, tokens, ctx, id);
+        if (!res) {
+            SLT_WRN(*this, "%s", "failed to load prompt from cache\n");
+        }
+
+        return res;
+    }
+
+    std::vector<common_adapter_lora_info> lora;
+    int32_t alora_invocation_start = -1;
+
+    // sampling
+    json json_schema;
+
+    struct common_sampler * smpl = nullptr;
+
+    llama_token sampled;
+
+    common_chat_format chat_format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    std::vector<std::string> generated_tool_call_ids;
+
+    // stats
+    size_t n_sent_text = 0; // number of sent text character
+
+    int64_t t_start_process_prompt;
+    int64_t t_start_generation;
+
+    double t_prompt_processing; // ms
+    double t_token_generation;  // ms
+
+    std::function<void(int)> callback_on_release;
+
+    // Speculative decoding stats
+    int32_t n_draft_total = 0;      // Total draft tokens generated
+    int32_t n_draft_accepted = 0;   // Draft tokens actually accepted
+
+    void reset() {
+        SLT_DBG(*this, "%s", "\n");
+
+        n_prompt_tokens_cache = 0;
+
+        last_nl_pos    = 0;
+        generated_text = "";
+        has_new_line   = false;
+        truncated      = false;
+        stop           = STOP_TYPE_NONE;
+        stopping_word  = "";
+        n_sent_text    = 0;
+        chat_format    = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+
+        generated_tokens.clear();
+        generated_token_probs.clear();
+        chat_msg = {};
+        json_schema = json();
+        generated_tool_call_ids.clear();
+
+        // clear speculative decoding stats
+        n_draft_total = 0;
+        n_draft_accepted = 0;
+
+        task.reset();
+        task_prev.reset();
+
+        // clear alora start
+        alora_invocation_start = -1;
+    }
+
+    bool need_embd() const {
+        GGML_ASSERT(task);
+
+        return server_task_type_need_embd(task->type);
+    }
+
+    bool need_logits() const {
+        GGML_ASSERT(task);
+
+        return server_task_type_need_logits(task->type);
+    }
+
+    // if the context does not have a memory module then all embeddings have to be computed within a single ubatch
+    // also we cannot split if the pooling would require any past tokens
+    bool can_split() const {
+        return
+            !need_embd() ||
+            (llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST);
+    }
+
+    bool can_batch_with(server_slot & other_slot) const {
+        GGML_ASSERT(task);
+
+        return task->type == other_slot.task->type && are_lora_equal(lora, other_slot.lora);
+    }
+
+    bool has_budget(const common_params & global_params) {
+        GGML_ASSERT(task);
+
+        if (task->params.n_predict == -1 && global_params.n_predict == -1) {
+            return true; // limitless
+        }
+
+        n_remaining = -1;
+
+        if (task->params.n_predict != -1) {
+            n_remaining = task->params.n_predict - n_decoded;
+        } else if (global_params.n_predict != -1) {
+            n_remaining = global_params.n_predict - n_decoded;
+        }
+
+        return n_remaining > 0; // no budget
+    }
+
+    bool is_processing() const {
+        return state != SLOT_STATE_IDLE;
+    }
+
+    bool can_speculate() const {
+        return ctx_dft;
+    }
+
+    void add_token(const completion_token_output & token) {
+        if (!is_processing()) {
+            SLT_WRN(*this, "%s", "slot is not processing\n");
+            return;
+        }
+        generated_token_probs.push_back(token);
+    }
+
+    void release() {
+        if (is_processing()) {
+            GGML_ASSERT(task);
+
+            SLT_INF(*this, "stop processing: n_tokens = %d, truncated = %d\n", prompt.n_tokens(), truncated);
+
+            t_last_used = ggml_time_us();
+            t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
+            state = SLOT_STATE_IDLE;
+
+            task_prev = std::move(task);
+            task.reset();
+
+            callback_on_release(id);
+        }
+    }
+
+    result_timings get_timings() const {
+        result_timings timings;
+        timings.cache_n = n_prompt_tokens_cache;
+
+        timings.prompt_n            = n_prompt_tokens_processed;
+        timings.prompt_ms           = t_prompt_processing;
+        timings.prompt_per_token_ms = t_prompt_processing / n_prompt_tokens_processed;
+        timings.prompt_per_second   = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
+
+        timings.predicted_n            = n_decoded;
+        timings.predicted_ms           = t_token_generation;
+        timings.predicted_per_token_ms = t_token_generation / n_decoded;
+        timings.predicted_per_second   = 1e3 / t_token_generation * n_decoded;
+
+        // Add speculative metrics
+        if (n_draft_total > 0) {
+            timings.draft_n          = n_draft_total;
+            timings.draft_n_accepted = n_draft_accepted;
+        }
+
+        return timings;
+    }
+
+    const common_chat_msg & update_chat_msg(std::vector<common_chat_msg_diff> & diffs) {
+        GGML_ASSERT(task);
+
+        auto previous_msg = chat_msg;
+        SRV_DBG("Parsing chat message: %s\n", generated_text.c_str());
+        auto new_msg = common_chat_parse(
+            generated_text,
+            /* is_partial= */ stop != STOP_TYPE_EOS,
+            task->params.oaicompat_chat_syntax);
+        if (!new_msg.empty()) {
+            new_msg.set_tool_call_ids(generated_tool_call_ids, gen_tool_call_id);
+            chat_msg = new_msg;
+            diffs = common_chat_msg_diff::compute_diffs(previous_msg, new_msg.empty() ? previous_msg : new_msg);
+        }
+        return chat_msg;
+    }
+
+    size_t find_stopping_strings(const std::string & text, const size_t last_token_size, bool is_full_stop) {
+        GGML_ASSERT(task);
+
+        size_t stop_pos = std::string::npos;
+
+        for (const std::string & word : task->params.antiprompt) {
+            size_t pos;
+
+            if (is_full_stop) {
+                const size_t tmp      = word.size() + last_token_size;
+                const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
+
+                pos = text.find(word, from_pos);
+            } else {
+                // otherwise, partial stop
+                pos = string_find_partial_stop(text, word);
+            }
+
+            if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
+                if (is_full_stop) {
+                    stop           = STOP_TYPE_WORD;
+                    stopping_word  = word;
+                    has_next_token = false;
+                }
+                stop_pos = pos;
+            }
+        }
+
+        return stop_pos;
+    }
+
+    void print_timings() const {
+        const double t_prompt        =       t_prompt_processing / n_prompt_tokens_processed;
+        const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
+
+        const double t_gen        =       t_token_generation / n_decoded;
+        const double n_gen_second = 1e3 / t_token_generation * n_decoded;
+
+        SLT_INF(*this,
+                "\n"
+                "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
+                "       eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
+                "      total time = %10.2f ms / %5d tokens\n",
+                t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
+                t_token_generation, n_decoded, t_gen, n_gen_second,
+                t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
+
+        if (n_draft_total > 0) {
+            const float draft_ratio = (float) n_draft_accepted / n_draft_total;
+            SLT_INF(*this,
+                    "\n"
+                    "draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n",
+                    draft_ratio, n_draft_accepted, n_draft_total
+            );
+        }
+    }
+
+    json to_json(bool only_metrics = false) const {
+        json res;
+
+        res = {
+            {"id",            id},
+            {"n_ctx",         n_ctx},
+            {"speculative",   can_speculate()},
+            {"is_processing", is_processing()},
+        };
+
+        const auto & ptask = task ? task : task_prev;
+
+        if (ptask) {
+            res["id_task"] = ptask->id;
+            res["params"] = ptask->params.to_json(only_metrics);
+            res["next_token"] = {
+                {
+                    {"has_next_token", has_next_token},
+                    {"has_new_line",   has_new_line},
+                    {"n_remain",       n_remaining},
+                    {"n_decoded",      n_decoded},
+                }
+            };
+
+            if (!only_metrics) {
+                res["prompt"] = ptask->tokens.detokenize(ctx, true);
+                res["generated"] = generated_text;
+            }
+        }
+
+        return res;
+    }
+};
+
+
+
+//
+// server_metrics
+//
+
+struct server_metrics {
+    int64_t t_start = 0;
+
+    uint64_t n_prompt_tokens_processed_total = 0;
+    uint64_t t_prompt_processing_total       = 0;
+    uint64_t n_tokens_predicted_total        = 0;
+    uint64_t t_tokens_generation_total       = 0;
+
+    uint64_t n_tokens_max = 0;
+
+    uint64_t n_prompt_tokens_processed = 0;
+    uint64_t t_prompt_processing       = 0;
+
+    uint64_t n_tokens_predicted  = 0;
+    uint64_t t_tokens_generation = 0;
+
+    uint64_t n_decode_total     = 0;
+    uint64_t n_busy_slots_total = 0;
+
+    void init() {
+        t_start = ggml_time_us();
+    }
+
+    void on_prompt_eval(const server_slot & slot) {
+        n_prompt_tokens_processed_total += slot.n_prompt_tokens_processed;
+        n_prompt_tokens_processed       += slot.n_prompt_tokens_processed;
+        t_prompt_processing             += slot.t_prompt_processing;
+        t_prompt_processing_total       += slot.t_prompt_processing;
+
+        n_tokens_max = std::max(n_tokens_max, (uint64_t) slot.prompt.n_tokens());
+    }
+
+    void on_prediction(const server_slot & slot) {
+        n_tokens_predicted_total   += slot.n_decoded;
+        n_tokens_predicted         += slot.n_decoded;
+        t_tokens_generation        += slot.t_token_generation;
+        t_tokens_generation_total  += slot.t_token_generation;
+    }
+
+    void on_decoded(const std::vector<server_slot> & slots) {
+        n_decode_total++;
+        for (const auto & slot : slots) {
+            if (slot.is_processing()) {
+                n_busy_slots_total++;
+            }
+            n_tokens_max = std::max(n_tokens_max, (uint64_t) slot.prompt.n_tokens());
+        }
+    }
+
+    void reset_bucket() {
+        n_prompt_tokens_processed = 0;
+        t_prompt_processing       = 0;
+        n_tokens_predicted        = 0;
+        t_tokens_generation       = 0;
+    }
+};
+
+
+//
+// server_context_impl (private implementation)
+//
+
+struct server_context_impl {
+    common_params params_base;
+
+    // note: keep these alive - they determine the lifetime of the model, context, etc.
+    common_init_result llama_init;
+    common_init_result llama_init_dft;
+
+    llama_model * model = nullptr;
+    llama_context * ctx = nullptr;
+
+    // multimodal
+    mtmd_context * mctx = nullptr;
+
+    const llama_vocab * vocab = nullptr;
+    bool vocab_dft_compatible = true;
+
+    llama_model * model_dft = nullptr;
+
+    llama_context_params cparams_dft;
+
+    llama_batch batch {};
+
+    bool add_bos_token  = true;
+
+    int32_t n_ctx; // total context for all clients / slots
+
+    // slots / clients
+    std::vector<server_slot> slots;
+
+    int slots_debug = 0;
+
+    server_queue    queue_tasks;
+    server_response queue_results;
+
+    std::unique_ptr<server_prompt_cache> prompt_cache;
+
+    server_metrics metrics;
+
+    // Necessary similarity of prompt for slot selection
+    float slot_prompt_similarity = 0.0f;
+
+    std::string model_name; // name of the loaded model, to be used by API
+
+    common_chat_templates_ptr chat_templates;
+    oaicompat_parser_options  oai_parser_opt;
+
+    ~server_context_impl() {
+        mtmd_free(mctx);
+
+        // Clear any sampling context
+        for (server_slot & slot : slots) {
+            common_sampler_free(slot.smpl);
+            slot.smpl = nullptr;
+
+            llama_free(slot.ctx_dft);
+            slot.ctx_dft = nullptr;
+
+            common_speculative_free(slot.spec);
+            slot.spec = nullptr;
+
+            llama_batch_free(slot.batch_spec);
+        }
+
+        llama_batch_free(batch);
+    }
+
+    // load the model and initialize llama_context
+    bool load_model(const common_params & params) {
+        SRV_INF("loading model '%s'\n", params.model.path.c_str());
+
+        params_base = params;
+
+        llama_init = common_init_from_params(params_base);
+
+        model = llama_init.model.get();
+        ctx   = llama_init.context.get();
+
+        if (model == nullptr) {
+            SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str());
+            return false;
+        }
+
+        vocab = llama_model_get_vocab(model);
+
+        n_ctx = llama_n_ctx(ctx);
+
+        add_bos_token = llama_vocab_get_add_bos(vocab);
+
+        if (params_base.has_speculative()) {
+            SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str());
+
+            auto params_dft = params_base;
+
+            params_dft.devices      = params_base.speculative.devices;
+            params_dft.model        = params_base.speculative.model;
+            params_dft.n_ctx        = params_base.speculative.n_ctx == 0 ? llama_n_ctx_seq(ctx) : params_base.speculative.n_ctx;
+            params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
+            params_dft.n_parallel   = 1;
+            params_dft.cache_type_k = params_base.speculative.cache_type_k;
+            params_dft.cache_type_v = params_base.speculative.cache_type_v;
+
+            params_dft.cpuparams.n_threads = params_base.speculative.cpuparams.n_threads;
+            params_dft.cpuparams_batch.n_threads = params_base.speculative.cpuparams_batch.n_threads;
+            params_dft.tensor_buft_overrides = params_base.speculative.tensor_buft_overrides;
+
+            llama_init_dft = common_init_from_params(params_dft);
+
+            model_dft = llama_init_dft.model.get();
+
+            if (model_dft == nullptr) {
+                SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str());
+                return false;
+            }
+
+            vocab_dft_compatible = common_speculative_are_compatible(ctx, llama_init_dft.context.get());
+            if (!vocab_dft_compatible) {
+                SRV_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());
+            }
+
+            const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get());
+
+            cparams_dft = common_context_params_to_llama(params_dft);
+            cparams_dft.n_batch = n_ctx_dft;
+
+            // the context is not needed - we will create one for each slot
+            llama_init_dft.context.reset();
+        }
+
+        chat_templates = common_chat_templates_init(model, params_base.chat_template);
+        try {
+            common_chat_format_example(chat_templates.get(), params.use_jinja, params.default_template_kwargs);
+        } catch (const std::exception & e) {
+            SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what());
+            SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
+            chat_templates = common_chat_templates_init(model, "chatml");
+        }
+
+        std::string & mmproj_path = params_base.mmproj.path;
+        if (!mmproj_path.empty()) {
+            mtmd_helper_log_set(common_log_default_callback, nullptr);
+
+            mtmd_context_params mparams = mtmd_context_params_default();
+            mparams.use_gpu          = params_base.mmproj_use_gpu;
+            mparams.print_timings    = false;
+            mparams.n_threads        = params_base.cpuparams.n_threads;
+            mparams.flash_attn_type  = params_base.flash_attn_type;
+            mparams.warmup           = params_base.warmup;
+            mparams.image_min_tokens = params_base.image_min_tokens;
+            mparams.image_max_tokens = params_base.image_max_tokens;
+            mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
+            if (mctx == nullptr) {
+                SRV_ERR("failed to load multimodal model, '%s'\n", mmproj_path.c_str());
+                return false;
+            }
+            SRV_INF("loaded multimodal model, '%s'\n", mmproj_path.c_str());
+
+            if (params_base.ctx_shift) {
+                params_base.ctx_shift = false;
+                SRV_WRN("%s\n", "ctx_shift is not supported by multimodal, it will be disabled");
+            }
+
+            if (params_base.n_cache_reuse) {
+                params_base.n_cache_reuse = 0;
+                SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
+            }
+
+            if (params_base.has_speculative()) {
+                SRV_ERR("%s\n", "err: speculative decode is not supported by multimodal");
+                return false;
+            }
+        }
+
+        if (!llama_memory_can_shift(llama_get_memory(ctx))) {
+            if (params_base.ctx_shift) {
+                params_base.ctx_shift = false;
+                SRV_WRN("%s\n", "ctx_shift is not supported by this context, it will be disabled");
+            }
+
+            if (params_base.n_cache_reuse) {
+                params_base.n_cache_reuse = 0;
+                SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled");
+            }
+        }
+
+        return true;
+    }
+
+    // initialize slots and server-related data
+    void init() {
+        // wiring up server queues
+        queue_tasks.on_new_task([this](server_task && task) {
+            process_single_task(std::move(task));
+        });
+        queue_tasks.on_update_slots([this]() {
+            update_slots();
+        });
+
+        // Necessary similarity of prompt for slot selection
+        slot_prompt_similarity = params_base.slot_prompt_similarity;
+
+        // setup slots
+        SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
+
+        const int n_ctx_train = llama_model_n_ctx_train(model);
+
+        int n_ctx_slot = llama_n_ctx_seq(ctx);
+        if (n_ctx_slot > n_ctx_train) {
+            SRV_WRN("the slot context (%d) exceeds the training context of the model (%d) - capping\n", n_ctx_slot, n_ctx_train);
+            n_ctx_slot = n_ctx_train;
+        }
+
+        for (int i = 0; i < params_base.n_parallel; i++) {
+            server_slot slot;
+
+            slot.id = i;
+            slot.ctx = ctx;
+            slot.n_ctx = n_ctx_slot;
+            slot.mctx = mctx;
+            slot.prompt.tokens.has_mtmd = mctx != nullptr;
+
+            if (model_dft) {
+                slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
+
+                // TODO: rework speculative decoding [TAG_SERVER_SPEC_REWORK]
+                slot.ctx_dft = llama_init_from_model(model_dft, cparams_dft);
+                if (slot.ctx_dft == nullptr) {
+                    SRV_ERR("%s", "failed to create draft context\n");
+                    return;
+                }
+
+                slot.spec = common_speculative_init(slot.ctx, slot.ctx_dft);
+                if (slot.spec == nullptr) {
+                    SRV_ERR("%s", "failed to create speculator\n");
+                    return;
+                }
+                for (auto & pair : params_base.speculative.replacements) {
+                    common_speculative_add_replacement_tgt_dft(slot.spec, pair.first.c_str(), pair.second.c_str());
+                }
+            }
+
+            SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx);
+
+            slot.callback_on_release = [this](int) {
+                queue_tasks.pop_deferred_task();
+            };
+
+            slot.reset();
+
+            slots.push_back(std::move(slot));
+        }
+
+        {
+            const char * LLAMA_SERVER_SLOTS_DEBUG = getenv("LLAMA_SERVER_SLOTS_DEBUG");
+            slots_debug = LLAMA_SERVER_SLOTS_DEBUG ? atoi(LLAMA_SERVER_SLOTS_DEBUG) : 0;
+
+            if (slots_debug) {
+                SRV_WRN("slots debug = %d\n", slots_debug);
+            }
+        }
+
+        // the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens
+        // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
+        {
+            const int32_t n_batch = llama_n_batch(ctx);
+            batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
+        }
+
+        metrics.init();
+
+        if (params_base.cache_ram_mib != 0) {
+            if (params_base.cache_ram_mib < 0) {
+                SRV_WRN("prompt cache is enabled, size limit: %s\n", "no limit");
+            } else {
+                SRV_WRN("prompt cache is enabled, size limit: %d MiB\n", params_base.cache_ram_mib);
+            }
+            SRV_WRN("%s", "use `--cache-ram 0` to disable the prompt cache\n");
+
+            prompt_cache = std::make_unique<server_prompt_cache>(params_base.cache_ram_mib, n_ctx);
+        } else {
+            SRV_WRN("%s", "prompt cache is disabled - use `--cache-ram N` to enable it\n");
+        }
+        SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
+
+        if (!params_base.model_alias.empty()) {
+            // user explicitly specified model name
+            model_name = params_base.model_alias;
+        } else if (!params_base.model.name.empty()) {
+            // use model name in registry format (for models in cache)
+            model_name = params_base.model.name;
+        } else {
+            // fallback: derive model name from file name
+            auto model_path = std::filesystem::path(params_base.model.path);
+            model_name = model_path.filename().string();
+        }
+
+        // thinking is enabled if:
+        // 1. It's not explicitly disabled (reasoning_budget == 0)
+        // 2. The chat template supports it
+        const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
+        SRV_INF("thinking = %d\n", enable_thinking);
+
+        oai_parser_opt = {
+            /* use_jinja             */ params_base.use_jinja,
+            /* prefill_assistant     */ params_base.prefill_assistant,
+            /* reasoning_format      */ params_base.reasoning_format,
+            /* chat_template_kwargs  */ params_base.default_template_kwargs,
+            /* common_chat_templates */ chat_templates.get(),
+            /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false,
+            /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false,
+            /* enable_thinking       */ enable_thinking,
+            /* media_path            */ params_base.media_path,
+        };
+
+        // print sample chat example to make it clear which template is used
+        LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
+            common_chat_templates_source(chat_templates.get()),
+            common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());
+    }
+
+    server_slot * get_slot_by_id(int id) {
+        for (server_slot & slot : slots) {
+            if (slot.id == id) {
+                return &slot;
+            }
+        }
+
+        return nullptr;
+    }
+
+    server_slot * get_available_slot(const server_task & task) {
+        server_slot * ret = nullptr;
+
+        bool update_cache = false;
+
+        // find the slot that has at least n% prompt similarity
+        if (ret == nullptr && slot_prompt_similarity != 0.0f) {
+            float sim_best = 0;
+
+            for (server_slot & slot : slots) {
+                // skip the slot if it is not available
+                if (slot.is_processing()) {
+                    continue;
+                }
+
+                const auto & tokens = slot.prompt.tokens;
+
+                // skip the slot if it does not contains cached tokens
+                if (tokens.empty()) {
+                    continue;
+                }
+
+                // fraction of the Longest Common Prefix length with respect to the input prompt length
+                const float sim_cur = float(tokens.get_common_prefix(task.tokens)) / task.tokens.size();
+
+                // select the current slot if the criteria match
+                if (sim_cur > sim_best && sim_cur > slot_prompt_similarity) {
+                    sim_best = sim_cur;
+
+                    ret = &slot;
+                }
+            }
+
+            if (ret != nullptr) {
+                const float f_keep = (sim_best*task.tokens.size()) / ret->prompt.tokens.size();
+
+                SLT_INF(*ret, "selected slot by LCP similarity, sim_best = %.3f (> %.3f thold), f_keep = %.3f\n",
+                        sim_best, slot_prompt_similarity, f_keep);
+
+                // if we are about to lose a large portion of the existing context - save it in the prompt cache
+                if (f_keep < 0.5f) {
+                    update_cache = true;
+                }
+            }
+        }
+
+        // find the slot that has been least recently used
+        if (ret == nullptr) {
+            int64_t t_last = -1;
+
+            for (server_slot & slot : slots) {
+                // skip the slot if it is not available
+                if (slot.is_processing()) {
+                    continue;
+                }
+
+                // select the current slot if the criteria match
+                if (!ret || slot.t_last_used <= t_last) {
+                    t_last = slot.t_last_used;
+                    ret = &slot;
+                }
+            }
+
+            if (ret != nullptr) {
+                SLT_INF(*ret, "selected slot by LRU, t_last = %" PRId64 "\n", t_last);
+
+                update_cache = true;
+            }
+        }
+
+        if (ret) {
+            const auto & tokens = ret->prompt.tokens;
+
+            update_cache = update_cache && prompt_cache;
+
+            // cache prompts only for completion tasks
+            update_cache = update_cache && task.type == SERVER_TASK_TYPE_COMPLETION;
+
+            // don't update the cache if the slot's context is empty
+            update_cache = update_cache && tokens.size() > 0;
+
+            // TODO: mtmd does not support prompt cache
+            update_cache = update_cache && (ret->mctx == nullptr);
+
+            if (update_cache) {
+                SRV_WRN("%s", "updating prompt cache\n");
+
+                const int64_t t_start = ggml_time_us();
+
+                ret->prompt_save(*prompt_cache);
+
+                if (!ret->prompt_load(*prompt_cache, task.tokens)) {
+                    clear_slot(*ret);
+                }
+
+                prompt_cache->update();
+
+                SRV_WRN("prompt cache update took %.2f ms\n", (ggml_time_us() - t_start) / 1000.0);
+            }
+        }
+
+        return ret;
+    }
+
+    void clear_slot(server_slot & slot) const {
+        GGML_ASSERT(!slot.is_processing());
+
+        SLT_WRN(slot, "clearing slot with %zu tokens\n", slot.prompt.tokens.size());
+
+        llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);
+        slot.prompt.tokens.clear();
+    }
+
+    // return true if at least one slot has been cleared
+    // TODO: improve logic
+    //       - smarter decision which slot to clear (LRU or longest prompt?)
+    //       - move slot to level 2 cache instead of removing?
+    //       - instead of purging, try to store and resume later?
+    bool try_clear_idle_slots() {
+        bool res = false;
+
+        if (!params_base.kv_unified) {
+            return res;
+        }
+
+        for (auto & slot : slots) {
+            if (slot.is_processing()) {
+                continue;
+            }
+
+            if (slot.prompt.n_tokens() > 0) {
+                SRV_WRN("purging slot %d with %zu tokens\n", slot.id, slot.prompt.tokens.size());
+
+                clear_slot(slot);
+
+                res = true;
+
+                // clear slots one by one
+                break;
+            }
+        }
+
+        return res;
+    }
+
+    bool launch_slot_with_task(server_slot & slot, server_task && task) {
+        slot.reset();
+
+        if (!are_lora_equal(task.params.lora, slot.lora)) {
+            // if lora has changed, check to see if the cache should be cleared
+            if (lora_should_clear_cache(slot.lora, task.params.lora)) {
+                SLT_INF(slot, "clearing cache for lora change. %zu loras -> %zu loras\n", slot.lora.size(), task.params.lora.size());
+                slot.prompt.tokens.clear();
+            } else {
+                SLT_INF(slot, "keeping cache for alora. %zu target loras\n", task.params.lora.size());
+            }
+            slot.lora = task.params.lora;
+        }
+
+        // if using alora, make sure it's only a single one requested and active
+        size_t alora_invocation_start = task.tokens.size();
+        if (lora_all_alora(slot.lora)) {
+            const auto & enabled_ids = lora_get_enabled_ids(slot.lora);
+            // TODO: This will error out if a user requests two aloras, but only
+            // provides the activation string for one. We could, instead search
+            // for all requested alora activation strings and then either keep
+            // only the last one, or reject if multiple are found.
+            if (enabled_ids.size() != 1) {
+                send_error(task, "Cannot run multiple aLoRAs in a single request", ERROR_TYPE_INVALID_REQUEST);
+                return false;
+            }
+            const auto & lora = slot.lora[enabled_ids[0]].ptr;
+
+            // get the pointer and count for the invocation tokens
+            const uint64_t      n_invocation_tokens = llama_adapter_get_alora_n_invocation_tokens(lora);
+            const llama_token * invocation_tokens   = llama_adapter_get_alora_invocation_tokens  (lora);
+
+            // scan backwards through the prompt tokens to find the last
+            // occurrence of the invocation sequence
+            int match_idx = static_cast<int>(n_invocation_tokens) - 1;
+            for (int i = task.tokens.size() - 1; i >= 0; --i) {
+                // the token in this position matches the next token to find in
+                // the invocation sequence
+                if (task.tokens[i] == invocation_tokens[match_idx]) {
+                    // if it's a full match, we've found the start
+                    if (match_idx == 0) {
+                        alora_invocation_start = i;
+                        break;
+                    }
+                    // otherwise, check the next token in the sequence
+                    --match_idx;
+                } else {
+                    // no match in this position, so start looking over again
+                    match_idx = static_cast<int>(n_invocation_tokens) - 1;
+                }
+            }
+
+            // if the activation string is not found, disable the alora
+            if (alora_invocation_start == task.tokens.size()) {
+                SLT_DBG(slot, "alora %zu requested, but not found. deactivating\n", enabled_ids[0]);
+                slot.lora[enabled_ids[0]].scale = 0.0f;
+            } else {
+                SLT_DBG(slot, "alora %zu activated starting at %zu\n", enabled_ids[0], alora_invocation_start);
+                slot.alora_invocation_start = alora_invocation_start;
+            }
+        }
+
+        if (!task.tokens.validate(ctx)) {
+            send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
+            return false;
+        }
+
+        SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
+
+        // initialize samplers
+        {
+            if (slot.smpl != nullptr) {
+                common_sampler_free(slot.smpl);
+            }
+
+            slot.smpl = common_sampler_init(model, task.params.sampling);
+            if (slot.smpl == nullptr) {
+                // for now, the only error that may happen here is invalid grammar
+                send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
+                return false;
+            }
+
+            SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl).c_str());
+        }
+
+        // initialize draft batch
+        // TODO: rework speculative decoding [TAG_SERVER_SPEC_REWORK]
+        if (slot.ctx_dft) {
+            llama_batch_free(slot.batch_spec);
+
+            slot.batch_spec = llama_batch_init(task.params.speculative.n_max + 1, 0, 1);
+        }
+
+        slot.task = std::make_unique<const server_task>(std::move(task));
+
+        slot.state = SLOT_STATE_STARTED;
+
+        SLT_INF(slot, "%s", "processing task\n");
+
+        return true;
+    }
+
+    bool process_token(completion_token_output & result, server_slot & slot) {
+        // remember which tokens were sampled - used for repetition penalties during sampling
+        const std::string token_str = result.text_to_send;
+        slot.sampled = result.tok;
+
+        slot.generated_text += token_str;
+        if (slot.task->params.return_tokens) {
+            slot.generated_tokens.push_back(result.tok);
+        }
+        slot.has_next_token = true;
+
+        // check if there is incomplete UTF-8 character at the end
+        bool incomplete = validate_utf8(slot.generated_text) < slot.generated_text.size();
+
+        // search stop word and delete it
+        if (!incomplete) {
+            size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
+
+            const std::string str_test = slot.generated_text.substr(pos);
+            bool send_text = true;
+
+            size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), true);
+            if (stop_pos != std::string::npos) {
+                slot.generated_text.erase(
+                    slot.generated_text.begin() + pos + stop_pos,
+                    slot.generated_text.end());
+                pos = std::min(slot.n_sent_text, slot.generated_text.size());
+            } else if (slot.has_next_token && !llama_vocab_is_eog(vocab, result.tok) ) {
+                stop_pos = slot.find_stopping_strings(str_test, token_str.size(), false);
+                send_text = stop_pos == std::string::npos;
+            }
+
+            // check if there is any token to predict
+            if (send_text) {
+                // no send the stop word in the response
+                result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
+                slot.n_sent_text += result.text_to_send.size();
+                // add the token to slot queue and cache
+            } else {
+                result.text_to_send = "";
+            }
+
+            slot.add_token(result);
+            if (slot.task->params.stream) {
+                send_partial_response(slot, result, false);
+            }
+        }
+
+        if (incomplete) {
+            slot.has_next_token = true;
+        }
+
+        // if context shifting is disabled, make sure that we don't run out of context
+        if (!params_base.ctx_shift && slot.prompt.n_tokens() + 1 >= slot.n_ctx) {
+            slot.truncated      = true;
+            slot.stop           = STOP_TYPE_LIMIT;
+            slot.has_next_token = false;
+
+            SLT_DBG(slot, "stopped due to running out of context capacity, prompt.n_tokens() = %d, task.n_tokens = %d, n_decoded = %d, n_ctx = %d\n",
+                    slot.prompt.n_tokens(), slot.task->n_tokens(), slot.n_decoded, slot.n_ctx);
+        }
+
+        // check the limits
+        if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
+            slot.stop           = STOP_TYPE_LIMIT;
+            slot.has_next_token = false;
+
+            SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.task->params.n_predict);
+        }
+
+        if (slot.has_new_line) {
+            // require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent
+            if (slot.task->params.n_indent > 0) {
+                // check the current indentation
+                // TODO: improve by not doing it more than once for each new line
+                if (slot.last_nl_pos > 0) {
+                    size_t pos = slot.last_nl_pos;
+
+                    int n_indent = 0;
+                    while (pos < slot.generated_text.size() && (slot.generated_text[pos] == ' ' || slot.generated_text[pos] == '\t')) {
+                        n_indent++;
+                        pos++;
+                    }
+
+                    if (pos < slot.generated_text.size() && n_indent < slot.task->params.n_indent) {
+                        slot.stop           = STOP_TYPE_LIMIT;
+                        slot.has_next_token = false;
+
+                        // cut the last line
+                        slot.generated_text.erase(pos, std::string::npos);
+
+                        SLT_DBG(slot, "stopped by indentation limit, n_decoded = %d, n_indent = %d\n", slot.n_decoded, n_indent);
+                    }
+                }
+
+                // find the next new line
+                {
+                    const size_t pos = slot.generated_text.find('\n', slot.last_nl_pos);
+
+                    if (pos != std::string::npos) {
+                        slot.last_nl_pos = pos + 1;
+                    }
+                }
+            }
+        }
+
+        // check if there is a new line in the generated text
+        if (result.text_to_send.find('\n') != std::string::npos) {
+            slot.has_new_line = true;
+
+            // if we have seen a new line, we stop after a certain time limit, but only upon another new line
+            if (slot.task->params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.task->params.t_max_predict_ms)) {
+                slot.stop           = STOP_TYPE_LIMIT;
+                slot.has_next_token = false;
+
+                SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.task->params.t_max_predict_ms);
+            }
+        }
+
+        if (llama_vocab_is_eog(vocab, result.tok)) {
+            slot.stop           = STOP_TYPE_EOS;
+            slot.has_next_token = false;
+
+            SLT_DBG(slot, "%s", "stopped by EOS\n");
+        }
+
+        SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n", slot.n_decoded, slot.n_remaining, result.tok, token_str.c_str());
+
+        return slot.has_next_token; // continue
+    }
+
+    void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) const {
+        size_t n_probs = slot.task->params.sampling.n_probs;
+        size_t n_vocab = llama_vocab_n_tokens(vocab);
+
+        if (post_sampling) {
+            const auto * cur_p = common_sampler_get_candidates(slot.smpl, true);
+            const size_t max_probs = cur_p->size;
+
+            // set probability for sampled token
+            for (size_t i = 0; i < max_probs; i++) {
+                if (cur_p->data[i].id == result.tok) {
+                    result.prob = cur_p->data[i].p;
+                    break;
+                }
+            }
+
+            // set probability for top n_probs tokens
+            result.probs.reserve(max_probs);
+            for (size_t i = 0; i < std::min(max_probs, n_probs); i++) {
+                result.probs.push_back({
+                    cur_p->data[i].id,
+                    common_token_to_piece(ctx, cur_p->data[i].id, special),
+                    cur_p->data[i].p
+                });
+            }
+        } else {
+            // TODO: optimize this with min-p optimization
+            std::vector<llama_token_data> cur = get_token_probabilities(ctx, idx);
+
+            // set probability for sampled token
+            for (size_t i = 0; i < n_vocab; i++) {
+                // set probability for sampled token
+                if (cur[i].id == result.tok) {
+                    result.prob = cur[i].p;
+                    break;
+                }
+            }
+
+            // set probability for top n_probs tokens
+            result.probs.reserve(n_probs);
+            for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) {
+                result.probs.push_back({
+                    cur[i].id,
+                    common_token_to_piece(ctx, cur[i].id, special),
+                    cur[i].p
+                });
+            }
+        }
+    }
+
+    void send_error(const server_task & task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
+        send_error(task.id, error, type);
+    }
+
+    void send_error(const server_slot & slot, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
+        send_error(slot.task->id, error, type, slot.task->n_tokens(), slot.n_ctx);
+    }
+
+    void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER, const int32_t n_prompt_tokens = 0, const int32_t n_ctx = 0) {
+        SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str());
+
+        if (type == ERROR_TYPE_EXCEED_CONTEXT_SIZE) {
+            GGML_ASSERT(n_ctx > 0 && n_prompt_tokens > 0);
+        }
+
+        auto res = std::make_unique<server_task_result_error>();
+        res->id              = id_task;
+        res->err_type        = type;
+        res->err_msg         = error;
+        res->n_prompt_tokens = n_prompt_tokens;
+        res->n_ctx           = n_ctx;
+
+        queue_results.send(std::move(res));
+    }
+
+    // if multimodal is enabled, send an error and return false
+    bool check_no_mtmd(const int id_task) {
+        if (mctx) {
+            send_error(id_task, "This feature is not supported by multimodal", ERROR_TYPE_NOT_SUPPORTED);
+            return false;
+        }
+        return true;
+    }
+
+    void send_partial_response(server_slot & slot, const completion_token_output & tkn, bool is_progress) {
+        auto res = std::make_unique<server_task_result_cmpl_partial>();
+
+        res->id    = slot.task->id;
+        res->index = slot.task->index;
+
+        if (is_progress) {
+            res->is_progress        = true;
+            res->progress.total     = slot.task->n_tokens();
+            res->progress.cache     = slot.n_prompt_tokens_cache;
+            res->progress.processed = slot.prompt.tokens.size();
+            res->progress.time_ms   = (ggml_time_us() - slot.t_start_process_prompt) / 1000;
+        } else {
+            res->content = tkn.text_to_send;
+            res->tokens  = { tkn.tok };
+
+            slot.update_chat_msg(res->oaicompat_msg_diffs);
+        }
+
+        res->n_decoded           = slot.n_decoded;
+        res->n_prompt_tokens     = slot.task->n_tokens();
+        res->post_sampling_probs = slot.task->params.post_sampling_probs;
+
+        res->verbose           = slot.task->params.verbose;
+        res->res_type          = slot.task->params.res_type;
+        res->oaicompat_model   = slot.task->params.oaicompat_model;
+        res->oaicompat_cmpl_id = slot.task->params.oaicompat_cmpl_id;
+
+        // populate res.probs_output
+        if (slot.task->params.sampling.n_probs > 0) {
+            res->prob_output = tkn; // copy the token probs
+        }
+
+        // populate timings if this is final response or timings_per_token is enabled
+        if (slot.stop != STOP_TYPE_NONE || slot.task->params.timings_per_token) {
+            res->timings = slot.get_timings();
+        }
+
+        queue_results.send(std::move(res));
+    }
+
+    void send_final_response(server_slot & slot) {
+        auto res = std::make_unique<server_task_result_cmpl_final>();
+
+        res->id      = slot.task->id;
+        res->id_slot = slot.id;
+
+        res->index           = slot.task->index;
+        res->content         = slot.generated_text;
+        res->tokens          = std::move(slot.generated_tokens);
+        res->timings         = slot.get_timings();
+        res->prompt          = slot.task->tokens.detokenize(ctx, true);
+        res->response_fields = std::move(slot.task->params.response_fields);
+
+        res->truncated           = slot.truncated;
+        res->n_decoded           = slot.n_decoded;
+        res->n_prompt_tokens     = slot.task->n_tokens();
+        res->n_tokens_cached     = slot.prompt.n_tokens();
+        res->has_new_line        = slot.has_new_line;
+        res->stopping_word       = slot.stopping_word;
+        res->stop                = slot.stop;
+        res->post_sampling_probs = slot.task->params.post_sampling_probs;
+
+        res->verbose           = slot.task->params.verbose;
+        res->stream            = slot.task->params.stream;
+        res->include_usage     = slot.task->params.include_usage;
+        res->res_type          = slot.task->params.res_type;
+        res->oaicompat_model   = slot.task->params.oaicompat_model;
+        res->oaicompat_cmpl_id = slot.task->params.oaicompat_cmpl_id;
+        res->oaicompat_msg     = slot.update_chat_msg(res->oaicompat_msg_diffs);
+
+        // populate res.probs_output
+        if (slot.task->params.sampling.n_probs > 0) {
+            if (!slot.task->params.stream && slot.stop == STOP_TYPE_WORD) {
+                const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
+
+                size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
+                res->probs_output = std::vector<completion_token_output>(
+                        slot.generated_token_probs.begin(),
+                        slot.generated_token_probs.end() - safe_offset);
+            } else {
+                res->probs_output = std::vector<completion_token_output>(
+                        slot.generated_token_probs.begin(),
+                        slot.generated_token_probs.end());
+            }
+        }
+
+        res->generation_params = slot.task->params; // copy the parameters
+
+        queue_results.send(std::move(res));
+    }
+
+    void send_embedding(const server_slot & slot, const llama_batch & batch) {
+        auto res = std::make_unique<server_task_result_embd>();
+        res->id        = slot.task->id;
+        res->index     = slot.task->index;
+        res->n_tokens  = slot.task->n_tokens();
+        res->res_type  = slot.task->params.res_type;
+
+        const int n_embd = llama_model_n_embd(model);
+
+        std::vector<float> embd_res(n_embd, 0.0f);
+
+        for (int i = 0; i < batch.n_tokens; ++i) {
+            if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
+                continue;
+            }
+
+            const float * embd = nullptr;
+            if (llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE) {
+                embd = llama_get_embeddings_ith(ctx, i);
+            } else {
+                embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+            }
+
+            if (embd == nullptr) {
+                SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
+
+                res->embedding.push_back(std::vector<float>(n_embd, 0.0f));
+                continue;
+            }
+
+            // normalize only when there is pooling
+            if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) {
+                common_embd_normalize(embd, embd_res.data(), n_embd, slot.task->params.embd_normalize);
+                res->embedding.push_back(embd_res);
+                break;
+            }
+
+            res->embedding.emplace_back(embd, embd + n_embd);
+        }
+
+        SLT_DBG(slot, "%s", "sending embeddings\n");
+
+        queue_results.send(std::move(res));
+    }
+
+    void send_rerank(const server_slot & slot, const llama_batch & batch) {
+        auto res = std::make_unique<server_task_result_rerank>();
+        res->id       = slot.task->id;
+        res->index    = slot.task->index;
+        res->n_tokens = slot.task->n_tokens();
+
+        for (int i = 0; i < batch.n_tokens; ++i) {
+            if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
+                continue;
+            }
+
+            const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+            if (embd == NULL) {
+                embd = llama_get_embeddings_ith(ctx, i);
+            }
+
+            if (embd == NULL) {
+                SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
+
+                res->score = -1e6;
+                continue;
+            }
+
+            res->score = embd[0];
+        }
+
+        SLT_DBG(slot, "sending rerank result, res.score = %f\n", res->score);
+
+        queue_results.send(std::move(res));
+    }
+
+    //
+    // Functions to process the task
+    //
+
+    void process_single_task(server_task && task) {
+        switch (task.type) {
+            case SERVER_TASK_TYPE_COMPLETION:
+            case SERVER_TASK_TYPE_INFILL:
+            case SERVER_TASK_TYPE_EMBEDDING:
+            case SERVER_TASK_TYPE_RERANK:
+                {
+                    const int id_slot = task.id_slot;
+
+                    server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
+
+                    if (slot == nullptr) {
+                        // if no slot is available, we defer this task for processing later
+                        SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id);
+                        queue_tasks.defer(std::move(task));
+                        break;
+                    }
+
+                    if (slot->is_processing()) {
+                        // if requested slot is unavailable, we defer this task for processing later
+                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
+                        queue_tasks.defer(std::move(task));
+                        break;
+                    }
+
+                    if (!launch_slot_with_task(*slot, std::move(task))) {
+                        SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id);
+                        break;
+                    }
+                } break;
+            case SERVER_TASK_TYPE_CANCEL:
+                {
+                    // release slot linked with the task id
+                    for (auto & slot : slots) {
+                        if (slot.task && slot.task->id == task.id_target) {
+                            slot.release();
+                            break;
+                        }
+                    }
+                } break;
+            case SERVER_TASK_TYPE_NEXT_RESPONSE:
+                {
+                    // do nothing
+                } break;
+            case SERVER_TASK_TYPE_METRICS:
+                {
+                    json slots_data = json::array();
+
+                    int n_idle_slots       = 0;
+                    int n_processing_slots = 0;
+
+                    for (server_slot & slot : slots) {
+                        json slot_data = slot.to_json(slots_debug == 0);
+
+                        if (slot.is_processing()) {
+                            n_processing_slots++;
+                        } else {
+                            n_idle_slots++;
+                        }
+
+                        slots_data.push_back(slot_data);
+                    }
+                    SRV_DBG("n_idle_slots = %d, n_processing_slots = %d\n", n_idle_slots, n_processing_slots);
+
+                    auto res = std::make_unique<server_task_result_metrics>();
+                    res->id                  = task.id;
+                    res->slots_data          = std::move(slots_data);
+                    res->n_idle_slots        = n_idle_slots;
+                    res->n_processing_slots  = n_processing_slots;
+                    res->n_tasks_deferred    = queue_tasks.queue_tasks_deferred_size();
+                    res->t_start             = metrics.t_start;
+
+                    res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
+                    res->t_prompt_processing_total       = metrics.t_prompt_processing_total;
+                    res->n_tokens_predicted_total        = metrics.n_tokens_predicted_total;
+                    res->t_tokens_generation_total       = metrics.t_tokens_generation_total;
+
+                    res->n_tokens_max = metrics.n_tokens_max;
+
+                    res->n_prompt_tokens_processed = metrics.n_prompt_tokens_processed;
+                    res->t_prompt_processing       = metrics.t_prompt_processing;
+                    res->n_tokens_predicted        = metrics.n_tokens_predicted;
+                    res->t_tokens_generation       = metrics.t_tokens_generation;
+
+                    res->n_decode_total          = metrics.n_decode_total;
+                    res->n_busy_slots_total      = metrics.n_busy_slots_total;
+
+                    if (task.metrics_reset_bucket) {
+                        metrics.reset_bucket();
+                    }
+                    queue_results.send(std::move(res));
+                } break;
+            case SERVER_TASK_TYPE_SLOT_SAVE:
+                {
+                    if (!check_no_mtmd(task.id)) {
+                        break;
+                    }
+
+                    int id_slot = task.slot_action.slot_id;
+                    server_slot * slot = get_slot_by_id(id_slot);
+                    if (slot == nullptr) {
+                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
+                        break;
+                    }
+                    if (slot->is_processing()) {
+                        // if requested slot is unavailable, we defer this task for processing later
+                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
+                        queue_tasks.defer(std::move(task));
+                        break;
+                    }
+
+                    const size_t token_count = slot->prompt.tokens.size();
+                    const int64_t t_start = ggml_time_us();
+
+                    std::string filename = task.slot_action.filename;
+                    std::string filepath = task.slot_action.filepath;
+
+                    const llama_tokens & tokens = slot->prompt.tokens.get_text_tokens();
+                    const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, tokens.data(), token_count);
+
+                    const int64_t t_end = ggml_time_us();
+                    const double t_save_ms = (t_end - t_start) / 1000.0;
+
+                    auto res = std::make_unique<server_task_result_slot_save_load>();
+                    res->id       = task.id;
+                    res->id_slot  = id_slot;
+                    res->filename = filename;
+                    res->is_save  = true;
+                    res->n_tokens = token_count;
+                    res->n_bytes  = nwrite;
+                    res->t_ms     = t_save_ms;
+                    queue_results.send(std::move(res));
+                } break;
+            case SERVER_TASK_TYPE_SLOT_RESTORE:
+                {
+                    if (!check_no_mtmd(task.id)) break;
+                    int id_slot = task.slot_action.slot_id;
+                    server_slot * slot = get_slot_by_id(id_slot);
+                    if (slot == nullptr) {
+                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
+                        break;
+                    }
+                    if (slot->is_processing()) {
+                        // if requested slot is unavailable, we defer this task for processing later
+                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
+                        queue_tasks.defer(std::move(task));
+                        break;
+                    }
+
+                    const int64_t t_start = ggml_time_us();
+
+                    std::string filename = task.slot_action.filename;
+                    std::string filepath = task.slot_action.filepath;
+
+                    llama_tokens tokens;
+                    tokens.resize(slot->n_ctx);
+                    size_t token_count = 0;
+                    size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, tokens.data(), tokens.size(), &token_count);
+                    if (nread == 0) {
+                        slot->prompt.tokens.clear(); // KV may already been invalidated?
+                        send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
+                        break;
+                    }
+                    tokens.resize(token_count);
+                    slot->prompt.tokens.clear();
+                    slot->prompt.tokens.insert(tokens);
+
+                    const int64_t t_end = ggml_time_us();
+                    const double t_restore_ms = (t_end - t_start) / 1000.0;
+
+                    auto res = std::make_unique<server_task_result_slot_save_load>();
+                    res->id       = task.id;
+                    res->id_slot  = id_slot;
+                    res->filename = filename;
+                    res->is_save  = false;
+                    res->n_tokens = token_count;
+                    res->n_bytes  = nread;
+                    res->t_ms     = t_restore_ms;
+                    queue_results.send(std::move(res));
+                } break;
+            case SERVER_TASK_TYPE_SLOT_ERASE:
+                {
+                    if (!check_no_mtmd(task.id)) {
+                        break;
+                    }
+                    int id_slot = task.slot_action.slot_id;
+                    server_slot * slot = get_slot_by_id(id_slot);
+                    if (slot == nullptr) {
+                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
+                        break;
+                    }
+                    if (slot->is_processing()) {
+                        // if requested slot is unavailable, we defer this task for processing later
+                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
+                        queue_tasks.defer(std::move(task));
+                        break;
+                    }
+
+                    // Erase token cache
+                    const size_t n_erased = slot->prompt.tokens.size();
+
+                    clear_slot(*slot);
+
+                    auto res = std::make_unique<server_task_result_slot_erase>();
+                    res->id       = task.id;
+                    res->id_slot  = id_slot;
+                    res->n_erased = n_erased;
+                    queue_results.send(std::move(res));
+                } break;
+            case SERVER_TASK_TYPE_SET_LORA:
+                {
+                    params_base.lora_adapters = std::move(task.set_lora);
+                    auto res = std::make_unique<server_task_result_apply_lora>();
+                    res->id = task.id;
+                    queue_results.send(std::move(res));
+                } break;
+
+        }
+    }
+
+    void update_slots() {
+        // check if all slots are idle
+        {
+            bool all_idle = true;
+
+            for (auto & slot : slots) {
+                if (slot.is_processing()) {
+                    all_idle = false;
+                    break;
+                }
+            }
+
+            if (all_idle) {
+                SRV_INF("%s", "all slots are idle\n");
+
+                return;
+            }
+        }
+
+        {
+            SRV_DBG("%s", "posting NEXT_RESPONSE\n");
+
+            server_task task(SERVER_TASK_TYPE_NEXT_RESPONSE);
+            task.id = queue_tasks.get_new_id();
+            queue_tasks.post(std::move(task));
+        }
+
+        // apply context-shift if needed
+        // TODO: simplify and improve
+        for (server_slot & slot : slots) {
+            if (slot.state == SLOT_STATE_GENERATING && slot.prompt.n_tokens() + 1 >= slot.n_ctx) {
+                if (!params_base.ctx_shift) {
+                    // this check is redundant (for good)
+                    // we should never get here, because generation should already stopped in process_token()
+                    send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER);
+                    slot.release();
+                    continue;
+                }
+
+                if (mctx) {
+                    // we should never reach this because params_base.ctx_shift is automatically disabled if mmproj is loaded
+                    // we don't support ctx_shift because an image chunk may contains multiple tokens
+                    GGML_ABORT("not supported by multimodal");
+                }
+
+                // Shift context
+                int n_keep = slot.task->params.n_keep < 0 ? slot.task->n_tokens() : slot.task->params.n_keep;
+
+                if (add_bos_token) {
+                    n_keep += 1;
+                }
+
+                n_keep = std::min(slot.n_ctx - 4, n_keep);
+
+                const int n_left    = slot.prompt.n_tokens() - n_keep;
+                const int n_discard = slot.task->params.n_discard ? slot.task->params.n_discard : (n_left / 2);
+
+                SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
+
+                llama_memory_seq_rm (llama_get_memory(ctx), slot.id, n_keep            , n_keep + n_discard);
+                llama_memory_seq_add(llama_get_memory(ctx), slot.id, n_keep + n_discard, slot.prompt.n_tokens(), -n_discard);
+
+                // add generated tokens to cache
+                // ref: https://github.com/ggml-org/llama.cpp/pull/16818#discussion_r2473269481
+                {
+                    GGML_ASSERT(!slot.prompt.tokens.has_mtmd);
+
+                    llama_tokens new_tokens = slot.prompt.tokens.get_text_tokens(); // copy
+                    for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) {
+                        new_tokens[i - n_discard] = new_tokens[i];
+                    }
+
+                    new_tokens.resize(slot.prompt.tokens.size() - n_discard);
+
+                    slot.prompt.tokens.clear();
+                    slot.prompt.tokens.insert(new_tokens);
+                }
+
+                slot.truncated = true;
+            }
+        }
+
+        // start populating the batch for this iteration
+        common_batch_clear(batch);
+
+        // track if given slot can be batched with slots already in the batch
+        server_slot * slot_batched = nullptr;
+
+        auto accept_special_token = [&](server_slot & slot, llama_token token) {
+            return params_base.special ||
+                slot.task->params.sampling.preserved_tokens.find(token) != slot.task->params.sampling.preserved_tokens.end();
+        };
+
+        // first, add sampled tokens from any ongoing sequences
+        for (auto & slot : slots) {
+            if (slot.state != SLOT_STATE_GENERATING) {
+                continue;
+            }
+
+            // check if we can batch this slot with the previous one
+            if (!slot_batched) {
+                slot_batched = &slot;
+            } else if (!slot_batched->can_batch_with(slot)) {
+                continue;
+            }
+
+            slot.i_batch = batch.n_tokens;
+
+            common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true);
+
+            slot.prompt.tokens.push_back(slot.sampled);
+
+            SLT_DBG(slot, "slot decode token, n_ctx = %d, n_tokens = %d, truncated = %d\n",
+                    slot.n_ctx, slot.prompt.n_tokens(), slot.truncated);
+        }
+
+        // process in chunks of params.n_batch
+        int32_t n_batch  = llama_n_batch(ctx);
+        int32_t n_ubatch = llama_n_ubatch(ctx);
+
+        float  alora_scale       = -1.0f;
+        size_t alora_disabled_id = 0;
+
+        // next, batch any pending prompts without exceeding n_batch
+        if (params_base.cont_batching || batch.n_tokens == 0) {
+            for (auto & slot : slots) {
+                if (!slot.is_processing()) {
+                    continue;
+                }
+
+                // check if we can batch this slot with the previous one
+                if (slot_batched && !slot_batched->can_batch_with(slot)) {
+                    continue;
+                }
+
+                // this slot still has a prompt to be processed
+                if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
+                    const auto & input_tokens = slot.task->tokens;
+
+                    // TODO: maybe move branch to outside of this loop in the future
+                    if (slot.state == SLOT_STATE_STARTED) {
+                        slot.t_start_process_prompt = ggml_time_us();
+                        slot.t_start_generation = 0;
+
+                        slot.state = SLOT_STATE_PROCESSING_PROMPT;
+
+                        SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, task.n_tokens = %d\n",
+                                slot.n_ctx, slot.task->params.n_keep, slot.task->n_tokens());
+
+                        // print prompt tokens (for debugging)
+                        /*if (1) {
+                            // first 16 tokens (avoid flooding logs)
+                            for (int i = 0; i < std::min<int>(16, input_tokens.size()); i++) {
+                                SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, input_tokens[i], common_token_to_piece(ctx, input_tokens[i]).c_str());
+                            }
+                        } else {
+                            // all
+                            for (int i = 0; i < (int) input_tokens.size(); i++) {
+                                SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, input_tokens[i], common_token_to_piece(ctx, input_tokens[i]).c_str());
+                            }
+                        }*/
+
+                        // keep track how many tokens we can reuse from the previous state
+                        int n_past = 0;
+
+                        // empty prompt passed -> release the slot and send empty response
+                        if (input_tokens.empty()) {
+                            SLT_WRN(slot, "%s", "empty prompt - releasing slot\n");
+
+                            slot.print_timings();
+                            send_final_response(slot);
+                            slot.release();
+
+                            continue;
+                        }
+
+                        // TODO: support memory-less logits computation
+                        if (slot.need_logits() && !llama_get_memory(ctx)) {
+                            send_error(slot, "the current context does not logits computation. skipping", ERROR_TYPE_SERVER);
+                            slot.release();
+                            continue;
+                        }
+
+                        if (!slot.can_split()) {
+                            if (slot.task->n_tokens() > n_ubatch) {
+                                send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
+                                slot.release();
+                                continue;
+                            }
+
+                            if (slot.task->n_tokens() > slot.n_ctx) {
+                                send_error(slot, "input is larger than the max context size. skipping", ERROR_TYPE_EXCEED_CONTEXT_SIZE);
+                                slot.release();
+                                continue;
+                            }
+                        } else {
+                            if (slot.task->n_tokens() >= slot.n_ctx) {
+                                send_error(slot, "the request exceeds the available context size, try increasing it", ERROR_TYPE_EXCEED_CONTEXT_SIZE);
+                                slot.release();
+                                continue;
+                            }
+
+                            if (slot.task->params.cache_prompt) {
+                                // reuse any previously computed tokens that are common with the new prompt
+                                n_past = slot.prompt.tokens.get_common_prefix(input_tokens);
+
+                                // if there is an alora invoked, don't cache after the invocation start
+                                if (slot.alora_invocation_start > 0) {
+                                    SLT_DBG(slot, "only caching to alora invocation start (n_past = %d, alora_invocation_start = %d)\n", n_past, slot.alora_invocation_start);
+                                    n_past = std::min(n_past, slot.alora_invocation_start - 1);
+                                }
+
+                                // reuse chunks from the cached prompt by shifting their KV cache in the new position
+                                if (params_base.n_cache_reuse > 0) {
+                                    GGML_ASSERT(!slot.prompt.tokens.has_mtmd);
+
+                                    size_t head_c = n_past; // cache
+                                    size_t head_p = n_past; // current prompt
+
+                                    if (mctx) {
+                                        // we should never reach this
+                                        GGML_ABORT("not supported by multimodal");
+                                    }
+
+                                    SLT_DBG(slot, "trying to reuse chunks with size > %d, n_past = %d\n", params_base.n_cache_reuse, n_past);
+
+                                    while (head_c < slot.prompt.tokens.size() &&
+                                           head_p < input_tokens.size()) {
+
+                                        size_t n_match = 0;
+                                        while (head_c + n_match < slot.prompt.tokens.size() &&
+                                               head_p + n_match < input_tokens.size()       &&
+                                               slot.prompt.tokens[head_c + n_match] == input_tokens[head_p + n_match]) {
+
+                                            n_match++;
+                                        }
+
+                                        if (n_match >= (size_t) params_base.n_cache_reuse) {
+                                            SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
+                                            //for (size_t i = head_p; i < head_p + n_match; i++) {
+                                            //    SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
+                                            //}
+
+                                            const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
+
+                                            llama_memory_seq_rm (llama_get_memory(ctx), slot.id, head_p, head_c);
+                                            llama_memory_seq_add(llama_get_memory(ctx), slot.id, head_c, head_c + n_match, kv_shift);
+
+                                            for (size_t i = 0; i < n_match; i++) {
+                                                slot.prompt.tokens.set_token(head_p + i, slot.prompt.tokens[head_c + i]);
+                                                n_past++;
+                                            }
+
+                                            head_c += n_match;
+                                            head_p += n_match;
+                                        } else {
+                                            head_c += 1;
+                                        }
+                                    }
+
+                                    SLT_DBG(slot, "after context reuse, new n_past = %d\n", n_past);
+                                }
+                            } else {
+                                // if we don't cache the prompt, we have to remove all previous tokens
+                                n_past = 0;
+                            }
+
+                            // note: when n_swa == 0, the model does not use SWA, which is equivalent to a window of 1
+                            const auto n_swa = std::max(1, llama_model_n_swa(model));
+
+                            // the largest pos_min required for a checkpoint to be useful
+                            const auto pos_min_thold = std::max(0, n_past - n_swa);
+
+                            // note: disallow with mtmd contexts for now
+                            //       https://github.com/ggml-org/llama.cpp/issues/17043
+                            if (!mctx && n_past > 0 && n_past < slot.prompt.n_tokens()) {
+                                const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
+                                if (pos_min == -1) {
+                                    SLT_ERR(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min);
+                                    GGML_ABORT("pos_min == -1, but n_past > 0 - should not happen: https://github.com/ggml-org/llama.cpp/pull/13833#discussion_r2116181237");
+                                }
+
+                                // when the prompt prefix does not match, print the tokens around the mismatch
+                                // this is useful for debugging prompt caching
+                                if (slots_debug) {
+                                    const int np0 = std::max<int>(n_past - 4, 0);
+                                    const int np1 = std::min<int>(n_past + 6, std::min(slot.prompt.tokens.size(), slot.task->tokens.size()));
+
+                                    std::stringstream ss0;
+                                    std::stringstream ss1;
+
+                                    std::stringstream st0;
+                                    std::stringstream st1;
+
+                                    ss0 << "old: ... ";
+                                    ss1 << "new: ... ";
+
+                                    for (int i = np0; i < np1; i++) {
+                                        if (i == n_past) {
+                                            ss0 << " | ";
+                                            ss1 << " | ";
+                                        }
+
+                                        {
+                                            const auto token = slot.prompt.tokens[i];
+                                            const auto piece = token != LLAMA_TOKEN_NULL ? common_token_to_piece(ctx, token) : "[mtmd]";
+                                            ss0 << piece;
+                                            st0 << std::setw(8) << token;
+                                        }
+
+                                        {
+                                            const auto token = slot.task->tokens[i];
+                                            const auto piece = token != LLAMA_TOKEN_NULL ? common_token_to_piece(ctx, token) : "[mtmd]";
+                                            ss1 << piece;
+                                            st1 << std::setw(8) << token;
+                                        }
+                                    }
+
+                                    SLT_WRN(slot, "%s\n", ss0.str().c_str());
+                                    SLT_WRN(slot, "%s\n", ss1.str().c_str());
+
+                                    SLT_WRN(slot, "%s\n", st0.str().c_str());
+                                    SLT_WRN(slot, "%s\n", st1.str().c_str());
+                                }
+
+                                if (pos_min > pos_min_thold) {
+                                    // TODO: support can be added in the future when corresponding vision models get released
+                                    GGML_ASSERT(!slot.prompt.tokens.has_mtmd);
+
+                                    SLT_WRN(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min, n_swa);
+
+                                    // search for a context checkpoint
+                                    const auto it = std::find_if(
+                                        slot.prompt.checkpoints.rbegin(),
+                                        slot.prompt.checkpoints.rend(),
+                                        [&](const auto & cur) {
+                                            // guarantee that a checkpoint will result in at least one token being processed [TAG_PROMPT_LOGITS]
+                                            return cur.pos_min < pos_min_thold;
+                                        }
+                                    );
+
+                                    bool do_reset = it == slot.prompt.checkpoints.rend();
+
+                                    if (!do_reset) {
+                                        // restore the context checkpoint
+                                        const size_t checkpoint_size = it->data.size();
+                                        const size_t n = llama_state_seq_set_data_ext(ctx, it->data.data(), checkpoint_size, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+
+                                        if (n != checkpoint_size) {
+                                            SLT_ERR(slot, "failed to restore context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, (float) checkpoint_size / 1024 / 1024);
+                                            do_reset = true;
+                                            //printf("[DEBUG] `do_reset` was set to `true` after failing to restore a checkpoint");
+                                        } else {
+                                            n_past = std::min(n_past, std::max(it->pos_min + 1, it->pos_max));
+                                            SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, (float) checkpoint_size / 1024 / 1024);
+                                        }
+                                    }
+
+                                    if (do_reset) {
+                                        SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA or hybrid/recurrent memory, see %s)\n",
+                                                "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
+                                        n_past = 0;
+                                    }
+                                }
+                            }
+
+                            {
+                                // erase any checkpoints with pos_min > pos_min_thold
+                                for (auto it = slot.prompt.checkpoints.begin(); it != slot.prompt.checkpoints.end();) {
+                                    const auto & cur = *it;
+                                    if (cur.pos_min > pos_min_thold) {
+                                        SLT_WRN(slot, "erased invalidated context checkpoint (pos_min = %d, pos_max = %d, n_swa = %d, size = %.3f MiB)\n", cur.pos_min, cur.pos_max, n_swa, (float) cur.data.size() / 1024 / 1024);
+                                        it = slot.prompt.checkpoints.erase(it);
+                                    } else {
+                                        ++it;
+                                    }
+                                }
+                            }
+                        }
+
+                        // [TAG_PROMPT_LOGITS]
+                        if (n_past == slot.task->n_tokens() && n_past > 0) {
+                            SLT_WRN(slot, "need to evaluate at least 1 token for each active slot (n_past = %d, task.n_tokens() = %d)\n", n_past, slot.task->n_tokens());
+                            n_past--;
+                            SLT_WRN(slot, "n_past was set to %d\n", n_past);
+                        }
+
+                        slot.n_prompt_tokens_cache     = n_past;
+                        slot.n_prompt_tokens_processed = 0;
+
+                        slot.prompt.tokens.keep_first(n_past);
+                    }
+
+                    if (!slot.can_split()) {
+                        // cannot fit the prompt in the current batch - will try next iter
+                        if (batch.n_tokens + slot.task->n_tokens() > n_batch) {
+                            continue;
+                        }
+                    }
+
+                    // truncate any tokens that are beyond n_past for this slot
+                    const llama_pos p0 = slot.prompt.tokens.pos_next();
+
+                    SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0);
+
+                    if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) {
+                        SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0);
+
+                        clear_slot(slot);
+
+                        // there is no common part left
+                        slot.n_prompt_tokens_cache = 0;
+                    }
+
+                    // check if we should process the image
+                    if (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) {
+                        // process the image
+                        size_t n_tokens_out = 0;
+                        int32_t res = input_tokens.process_chunk(ctx, mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out);
+                        if (res != 0) {
+                            SLT_ERR(slot, "failed to process image, res = %d\n", res);
+                            send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
+                            slot.release();
+                            continue;
+                        }
+
+                        slot.n_prompt_tokens_processed += n_tokens_out;
+
+                        // add the image chunk to cache
+                        {
+                            const auto & chunk = input_tokens.find_chunk(slot.prompt.n_tokens());
+                            slot.prompt.tokens.push_back(chunk.get()); // copy
+                        }
+                    }
+
+                    // If using an alora, there may be uncached tokens that come
+                    // before the invocation sequence. When this happens, the
+                    // tokens before the invocation sequence need to be
+                    // processed without the adapter in a separate batch, then
+                    // the adapter needs to be enabled for the remaining tokens.
+                    if (lora_all_alora(slot.lora) && slot.alora_invocation_start - 1 > slot.prompt.n_tokens()) {
+                        SLT_DBG(slot, "processing pre-alora tokens without the adapter (n_tokens = %d, alora_invocation_start = %d)\n", slot.prompt.n_tokens(), slot.alora_invocation_start);
+                        const auto & enabled_loras = lora_get_enabled_ids(slot.lora);
+                        GGML_ASSERT(enabled_loras.size() == 1);
+                        alora_scale = slot.lora[enabled_loras[0]].scale;
+                        slot.lora[enabled_loras[0]].scale = 0.0f;
+                        alora_disabled_id = enabled_loras[0];
+                    }
+
+                    bool do_checkpoint = params_base.n_ctx_checkpoints > 0;
+
+                    // make checkpoints only for completion tasks
+                    do_checkpoint = do_checkpoint && slot.task->type == SERVER_TASK_TYPE_COMPLETION;
+
+                    // make a checkpoint of the parts of the memory that cannot be rolled back.
+                    // checkpoints are created only if:
+                    // - the model uses SWA and we are not using `swa_full`
+                    // - the model architecture is marked as recurrent or hybrid
+                    //
+                    // TODO: try to make this conditional on the context or the memory module, instead of the model type
+                    do_checkpoint = do_checkpoint && (
+                            llama_model_is_recurrent(model) ||
+                            llama_model_is_hybrid(model) ||
+                            (llama_model_n_swa(model) > 0 && !params_base.swa_full)
+                            );
+
+                    // add prompt tokens for processing in the current batch
+                    while (slot.prompt.n_tokens() < slot.task->n_tokens() && batch.n_tokens < n_batch) {
+                        // get next token to process
+                        llama_token cur_tok = input_tokens[slot.prompt.n_tokens()];
+                        if (cur_tok == LLAMA_TOKEN_NULL) {
+                            break; // end of text chunk
+                        }
+
+                        // if this is an alora request with pre-invocation
+                        // tokens that are not cached, we need to stop filling
+                        // this batch at those pre-invocation tokens.
+                        if (alora_scale > 0 && slot.prompt.n_tokens() == slot.alora_invocation_start - 1) {
+                            SLT_DBG(slot, "stop prompt batch filling at (n_tokens = %d, alora_invocation_start = %d)\n", slot.prompt.n_tokens(), slot.alora_invocation_start);
+                            break;
+                        }
+
+                        // embedding requires all tokens in the batch to be output
+                        common_batch_add(batch,
+                            cur_tok,
+                            slot.prompt.tokens.pos_next(),
+                            { slot.id },
+                            slot.need_embd());
+                        slot.prompt.tokens.push_back(cur_tok);
+
+                        slot.n_prompt_tokens_processed++;
+
+                        // process the last few tokens of the prompt separately in order to allow for a checkpoint to be created.
+                        if (do_checkpoint && slot.task->n_tokens() - slot.prompt.n_tokens() == 64) {
+                            break;
+                        }
+                    }
+
+                    // SLT_INF(slot, "new slot.prompt.tokens: %s\n", slot.slot.prompt.tokens.str().c_str());
+
+                    SLT_INF(slot, "prompt processing progress, n_tokens = %d, batch.n_tokens = %d, progress = %f\n", slot.prompt.n_tokens(), batch.n_tokens, (float) slot.prompt.n_tokens() / slot.task->n_tokens());
+
+                    // entire prompt has been processed
+                    if (slot.prompt.n_tokens() == slot.task->n_tokens()) {
+                        slot.state = SLOT_STATE_DONE_PROMPT;
+
+                        GGML_ASSERT(batch.n_tokens > 0);
+
+                        common_sampler_reset(slot.smpl);
+
+                        // Process all prompt tokens through sampler system
+                        for (int i = 0; i < slot.task->n_tokens(); ++i) {
+                            llama_token id = input_tokens[i];
+                            if (id != LLAMA_TOKEN_NULL) {
+                                common_sampler_accept(slot.smpl, id, false);
+                            }
+                        }
+
+                        // extract the logits only for the last token
+                        batch.logits[batch.n_tokens - 1] = true;
+
+                        slot.n_decoded = 0;
+                        slot.i_batch   = batch.n_tokens - 1;
+
+                        SLT_INF(slot, "prompt done, n_tokens = %d, batch.n_tokens = %d\n", slot.prompt.n_tokens(), batch.n_tokens);
+
+                        const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
+                        const auto pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx), slot.id);
+
+                        // no need for empty or small checkpoints
+                        do_checkpoint = do_checkpoint && (pos_min >= 0 && pos_max >= 64);
+
+                        // no need to create checkpoints that are too close together
+                        do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || pos_max > slot.prompt.checkpoints.back().pos_max + 64);
+
+                        if (do_checkpoint) {
+                            while (slot.prompt.checkpoints.size() >= (size_t) params_base.n_ctx_checkpoints) {
+                                // make room for the new checkpoint, if needed
+                                const auto & cur = slot.prompt.checkpoints.front();
+
+                                SLT_WRN(slot, "erasing old context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n",
+                                        cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);
+
+                                slot.prompt.checkpoints.erase(slot.prompt.checkpoints.begin());
+                            }
+
+                            const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+
+                            auto & cur = slot.prompt.checkpoints.emplace_back(server_prompt_checkpoint{
+                                /*.pos_min = */ pos_min,
+                                /*.pos_max = */ pos_max,
+                                /*.data    = */ std::vector<uint8_t>(checkpoint_size),
+                            });
+
+                            llama_state_seq_get_data_ext(ctx, cur.data.data(), checkpoint_size, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+
+                            SLT_WRN(slot, "created context checkpoint %d of %d (pos_min = %d, pos_max = %d, size = %.3f MiB)\n",
+                                    (int) slot.prompt.checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);
+                        }
+                    }
+                }
+
+                if (!slot_batched) {
+                    slot_batched = &slot;
+                }
+
+                if (batch.n_tokens >= n_batch) {
+                    break;
+                }
+            }
+        }
+
+        if (batch.n_tokens == 0) {
+            SRV_WRN("%s", "no tokens to decode\n");
+            return;
+        }
+
+        SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
+
+        if (slot_batched) {
+            // apply lora, only need to do it once per batch
+            common_set_adapter_lora(ctx, slot_batched->lora);
+
+            // if the lora is temporarily disabled for an alora, re-enable it
+            // for next time
+            if (alora_scale > 0.0f) {
+                SRV_DBG("re-enabling alora with scale %f\n", alora_scale);
+                slot_batched->lora[alora_disabled_id].scale = alora_scale;
+            }
+
+            llama_set_embeddings(ctx, slot_batched->need_embd());
+        }
+
+        int32_t i_next = 0;
+
+        // process the created batch of tokens
+        for (int32_t i = 0; i < batch.n_tokens; i = i_next) {
+            const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
+
+            llama_batch batch_view = {
+                n_tokens,
+                batch.token    + i,
+                nullptr,
+                batch.pos      + i,
+                batch.n_seq_id + i,
+                batch.seq_id   + i,
+                batch.logits   + i,
+            };
+
+            const int ret = llama_decode(ctx, batch_view);
+
+            metrics.on_decoded(slots);
+
+            if (ret != 0) {
+                {
+                    std::string err;
+
+                    if (n_batch == 1 && ret == 1) {
+                        // TODO: try to terminate only the largest active slot/sequence and continue with the rest
+                        //       need to remove the tokens from the current batch too
+                        err = "Context size has been exceeded.";
+                    }
+
+                    if (ret == -1) {
+                        err = "Invalid input batch.";
+                    }
+
+                    if (ret < -1) {
+                        // TODO: update slot state based on llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
+                        err = "Compute error.";
+                    }
+
+                    // TODO: handle ret == 2 (abort) when we start aborting
+
+                    if (!err.empty()) {
+                        SRV_ERR("%s i = %d, n_batch = %d, ret = %d\n", err.c_str(), i, n_batch, ret);
+
+                        for (auto & slot : slots) {
+                            if (slot.is_processing()) {
+                                send_error(slot, err);
+                                slot.release();
+
+                                // note: it's complicated to keep track of how much of the current batch has been
+                                //       processed before the error occurred, so we simply clear the entire context
+                                clear_slot(slot);
+                            }
+                        }
+
+                        break;
+                    }
+                }
+
+                // retry with half the batch size to try to find a free slot in the KV cache
+                if (!try_clear_idle_slots()) {
+                    n_batch /= 2;
+                }
+
+                SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
+
+                continue; // continue loop of n_batch
+            }
+
+            // move the head of the batch forward with the number of tokens we just processed
+            i_next = i + n_tokens;
+
+            // on successful decode, restore the original batch size
+            n_batch = llama_n_batch(ctx);
+
+            for (auto & slot : slots) {
+                // optionally send prompt processing progress
+                if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_DONE_PROMPT) {
+                    if (slot.task->params.stream && slot.task->params.return_progress) {
+                        send_partial_response(slot, {}, true);
+                    }
+                }
+
+                if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) {
+                    continue; // continue loop of slots
+                }
+
+                if (slot.state == SLOT_STATE_DONE_PROMPT) {
+                    if (slot.task->type == SERVER_TASK_TYPE_EMBEDDING) {
+                        // prompt evaluated for embedding
+                        send_embedding(slot, batch_view);
+                        slot.release();
+                        slot.i_batch = -1;
+                        continue; // continue loop of slots
+                    }
+
+                    if (slot.task->type == SERVER_TASK_TYPE_RERANK) {
+                        send_rerank(slot, batch_view);
+                        slot.release();
+                        slot.i_batch = -1;
+                        continue; // continue loop of slots
+                    }
+
+                    // prompt evaluated for next-token prediction
+                    slot.state = SLOT_STATE_GENERATING;
+                } else if (slot.state != SLOT_STATE_GENERATING) {
+                    continue; // continue loop of slots
+                }
+
+                const int tok_idx = slot.i_batch - i;
+
+                llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx);
+
+                slot.i_batch = -1;
+
+                common_sampler_accept(slot.smpl, id, true);
+
+                slot.n_decoded += 1;
+
+                const int64_t t_current = ggml_time_us();
+
+                if (slot.n_decoded == 1) {
+                    slot.t_start_generation = t_current;
+                    slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
+                    metrics.on_prompt_eval(slot);
+                }
+
+                slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
+
+                completion_token_output result;
+                result.tok          = id;
+                result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok));
+                result.prob         = 1.0f; // TODO: set it here instead of doing inside populate_token_probs
+
+                if (slot.task->params.sampling.n_probs > 0) {
+                    populate_token_probs(slot, result, slot.task->params.post_sampling_probs, params_base.special, tok_idx);
+                }
+
+                if (!process_token(result, slot)) {
+                    // release slot because of stop condition
+                    slot.print_timings();
+                    send_final_response(slot);
+                    metrics.on_prediction(slot);
+                    slot.release();
+
+                    continue;
+                }
+            }
+
+            // do speculative decoding
+            // TODO: rework to have a single draft llama_context shared across all slots [TAG_SERVER_SPEC_REWORK]
+            //       perform the speculative drafting for all sequences at the same time in a single batch
+            for (auto & slot : slots) {
+                if (!slot.is_processing() || !slot.can_speculate()) {
+                    continue;
+                }
+
+                if (slot.state != SLOT_STATE_GENERATING) {
+                    continue;
+                }
+
+                if (mctx) {
+                    // we should never reach this, as speculative is automatically disabled if mmproj is loaded
+                    GGML_ABORT("not supported by multimodal");
+                }
+
+                // determine the max draft that fits the current slot state
+                int n_draft_max = slot.task->params.speculative.n_max;
+
+                // note: slot.prompt is not yet expanded with the `id` token sampled above
+                //       also, need to leave space for 1 extra token to allow context shifts
+                n_draft_max = std::min(n_draft_max, slot.n_ctx - slot.prompt.n_tokens() - 2);
+
+                if (slot.n_remaining > 0) {
+                    n_draft_max = std::min(n_draft_max, slot.n_remaining - 1);
+                }
+
+                SLT_DBG(slot, "max possible draft: %d\n", n_draft_max);
+
+                if (n_draft_max < slot.task->params.speculative.n_min) {
+                    SLT_DBG(slot, "the max possible draft is too small: %d < %d - skipping speculative decoding\n", n_draft_max, slot.task->params.speculative.n_min);
+
+                    continue;
+                }
+
+                llama_token id = slot.sampled;
+
+                struct common_speculative_params params_spec;
+                params_spec.n_draft = n_draft_max;
+                params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.task->params.speculative.n_max;
+                params_spec.p_min   = slot.task->params.speculative.p_min;
+
+                const llama_tokens & cached_text_tokens = slot.prompt.tokens.get_text_tokens();
+                llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, id);
+
+                // ignore small drafts
+                if (slot.task->params.speculative.n_min > (int) draft.size()) {
+                    SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.task->params.speculative.n_min);
+
+                    continue;
+                }
+
+                // keep track of total number of drafted tokens tested
+                slot.n_draft_total += draft.size();
+
+                // construct the speculation batch
+                common_batch_clear(slot.batch_spec);
+                common_batch_add  (slot.batch_spec, id, slot.prompt.tokens.pos_next(), { slot.id }, true);
+
+                for (size_t i = 0; i < draft.size(); ++i) {
+                    common_batch_add(slot.batch_spec, draft[i], slot.prompt.tokens.pos_next() + 1 + i, { slot.id }, true);
+                }
+
+                SLT_DBG(slot, "decoding speculative batch, size = %d\n", slot.batch_spec.n_tokens);
+
+                llama_decode(ctx, slot.batch_spec);
+
+                // the accepted tokens from the speculation
+                const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft);
+
+                slot.n_decoded += ids.size();
+
+                // update how many tokens out of those tested were accepted
+                slot.n_draft_accepted += ids.size() - 1;
+
+                slot.prompt.tokens.push_back(id);
+                slot.prompt.tokens.insert({ids.begin(), ids.end() - 1});
+
+                llama_memory_seq_rm(llama_get_memory(ctx), slot.id, slot.prompt.n_tokens(), -1);
+
+                for (size_t i = 0; i < ids.size(); ++i) {
+                    completion_token_output result;
+
+                    result.tok          = ids[i];
+                    result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok));
+                    result.prob         = 1.0f; // set later
+
+                    // TODO: set result.probs
+
+                    if (!process_token(result, slot)) {
+                        slot.print_timings();
+                        send_final_response(slot);
+                        metrics.on_prediction(slot);
+                        slot.release();
+
+                        break;
+                    }
+                }
+
+                SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int) ids.size() - 1, (int) draft.size(), slot.prompt.n_tokens());
+            }
+        }
+
+        SRV_DBG("%s", "run slots completed\n");
+    }
+
+    json model_meta() const {
+        return json {
+            {"vocab_type",  llama_vocab_type       (vocab)},
+            {"n_vocab",     llama_vocab_n_tokens   (vocab)},
+            {"n_ctx_train", llama_model_n_ctx_train(model)},
+            {"n_embd",      llama_model_n_embd     (model)},
+            {"n_params",    llama_model_n_params   (model)},
+            {"size",        llama_model_size       (model)},
+        };
+    }
+
+    int get_slot_n_ctx() {
+        return slots.back().n_ctx;
+    }
+};
+
+//
+// server_context (public API)
+//
+
+server_context::server_context() : impl(new server_context_impl()) {}
+server_context::~server_context() = default;
+
+void server_context::init() {
+    impl->init();
+}
+
+bool server_context::load_model(const common_params & params) {
+    return impl->load_model(params);
+}
+
+void server_context::start_loop() {
+    impl->queue_tasks.start_loop();
+}
+
+void server_context::terminate() {
+    impl->queue_tasks.terminate();
+}
+
+llama_context * server_context::get_llama_context() const {
+    return impl->ctx;
+}
+
+std::pair<server_queue &, server_response &> server_context::get_queues() {
+    return { impl->queue_tasks, impl->queue_results };
+}
+
+
+
+// generator-like API for HTTP response generation
+struct server_res_generator : server_http_res {
+    server_response_reader rd;
+    server_res_generator(server_context_impl & ctx_server)
+        : rd({ctx_server.queue_tasks, ctx_server.queue_results}, HTTP_POLLING_SECONDS) {}
+    void ok(const json & response_data) {
+        status = 200;
+        data = safe_json_to_str(response_data);
+    }
+    void error(const json & error_data) {
+        status = json_value(error_data, "code", 500);
+        data = safe_json_to_str({{ "error", error_data }});
+    }
+};
+
+
+
+//
+// server_routes
+//
+
+static std::unique_ptr<server_res_generator> handle_completions_impl(
+            server_context_impl & ctx_server,
+            server_task_type type,
+            const json & data,
+            const std::vector<raw_buffer> & files,
+            const std::function<bool()> & should_stop,
+            task_response_type res_type) {
+    GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
+
+    auto res = std::make_unique<server_res_generator>(ctx_server);
+    auto completion_id = gen_chatcmplid();
+    auto & rd = res->rd;
+
+    try {
+        std::vector<server_task> tasks;
+
+        const auto & prompt = data.at("prompt");
+        // TODO: this log can become very long, put it behind a flag or think about a more compact format
+        //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
+
+        // process prompt
+        std::vector<server_tokens> inputs;
+
+        if (res_type != TASK_RESPONSE_TYPE_NONE && ctx_server.mctx != nullptr) {
+            // This is the case used by OAI compatible chat path with MTMD. TODO It can be moved to the path below.
+            inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt.get<std::string>(), files));
+        } else {
+            // Everything else, including multimodal completions.
+            inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
+        }
+        tasks.reserve(inputs.size());
+        for (size_t i = 0; i < inputs.size(); i++) {
+            server_task task = server_task(type);
+
+            task.id    = ctx_server.queue_tasks.get_new_id();
+            task.index = i;
+
+            task.tokens = std::move(inputs[i]);
+            task.params = server_task::params_from_json_cmpl(
+                    ctx_server.ctx,
+                    ctx_server.params_base,
+                    data);
+            task.id_slot = json_value(data, "id_slot", -1);
+
+            // OAI-compat
+            task.params.res_type          = res_type;
+            task.params.oaicompat_cmpl_id = completion_id;
+            task.params.oaicompat_model   = ctx_server.model_name;
+
+            tasks.push_back(std::move(task));
+        }
+
+        rd.post_tasks(std::move(tasks));
+    } catch (const std::exception & e) {
+        res->error(format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
+        return res;
+    }
+
+    bool stream = json_value(data, "stream", false);
+
+    if (!stream) {
+        // non-stream, wait for the results
+        auto all_results = rd.wait_for_all(should_stop);
+        if (all_results.is_terminated) {
+            return res; // connection is closed
+        } else if (all_results.error) {
+            res->error(all_results.error->to_json());
+            return res;
+        } else {
+            json arr = json::array();
+            for (auto & res : all_results.results) {
+                GGML_ASSERT(dynamic_cast<server_task_result_cmpl_final*>(res.get()) != nullptr);
+                arr.push_back(res->to_json());
+            }
+            // if single request, return single object instead of array
+            res->ok(arr.size() == 1 ? arr[0] : arr);
+        }
+
+    } else {
+        // in streaming mode, the first error must be treated as non-stream response
+        // this is to match the OAI API behavior
+        // ref: https://github.com/ggml-org/llama.cpp/pull/16486#discussion_r2419657309
+        server_task_result_ptr first_result = rd.next(should_stop);
+        if (first_result == nullptr) {
+            return res; // connection is closed
+        } else if (first_result->is_error()) {
+            res->error(first_result->to_json());
+            return res;
+        } else {
+            GGML_ASSERT(
+                dynamic_cast<server_task_result_cmpl_partial*>(first_result.get()) != nullptr
+                || dynamic_cast<server_task_result_cmpl_final*>(first_result.get()) != nullptr
+            );
+        }
+
+        // next responses are streamed
+        if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
+            res->data = format_anthropic_sse(first_result->to_json());
+        } else {
+            res->data = format_oai_sse(first_result->to_json()); // to be sent immediately
+        }
+        res->status = 200;
+        res->content_type = "text/event-stream";
+        res->next = [res_this = res.get(), res_type, &should_stop](std::string & output) -> bool {
+            if (should_stop()) {
+                SRV_DBG("%s", "stopping streaming due to should_stop condition\n");
+                return false; // should_stop condition met
+            }
+
+            if (!res_this->data.empty()) {
+                // flush the first chunk
+                output = std::move(res_this->data);
+                res_this->data.clear();
+                return true;
+            }
+
+            server_response_reader & rd = res_this->rd;
+
+            // check if there is more data
+            if (!rd.has_next()) {
+                if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
+                    // Anthropic doesn't send [DONE], message_stop was already sent
+                    output = "";
+                } else if (res_type != TASK_RESPONSE_TYPE_NONE) {
+                    output = "data: [DONE]\n\n";
+                } else {
+                    output = "";
+                }
+                SRV_DBG("%s", "all results received, terminating stream\n");
+                return false; // no more data, terminate
+            }
+
+            // receive subsequent results
+            auto result = rd.next(should_stop);
+            if (result == nullptr) {
+                SRV_DBG("%s", "stopping streaming due to should_stop condition\n");
+                return false; // should_stop condition met
+            }
+
+            // send the results
+            json res_json = result->to_json();
+            if (result->is_error()) {
+                if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
+                    output = format_anthropic_sse({
+                        {"event", "error"},
+                        {"data", res_json},
+                    });
+                } else {
+                    output = format_oai_sse(json {{ "error", res_json }});
+                }
+                SRV_DBG("%s", "error received during streaming, terminating stream\n");
+                return false; // terminate on error
+            } else {
+                GGML_ASSERT(
+                    dynamic_cast<server_task_result_cmpl_partial*>(result.get()) != nullptr
+                    || dynamic_cast<server_task_result_cmpl_final*>(result.get()) != nullptr
+                );
+                if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
+                    output = format_anthropic_sse(res_json);
+                } else {
+                    output = format_oai_sse(res_json);
+                }
+            }
+
+            // has next data, continue
+            return true;
+        };
+    }
+
+    return res;
+}
+
+void server_routes::init_routes() {
+    this->get_health = [this](const server_http_req &) {
+        // error and loading states are handled by middleware
+        auto res = std::make_unique<server_res_generator>(ctx_server);
+        res->ok({{"status", "ok"}});
+        return res;
+    };
+
+    this->get_metrics = [this](const server_http_req &) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
+        if (!params.endpoint_metrics) {
+            res->error(format_error_response("This server does not support metrics endpoint. Start it with `--metrics`", ERROR_TYPE_NOT_SUPPORTED));
+            return res;
+        }
+
+        // request slots data using task queue
+        // TODO: use server_response_reader
+        int task_id = ctx_server.queue_tasks.get_new_id();
+        {
+            server_task task(SERVER_TASK_TYPE_METRICS);
+            task.id = task_id;
+            ctx_server.queue_results.add_waiting_task_id(task_id);
+            ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
+        }
+
+        // get the result
+        server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
+        ctx_server.queue_results.remove_waiting_task_id(task_id);
+
+        if (result->is_error()) {
+            res->error(result->to_json());
+            return res;
+        }
+
+        // TODO: get rid of this dynamic_cast
+        auto res_task = dynamic_cast<server_task_result_metrics*>(result.get());
+        GGML_ASSERT(res_task != nullptr);
+
+        // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
+        json all_metrics_def = json {
+            {"counter", {{
+                    {"name",  "prompt_tokens_total"},
+                    {"help",  "Number of prompt tokens processed."},
+                    {"value",  (uint64_t) res_task->n_prompt_tokens_processed_total}
+            }, {
+                    {"name",  "prompt_seconds_total"},
+                    {"help",  "Prompt process time"},
+                    {"value",  (uint64_t) res_task->t_prompt_processing_total / 1.e3}
+            }, {
+                    {"name",  "tokens_predicted_total"},
+                    {"help",  "Number of generation tokens processed."},
+                    {"value",  (uint64_t) res_task->n_tokens_predicted_total}
+            }, {
+                    {"name",  "tokens_predicted_seconds_total"},
+                    {"help",  "Predict process time"},
+                    {"value",  (uint64_t) res_task->t_tokens_generation_total / 1.e3}
+            }, {
+                    {"name",  "n_decode_total"},
+                    {"help",  "Total number of llama_decode() calls"},
+                    {"value",  res_task->n_decode_total}
+            }, {
+                    {"name",  "n_tokens_max"},
+                    {"help",  "Largest observed n_tokens."},
+                    {"value",  res_task->n_tokens_max}
+            }, {
+                    {"name",  "n_busy_slots_per_decode"},
+                    {"help",  "Average number of busy slots per llama_decode() call"},
+                    {"value",  (float) res_task->n_busy_slots_total / std::max((float) res_task->n_decode_total, 1.f)}
+            }}},
+            {"gauge", {{
+                    {"name",  "prompt_tokens_seconds"},
+                    {"help",  "Average prompt throughput in tokens/s."},
+                    {"value",  res_task->n_prompt_tokens_processed ? 1.e3 / res_task->t_prompt_processing * res_task->n_prompt_tokens_processed : 0.}
+            },{
+                    {"name",  "predicted_tokens_seconds"},
+                    {"help",  "Average generation throughput in tokens/s."},
+                    {"value",  res_task->n_tokens_predicted ? 1.e3 / res_task->t_tokens_generation * res_task->n_tokens_predicted : 0.}
+            },{
+                    {"name",  "requests_processing"},
+                    {"help",  "Number of requests processing."},
+                    {"value",  (uint64_t) res_task->n_processing_slots}
+            },{
+                    {"name",  "requests_deferred"},
+                    {"help",  "Number of requests deferred."},
+                    {"value",  (uint64_t) res_task->n_tasks_deferred}
+            }}}
+        };
+
+        std::stringstream prometheus;
+
+        for (const auto & el : all_metrics_def.items()) {
+            const auto & type        = el.key();
+            const auto & metrics_def = el.value();
+
+            for (const auto & metric_def : metrics_def) {
+                const std::string name = metric_def.at("name");
+                const std::string help = metric_def.at("help");
+
+                auto value = json_value(metric_def, "value", 0.);
+                prometheus << "# HELP llamacpp:" << name << " " << help  << "\n"
+                            << "# TYPE llamacpp:" << name << " " << type  << "\n"
+                            << "llamacpp:"        << name << " " << value << "\n";
+            }
+        }
+
+        res->headers["Process-Start-Time-Unix"] = std::to_string(res_task->t_start);
+        res->content_type = "text/plain; version=0.0.4";
+        res->status = 200;
+        res->data = prometheus.str();
+        return res;
+    };
+
+    this->get_slots = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
+        if (!params.endpoint_slots) {
+            res->error(format_error_response("This server does not support slots endpoint. Start it with `--slots`", ERROR_TYPE_NOT_SUPPORTED));
+            return res;
+        }
+
+        // request slots data using task queue
+        int task_id = ctx_server.queue_tasks.get_new_id();
+        {
+            server_task task(SERVER_TASK_TYPE_METRICS);
+            task.id = task_id;
+            ctx_server.queue_results.add_waiting_task_id(task_id);
+            ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
+        }
+
+        // get the result
+        server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
+        ctx_server.queue_results.remove_waiting_task_id(task_id);
+
+        if (result->is_error()) {
+            res->error(result->to_json());
+            return res;
+        }
+
+        // TODO: get rid of this dynamic_cast
+        auto res_task = dynamic_cast<server_task_result_metrics*>(result.get());
+        GGML_ASSERT(res_task != nullptr);
+
+        // optionally return "fail_on_no_slot" error
+        if (!req.get_param("fail_on_no_slot").empty()) {
+            if (res_task->n_idle_slots == 0) {
+                res->error(format_error_response("no slot available", ERROR_TYPE_UNAVAILABLE));
+                return res;
+            }
+        }
+
+        res->ok(res_task->slots_data);
+        return res;
+    };
+
+    this->post_slots = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
+        if (params.slot_save_path.empty()) {
+            res->error(format_error_response("This server does not support slots action. Start it with `--slot-save-path`", ERROR_TYPE_NOT_SUPPORTED));
+            return res;
+        }
+
+        std::string id_slot_str = req.get_param("id_slot");
+        int id_slot;
+
+        try {
+            id_slot = std::stoi(id_slot_str);
+        } catch (const std::exception &) {
+            res->error(format_error_response("Invalid slot ID", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+
+        std::string action = req.get_param("action");
+
+        if (action == "save") {
+            return handle_slots_save(req, id_slot);
+        } else if (action == "restore") {
+            return handle_slots_restore(req, id_slot);
+        } else if (action == "erase") {
+            return handle_slots_erase(req, id_slot);
+        } else {
+            res->error(format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+    };
+
+    this->get_props = [this](const server_http_req &) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
+        json default_generation_settings_for_props;
+
+        {
+            task_params params;
+
+            params.sampling = ctx_server.params_base.sampling;
+
+            default_generation_settings_for_props = json {
+                {"params", params.to_json(true)},
+                {"n_ctx",  ctx_server.get_slot_n_ctx()},
+            };
+        }
+
+        // this endpoint is publicly available, please only return what is safe to be exposed
+        json data = {
+            { "default_generation_settings", default_generation_settings_for_props },
+            { "total_slots",                 ctx_server.params_base.n_parallel },
+            { "model_alias",                 ctx_server.model_name },
+            { "model_path",                  ctx_server.params_base.model.path },
+            { "modalities",                  json {
+                {"vision", ctx_server.oai_parser_opt.allow_image},
+                {"audio",  ctx_server.oai_parser_opt.allow_audio},
+            } },
+            { "endpoint_slots",              params.endpoint_slots },
+            { "endpoint_props",              params.endpoint_props },
+            { "endpoint_metrics",            params.endpoint_metrics },
+            { "webui",                       params.webui },
+            { "chat_template",               common_chat_templates_source(ctx_server.chat_templates.get()) },
+            { "bos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
+            { "eos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
+            { "build_info",                  build_info },
+        };
+        if (ctx_server.params_base.use_jinja) {
+            if (auto tool_use_src = common_chat_templates_source(ctx_server.chat_templates.get(), "tool_use")) {
+                data["chat_template_tool_use"] = tool_use_src;
+            }
+        }
+
+        res->ok(data);
+        return res;
+    };
+
+    this->post_props = [this](const server_http_req &) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
+        if (!params.endpoint_props) {
+            res->error(format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED));
+            return res;
+        }
+        // update any props here
+
+        res->ok({{ "success", true }});
+        return res;
+    };
+
+    this->get_api_show = [this](const server_http_req &) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
+        bool has_mtmd = ctx_server.mctx != nullptr;
+        json data = {
+            {
+                "template", common_chat_templates_source(ctx_server.chat_templates.get()),
+            },
+            {
+                "model_info", {
+                    { "llama.context_length", ctx_server.get_slot_n_ctx() },
+                }
+            },
+            {"modelfile", ""},
+            {"parameters", ""},
+            {"template", common_chat_templates_source(ctx_server.chat_templates.get())},
+            {"details", {
+                {"parent_model", ""},
+                {"format", "gguf"},
+                {"family", ""},
+                {"families", {""}},
+                {"parameter_size", ""},
+                {"quantization_level", ""}
+            }},
+            {"model_info", ""},
+            {"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})}
+        };
+
+        res->ok(data);
+        return res;
+    };
+
+    this->post_infill = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
+        // check model compatibility
+        std::string err;
+        if (llama_vocab_fim_pre(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
+            err += "prefix token is missing. ";
+        }
+        if (llama_vocab_fim_suf(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
+            err += "suffix token is missing. ";
+        }
+        if (llama_vocab_fim_mid(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
+            err += "middle token is missing. ";
+        }
+        if (!err.empty()) {
+            res->error(format_error_response(string_format("Infill is not supported by this model: %s", err.c_str()), ERROR_TYPE_NOT_SUPPORTED));
+            return res;
+        }
+
+        // validate input
+        json data = json::parse(req.body);
+        if (data.contains("prompt") && !data.at("prompt").is_string()) {
+            // prompt is optional
+            res->error(format_error_response("\"prompt\" must be a string", ERROR_TYPE_INVALID_REQUEST));
+        }
+
+        if (!data.contains("input_prefix")) {
+            res->error(format_error_response("\"input_prefix\" is required", ERROR_TYPE_INVALID_REQUEST));
+        }
+
+        if (!data.contains("input_suffix")) {
+            res->error(format_error_response("\"input_suffix\" is required", ERROR_TYPE_INVALID_REQUEST));
+        }
+
+        if (data.contains("input_extra") && !data.at("input_extra").is_array()) {
+            // input_extra is optional
+            res->error(format_error_response("\"input_extra\" must be an array of {\"filename\": string, \"text\": string}", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+
+        json input_extra = json_value(data, "input_extra", json::array());
+        for (const auto & chunk : input_extra) {
+            // { "text": string, "filename": string }
+            if (!chunk.contains("text") || !chunk.at("text").is_string()) {
+                res->error(format_error_response("extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST));
+                return res;
+            }
+            // filename is optional
+            if (chunk.contains("filename") && !chunk.at("filename").is_string()) {
+                res->error(format_error_response("extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST));
+                return res;
+            }
+        }
+        data["input_extra"] = input_extra; // default to empty array if it's not exist
+
+        std::string prompt = json_value(data, "prompt", std::string());
+        std::vector<server_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, false, true);
+        SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
+        data["prompt"] = format_prompt_infill(
+            ctx_server.vocab,
+            data.at("input_prefix"),
+            data.at("input_suffix"),
+            data.at("input_extra"),
+            ctx_server.params_base.n_batch,
+            ctx_server.params_base.n_predict,
+            ctx_server.get_slot_n_ctx(),
+            ctx_server.params_base.spm_infill,
+            tokenized_prompts[0].get_text_tokens() // TODO: this could maybe be multimodal.
+        );
+
+        std::vector<raw_buffer> files; // dummy
+        return handle_completions_impl(
+            ctx_server,
+            SERVER_TASK_TYPE_INFILL,
+            data,
+            files,
+            req.should_stop,
+            TASK_RESPONSE_TYPE_NONE); // infill is not OAI compatible
+    };
+
+    this->post_completions = [this](const server_http_req & req) {
+        std::vector<raw_buffer> files; // dummy
+        const json body = json::parse(req.body);
+        return handle_completions_impl(
+            ctx_server,
+            SERVER_TASK_TYPE_COMPLETION,
+            body,
+            files,
+            req.should_stop,
+            TASK_RESPONSE_TYPE_NONE);
+    };
+
+    this->post_completions_oai = [this](const server_http_req & req) {
+        std::vector<raw_buffer> files; // dummy
+        const json body = json::parse(req.body);
+        return handle_completions_impl(
+            ctx_server,
+            SERVER_TASK_TYPE_COMPLETION,
+            body,
+            files,
+            req.should_stop,
+            TASK_RESPONSE_TYPE_OAI_CMPL);
+    };
+
+    this->post_chat_completions = [this](const server_http_req & req) {
+        std::vector<raw_buffer> files;
+        json body = json::parse(req.body);
+        json body_parsed = oaicompat_chat_params_parse(
+            body,
+            ctx_server.oai_parser_opt,
+            files);
+        return handle_completions_impl(
+            ctx_server,
+            SERVER_TASK_TYPE_COMPLETION,
+            body_parsed,
+            files,
+            req.should_stop,
+            TASK_RESPONSE_TYPE_OAI_CHAT);
+    };
+
+    this->post_anthropic_messages = [this](const server_http_req & req) {
+        std::vector<raw_buffer> files;
+        json body = convert_anthropic_to_oai(json::parse(req.body));
+        json body_parsed = oaicompat_chat_params_parse(
+            body,
+            ctx_server.oai_parser_opt,
+            files);
+        return handle_completions_impl(
+            ctx_server,
+            SERVER_TASK_TYPE_COMPLETION,
+            body_parsed,
+            files,
+            req.should_stop,
+            TASK_RESPONSE_TYPE_ANTHROPIC);
+    };
+
+    this->post_anthropic_count_tokens = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
+        std::vector<raw_buffer> files;
+        json body = convert_anthropic_to_oai(json::parse(req.body));
+        json body_parsed = oaicompat_chat_params_parse(
+            body,
+            ctx_server.oai_parser_opt,
+            files);
+
+        json prompt = body_parsed.at("prompt");
+        llama_tokens tokens = tokenize_mixed(ctx_server.vocab, prompt, true, true);
+
+        res->ok({{"input_tokens", static_cast<int>(tokens.size())}});
+        return res;
+    };
+
+    // same with handle_chat_completions, but without inference part
+    this->post_apply_template = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
+        std::vector<raw_buffer> files; // dummy, unused
+        json body = json::parse(req.body);
+        json data = oaicompat_chat_params_parse(
+            body,
+            ctx_server.oai_parser_opt,
+            files);
+        res->ok({{ "prompt", std::move(data.at("prompt")) }});
+        return res;
+    };
+
+    this->get_models = [this](const server_http_req &) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
+        json model_meta = nullptr;
+        if (is_ready()) {
+            model_meta = ctx_server.model_meta();
+        }
+        bool has_mtmd = ctx_server.mctx != nullptr;
+        json models = {
+            {"models", {
+                {
+                    {"name", ctx_server.model_name},
+                    {"model", ctx_server.model_name},
+                    {"modified_at", ""},
+                    {"size", ""},
+                    {"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash
+                    {"type", "model"},
+                    {"description", ""},
+                    {"tags", {""}},
+                    {"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})},
+                    {"parameters", ""},
+                    {"details", {
+                        {"parent_model", ""},
+                        {"format", "gguf"},
+                        {"family", ""},
+                        {"families", {""}},
+                        {"parameter_size", ""},
+                        {"quantization_level", ""}
+                    }}
+                }
+            }},
+            {"object", "list"},
+            {"data", {
+                {
+                    {"id",       ctx_server.model_name},
+                    {"object",   "model"},
+                    {"created",  std::time(0)},
+                    {"owned_by", "llamacpp"},
+                    {"meta",     model_meta},
+                },
+            }}
+        };
+
+        res->ok(models);
+        return res;
+    };
+
+    this->post_tokenize = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
+        const json body = json::parse(req.body);
+        json tokens_response = json::array();
+        if (body.count("content") != 0) {
+            const bool add_special = json_value(body, "add_special", false);
+            const bool parse_special = json_value(body, "parse_special", true);
+            const bool with_pieces = json_value(body, "with_pieces", false);
+
+            llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, parse_special);
+
+            if (with_pieces) {
+                for (const auto& token : tokens) {
+                    std::string piece = common_token_to_piece(ctx_server.ctx, token);
+                    json piece_json;
+
+                    // Check if the piece is valid UTF-8
+                    if (is_valid_utf8(piece)) {
+                        piece_json = piece;
+                    } else {
+                        // If not valid UTF-8, store as array of byte values
+                        piece_json = json::array();
+                        for (unsigned char c : piece) {
+                            piece_json.push_back(static_cast<int>(c));
+                        }
+                    }
+
+                    tokens_response.push_back({
+                        {"id", token},
+                        {"piece", piece_json}
+                    });
+                }
+            } else {
+                tokens_response = tokens;
+            }
+        }
+
+        res->ok(json{{"tokens", std::move(tokens_response)}});
+        return res;
+    };
+
+    this->post_detokenize = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
+        const json body = json::parse(req.body);
+
+        std::string content;
+        if (body.count("tokens") != 0) {
+            const llama_tokens tokens = body.at("tokens");
+            content = tokens_to_str(ctx_server.ctx, tokens);
+        }
+
+        res->ok(json{{"content", std::move(content)}});
+        return res;
+    };
+
+    this->post_embeddings = [this](const server_http_req & req) {
+        return handle_embeddings_impl(req, TASK_RESPONSE_TYPE_NONE);
+    };
+
+    this->post_embeddings_oai = [this](const server_http_req & req) {
+        return handle_embeddings_impl(req, TASK_RESPONSE_TYPE_OAI_EMBD);
+    };
+
+    this->post_rerank = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
+        if (!ctx_server.params_base.embedding || ctx_server.params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) {
+            res->error(format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
+            return res;
+        }
+
+        const json body = json::parse(req.body);
+
+        // if true, use TEI API format, otherwise use Jina API format
+        // Jina: https://jina.ai/reranker/
+        // TEI: https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/rerank
+        bool is_tei_format = body.contains("texts");
+
+        json query;
+        if (body.count("query") == 1) {
+            query = body.at("query");
+            if (!query.is_string()) {
+                res->error(format_error_response("\"query\" must be a string", ERROR_TYPE_INVALID_REQUEST));
+                return res;
+            }
+        } else {
+            res->error(format_error_response("\"query\" must be provided", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+
+        std::vector<std::string> documents = json_value(body, "documents",
+                                             json_value(body, "texts", std::vector<std::string>()));
+        if (documents.empty()) {
+            res->error(format_error_response("\"documents\" must be a non-empty string array", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+
+        int top_n = json_value(body, "top_n", (int)documents.size());
+
+        // create and queue the task
+        json responses = json::array();
+        server_response_reader rd({ctx_server.queue_tasks, ctx_server.queue_results}, HTTP_POLLING_SECONDS);
+        {
+            std::vector<server_task> tasks;
+            tasks.reserve(documents.size());
+            for (size_t i = 0; i < documents.size(); i++) {
+                auto tmp = format_prompt_rerank(ctx_server.model, ctx_server.vocab, ctx_server.mctx, query, documents[i]);
+                server_task task = server_task(SERVER_TASK_TYPE_RERANK);
+                task.id     = ctx_server.queue_tasks.get_new_id();
+                task.index  = i;
+                task.tokens = std::move(tmp);
+                tasks.push_back(std::move(task));
+            }
+            rd.post_tasks(std::move(tasks));
+        }
+
+        // wait for the results
+        auto all_results = rd.wait_for_all(req.should_stop);
+
+        // collect results
+        if (all_results.is_terminated) {
+            return res; // connection is closed
+        } else if (all_results.error) {
+            res->error(all_results.error->to_json());
+            return res;
+        } else {
+            for (auto & res : all_results.results) {
+                GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
+                responses.push_back(res->to_json());
+            }
+        }
+
+        // write JSON response
+        json root = format_response_rerank(
+            body,
+            ctx_server.model_name,
+            responses,
+            is_tei_format,
+            documents,
+            top_n);
+
+        res->ok(root);
+        return res;
+    };
+
+    this->get_lora_adapters = [this](const server_http_req &) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
+        json result = json::array();
+        const auto & loras = ctx_server.params_base.lora_adapters;
+        for (size_t i = 0; i < loras.size(); ++i) {
+            auto & lora = loras[i];
+            json entry = {
+                {"id", i},
+                {"path", lora.path},
+                {"scale", lora.scale},
+                {"task_name", lora.task_name},
+                {"prompt_prefix", lora.prompt_prefix},
+            };
+            std::string alora_invocation_string = "";
+            const uint64_t n_alora_tokens = llama_adapter_get_alora_n_invocation_tokens(lora.ptr);
+            std::vector<llama_token> alora_invocation_tokens;
+            if (n_alora_tokens) {
+                const llama_token * alora_tokens = llama_adapter_get_alora_invocation_tokens(lora.ptr);
+                for (uint64_t i = 0; i < n_alora_tokens; ++i) {
+                    alora_invocation_string += common_token_to_piece(ctx_server.ctx, alora_tokens[i]);
+                    alora_invocation_tokens.push_back(alora_tokens[i]);
+                }
+                entry["alora_invocation_string"] = alora_invocation_string;
+                entry["alora_invocation_tokens"] = alora_invocation_tokens;
+            }
+            result.push_back(std::move(entry));
+        }
+        res->ok(result);
+        return res;
+    };
+
+    this->post_lora_adapters = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
+        const json body = json::parse(req.body);
+        if (!body.is_array()) {
+            res->error(format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+
+        int task_id = ctx_server.queue_tasks.get_new_id();
+        {
+            server_task task(SERVER_TASK_TYPE_SET_LORA);
+            task.id = task_id;
+            task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
+            ctx_server.queue_results.add_waiting_task_id(task_id);
+            ctx_server.queue_tasks.post(std::move(task));
+        }
+
+        // get the result
+        server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
+        ctx_server.queue_results.remove_waiting_task_id(task_id);
+
+        if (result->is_error()) {
+            res->error(result->to_json());
+            return res;
+        }
+
+        GGML_ASSERT(dynamic_cast<server_task_result_apply_lora*>(result.get()) != nullptr);
+        res->ok(result->to_json());
+        return res;
+    };
+}
+
+std::unique_ptr<server_res_generator> server_routes::handle_slots_save(const server_http_req & req, int id_slot) {
+    auto res = std::make_unique<server_res_generator>(ctx_server);
+    const json request_data = json::parse(req.body);
+    std::string filename = request_data.at("filename");
+    if (!fs_validate_filename(filename)) {
+        res->error(format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
+        return res;
+    }
+    std::string filepath = params.slot_save_path + filename;
+
+    int task_id = ctx_server.queue_tasks.get_new_id();
+    {
+        server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
+        task.id = task_id;
+        task.slot_action.slot_id  = id_slot;
+        task.slot_action.filename = filename;
+        task.slot_action.filepath = filepath;
+
+        // TODO: use server_response_reader
+        ctx_server.queue_results.add_waiting_task_id(task_id);
+        ctx_server.queue_tasks.post(std::move(task));
+    }
+
+    server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
+    ctx_server.queue_results.remove_waiting_task_id(task_id);
+
+    if (result->is_error()) {
+        res->error(result->to_json());
+        return res;
+    }
+
+    res->ok(result->to_json());
+    return res;
+}
+
+std::unique_ptr<server_res_generator> server_routes::handle_slots_restore(const server_http_req & req, int id_slot) {
+    auto res = std::make_unique<server_res_generator>(ctx_server);
+    const json request_data = json::parse(req.body);
+    std::string filename = request_data.at("filename");
+    if (!fs_validate_filename(filename)) {
+        res->error(format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
+        return res;
+    }
+    std::string filepath = params.slot_save_path + filename;
+
+    int task_id = ctx_server.queue_tasks.get_new_id();
+    {
+        server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
+        task.id = task_id;
+        task.slot_action.slot_id  = id_slot;
+        task.slot_action.filename = filename;
+        task.slot_action.filepath = filepath;
+
+        // TODO: use server_response_reader
+        ctx_server.queue_results.add_waiting_task_id(task_id);
+        ctx_server.queue_tasks.post(std::move(task));
+    }
+
+    server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
+    ctx_server.queue_results.remove_waiting_task_id(task_id);
+
+    if (result->is_error()) {
+        res->error(result->to_json());
+        return res;
+    }
+
+    GGML_ASSERT(dynamic_cast<server_task_result_slot_save_load*>(result.get()) != nullptr);
+    res->ok(result->to_json());
+    return res;
+}
+
+std::unique_ptr<server_res_generator> server_routes::handle_slots_erase(const server_http_req &, int id_slot) {
+    auto res = std::make_unique<server_res_generator>(ctx_server);
+    int task_id = ctx_server.queue_tasks.get_new_id();
+    {
+        server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
+        task.id = task_id;
+        task.slot_action.slot_id = id_slot;
+
+        // TODO: use server_response_reader
+        ctx_server.queue_results.add_waiting_task_id(task_id);
+        ctx_server.queue_tasks.post(std::move(task));
+    }
+
+    server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
+    ctx_server.queue_results.remove_waiting_task_id(task_id);
+
+    if (result->is_error()) {
+        res->error(result->to_json());
+        return res;
+    }
+
+    GGML_ASSERT(dynamic_cast<server_task_result_slot_erase*>(result.get()) != nullptr);
+    res->ok(result->to_json());
+    return res;
+}
+
+std::unique_ptr<server_res_generator> server_routes::handle_embeddings_impl(const server_http_req & req, task_response_type res_type) {
+    auto res = std::make_unique<server_res_generator>(ctx_server);
+    if (!ctx_server.params_base.embedding) {
+        res->error(format_error_response("This server does not support embeddings. Start it with `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
+        return res;
+    }
+
+    if (res_type != TASK_RESPONSE_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
+        res->error(format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST));
+        return res;
+    }
+
+    const json body = json::parse(req.body);
+
+    // for the shape of input/content, see tokenize_input_prompts()
+    json prompt;
+    if (body.count("input") != 0) {
+        prompt = body.at("input");
+    } else if (body.contains("content")) {
+        res_type = TASK_RESPONSE_TYPE_NONE; // "content" field is not OAI compatible
+        prompt = body.at("content");
+    } else {
+        res->error(format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
+        return res;
+    }
+
+    bool use_base64 = false;
+    if (body.count("encoding_format") != 0) {
+        const std::string& format = body.at("encoding_format");
+        if (format == "base64") {
+            use_base64 = true;
+        } else if (format != "float") {
+            res->error(format_error_response("The format to return the embeddings in. Can be either float or base64", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+    }
+
+    auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
+    for (const auto & tokens : tokenized_prompts) {
+        // this check is necessary for models that do not add BOS token to the input
+        if (tokens.empty()) {
+            res->error(format_error_response("Input content cannot be empty", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+    }
+
+    int embd_normalize = 2; // default to Euclidean/L2 norm
+    if (body.count("embd_normalize") != 0) {
+        embd_normalize = body.at("embd_normalize");
+        if (llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
+            SRV_DBG("embd_normalize is not supported by pooling type %d, ignoring it\n", llama_pooling_type(ctx_server.ctx));
+        }
+    }
+
+    // create and queue the task
+    json responses = json::array();
+    server_response_reader rd({ctx_server.queue_tasks, ctx_server.queue_results}, HTTP_POLLING_SECONDS);
+    {
+        std::vector<server_task> tasks;
+        for (size_t i = 0; i < tokenized_prompts.size(); i++) {
+            server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING);
+
+            task.id     = ctx_server.queue_tasks.get_new_id();
+            task.index  = i;
+            task.tokens = std::move(tokenized_prompts[i]);
+
+            // OAI-compat
+            task.params.res_type = res_type;
+            task.params.embd_normalize = embd_normalize;
+
+            tasks.push_back(std::move(task));
+        }
+        rd.post_tasks(std::move(tasks));
+    }
+
+    // wait for the results
+    auto all_results = rd.wait_for_all(req.should_stop);
+
+    // collect results
+    if (all_results.is_terminated) {
+        return res; // connection is closed
+    } else if (all_results.error) {
+        res->error(all_results.error->to_json());
+        return res;
+    } else {
+        for (auto & res : all_results.results) {
+            GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
+            responses.push_back(res->to_json());
+        }
+    }
+
+    // write JSON response
+    json root = res_type == TASK_RESPONSE_TYPE_OAI_EMBD
+        ? format_embeddings_response_oaicompat(body, ctx_server.model_name, responses, use_base64)
+        : json(responses);
+    res->ok(root);
+    return res;
+}
diff --git a/llamacpp/native/src/server/server-context.h b/llamacpp/native/src/server/server-context.h
new file mode 100644
index 000000000..05b4afaee
--- /dev/null
+++ b/llamacpp/native/src/server/server-context.h
@@ -0,0 +1,83 @@
+#include "server-http.h"
+#include "server-task.h"
+#include "server-queue.h"
+
+#include <nlohmann/json_fwd.hpp>
+
+#include <cstddef>
+#include <memory>
+
+struct server_context_impl; // private implementation
+
+struct server_context {
+    std::unique_ptr<server_context_impl> impl;
+
+    server_context();
+    ~server_context();
+
+    // initialize slots and server-related data
+    void init();
+
+    // load the model and initialize llama_context
+    // returns true on success
+    bool load_model(const common_params & params);
+
+    // this function will block main thread until termination
+    void start_loop();
+
+    // terminate main loop (will unblock start_loop)
+    void terminate();
+
+    // get the underlaying llama_context
+    llama_context * get_llama_context() const;
+
+    // get the underlaying queue_tasks and queue_results
+    // used by CLI application
+    std::pair<server_queue &, server_response &> get_queues();
+};
+
+
+// forward declarations
+struct server_res_generator;
+
+struct server_routes {
+    server_routes(const common_params & params, server_context & ctx_server, std::function<bool()> is_ready = []() { return true; })
+            : params(params), ctx_server(*ctx_server.impl), is_ready(is_ready) {
+        init_routes();
+    }
+
+    void init_routes();
+    // handlers using lambda function, so that they can capture `this` without `std::bind`
+    server_http_context::handler_t get_health;
+    server_http_context::handler_t get_metrics;
+    server_http_context::handler_t get_slots;
+    server_http_context::handler_t post_slots;
+    server_http_context::handler_t get_props;
+    server_http_context::handler_t post_props;
+    server_http_context::handler_t get_api_show;
+    server_http_context::handler_t post_infill;
+    server_http_context::handler_t post_completions;
+    server_http_context::handler_t post_completions_oai;
+    server_http_context::handler_t post_chat_completions;
+    server_http_context::handler_t post_anthropic_messages;
+    server_http_context::handler_t post_anthropic_count_tokens;
+    server_http_context::handler_t post_apply_template;
+    server_http_context::handler_t get_models;
+    server_http_context::handler_t post_tokenize;
+    server_http_context::handler_t post_detokenize;
+    server_http_context::handler_t post_embeddings;
+    server_http_context::handler_t post_embeddings_oai;
+    server_http_context::handler_t post_rerank;
+    server_http_context::handler_t get_lora_adapters;
+    server_http_context::handler_t post_lora_adapters;
+private:
+    // TODO: move these outside of server_routes?
+    std::unique_ptr<server_res_generator> handle_slots_save(const server_http_req & req, int id_slot);
+    std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot);
+    std::unique_ptr<server_res_generator> handle_slots_erase(const server_http_req &, int id_slot);
+    std::unique_ptr<server_res_generator> handle_embeddings_impl(const server_http_req & req, task_response_type res_type);
+
+    const common_params & params;
+    server_context_impl & ctx_server;
+    std::function<bool()> is_ready;
+};
diff --git a/llamacpp/native/src/server/server-http.cpp b/llamacpp/native/src/server/server-http.cpp
new file mode 100644
index 000000000..77e54d192
--- /dev/null
+++ b/llamacpp/native/src/server/server-http.cpp
@@ -0,0 +1,380 @@
+#include "common.h"
+#include "server-http.h"
+#include "server-common.h"
+
+#include <cpp-httplib/httplib.h>
+
+#include <functional>
+#include <string>
+#include <thread>
+
+//
+// HTTP implementation using cpp-httplib
+//
+
+class server_http_context::Impl {
+public:
+    std::unique_ptr<httplib::Server> srv;
+};
+
+server_http_context::server_http_context()
+    : pimpl(std::make_unique<server_http_context::Impl>())
+{}
+
+server_http_context::~server_http_context() = default;
+
+static void log_server_request(const httplib::Request & req, const httplib::Response & res) {
+    // skip GH copilot requests when using default port
+    if (req.path == "/v1/health") {
+        return;
+    }
+
+    // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch
+
+    SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
+
+    SRV_DBG("request:  %s\n", req.body.c_str());
+    SRV_DBG("response: %s\n", res.body.c_str());
+}
+
+bool server_http_context::init(const common_params & params) {
+    path_prefix = params.api_prefix;
+    port = params.port;
+    hostname = params.hostname;
+
+    auto & srv = pimpl->srv;
+
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+    if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
+        LOG_INF("Running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str());
+        srv.reset(
+            new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str())
+        );
+    } else {
+        LOG_INF("Running without SSL\n");
+        srv.reset(new httplib::Server());
+    }
+#else
+    if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
+        LOG_ERR("Server is built without SSL support\n");
+        return false;
+    }
+    srv.reset(new httplib::Server());
+#endif
+
+    srv->set_default_headers({{"Server", "llama.cpp"}});
+    srv->set_logger(log_server_request);
+    srv->set_exception_handler([](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) {
+        // this is fail-safe; exceptions should already handled by `ex_wrapper`
+
+        std::string message;
+        try {
+            std::rethrow_exception(ep);
+        } catch (const std::exception & e) {
+            message = e.what();
+        } catch (...) {
+            message = "Unknown Exception";
+        }
+
+        res.status = 500;
+        res.set_content(message, "text/plain");
+        LOG_ERR("got exception: %s\n", message.c_str());
+    });
+
+    srv->set_error_handler([](const httplib::Request &, httplib::Response & res) {
+        if (res.status == 404) {
+            res.set_content(
+                safe_json_to_str(json {
+                    {"error", {
+                        {"message", "File Not Found"},
+                        {"type", "not_found_error"},
+                        {"code", 404}
+                    }}
+                }),
+                "application/json; charset=utf-8"
+            );
+        }
+        // for other error codes, we skip processing here because it's already done by res->error()
+    });
+
+    // set timeouts and change hostname and port
+    srv->set_read_timeout (params.timeout_read);
+    srv->set_write_timeout(params.timeout_write);
+
+    if (params.api_keys.size() == 1) {
+        auto key = params.api_keys[0];
+        std::string substr = key.substr(std::max((int)(key.length() - 4), 0));
+        LOG_INF("%s: api_keys: ****%s\n", __func__, substr.c_str());
+    } else if (params.api_keys.size() > 1) {
+        LOG_INF("%s: api_keys: %zu keys loaded\n", __func__, params.api_keys.size());
+    }
+
+    //
+    // Middlewares
+    //
+
+    auto middleware_validate_api_key = [api_keys = params.api_keys](const httplib::Request & req, httplib::Response & res) {
+        static const std::unordered_set<std::string> public_endpoints = {
+            "/health",
+            "/v1/health",
+            "/models",
+            "/v1/models",
+            "/api/tags"
+        };
+
+        // If API key is not set, skip validation
+        if (api_keys.empty()) {
+            return true;
+        }
+
+        // If path is public or is static file, skip validation
+        if (public_endpoints.find(req.path) != public_endpoints.end() || req.path == "/") {
+            return true;
+        }
+
+        // Check for API key in the Authorization header
+        std::string req_api_key = req.get_header_value("Authorization");
+        if (req_api_key.empty()) {
+            // retry with anthropic header
+            req_api_key = req.get_header_value("X-Api-Key");
+        }
+
+        // remove the "Bearer " prefix if needed
+        std::string prefix = "Bearer ";
+        if (req_api_key.substr(0, prefix.size()) == prefix) {
+            req_api_key = req_api_key.substr(prefix.size());
+        }
+
+        // validate the API key
+        if (std::find(api_keys.begin(), api_keys.end(), req_api_key) != api_keys.end()) {
+            return true; // API key is valid
+        }
+
+        // API key is invalid or not provided
+        res.status = 401;
+        res.set_content(
+            safe_json_to_str(json {
+                {"error", {
+                    {"message", "Invalid API Key"},
+                    {"type", "authentication_error"},
+                    {"code", 401}
+                }}
+            }),
+            "application/json; charset=utf-8"
+        );
+
+        LOG_WRN("Unauthorized: Invalid API Key\n");
+
+        return false;
+    };
+
+    auto middleware_server_state = [this](const httplib::Request & req, httplib::Response & res) {
+        bool ready = is_ready.load();
+        if (!ready) {
+            res.status = 503;
+            res.set_content(
+                safe_json_to_str(json {
+                    {"error", {
+                        {"message", "Loading model"},
+                        {"type", "unavailable_error"},
+                        {"code", 503}
+                    }}
+                }),
+                "application/json; charset=utf-8"
+            );
+            return false;
+        }
+        return true;
+    };
+
+    // register server middlewares
+    srv->set_pre_routing_handler([middleware_validate_api_key, middleware_server_state](const httplib::Request & req, httplib::Response & res) {
+        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+        // If this is OPTIONS request, skip validation because browsers don't include Authorization header
+        if (req.method == "OPTIONS") {
+            res.set_header("Access-Control-Allow-Credentials", "true");
+            res.set_header("Access-Control-Allow-Methods",     "GET, POST");
+            res.set_header("Access-Control-Allow-Headers",     "*");
+            res.set_content("", "text/html"); // blank response, no data
+            return httplib::Server::HandlerResponse::Handled; // skip further processing
+        }
+        if (!middleware_server_state(req, res)) {
+            return httplib::Server::HandlerResponse::Handled;
+        }
+        if (!middleware_validate_api_key(req, res)) {
+            return httplib::Server::HandlerResponse::Handled;
+        }
+        return httplib::Server::HandlerResponse::Unhandled;
+    });
+
+    int n_threads_http = params.n_threads_http;
+    if (n_threads_http < 1) {
+        // +2 threads for monitoring endpoints
+        n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
+    }
+    LOG_INF("%s: using %d threads for HTTP server\n", __func__, n_threads_http);
+    srv->new_task_queue = [n_threads_http] { return new httplib::ThreadPool(n_threads_http); };
+
+    //
+    // Web UI setup
+    //
+
+    if (!params.webui) {
+        LOG_INF("Web UI is disabled\n");
+    } else {
+        // register static assets routes
+        if (!params.public_path.empty()) {
+            // Set the base directory for serving static files
+            bool is_found = srv->set_mount_point(params.api_prefix + "/", params.public_path);
+            if (!is_found) {
+                LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str());
+                return 1;
+            }
+        } else {
+            // using embedded static index.html
+            srv->Get(params.api_prefix + "/", [](const httplib::Request & req, httplib::Response & res) {
+                if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) {
+                    res.set_content("Error: gzip is not supported by this browser", "text/plain");
+                } else {
+                    res.set_header("Content-Encoding", "gzip");
+                    // COEP and COOP headers, required by pyodide (python interpreter)
+                    res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
+                    res.set_header("Cross-Origin-Opener-Policy", "same-origin");
+                }
+                return false;
+            });
+        }
+    }
+    return true;
+}
+
+bool server_http_context::start() {
+    // Bind and listen
+
+    auto & srv = pimpl->srv;
+    bool was_bound = false;
+    bool is_sock = false;
+    if (string_ends_with(std::string(hostname), ".sock")) {
+        is_sock = true;
+        LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
+        srv->set_address_family(AF_UNIX);
+        // bind_to_port requires a second arg, any value other than 0 should
+        // simply get ignored
+        was_bound = srv->bind_to_port(hostname, 8080);
+    } else {
+        LOG_INF("%s: binding port with default address family\n", __func__);
+        // bind HTTP listen port
+        if (port == 0) {
+            int bound_port = srv->bind_to_any_port(hostname);
+            was_bound = (bound_port >= 0);
+            if (was_bound) {
+                port = bound_port;
+            }
+        } else {
+            was_bound = srv->bind_to_port(hostname, port);
+        }
+    }
+
+    if (!was_bound) {
+        LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, hostname.c_str(), port);
+        return false;
+    }
+
+    // run the HTTP server in a thread
+    thread = std::thread([this]() { pimpl->srv->listen_after_bind(); });
+    srv->wait_until_ready();
+
+    listening_address = is_sock ? string_format("unix://%s",    hostname.c_str())
+                                : string_format("http://%s:%d", hostname.c_str(), port);
+    return true;
+}
+
+void server_http_context::stop() const {
+    if (pimpl->srv) {
+        pimpl->srv->stop();
+    }
+}
+
+static void set_headers(httplib::Response & res, const std::map<std::string, std::string> & headers) {
+    for (const auto & [key, value] : headers) {
+        res.set_header(key, value);
+    }
+}
+
+static std::map<std::string, std::string> get_params(const httplib::Request & req) {
+    std::map<std::string, std::string> params;
+    for (const auto & [key, value] : req.params) {
+        params[key] = value;
+    }
+    for (const auto & [key, value] : req.path_params) {
+        params[key] = value;
+    }
+    return params;
+}
+
+static std::map<std::string, std::string> get_headers(const httplib::Request & req) {
+    std::map<std::string, std::string> headers;
+    for (const auto & [key, value] : req.headers) {
+        headers[key] = value;
+    }
+    return headers;
+}
+
+static void process_handler_response(server_http_res_ptr & response, httplib::Response & res) {
+    if (response->is_stream()) {
+        res.status = response->status;
+        set_headers(res, response->headers);
+        std::string content_type = response->content_type;
+        // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it
+        std::shared_ptr<server_http_res> r_ptr = std::move(response);
+        const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool {
+            std::string chunk;
+            bool has_next = response->next(chunk);
+            if (!chunk.empty()) {
+                // TODO: maybe handle sink.write unsuccessful? for now, we rely on is_connection_closed()
+                sink.write(chunk.data(), chunk.size());
+                SRV_DBG("http: streamed chunk: %s\n", chunk.c_str());
+            }
+            if (!has_next) {
+                sink.done();
+                SRV_DBG("%s", "http: stream ended\n");
+            }
+            return has_next;
+        };
+        const auto on_complete = [response = r_ptr](bool) mutable {
+            response.reset(); // trigger the destruction of the response object
+        };
+        res.set_chunked_content_provider(content_type, chunked_content_provider, on_complete);
+    } else {
+        res.status = response->status;
+        set_headers(res, response->headers);
+        res.set_content(response->data, response->content_type);
+    }
+}
+
+void server_http_context::get(const std::string & path, const server_http_context::handler_t & handler) const {
+    pimpl->srv->Get(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
+        server_http_res_ptr response = handler(server_http_req{
+            get_params(req),
+            get_headers(req),
+            req.path,
+            req.body,
+            req.is_connection_closed
+        });
+        process_handler_response(response, res);
+    });
+}
+
+void server_http_context::post(const std::string & path, const server_http_context::handler_t & handler) const {
+    pimpl->srv->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
+        server_http_res_ptr response = handler(server_http_req{
+            get_params(req),
+            get_headers(req),
+            req.path,
+            req.body,
+            req.is_connection_closed
+        });
+        process_handler_response(response, res);
+    });
+}
+
diff --git a/llamacpp/native/src/server/server-http.h b/llamacpp/native/src/server/server-http.h
new file mode 100644
index 000000000..24c0b4011
--- /dev/null
+++ b/llamacpp/native/src/server/server-http.h
@@ -0,0 +1,78 @@
+#pragma once
+
+#include <atomic>
+#include <functional>
+#include <map>
+#include <string>
+#include <thread>
+
+struct common_params;
+
+// generator-like API for HTTP response generation
+// this object response with one of the 2 modes:
+// 1) normal response: `data` contains the full response body
+// 2) streaming response: each call to next(output) generates the next chunk
+//    when next(output) returns false, no more data after the current chunk
+//    note: some chunks can be empty, in which case no data is sent for that chunk
+struct server_http_res {
+    std::string content_type = "application/json; charset=utf-8";
+    int status = 200;
+    std::string data;
+    std::map<std::string, std::string> headers;
+
+    // TODO: move this to a virtual function once we have proper polymorphism support
+    std::function<bool(std::string &)> next = nullptr;
+    bool is_stream() const {
+        return next != nullptr;
+    }
+
+    virtual ~server_http_res() = default;
+};
+
+// unique pointer, used by set_chunked_content_provider
+// httplib requires the stream provider to be stored in heap
+using server_http_res_ptr = std::unique_ptr<server_http_res>;
+
+struct server_http_req {
+    std::map<std::string, std::string> params; // path_params + query_params
+    std::map<std::string, std::string> headers; // reserved for future use
+    std::string path; // reserved for future use
+    std::string body;
+    const std::function<bool()> & should_stop;
+
+    std::string get_param(const std::string & key, const std::string & def = "") const {
+        auto it = params.find(key);
+        if (it != params.end()) {
+            return it->second;
+        }
+        return def;
+    }
+};
+
+struct server_http_context {
+    class Impl;
+    std::unique_ptr<Impl> pimpl;
+
+    std::thread thread; // server thread
+    std::atomic<bool> is_ready = false;
+
+    std::string path_prefix;
+    std::string hostname;
+    int port;
+
+    server_http_context();
+    ~server_http_context();
+
+    bool init(const common_params & params);
+    bool start();
+    void stop() const;
+
+    // note: the handler should never throw exceptions
+    using handler_t = std::function<server_http_res_ptr(const server_http_req & req)>;
+
+    void get(const std::string & path, const handler_t & handler) const;
+    void post(const std::string & path, const handler_t & handler) const;
+
+    // for debugging
+    std::string listening_address;
+};
diff --git a/llamacpp/native/src/server/server-http.patch b/llamacpp/native/src/server/server-http.patch
new file mode 100644
index 000000000..900dae89b
--- /dev/null
+++ b/llamacpp/native/src/server/server-http.patch
@@ -0,0 +1,61 @@
+diff --git a/llamacpp/native/src/server/server-http.cpp b/llamacpp/native/src/server/server-http.cpp
+index 62250571..77e54d19 100644
+--- a/llamacpp/native/src/server/server-http.cpp
++++ b/llamacpp/native/src/server/server-http.cpp
+@@ -8,10 +8,6 @@
+ #include <string>
+ #include <thread>
+ 
+-// auto generated files (see README.md for details)
+-#include "index.html.gz.hpp"
+-#include "loading.html.hpp"
+-
+ //
+ // HTTP implementation using cpp-httplib
+ //
+@@ -175,26 +171,17 @@ bool server_http_context::init(const common_params & params) {
+     auto middleware_server_state = [this](const httplib::Request & req, httplib::Response & res) {
+         bool ready = is_ready.load();
+         if (!ready) {
+-            auto tmp = string_split<std::string>(req.path, '.');
+-            if (req.path == "/" || tmp.back() == "html") {
+-                res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
+-                res.status = 503;
+-            } else if (req.path == "/models" || req.path == "/v1/models" || req.path == "/api/tags") {
+-                // allow the models endpoint to be accessed during loading
+-                return true;
+-            } else {
+-                res.status = 503;
+-                res.set_content(
+-                    safe_json_to_str(json {
+-                        {"error", {
+-                            {"message", "Loading model"},
+-                            {"type", "unavailable_error"},
+-                            {"code", 503}
+-                        }}
+-                    }),
+-                    "application/json; charset=utf-8"
+-                );
+-            }
++            res.status = 503;
++            res.set_content(
++                safe_json_to_str(json {
++                    {"error", {
++                        {"message", "Loading model"},
++                        {"type", "unavailable_error"},
++                        {"code", 503}
++                    }}
++                }),
++                "application/json; charset=utf-8"
++            );
+             return false;
+         }
+         return true;
+@@ -253,7 +240,6 @@ bool server_http_context::init(const common_params & params) {
+                     // COEP and COOP headers, required by pyodide (python interpreter)
+                     res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
+                     res.set_header("Cross-Origin-Opener-Policy", "same-origin");
+-                    res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8");
+                 }
+                 return false;
+             });
diff --git a/llamacpp/native/src/server/server-models.cpp b/llamacpp/native/src/server/server-models.cpp
new file mode 100644
index 000000000..ac7f6b86b
--- /dev/null
+++ b/llamacpp/native/src/server/server-models.cpp
@@ -0,0 +1,975 @@
+#include "server-common.h"
+#include "server-models.h"
+
+#include "download.h"
+
+#include <cpp-httplib/httplib.h> // TODO: remove this once we use HTTP client from download.h
+#include <sheredom/subprocess.h>
+
+#include <functional>
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+#include <cstring>
+#include <atomic>
+#include <chrono>
+#include <queue>
+
+#ifdef _WIN32
+#include <winsock2.h>
+#else
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+#endif
+
+#if defined(__APPLE__) && defined(__MACH__)
+// macOS: use _NSGetExecutablePath to get the executable path
+#include <mach-o/dyld.h>
+#include <limits.h>
+#endif
+
+#define CMD_EXIT "exit"
+
+static std::filesystem::path get_server_exec_path() {
+#if defined(_WIN32)
+    wchar_t buf[32768] = { 0 };  // Large buffer to handle long paths
+    DWORD len = GetModuleFileNameW(nullptr, buf, _countof(buf));
+    if (len == 0 || len >= _countof(buf)) {
+        throw std::runtime_error("GetModuleFileNameW failed or path too long");
+    }
+    return std::filesystem::path(buf);
+#elif defined(__APPLE__) && defined(__MACH__)
+    char small_path[PATH_MAX];
+    uint32_t size = sizeof(small_path);
+
+    if (_NSGetExecutablePath(small_path, &size) == 0) {
+        // resolve any symlinks to get absolute path
+        try {
+            return std::filesystem::canonical(std::filesystem::path(small_path));
+        } catch (...) {
+            return std::filesystem::path(small_path);
+        }
+    } else {
+        // buffer was too small, allocate required size and call again
+        std::vector<char> buf(size);
+        if (_NSGetExecutablePath(buf.data(), &size) == 0) {
+            try {
+                return std::filesystem::canonical(std::filesystem::path(buf.data()));
+            } catch (...) {
+                return std::filesystem::path(buf.data());
+            }
+        }
+        throw std::runtime_error("_NSGetExecutablePath failed after buffer resize");
+    }
+#else
+    char path[FILENAME_MAX];
+    ssize_t count = readlink("/proc/self/exe", path, FILENAME_MAX);
+    if (count <= 0) {
+        throw std::runtime_error("failed to resolve /proc/self/exe");
+    }
+    return std::filesystem::path(std::string(path, count));
+#endif
+}
+
+struct local_model {
+    std::string name;
+    std::string path;
+    std::string path_mmproj;
+};
+
+static std::vector<local_model> list_local_models(const std::string & dir) {
+    if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
+        throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", dir.c_str()));
+    }
+
+    std::vector<local_model> models;
+    auto scan_subdir = [&models](const std::string & subdir_path, const std::string & name) {
+        auto files = fs_list(subdir_path, false);
+        common_file_info model_file;
+        common_file_info first_shard_file;
+        common_file_info mmproj_file;
+        for (const auto & file : files) {
+            if (string_ends_with(file.name, ".gguf")) {
+                if (file.name.find("mmproj") != std::string::npos) {
+                    mmproj_file = file;
+                } else if (file.name.find("-00001-of-") != std::string::npos) {
+                    first_shard_file = file;
+                } else {
+                    model_file = file;
+                }
+            }
+        }
+        // single file model
+        local_model model{
+            /* name        */ name,
+            /* path        */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path,
+            /* path_mmproj */ mmproj_file.path // can be empty
+        };
+        if (!model.path.empty()) {
+            models.push_back(model);
+        }
+    };
+
+    auto files = fs_list(dir, true);
+    for (const auto & file : files) {
+        if (file.is_dir) {
+            scan_subdir(file.path, file.name);
+        } else if (string_ends_with(file.name, ".gguf")) {
+            // single file model
+            std::string name = file.name;
+            string_replace_all(name, ".gguf", "");
+            local_model model{
+                /* name        */ name,
+                /* path        */ file.path,
+                /* path_mmproj */ ""
+            };
+            models.push_back(model);
+        }
+    }
+    return models;
+}
+
+//
+// server_models
+//
+
+server_models::server_models(
+        const common_params & params,
+        int argc,
+        char ** argv,
+        char ** envp) : base_params(params) {
+    for (int i = 0; i < argc; i++) {
+        base_args.push_back(std::string(argv[i]));
+    }
+    for (char ** env = envp; *env != nullptr; env++) {
+        base_env.push_back(std::string(*env));
+    }
+    GGML_ASSERT(!base_args.empty());
+    // set binary path
+    try {
+        base_args[0] = get_server_exec_path().string();
+    } catch (const std::exception & e) {
+        LOG_WRN("failed to get server executable path: %s\n", e.what());
+        LOG_WRN("using original argv[0] as fallback: %s\n", base_args[0].c_str());
+    }
+    // TODO: allow refreshing cached model list
+    // add cached models
+    auto cached_models = common_list_cached_models();
+    for (const auto & model : cached_models) {
+        server_model_meta meta{
+            /* name        */ model.to_string(),
+            /* path        */ model.manifest_path,
+            /* path_mmproj */ "", // auto-detected when loading
+            /* in_cache    */ true,
+            /* port        */ 0,
+            /* status      */ SERVER_MODEL_STATUS_UNLOADED,
+            /* last_used   */ 0,
+            /* args        */ std::vector<std::string>(),
+            /* exit_code   */ 0
+        };
+        mapping[meta.name] = instance_t{
+            /* subproc */ std::make_shared<subprocess_s>(),
+            /* th      */ std::thread(),
+            /* meta    */ meta
+        };
+    }
+    // add local models specificed via --models-dir
+    if (!params.models_dir.empty()) {
+        auto local_models = list_local_models(params.models_dir);
+        for (const auto & model : local_models) {
+            if (mapping.find(model.name) != mapping.end()) {
+                // already exists in cached models, skip
+                continue;
+            }
+            server_model_meta meta{
+                /* name        */ model.name,
+                /* path        */ model.path,
+                /* path_mmproj */ model.path_mmproj,
+                /* in_cache    */ false,
+                /* port        */ 0,
+                /* status      */ SERVER_MODEL_STATUS_UNLOADED,
+                /* last_used   */ 0,
+                /* args        */ std::vector<std::string>(),
+                /* exit_code   */ 0
+            };
+            mapping[meta.name] = instance_t{
+                /* subproc */ std::make_shared<subprocess_s>(),
+                /* th      */ std::thread(),
+                /* meta    */ meta
+            };
+        }
+    }
+}
+
+void server_models::update_meta(const std::string & name, const server_model_meta & meta) {
+    std::lock_guard<std::mutex> lk(mutex);
+    auto it = mapping.find(name);
+    if (it != mapping.end()) {
+        it->second.meta = meta;
+    }
+    cv.notify_all(); // notify wait_until_loaded
+}
+
+bool server_models::has_model(const std::string & name) {
+    std::lock_guard<std::mutex> lk(mutex);
+    return mapping.find(name) != mapping.end();
+}
+
+std::optional<server_model_meta> server_models::get_meta(const std::string & name) {
+    std::lock_guard<std::mutex> lk(mutex);
+    auto it = mapping.find(name);
+    if (it != mapping.end()) {
+        return it->second.meta;
+    }
+    return std::nullopt;
+}
+
+static int get_free_port() {
+#ifdef _WIN32
+    WSADATA wsaData;
+    if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
+        return -1;
+    }
+    typedef SOCKET native_socket_t;
+#define INVALID_SOCKET_VAL INVALID_SOCKET
+#define CLOSE_SOCKET(s) closesocket(s)
+#else
+    typedef int native_socket_t;
+#define INVALID_SOCKET_VAL -1
+#define CLOSE_SOCKET(s) close(s)
+#endif
+
+    native_socket_t sock = socket(AF_INET, SOCK_STREAM, 0);
+    if (sock == INVALID_SOCKET_VAL) {
+#ifdef _WIN32
+        WSACleanup();
+#endif
+        return -1;
+    }
+
+    struct sockaddr_in serv_addr;
+    std::memset(&serv_addr, 0, sizeof(serv_addr));
+    serv_addr.sin_family = AF_INET;
+    serv_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+    serv_addr.sin_port = htons(0);
+
+    if (bind(sock, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) != 0) {
+        CLOSE_SOCKET(sock);
+#ifdef _WIN32
+        WSACleanup();
+#endif
+        return -1;
+    }
+
+#ifdef _WIN32
+    int namelen = sizeof(serv_addr);
+#else
+    socklen_t namelen = sizeof(serv_addr);
+#endif
+    if (getsockname(sock, (struct sockaddr*)&serv_addr, &namelen) != 0) {
+        CLOSE_SOCKET(sock);
+#ifdef _WIN32
+        WSACleanup();
+#endif
+        return -1;
+    }
+
+    int port = ntohs(serv_addr.sin_port);
+
+    CLOSE_SOCKET(sock);
+#ifdef _WIN32
+    WSACleanup();
+#endif
+
+    return port;
+}
+
+// helper to convert vector<string> to char **
+// pointers are only valid as long as the original vector is valid
+static std::vector<char *> to_char_ptr_array(const std::vector<std::string> & vec) {
+    std::vector<char *> result;
+    result.reserve(vec.size() + 1);
+    for (const auto & s : vec) {
+        result.push_back(const_cast<char*>(s.c_str()));
+    }
+    result.push_back(nullptr);
+    return result;
+}
+
+std::vector<server_model_meta> server_models::get_all_meta() {
+    std::lock_guard<std::mutex> lk(mutex);
+    std::vector<server_model_meta> result;
+    result.reserve(mapping.size());
+    for (const auto & [name, inst] : mapping) {
+        result.push_back(inst.meta);
+    }
+    return result;
+}
+
+void server_models::unload_lru() {
+    if (base_params.models_max <= 0) {
+        return; // no limit
+    }
+    // remove one of the servers if we passed the models_max (least recently used - LRU)
+    std::string lru_model_name = "";
+    int64_t lru_last_used = ggml_time_ms();
+    size_t count_active = 0;
+    {
+        std::lock_guard<std::mutex> lk(mutex);
+        for (const auto & m : mapping) {
+            if (m.second.meta.is_active()) {
+                count_active++;
+                if (m.second.meta.last_used < lru_last_used) {
+                    lru_model_name = m.first;
+                    lru_last_used = m.second.meta.last_used;
+                }
+            }
+        }
+    }
+    if (!lru_model_name.empty() && count_active >= (size_t)base_params.models_max) {
+        SRV_INF("models_max limit reached, removing LRU name=%s\n", lru_model_name.c_str());
+        unload(lru_model_name);
+    }
+}
+
+static void add_or_replace_arg(std::vector<std::string> & args, const std::string & key, const std::string & value) {
+    for (size_t i = 0; i < args.size(); i++) {
+        if (args[i] == key && i + 1 < args.size()) {
+            args[i + 1] = value;
+            return;
+        }
+    }
+    // not found, append
+    args.push_back(key);
+    args.push_back(value);
+}
+
+void server_models::load(const std::string & name, bool auto_load) {
+    if (!has_model(name)) {
+        throw std::runtime_error("model name=" + name + " is not found");
+    }
+    unload_lru();
+
+    std::lock_guard<std::mutex> lk(mutex);
+
+    auto meta = mapping[name].meta;
+    if (meta.status != SERVER_MODEL_STATUS_UNLOADED) {
+        SRV_INF("model %s is not ready\n", name.c_str());
+        return;
+    }
+
+    // prepare new instance info
+    instance_t inst;
+    inst.meta           = meta;
+    inst.meta.port      = get_free_port();
+    inst.meta.status    = SERVER_MODEL_STATUS_LOADING;
+    inst.meta.last_used = ggml_time_ms();
+
+    if (inst.meta.port <= 0) {
+        throw std::runtime_error("failed to get a port number");
+    }
+
+    inst.subproc = std::make_shared<subprocess_s>();
+    {
+        SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port);
+
+        std::vector<std::string> child_args;
+        if (auto_load && !meta.args.empty()) {
+            child_args = meta.args; // copy previous args
+        } else {
+            child_args = base_args; // copy
+            if (inst.meta.in_cache) {
+                add_or_replace_arg(child_args, "-hf", inst.meta.name);
+            } else {
+                add_or_replace_arg(child_args, "-m", inst.meta.path);
+                if (!inst.meta.path_mmproj.empty()) {
+                    add_or_replace_arg(child_args, "--mmproj", inst.meta.path_mmproj);
+                }
+            }
+        }
+
+        // set model args
+        add_or_replace_arg(child_args, "--port", std::to_string(inst.meta.port));
+        add_or_replace_arg(child_args, "--alias", inst.meta.name);
+
+        std::vector<std::string> child_env = base_env; // copy
+        child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port));
+
+        SRV_INF("%s", "spawning server instance with args:\n");
+        for (const auto & arg : child_args) {
+            SRV_INF("  %s\n", arg.c_str());
+        }
+        inst.meta.args = child_args; // save for debugging
+
+        std::vector<char *> argv = to_char_ptr_array(child_args);
+        std::vector<char *> envp = to_char_ptr_array(child_env);
+
+        int options = subprocess_option_no_window | subprocess_option_combined_stdout_stderr;
+        int result = subprocess_create_ex(argv.data(), options, envp.data(), inst.subproc.get());
+        if (result != 0) {
+            throw std::runtime_error("failed to spawn server instance");
+        }
+
+        inst.stdin_file = subprocess_stdin(inst.subproc.get());
+    }
+
+    // start a thread to manage the child process
+    // captured variables are guaranteed to be destroyed only after the thread is joined
+    inst.th = std::thread([this, name, child_proc = inst.subproc, port = inst.meta.port]() {
+        // read stdout/stderr and forward to main server log
+        FILE * p_stdout_stderr = subprocess_stdout(child_proc.get());
+        if (p_stdout_stderr) {
+            char buffer[4096];
+            while (fgets(buffer, sizeof(buffer), p_stdout_stderr) != nullptr) {
+                LOG("[%5d] %s", port, buffer);
+            }
+        } else {
+            SRV_ERR("failed to get stdout/stderr of child process for name=%s\n", name.c_str());
+        }
+        // we reach here when the child process exits
+        int exit_code = 0;
+        subprocess_join(child_proc.get(), &exit_code);
+        subprocess_destroy(child_proc.get());
+        // update PID and status
+        {
+            std::lock_guard<std::mutex> lk(mutex);
+            auto it = mapping.find(name);
+            if (it != mapping.end()) {
+                auto & meta = it->second.meta;
+                meta.exit_code = exit_code;
+                meta.status    = SERVER_MODEL_STATUS_UNLOADED;
+            }
+            cv.notify_all();
+        }
+        SRV_INF("instance name=%s exited with status %d\n", name.c_str(), exit_code);
+    });
+
+    // clean up old process/thread if exists
+    {
+        auto & old_instance = mapping[name];
+        // old process should have exited already, but just in case, we clean it up here
+        if (subprocess_alive(old_instance.subproc.get())) {
+            SRV_WRN("old process for model name=%s is still alive, this is unexpected\n", name.c_str());
+            subprocess_terminate(old_instance.subproc.get()); // force kill
+        }
+        if (old_instance.th.joinable()) {
+            old_instance.th.join();
+        }
+    }
+
+    mapping[name] = std::move(inst);
+    cv.notify_all();
+}
+
+static void interrupt_subprocess(FILE * stdin_file) {
+    // because subprocess.h does not provide a way to send SIGINT,
+    // we will send a command to the child process to exit gracefully
+    if (stdin_file) {
+        fprintf(stdin_file, "%s\n", CMD_EXIT);
+        fflush(stdin_file);
+    }
+}
+
+void server_models::unload(const std::string & name) {
+    std::lock_guard<std::mutex> lk(mutex);
+    auto it = mapping.find(name);
+    if (it != mapping.end()) {
+        if (it->second.meta.is_active()) {
+            SRV_INF("unloading model instance name=%s\n", name.c_str());
+            interrupt_subprocess(it->second.stdin_file);
+            // status change will be handled by the managing thread
+        } else {
+            SRV_WRN("model instance name=%s is not loaded\n", name.c_str());
+        }
+    }
+}
+
+void server_models::unload_all() {
+    std::vector<std::thread> to_join;
+    {
+        std::lock_guard<std::mutex> lk(mutex);
+        for (auto & [name, inst] : mapping) {
+            if (inst.meta.is_active()) {
+                SRV_INF("unloading model instance name=%s\n", name.c_str());
+                interrupt_subprocess(inst.stdin_file);
+                // status change will be handled by the managing thread
+            }
+            // moving the thread to join list to avoid deadlock
+            to_join.push_back(std::move(inst.th));
+        }
+    }
+    for (auto & th : to_join) {
+        if (th.joinable()) {
+            th.join();
+        }
+    }
+}
+
+void server_models::update_status(const std::string & name, server_model_status status) {
+    // for now, we only allow updating to LOADED status
+    if (status != SERVER_MODEL_STATUS_LOADED) {
+        throw std::runtime_error("invalid status value");
+    }
+    auto meta = get_meta(name);
+    if (meta.has_value()) {
+        meta->status = status;
+        update_meta(name, meta.value());
+    }
+}
+
+void server_models::wait_until_loaded(const std::string & name) {
+    std::unique_lock<std::mutex> lk(mutex);
+    cv.wait(lk, [this, &name]() {
+        auto it = mapping.find(name);
+        if (it != mapping.end()) {
+            return it->second.meta.status != SERVER_MODEL_STATUS_LOADING;
+        }
+        return false;
+    });
+}
+
+bool server_models::ensure_model_loaded(const std::string & name) {
+    auto meta = get_meta(name);
+    if (!meta.has_value()) {
+        throw std::runtime_error("model name=" + name + " is not found");
+    }
+    if (meta->status == SERVER_MODEL_STATUS_LOADED) {
+        return false; // already loaded
+    }
+    if (meta->status == SERVER_MODEL_STATUS_UNLOADED) {
+        SRV_INF("model name=%s is not loaded, loading...\n", name.c_str());
+        load(name, true);
+    }
+
+    SRV_INF("waiting until model name=%s is fully loaded...\n", name.c_str());
+    wait_until_loaded(name);
+
+    // check final status
+    meta = get_meta(name);
+    if (!meta.has_value() || meta->is_failed()) {
+        throw std::runtime_error("model name=" + name + " failed to load");
+    }
+
+    return true;
+}
+
+server_http_res_ptr server_models::proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used) {
+    auto meta = get_meta(name);
+    if (!meta.has_value()) {
+        throw std::runtime_error("model name=" + name + " is not found");
+    }
+    if (meta->status != SERVER_MODEL_STATUS_LOADED) {
+        throw std::invalid_argument("model name=" + name + " is not loaded");
+    }
+    if (update_last_used) {
+        std::unique_lock<std::mutex> lk(mutex);
+        mapping[name].meta.last_used = ggml_time_ms();
+    }
+    SRV_INF("proxying request to model %s on port %d\n", name.c_str(), meta->port);
+    auto proxy = std::make_unique<server_http_proxy>(
+            method,
+            base_params.hostname,
+            meta->port,
+            req.path,
+            req.headers,
+            req.body,
+            req.should_stop);
+    return proxy;
+}
+
+std::thread server_models::setup_child_server(const common_params & base_params, int router_port, const std::string & name, std::function<void(int)> & shutdown_handler) {
+    // send a notification to the router server that a model instance is ready
+    // TODO @ngxson : use HTTP client from libcommon
+    httplib::Client cli(base_params.hostname, router_port);
+    cli.set_connection_timeout(0, 200000); // 200 milliseconds
+
+    httplib::Request req;
+    req.method = "POST";
+    req.path   = "/models/status";
+    req.set_header("Content-Type", "application/json");
+    if (!base_params.api_keys.empty()) {
+        req.set_header("Authorization", "Bearer " + base_params.api_keys[0]);
+    }
+
+    json body;
+    body["model"] = name;
+    body["value"] = server_model_status_to_string(SERVER_MODEL_STATUS_LOADED);
+    req.body = body.dump();
+
+    SRV_INF("notifying router server (port=%d) that model %s is ready\n", router_port, name.c_str());
+    auto result = cli.send(std::move(req));
+    if (result.error() != httplib::Error::Success) {
+        auto err_str = httplib::to_string(result.error());
+        SRV_ERR("failed to notify router server: %s\n", err_str.c_str());
+        exit(1); // force exit
+    }
+
+    // setup thread for monitoring stdin
+    return std::thread([shutdown_handler]() {
+        // wait for EOF on stdin
+        SRV_INF("%s", "child server monitoring thread started, waiting for EOF on stdin...\n");
+        bool eof = false;
+        while (true) {
+            std::string line;
+            if (!std::getline(std::cin, line)) {
+                // EOF detected, that means the router server is unexpectedly exit or killed
+                eof = true;
+                break;
+            }
+            if (line.find(CMD_EXIT) != std::string::npos) {
+                SRV_INF("%s", "exit command received, exiting...\n");
+                shutdown_handler(0);
+                break;
+            }
+        }
+        if (eof) {
+            SRV_INF("%s", "EOF on stdin detected, forcing shutdown...\n");
+            exit(1);
+        }
+    });
+}
+
+
+
+//
+// server_models_routes
+//
+
+static void res_ok(std::unique_ptr<server_http_res> & res, const json & response_data) {
+    res->status = 200;
+    res->data = safe_json_to_str(response_data);
+}
+
+static void res_err(std::unique_ptr<server_http_res> & res, const json & error_data) {
+    res->status = json_value(error_data, "code", 500);
+    res->data = safe_json_to_str({{ "error", error_data }});
+}
+
+static bool router_validate_model(const std::string & name, server_models & models, bool models_autoload, std::unique_ptr<server_http_res> & res) {
+    if (name.empty()) {
+        res_err(res, format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST));
+        return false;
+    }
+    auto meta = models.get_meta(name);
+    if (!meta.has_value()) {
+        res_err(res, format_error_response("model not found", ERROR_TYPE_INVALID_REQUEST));
+        return false;
+    }
+    if (models_autoload) {
+        models.ensure_model_loaded(name);
+    } else {
+        if (meta->status != SERVER_MODEL_STATUS_LOADED) {
+            res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
+            return false;
+        }
+    }
+    return true;
+}
+
+static bool is_autoload(const common_params & params, const server_http_req & req) {
+    std::string autoload = req.get_param("autoload");
+    if (autoload.empty()) {
+        return params.models_autoload;
+    } else {
+        return autoload == "true" || autoload == "1";
+    }
+}
+
+void server_models_routes::init_routes() {
+    this->get_router_props = [this](const server_http_req & req) {
+        std::string name = req.get_param("model");
+        if (name.empty()) {
+            // main instance
+            auto res = std::make_unique<server_http_res>();
+            res_ok(res, {
+                // TODO: add support for this on web UI
+                {"role",          "router"},
+                {"max_instances", 4}, // dummy value for testing
+                // this is a dummy response to make sure webui doesn't break
+                {"model_alias", "llama-server"},
+                {"model_path",  "none"},
+                {"default_generation_settings", {
+                    {"params", json{}},
+                    {"n_ctx",  0},
+                }},
+            });
+            return res;
+        }
+        return proxy_get(req);
+    };
+
+    this->proxy_get = [this](const server_http_req & req) {
+        std::string method = "GET";
+        std::string name = req.get_param("model");
+        bool autoload = is_autoload(params, req);
+        auto error_res = std::make_unique<server_http_res>();
+        if (!router_validate_model(name, models, autoload, error_res)) {
+            return error_res;
+        }
+        return models.proxy_request(req, method, name, false);
+    };
+
+    this->proxy_post = [this](const server_http_req & req) {
+        std::string method = "POST";
+        json body = json::parse(req.body);
+        std::string name = json_value(body, "model", std::string());
+        bool autoload = is_autoload(params, req);
+        auto error_res = std::make_unique<server_http_res>();
+        if (!router_validate_model(name, models, autoload, error_res)) {
+            return error_res;
+        }
+        return models.proxy_request(req, method, name, true); // update last usage for POST request only
+    };
+
+    this->get_router_models = [this](const server_http_req &) {
+        auto res = std::make_unique<server_http_res>();
+        json models_json = json::array();
+        auto all_models = models.get_all_meta();
+        std::time_t t = std::time(0);
+        for (const auto & meta : all_models) {
+            json status {
+                {"value", server_model_status_to_string(meta.status)},
+                {"args",  meta.args},
+            };
+            if (meta.is_failed()) {
+                status["exit_code"] = meta.exit_code;
+                status["failed"]    = true;
+            }
+            models_json.push_back(json {
+                {"id",       meta.name},
+                {"object",   "model"},    // for OAI-compat
+                {"owned_by", "llamacpp"}, // for OAI-compat
+                {"created",  t},          // for OAI-compat
+                {"in_cache", meta.in_cache},
+                {"path",     meta.path},
+                {"status",   status},
+                // TODO: add other fields, may require reading GGUF metadata
+            });
+        }
+        res_ok(res, {
+            {"data", models_json},
+            {"object", "list"},
+        });
+        return res;
+    };
+
+    this->post_router_models_load = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_http_res>();
+        json body = json::parse(req.body);
+        std::string name = json_value(body, "model", std::string());
+        auto model = models.get_meta(name);
+        if (!model.has_value()) {
+            res_err(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND));
+            return res;
+        }
+        if (model->status == SERVER_MODEL_STATUS_LOADED) {
+            res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+        models.load(name, false);
+        res_ok(res, {{"success", true}});
+        return res;
+    };
+
+    // used by child process to notify the router about status change
+    // TODO @ngxson : maybe implement authentication for this endpoint in the future
+    this->post_router_models_status = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_http_res>();
+        json body = json::parse(req.body);
+        std::string model = json_value(body, "model", std::string());
+        std::string value = json_value(body, "value", std::string());
+        models.update_status(model, server_model_status_from_string(value));
+        res_ok(res, {{"success", true}});
+        return res;
+    };
+
+    this->get_router_models = [this](const server_http_req &) {
+        auto res = std::make_unique<server_http_res>();
+        json models_json = json::array();
+        auto all_models = models.get_all_meta();
+        std::time_t t = std::time(0);
+        for (const auto & meta : all_models) {
+            json status {
+                {"value", server_model_status_to_string(meta.status)},
+                {"args",  meta.args},
+            };
+            if (meta.is_failed()) {
+                status["exit_code"] = meta.exit_code;
+                status["failed"]    = true;
+            }
+            models_json.push_back(json {
+                {"id",       meta.name},
+                {"object",   "model"},    // for OAI-compat
+                {"owned_by", "llamacpp"}, // for OAI-compat
+                {"created",  t},          // for OAI-compat
+                {"in_cache", meta.in_cache},
+                {"path",     meta.path},
+                {"status",   status},
+                // TODO: add other fields, may require reading GGUF metadata
+            });
+        }
+        res_ok(res, {
+            {"data", models_json},
+            {"object", "list"},
+        });
+        return res;
+    };
+
+    this->post_router_models_unload = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_http_res>();
+        json body = json::parse(req.body);
+        std::string name = json_value(body, "model", std::string());
+        auto model = models.get_meta(name);
+        if (!model.has_value()) {
+            res_err(res, format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+        if (model->status != SERVER_MODEL_STATUS_LOADED) {
+            res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+        models.unload(name);
+        res_ok(res, {{"success", true}});
+        return res;
+    };
+}
+
+
+
+//
+// server_http_proxy
+//
+
+// simple implementation of a pipe
+// used for streaming data between threads
+template<typename T>
+struct pipe_t {
+    std::mutex mutex;
+    std::condition_variable cv;
+    std::queue<T> queue;
+    std::atomic<bool> writer_closed{false};
+    std::atomic<bool> reader_closed{false};
+    void close_write() {
+        writer_closed.store(true, std::memory_order_relaxed);
+        cv.notify_all();
+    }
+    void close_read() {
+        reader_closed.store(true, std::memory_order_relaxed);
+        cv.notify_all();
+    }
+    bool read(T & output, const std::function<bool()> & should_stop) {
+        std::unique_lock<std::mutex> lk(mutex);
+        constexpr auto poll_interval = std::chrono::milliseconds(500);
+        while (true) {
+            if (!queue.empty()) {
+                output = std::move(queue.front());
+                queue.pop();
+                return true;
+            }
+            if (writer_closed.load()) {
+                return false; // clean EOF
+            }
+            if (should_stop()) {
+                close_read(); // signal broken pipe to writer
+                return false; // cancelled / reader no longer alive
+            }
+            cv.wait_for(lk, poll_interval);
+        }
+    }
+    bool write(T && data) {
+        std::lock_guard<std::mutex> lk(mutex);
+        if (reader_closed.load()) {
+            return false; // broken pipe
+        }
+        queue.push(std::move(data));
+        cv.notify_one();
+        return true;
+    }
+};
+
+server_http_proxy::server_http_proxy(
+        const std::string & method,
+        const std::string & host,
+        int port,
+        const std::string & path,
+        const std::map<std::string, std::string> & headers,
+        const std::string & body,
+        const std::function<bool()> should_stop) {
+    // shared between reader and writer threads
+    auto cli  = std::make_shared<httplib::Client>(host, port);
+    auto pipe = std::make_shared<pipe_t<msg_t>>();
+
+    // setup Client
+    cli->set_connection_timeout(0, 200000); // 200 milliseconds
+    this->status = 500; // to be overwritten upon response
+    this->cleanup = [pipe]() {
+        pipe->close_read();
+        pipe->close_write();
+    };
+
+    // wire up the receive end of the pipe
+    this->next = [pipe, should_stop](std::string & out) -> bool {
+        msg_t msg;
+        bool has_next = pipe->read(msg, should_stop);
+        if (!msg.data.empty()) {
+            out = std::move(msg.data);
+        }
+        return has_next; // false if EOF or pipe broken
+    };
+
+    // wire up the HTTP client
+    // note: do NOT capture `this` pointer, as it may be destroyed before the thread ends
+    httplib::ResponseHandler response_handler = [pipe, cli](const httplib::Response & response) {
+        msg_t msg;
+        msg.status = response.status;
+        for (const auto & [key, value] : response.headers) {
+            msg.headers[key] = value;
+        }
+        return pipe->write(std::move(msg)); // send headers first
+    };
+    httplib::ContentReceiverWithProgress content_receiver = [pipe](const char * data, size_t data_length, size_t, size_t) {
+        // send data chunks
+        // returns false if pipe is closed / broken (signal to stop receiving)
+        return pipe->write({{}, 0, std::string(data, data_length)});
+    };
+
+    // prepare the request to destination server
+    httplib::Request req;
+    {
+        req.method = method;
+        req.path = path;
+        for (const auto & [key, value] : headers) {
+            req.set_header(key, value);
+        }
+        req.body = body;
+        req.response_handler = response_handler;
+        req.content_receiver = content_receiver;
+    }
+
+    // start the proxy thread
+    SRV_DBG("start proxy thread %s %s\n", req.method.c_str(), req.path.c_str());
+    this->thread = std::thread([cli, pipe, req]() {
+        auto result = cli->send(std::move(req));
+        if (result.error() != httplib::Error::Success) {
+            auto err_str = httplib::to_string(result.error());
+            SRV_ERR("http client error: %s\n", err_str.c_str());
+            pipe->write({{}, 500, ""}); // header
+            pipe->write({{}, 0, "proxy error: " + err_str}); // body
+        }
+        pipe->close_write(); // signal EOF to reader
+        SRV_DBG("%s", "client request thread ended\n");
+    });
+    this->thread.detach();
+
+    // wait for the first chunk (headers)
+    msg_t header;
+    if (pipe->read(header, should_stop)) {
+        SRV_DBG("%s", "received response headers\n");
+        this->status  = header.status;
+        this->headers = header.headers;
+    } else {
+        SRV_DBG("%s", "no response headers received (request cancelled?)\n");
+    }
+}
diff --git a/llamacpp/native/src/server/server-models.h b/llamacpp/native/src/server/server-models.h
new file mode 100644
index 000000000..b9bec983e
--- /dev/null
+++ b/llamacpp/native/src/server/server-models.h
@@ -0,0 +1,174 @@
+#pragma once
+
+#include "common.h"
+#include "server-http.h"
+
+#include <mutex>
+#include <condition_variable>
+#include <functional>
+#include <memory>
+
+/**
+ * state diagram:
+ *
+ * UNLOADED ──► LOADING ──► LOADED
+ *  ▲            │            │
+ *  └───failed───┘            │
+ *  ▲                         │
+ *  └────────unloaded─────────┘
+ */
+enum server_model_status {
+    // TODO: also add downloading state when the logic is added
+    SERVER_MODEL_STATUS_UNLOADED,
+    SERVER_MODEL_STATUS_LOADING,
+    SERVER_MODEL_STATUS_LOADED
+};
+
+static server_model_status server_model_status_from_string(const std::string & status_str) {
+    if (status_str == "unloaded") {
+        return SERVER_MODEL_STATUS_UNLOADED;
+    }
+    if (status_str == "loading") {
+        return SERVER_MODEL_STATUS_LOADING;
+    }
+    if (status_str == "loaded") {
+        return SERVER_MODEL_STATUS_LOADED;
+    }
+    throw std::runtime_error("invalid server model status");
+}
+
+static std::string server_model_status_to_string(server_model_status status) {
+    switch (status) {
+        case SERVER_MODEL_STATUS_UNLOADED: return "unloaded";
+        case SERVER_MODEL_STATUS_LOADING:  return "loading";
+        case SERVER_MODEL_STATUS_LOADED:   return "loaded";
+        default:                           return "unknown";
+    }
+}
+
+struct server_model_meta {
+    std::string name;
+    std::string path;
+    std::string path_mmproj; // only available if in_cache=false
+    bool in_cache = false; // if true, use -hf; use -m otherwise
+    int port = 0;
+    server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
+    int64_t last_used = 0; // for LRU unloading
+    std::vector<std::string> args; // additional args passed to the model instance (used for debugging)
+    int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
+
+    bool is_active() const {
+        return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING;
+    }
+
+    bool is_failed() const {
+        return status == SERVER_MODEL_STATUS_UNLOADED && exit_code != 0;
+    }
+};
+
+struct subprocess_s;
+
+struct server_models {
+private:
+    struct instance_t {
+        std::shared_ptr<subprocess_s> subproc; // shared between main thread and monitoring thread
+        std::thread th;
+        server_model_meta meta;
+        FILE * stdin_file = nullptr;
+    };
+
+    std::mutex mutex;
+    std::condition_variable cv;
+    std::map<std::string, instance_t> mapping;
+
+    common_params base_params;
+    std::vector<std::string> base_args;
+    std::vector<std::string> base_env;
+
+    void update_meta(const std::string & name, const server_model_meta & meta);
+
+    // unload least recently used models if the limit is reached
+    void unload_lru();
+
+public:
+    server_models(const common_params & params, int argc, char ** argv, char ** envp);
+
+    // check if a model instance exists
+    bool has_model(const std::string & name);
+
+    // return a copy of model metadata
+    std::optional<server_model_meta> get_meta(const std::string & name);
+
+    // return a copy of all model metadata
+    std::vector<server_model_meta> get_all_meta();
+
+    // if auto_load is true, load the model with previous args if any
+    void load(const std::string & name, bool auto_load);
+    void unload(const std::string & name);
+    void unload_all();
+
+    // update the status of a model instance
+    void update_status(const std::string & name, server_model_status status);
+
+    // wait until the model instance is fully loaded
+    // return when the model is loaded or failed to load
+    void wait_until_loaded(const std::string & name);
+
+    // load the model if not loaded, otherwise do nothing
+    // return false if model is already loaded; return true otherwise (meta may need to be refreshed)
+    bool ensure_model_loaded(const std::string & name);
+
+    // proxy an HTTP request to the model instance
+    server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used);
+
+    // notify the router server that a model instance is ready
+    // return the monitoring thread (to be joined by the caller)
+    static std::thread setup_child_server(const common_params & base_params, int router_port, const std::string & name, std::function<void(int)> & shutdown_handler);
+};
+
+struct server_models_routes {
+    common_params params;
+    server_models models;
+    server_models_routes(const common_params & params, int argc, char ** argv, char ** envp)
+            : params(params), models(params, argc, argv, envp) {
+        init_routes();
+    }
+
+    void init_routes();
+    // handlers using lambda function, so that they can capture `this` without `std::bind`
+    server_http_context::handler_t get_router_props;
+    server_http_context::handler_t proxy_get;
+    server_http_context::handler_t proxy_post;
+    server_http_context::handler_t get_router_models;
+    server_http_context::handler_t post_router_models_load;
+    server_http_context::handler_t post_router_models_status;
+    server_http_context::handler_t post_router_models_unload;
+};
+
+/**
+ * A simple HTTP proxy that forwards requests to another server
+ * and relays the responses back.
+ */
+struct server_http_proxy : server_http_res {
+    std::function<void()> cleanup = nullptr;
+public:
+    server_http_proxy(const std::string & method,
+                      const std::string & host,
+                      int port,
+                      const std::string & path,
+                      const std::map<std::string, std::string> & headers,
+                      const std::string & body,
+                      const std::function<bool()> should_stop);
+    ~server_http_proxy() {
+        if (cleanup) {
+            cleanup();
+        }
+    }
+private:
+    std::thread thread;
+    struct msg_t {
+        std::map<std::string, std::string> headers;
+        int status = 0;
+        std::string data;
+    };
+};
diff --git a/llamacpp/native/src/server/server-queue.cpp b/llamacpp/native/src/server/server-queue.cpp
new file mode 100644
index 000000000..38a485852
--- /dev/null
+++ b/llamacpp/native/src/server/server-queue.cpp
@@ -0,0 +1,351 @@
+#include "server-task.h"
+#include "server-queue.h"
+
+#include "log.h"
+
+#include <chrono>
+
+#define QUE_INF(fmt, ...) LOG_INF("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_WRN(fmt, ...) LOG_WRN("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_ERR(fmt, ...) LOG_ERR("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_DBG(fmt, ...) LOG_DBG("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+
+#define RES_INF(fmt, ...) LOG_INF("res  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define RES_WRN(fmt, ...) LOG_WRN("res  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define RES_ERR(fmt, ...) LOG_ERR("res  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define RES_DBG(fmt, ...) LOG_DBG("res  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+
+//
+// server_queue
+//
+
+int server_queue::post(server_task && task, bool front) {
+    std::unique_lock<std::mutex> lock(mutex_tasks);
+    GGML_ASSERT(task.id != -1);
+    // if this is cancel task make sure to clean up pending tasks
+    if (task.type == SERVER_TASK_TYPE_CANCEL) {
+        cleanup_pending_task(task.id_target);
+    }
+    const int task_id = task.id;
+    QUE_DBG("new task, id = %d, front = %d\n", task_id, front);
+    if (front) {
+        queue_tasks.push_front(std::move(task));
+    } else {
+        queue_tasks.push_back(std::move(task));
+    }
+    condition_tasks.notify_one();
+    return task_id;
+}
+
+int server_queue::post(std::vector<server_task> && tasks, bool front) {
+    std::unique_lock<std::mutex> lock(mutex_tasks);
+    for (auto & task : tasks) {
+        if (task.id == -1) {
+            task.id = id++;
+        }
+        // if this is cancel task make sure to clean up pending tasks
+        if (task.type == SERVER_TASK_TYPE_CANCEL) {
+            cleanup_pending_task(task.id_target);
+        }
+        QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front);
+        if (front) {
+            queue_tasks.push_front(std::move(task));
+        } else {
+            queue_tasks.push_back(std::move(task));
+        }
+    }
+    condition_tasks.notify_one();
+    return 0;
+}
+
+void server_queue::defer(server_task && task) {
+    std::unique_lock<std::mutex> lock(mutex_tasks);
+    QUE_DBG("defer task, id = %d\n", task.id);
+    queue_tasks_deferred.push_back(std::move(task));
+    condition_tasks.notify_one();
+}
+
+int server_queue::get_new_id() {
+    std::unique_lock<std::mutex> lock(mutex_tasks);
+    int new_id = id++;
+    return new_id;
+}
+
+void server_queue::on_new_task(std::function<void(server_task &&)> callback) {
+    callback_new_task = std::move(callback);
+}
+
+void server_queue::on_update_slots(std::function<void(void)> callback) {
+    callback_update_slots = std::move(callback);
+}
+
+void server_queue::pop_deferred_task() {
+    std::unique_lock<std::mutex> lock(mutex_tasks);
+    if (!queue_tasks_deferred.empty()) {
+        queue_tasks.emplace_front(std::move(queue_tasks_deferred.front()));
+        queue_tasks_deferred.pop_front();
+    }
+    condition_tasks.notify_one();
+}
+
+void server_queue::terminate() {
+    std::unique_lock<std::mutex> lock(mutex_tasks);
+    running = false;
+    condition_tasks.notify_all();
+}
+
+void server_queue::start_loop() {
+    running = true;
+
+    while (true) {
+        QUE_DBG("%s", "processing new tasks\n");
+
+        while (true) {
+            std::unique_lock<std::mutex> lock(mutex_tasks);
+            if (!running) {
+                QUE_DBG("%s", "terminate\n");
+                return;
+            }
+            if (queue_tasks.empty()) {
+                lock.unlock();
+                break;
+            }
+            server_task task = std::move(queue_tasks.front());
+            queue_tasks.pop_front();
+            lock.unlock();
+
+            QUE_DBG("processing task, id = %d\n", task.id);
+            callback_new_task(std::move(task));
+        }
+
+        // all tasks in the current loop is processed, slots data is now ready
+        QUE_DBG("%s", "update slots\n");
+
+        callback_update_slots();
+
+        QUE_DBG("%s", "waiting for new tasks\n");
+        {
+            std::unique_lock<std::mutex> lock(mutex_tasks);
+            if (!running) {
+                QUE_DBG("%s", "terminate\n");
+                return;
+            }
+            if (queue_tasks.empty()) {
+                condition_tasks.wait(lock, [&]{
+                    return (!queue_tasks.empty() || !running);
+                });
+            }
+        }
+    }
+}
+
+void server_queue::cleanup_pending_task(int id_target) {
+    // no need lock because this is called exclusively by post()
+    auto rm_func = [id_target](const server_task & task) {
+        return task.id == id_target;
+    };
+    queue_tasks.erase(
+        std::remove_if(queue_tasks.begin(),          queue_tasks.end(),          rm_func),
+        queue_tasks.end());
+    queue_tasks_deferred.erase(
+        std::remove_if(queue_tasks_deferred.begin(), queue_tasks_deferred.end(), rm_func),
+        queue_tasks_deferred.end());
+}
+
+//
+// server_response
+//
+
+void server_response::add_waiting_task_id(int id_task) {
+    RES_DBG("add task %d to waiting list. current waiting = %d (before add)\n", id_task, (int) waiting_task_ids.size());
+
+    std::unique_lock<std::mutex> lock(mutex_results);
+    waiting_task_ids.insert(id_task);
+}
+
+void server_response::add_waiting_tasks(const std::vector<server_task> & tasks) {
+    std::unique_lock<std::mutex> lock(mutex_results);
+
+    for (const auto & task : tasks) {
+        RES_DBG("add task %d to waiting list. current waiting = %d (before add)\n", task.id, (int) waiting_task_ids.size());
+        waiting_task_ids.insert(task.id);
+    }
+}
+
+void server_response::remove_waiting_task_id(int id_task) {
+    RES_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
+
+    std::unique_lock<std::mutex> lock(mutex_results);
+    waiting_task_ids.erase(id_task);
+    // make sure to clean up all pending results
+    queue_results.erase(
+        std::remove_if(queue_results.begin(), queue_results.end(), [id_task](const server_task_result_ptr & res) {
+            return res->id == id_task;
+        }),
+        queue_results.end());
+}
+
+void server_response::remove_waiting_task_ids(const std::unordered_set<int> & id_tasks) {
+    std::unique_lock<std::mutex> lock(mutex_results);
+
+    for (const auto & id_task : id_tasks) {
+        RES_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
+        waiting_task_ids.erase(id_task);
+    }
+}
+
+server_task_result_ptr server_response::recv(const std::unordered_set<int> & id_tasks) {
+    while (true) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+        condition_results.wait(lock, [&]{
+            if (!running) {
+                RES_DBG("%s : queue result stop\n", "recv");
+                std::terminate(); // we cannot return here since the caller is HTTP code
+            }
+            return !queue_results.empty();
+        });
+
+        for (size_t i = 0; i < queue_results.size(); i++) {
+            if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
+                server_task_result_ptr res = std::move(queue_results[i]);
+                queue_results.erase(queue_results.begin() + i);
+                return res;
+            }
+        }
+    }
+
+    // should never reach here
+}
+
+server_task_result_ptr server_response::recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout) {
+    while (true) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+
+        for (int i = 0; i < (int) queue_results.size(); i++) {
+            if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
+                server_task_result_ptr res = std::move(queue_results[i]);
+                queue_results.erase(queue_results.begin() + i);
+                return res;
+            }
+        }
+
+        std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
+        if (!running) {
+            RES_DBG("%s : queue result stop\n", __func__);
+            std::terminate(); // we cannot return here since the caller is HTTP code
+        }
+        if (cr_res == std::cv_status::timeout) {
+            return nullptr;
+        }
+    }
+
+    // should never reach here
+}
+
+server_task_result_ptr server_response::recv(int id_task) {
+    std::unordered_set<int> id_tasks = {id_task};
+    return recv(id_tasks);
+}
+
+void server_response::send(server_task_result_ptr && result) {
+    RES_DBG("sending result for task id = %d\n", result->id);
+
+    std::unique_lock<std::mutex> lock(mutex_results);
+    for (const auto & id_task : waiting_task_ids) {
+        if (result->id == id_task) {
+            RES_DBG("task id = %d pushed to result queue\n", result->id);
+
+            queue_results.emplace_back(std::move(result));
+            condition_results.notify_all();
+            return;
+        }
+    }
+}
+
+void server_response::terminate() {
+    running = false;
+    condition_results.notify_all();
+}
+
+//
+// server_response_reader
+//
+
+void server_response_reader::post_tasks(std::vector<server_task> && tasks) {
+    id_tasks = server_task::get_list_id(tasks);
+    queue_results.add_waiting_tasks(tasks);
+    queue_tasks.post(std::move(tasks));
+}
+
+bool server_response_reader::has_next() const {
+    return !cancelled && received_count < id_tasks.size();
+}
+
+// return nullptr if should_stop() is true before receiving a result
+// note: if one error is received, it will stop further processing and return error result
+server_task_result_ptr server_response_reader::next(const std::function<bool()> & should_stop) {
+    while (true) {
+        server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, polling_interval_seconds);
+        if (result == nullptr) {
+            // timeout, check stop condition
+            if (should_stop()) {
+                SRV_DBG("%s", "stopping wait for next result due to should_stop condition\n");
+                return nullptr;
+            }
+        } else {
+            if (result->is_error()) {
+                stop(); // cancel remaining tasks
+                SRV_DBG("%s", "received error result, stopping further processing\n");
+                return result;
+            }
+            if (result->is_stop()) {
+                received_count++;
+            }
+            return result;
+        }
+    }
+
+    // should not reach here
+}
+
+server_response_reader::batch_response server_response_reader::wait_for_all(const std::function<bool()> & should_stop) {
+    batch_response batch_res;
+    batch_res.results.resize(id_tasks.size());
+    while (has_next()) {
+        auto res = next(should_stop);
+        if (res == nullptr) {
+            batch_res.is_terminated = true;
+            return batch_res;
+        }
+        if (res->is_error()) {
+            batch_res.error = std::move(res);
+            return batch_res;
+        }
+        const size_t idx = res->get_index();
+        GGML_ASSERT(idx < batch_res.results.size() && "index out of range");
+        GGML_ASSERT(batch_res.results[idx] == nullptr && "duplicate result received");
+        batch_res.results[idx] = std::move(res);
+    }
+    return batch_res;
+}
+
+void server_response_reader::stop() {
+    queue_results.remove_waiting_task_ids(id_tasks);
+    if (has_next() && !cancelled) {
+        // if tasks is not finished yet, cancel them
+        cancelled = true;
+        std::vector<server_task> cancel_tasks;
+        cancel_tasks.reserve(id_tasks.size());
+        for (const auto & id_task : id_tasks) {
+            SRV_WRN("cancel task, id_task = %d\n", id_task);
+            server_task task(SERVER_TASK_TYPE_CANCEL);
+            task.id_target = id_task;
+            queue_results.remove_waiting_task_id(id_task);
+            cancel_tasks.push_back(std::move(task));
+        }
+        // push to beginning of the queue, so it has highest priority
+        queue_tasks.post(std::move(cancel_tasks), true);
+    } else {
+        SRV_DBG("%s", "all tasks already finished, no need to cancel\n");
+    }
+}
diff --git a/llamacpp/native/src/server/server-queue.h b/llamacpp/native/src/server/server-queue.h
new file mode 100644
index 000000000..209d2017c
--- /dev/null
+++ b/llamacpp/native/src/server/server-queue.h
@@ -0,0 +1,146 @@
+#pragma once
+
+#include "server-task.h"
+
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+#include <unordered_set>
+
+struct server_queue {
+private:
+    int id = 0;
+    bool running;
+
+    // queues
+    std::deque<server_task> queue_tasks;
+    std::deque<server_task> queue_tasks_deferred;
+
+    std::mutex mutex_tasks;
+    std::condition_variable condition_tasks;
+
+    // callback functions
+    std::function<void(server_task &&)> callback_new_task;
+    std::function<void(void)>           callback_update_slots;
+
+public:
+    // Add a new task to the end of the queue
+    int post(server_task && task, bool front = false);
+
+    // multi-task version of post()
+    int post(std::vector<server_task> && tasks, bool front = false);
+
+    // Add a new task, but defer until one slot is available
+    void defer(server_task && task);
+
+    // Get the next id for creating a new task
+    int get_new_id();
+
+    // Register function to process a new task
+    void on_new_task(std::function<void(server_task &&)> callback);
+
+    // Register the function to be called when all slots data is ready to be processed
+    void on_update_slots(std::function<void(void)> callback);
+
+    // Call when the state of one slot is changed, it will move one task from deferred to main queue
+    void pop_deferred_task();
+
+    // end the start_loop routine
+    void terminate();
+
+    /**
+     * Main loop consists of these steps:
+     * - Wait until a new task arrives
+     * - Process the task (i.e. maybe copy data into slot)
+     * - Check if multitask is finished
+     * - Update all slots
+     */
+    void start_loop();
+
+    // for metrics
+    size_t queue_tasks_deferred_size() {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        return queue_tasks_deferred.size();
+    }
+
+private:
+    void cleanup_pending_task(int id_target);
+};
+
+struct server_response {
+private:
+    bool running = true;
+
+    // for keeping track of all tasks waiting for the result
+    std::unordered_set<int> waiting_task_ids;
+
+    // the main result queue (using ptr for polymorphism)
+    std::vector<server_task_result_ptr> queue_results;
+
+    std::mutex mutex_results;
+    std::condition_variable condition_results;
+
+public:
+    // add the id_task to the list of tasks waiting for response
+    void add_waiting_task_id(int id_task);
+
+    void add_waiting_tasks(const std::vector<server_task> & tasks);
+
+    // when the request is finished, we can remove task associated with it
+    void remove_waiting_task_id(int id_task);
+
+    // remove multiple tasks from waiting list
+    void remove_waiting_task_ids(const std::unordered_set<int> & id_tasks);
+
+    // This function blocks the thread until there is a response for one of the id_tasks
+    server_task_result_ptr recv(const std::unordered_set<int> & id_tasks);
+
+    // same as recv(), but have timeout in seconds
+    // if timeout is reached, nullptr is returned
+    server_task_result_ptr recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout);
+
+    // single-task version of recv()
+    server_task_result_ptr recv(int id_task);
+
+    // Send a new result to a waiting id_task
+    void send(server_task_result_ptr && result);
+
+    // terminate the waiting loop
+    void terminate();
+};
+
+// utility class to make working with server_queue and server_response easier
+// it provides a generator-like API for server responses
+// support pooling connection state and aggregating multiple results
+struct server_response_reader {
+    std::unordered_set<int> id_tasks;
+    server_queue & queue_tasks;
+    server_response & queue_results;
+    size_t received_count = 0;
+    bool cancelled = false;
+    int polling_interval_seconds;
+
+    // should_stop function will be called each polling_interval_seconds
+    server_response_reader(std::pair<server_queue &, server_response &> server_queues, int polling_interval_seconds)
+        : queue_tasks(server_queues.first), queue_results(server_queues.second), polling_interval_seconds(polling_interval_seconds) {}
+    ~server_response_reader() {
+        stop();
+    }
+
+    void post_tasks(std::vector<server_task> && tasks);
+    bool has_next() const;
+
+    // return nullptr if should_stop() is true before receiving a result
+    // note: if one error is received, it will stop further processing and return error result
+    server_task_result_ptr next(const std::function<bool()> & should_stop);
+
+    struct batch_response {
+        bool is_terminated = false; // if true, indicates that processing was stopped before all results were received
+        std::vector<server_task_result_ptr> results;
+        server_task_result_ptr error; // nullptr if no error
+    };
+    // aggregate multiple results
+    batch_response wait_for_all(const std::function<bool()> & should_stop);
+
+    void stop();
+};
diff --git a/llamacpp/native/src/server/server-task.cpp b/llamacpp/native/src/server/server-task.cpp
new file mode 100644
index 000000000..3f59127fb
--- /dev/null
+++ b/llamacpp/native/src/server/server-task.cpp
@@ -0,0 +1,1471 @@
+#include "server-common.h"
+#include "server-task.h"
+
+#include "common.h"
+#include "llama.h"
+#include "chat.h"
+#include "sampling.h"
+#include "json-schema-to-grammar.h"
+
+using json = nlohmann::ordered_json;
+
+//
+// task_params
+//
+
+json task_params::format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) const {
+    json data = json::array();
+    for (const auto & lb : logit_bias) {
+        data.push_back(json{
+            {"bias", lb.bias},
+            {"token", lb.token},
+        });
+    }
+    return data;
+}
+
+json task_params::to_json(bool only_metrics) const {
+    std::vector<std::string> samplers;
+    samplers.reserve(sampling.samplers.size());
+    for (const auto & sampler : sampling.samplers) {
+        samplers.emplace_back(common_sampler_type_to_str(sampler));
+    }
+
+    json lora = json::array();
+    for (size_t i = 0; i < this->lora.size(); ++i) {
+        lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
+    }
+
+    if (only_metrics) {
+        return json {
+            {"seed",                      sampling.seed},
+            {"temperature",               sampling.temp},
+            {"dynatemp_range",            sampling.dynatemp_range},
+            {"dynatemp_exponent",         sampling.dynatemp_exponent},
+            {"top_k",                     sampling.top_k},
+            {"top_p",                     sampling.top_p},
+            {"min_p",                     sampling.min_p},
+            {"top_n_sigma",               sampling.top_n_sigma},
+            {"xtc_probability",           sampling.xtc_probability},
+            {"xtc_threshold",             sampling.xtc_threshold},
+            {"typical_p",                 sampling.typ_p},
+            {"repeat_last_n",             sampling.penalty_last_n},
+            {"repeat_penalty",            sampling.penalty_repeat},
+            {"presence_penalty",          sampling.penalty_present},
+            {"frequency_penalty",         sampling.penalty_freq},
+            {"dry_multiplier",            sampling.dry_multiplier},
+            {"dry_base",                  sampling.dry_base},
+            {"dry_allowed_length",        sampling.dry_allowed_length},
+            {"dry_penalty_last_n",        sampling.dry_penalty_last_n},
+            {"mirostat",                  sampling.mirostat},
+            {"mirostat_tau",              sampling.mirostat_tau},
+            {"mirostat_eta",              sampling.mirostat_eta},
+            {"max_tokens",                n_predict},
+            {"n_predict",                 n_predict}, // TODO: deduplicate?
+            {"n_keep",                    n_keep},
+            {"n_discard",                 n_discard},
+            {"ignore_eos",                sampling.ignore_eos},
+            {"stream",                    stream},
+            {"n_probs",                   sampling.n_probs},
+            {"min_keep",                  sampling.min_keep},
+            {"chat_format",               common_chat_format_name(oaicompat_chat_syntax.format)},
+            {"reasoning_format",          common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)},
+            {"reasoning_in_content",      oaicompat_chat_syntax.reasoning_in_content},
+            {"thinking_forced_open",      oaicompat_chat_syntax.thinking_forced_open},
+            {"samplers",                  samplers},
+            {"speculative.n_max",         speculative.n_max},
+            {"speculative.n_min",         speculative.n_min},
+            {"speculative.p_min",         speculative.p_min},
+            {"timings_per_token",         timings_per_token},
+            {"post_sampling_probs",       post_sampling_probs},
+            {"lora",                      lora},
+        };
+    }
+
+    auto grammar_triggers = json::array();
+    for (const auto & trigger : sampling.grammar_triggers) {
+        server_grammar_trigger ct(trigger);
+        grammar_triggers.push_back(ct.to_json());
+    }
+
+    return json {
+        {"seed",                      sampling.seed},
+        {"temperature",               sampling.temp},
+        {"dynatemp_range",            sampling.dynatemp_range},
+        {"dynatemp_exponent",         sampling.dynatemp_exponent},
+        {"top_k",                     sampling.top_k},
+        {"top_p",                     sampling.top_p},
+        {"min_p",                     sampling.min_p},
+        {"top_n_sigma",               sampling.top_n_sigma},
+        {"xtc_probability",           sampling.xtc_probability},
+        {"xtc_threshold",             sampling.xtc_threshold},
+        {"typical_p",                 sampling.typ_p},
+        {"repeat_last_n",             sampling.penalty_last_n},
+        {"repeat_penalty",            sampling.penalty_repeat},
+        {"presence_penalty",          sampling.penalty_present},
+        {"frequency_penalty",         sampling.penalty_freq},
+        {"dry_multiplier",            sampling.dry_multiplier},
+        {"dry_base",                  sampling.dry_base},
+        {"dry_allowed_length",        sampling.dry_allowed_length},
+        {"dry_penalty_last_n",        sampling.dry_penalty_last_n},
+        {"dry_sequence_breakers",     sampling.dry_sequence_breakers},
+        {"mirostat",                  sampling.mirostat},
+        {"mirostat_tau",              sampling.mirostat_tau},
+        {"mirostat_eta",              sampling.mirostat_eta},
+        {"stop",                      antiprompt},
+        {"max_tokens",                n_predict},
+        {"n_predict",                 n_predict}, // TODO: deduplicate?
+        {"n_keep",                    n_keep},
+        {"n_discard",                 n_discard},
+        {"ignore_eos",                sampling.ignore_eos},
+        {"stream",                    stream},
+        {"logit_bias",                format_logit_bias(sampling.logit_bias)},
+        {"n_probs",                   sampling.n_probs},
+        {"min_keep",                  sampling.min_keep},
+        {"grammar",                   sampling.grammar},
+        {"grammar_lazy",              sampling.grammar_lazy},
+        {"grammar_triggers",          grammar_triggers},
+        {"preserved_tokens",          sampling.preserved_tokens},
+        {"chat_format",               common_chat_format_name(oaicompat_chat_syntax.format)},
+        {"reasoning_format",          common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)},
+        {"reasoning_in_content",      oaicompat_chat_syntax.reasoning_in_content},
+        {"thinking_forced_open",      oaicompat_chat_syntax.thinking_forced_open},
+        {"samplers",                  samplers},
+        {"speculative.n_max",         speculative.n_max},
+        {"speculative.n_min",         speculative.n_min},
+        {"speculative.p_min",         speculative.p_min},
+        {"timings_per_token",         timings_per_token},
+        {"post_sampling_probs",       post_sampling_probs},
+        {"lora",                      lora},
+    };
+}
+
+//
+// server_task
+//
+
+task_params server_task::params_from_json_cmpl(
+        const llama_context * ctx,
+        const common_params & params_base,
+        const json & data) {
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    task_params params;
+
+    // Sampling parameter defaults are loaded from the global server context (but individual requests can still them)
+    task_params defaults;
+    defaults.sampling    = params_base.sampling;
+    defaults.speculative = params_base.speculative;
+    defaults.n_keep      = params_base.n_keep;
+    defaults.n_predict   = params_base.n_predict;
+    defaults.antiprompt  = params_base.antiprompt;
+
+    // enabling this will output extra debug information in the HTTP responses from the server
+    params.verbose           = params_base.verbosity > 9;
+    params.timings_per_token = json_value(data, "timings_per_token", false);
+
+    params.stream           = json_value(data,       "stream",             false);
+    auto stream_opt         = json_value(data,       "stream_options",     json::object());
+    params.include_usage    = json_value(stream_opt, "include_usage",      false);
+    params.cache_prompt     = json_value(data,       "cache_prompt",       true);
+    params.return_tokens    = json_value(data,       "return_tokens",      false);
+    params.return_progress  = json_value(data,       "return_progress",    false);
+    params.n_predict        = json_value(data,       "n_predict",          json_value(data, "max_tokens", defaults.n_predict));
+    params.n_indent         = json_value(data,       "n_indent",           defaults.n_indent);
+    params.n_keep           = json_value(data,       "n_keep",             defaults.n_keep);
+    params.n_discard        = json_value(data,       "n_discard",          defaults.n_discard);
+    //params.t_max_prompt_ms  = json_value(data,       "t_max_prompt_ms",    defaults.t_max_prompt_ms); // TODO: implement
+    params.t_max_predict_ms = json_value(data,       "t_max_predict_ms",   defaults.t_max_predict_ms);
+    params.response_fields  = json_value(data,       "response_fields",    std::vector<std::string>());
+
+    params.sampling.top_k              = json_value(data, "top_k",               defaults.sampling.top_k);
+    params.sampling.top_p              = json_value(data, "top_p",               defaults.sampling.top_p);
+    params.sampling.min_p              = json_value(data, "min_p",               defaults.sampling.min_p);
+    params.sampling.top_n_sigma        = json_value(data, "top_n_sigma",         defaults.sampling.top_n_sigma);
+    params.sampling.xtc_probability    = json_value(data, "xtc_probability",     defaults.sampling.xtc_probability);
+    params.sampling.xtc_threshold      = json_value(data, "xtc_threshold",       defaults.sampling.xtc_threshold);
+    params.sampling.typ_p              = json_value(data, "typical_p",           defaults.sampling.typ_p);
+    params.sampling.temp               = json_value(data, "temperature",         defaults.sampling.temp);
+    params.sampling.dynatemp_range     = json_value(data, "dynatemp_range",      defaults.sampling.dynatemp_range);
+    params.sampling.dynatemp_exponent  = json_value(data, "dynatemp_exponent",   defaults.sampling.dynatemp_exponent);
+    params.sampling.penalty_last_n     = json_value(data, "repeat_last_n",       defaults.sampling.penalty_last_n);
+    params.sampling.penalty_repeat     = json_value(data, "repeat_penalty",      defaults.sampling.penalty_repeat);
+    params.sampling.penalty_freq       = json_value(data, "frequency_penalty",   defaults.sampling.penalty_freq);
+    params.sampling.penalty_present    = json_value(data, "presence_penalty",    defaults.sampling.penalty_present);
+    params.sampling.dry_multiplier     = json_value(data, "dry_multiplier",      defaults.sampling.dry_multiplier);
+    params.sampling.dry_base           = json_value(data, "dry_base",            defaults.sampling.dry_base);
+    params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length",  defaults.sampling.dry_allowed_length);
+    params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n",  defaults.sampling.dry_penalty_last_n);
+    params.sampling.mirostat           = json_value(data, "mirostat",            defaults.sampling.mirostat);
+    params.sampling.mirostat_tau       = json_value(data, "mirostat_tau",        defaults.sampling.mirostat_tau);
+    params.sampling.mirostat_eta       = json_value(data, "mirostat_eta",        defaults.sampling.mirostat_eta);
+    params.sampling.seed               = json_value(data, "seed",                defaults.sampling.seed);
+    params.sampling.n_probs            = json_value(data, "n_probs",             defaults.sampling.n_probs);
+    params.sampling.min_keep           = json_value(data, "min_keep",            defaults.sampling.min_keep);
+    params.post_sampling_probs         = json_value(data, "post_sampling_probs", defaults.post_sampling_probs);
+
+    params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
+    params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max);
+    params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
+
+    params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
+    params.speculative.n_min = std::max(params.speculative.n_min, 0);
+    params.speculative.n_max = std::max(params.speculative.n_max, 0);
+
+    // Use OpenAI API logprobs only if n_probs wasn't provided
+    if (data.contains("logprobs") && params.sampling.n_probs == defaults.sampling.n_probs){
+        params.sampling.n_probs = json_value(data, "logprobs", defaults.sampling.n_probs);
+    }
+
+    if (data.contains("lora")) {
+        if (data.at("lora").is_array()) {
+            params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora"));
+        } else {
+            throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields");
+        }
+    } else {
+        params.lora = params_base.lora_adapters;
+    }
+
+    // TODO: add more sanity checks for the input parameters
+
+    if (params.sampling.penalty_last_n < -1) {
+        throw std::runtime_error("Error: repeat_last_n must be >= -1");
+    }
+
+    if (params.sampling.dry_penalty_last_n < -1) {
+        throw std::runtime_error("Error: dry_penalty_last_n must be >= -1");
+    }
+
+    if (params.sampling.penalty_last_n == -1) {
+        // note: should be the slot's context and not the full context, but it's ok
+        params.sampling.penalty_last_n = llama_n_ctx(ctx);
+    }
+
+    if (params.sampling.dry_penalty_last_n == -1) {
+        params.sampling.dry_penalty_last_n = llama_n_ctx(ctx);
+    }
+
+    if (params.sampling.dry_base < 1.0f) {
+        params.sampling.dry_base = defaults.sampling.dry_base;
+    }
+
+    // sequence breakers for DRY
+    {
+        // Currently, this is not compatible with TextGen WebUI, Koboldcpp and SillyTavern format
+        // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39
+
+        if (data.contains("dry_sequence_breakers")) {
+            params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector<std::string>());
+            if (params.sampling.dry_sequence_breakers.empty()) {
+                throw std::runtime_error("Error: dry_sequence_breakers must be a non-empty array of strings");
+            }
+        }
+    }
+
+    // process "json_schema" and "grammar"
+    if (data.contains("json_schema") && !data.contains("grammar")) {
+        try {
+            auto schema                  = json_value(data, "json_schema", json::object());
+            SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str());
+            params.sampling.grammar      = json_schema_to_grammar(schema);
+            SRV_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
+        } catch (const std::exception & e) {
+            throw std::runtime_error(std::string("\"json_schema\": ") + e.what());
+        }
+    } else {
+        params.sampling.grammar      = json_value(data, "grammar", defaults.sampling.grammar);
+        SRV_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
+        params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy);
+        SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
+    }
+
+    {
+        auto it = data.find("chat_format");
+        if (it != data.end()) {
+            params.oaicompat_chat_syntax.format = static_cast<common_chat_format>(it->get<int>());
+            SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_syntax.format));
+        } else {
+            params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format;
+        }
+        common_reasoning_format reasoning_format = params_base.reasoning_format;
+        if (data.contains("reasoning_format")) {
+            reasoning_format = common_reasoning_format_from_name(data.at("reasoning_format").get<std::string>());
+        }
+        params.oaicompat_chat_syntax.reasoning_format = reasoning_format;
+        params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
+        params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false);
+        params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false);
+    }
+
+    {
+        const auto preserved_tokens = data.find("preserved_tokens");
+        if (preserved_tokens != data.end()) {
+            for (const auto & t : *preserved_tokens) {
+                auto ids = common_tokenize(vocab, t.get<std::string>(), /* add_special= */ false, /* parse_special= */ true);
+                if (ids.size() == 1) {
+                    SRV_DBG("Preserved token: %d\n", ids[0]);
+                    params.sampling.preserved_tokens.insert(ids[0]);
+                } else {
+                    // This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
+                    SRV_DBG("Not preserved because more than 1 token: %s\n", t.get<std::string>().c_str());
+                }
+            }
+        }
+        const auto grammar_triggers = data.find("grammar_triggers");
+        if (grammar_triggers != data.end()) {
+            for (const auto & t : *grammar_triggers) {
+                server_grammar_trigger ct(t);
+                if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
+                    const auto & word = ct.value.value;
+                    auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
+                    if (ids.size() == 1) {
+                        auto token = ids[0];
+                        if (std::find(params.sampling.preserved_tokens.begin(), params.sampling.preserved_tokens.end(), (llama_token) token) == params.sampling.preserved_tokens.end()) {
+                            throw std::runtime_error("Grammar trigger word should be marked as preserved token: " + word);
+                        }
+                        SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str());
+                        common_grammar_trigger trigger;
+                        trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
+                        trigger.value = word;
+                        trigger.token = token;
+                        params.sampling.grammar_triggers.push_back(std::move(trigger));
+                    } else {
+                        SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
+                        params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
+                    }
+                } else {
+                    if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN) {
+                        SRV_DBG("Grammar trigger pattern: `%s`\n", ct.value.value.c_str());
+                    } else if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL) {
+                        SRV_DBG("Grammar trigger pattern full: `%s`\n", ct.value.value.c_str());
+                    } else {
+                        throw std::runtime_error("Unknown grammar trigger type");
+                    }
+                    params.sampling.grammar_triggers.emplace_back(std::move(ct.value));
+                }
+            }
+        }
+        if (params.sampling.grammar_lazy && params.sampling.grammar_triggers.empty()) {
+            throw std::runtime_error("Error: no triggers set for lazy grammar!");
+        }
+    }
+
+    {
+        params.sampling.logit_bias.clear();
+
+        const auto & logit_bias = data.find("logit_bias");
+        if (logit_bias != data.end() && logit_bias->is_array()) {
+            const int n_vocab = llama_vocab_n_tokens(vocab);
+            for (const auto & el : *logit_bias) {
+                // TODO: we may want to throw errors here, in case "el" is incorrect
+                if (el.is_array() && el.size() == 2) {
+                    float bias;
+                    if (el[1].is_number()) {
+                        bias = el[1].get<float>();
+                    } else if (el[1].is_boolean() && !el[1].get<bool>()) {
+                        bias = -INFINITY;
+                    } else {
+                        continue;
+                    }
+
+                    if (el[0].is_number_integer()) {
+                        llama_token tok = el[0].get<llama_token>();
+                        if (tok >= 0 && tok < n_vocab) {
+                            params.sampling.logit_bias.push_back({tok, bias});
+                        }
+                    } else if (el[0].is_string()) {
+                        auto toks = common_tokenize(vocab, el[0].get<std::string>(), false);
+                        for (auto tok : toks) {
+                            params.sampling.logit_bias.push_back({tok, bias});
+                        }
+                    }
+                }
+            }
+        } else if (logit_bias != data.end() && logit_bias->is_object()) {
+            const int n_vocab = llama_vocab_n_tokens(vocab);
+            for (const auto & el : logit_bias->items()) {
+                float bias;
+                const auto & key = el.key();
+                const auto & value = el.value();
+                if (value.is_number()) {
+                    bias = value.get<float>();
+                } else if (value.is_boolean() && !value.get<bool>()) {
+                    bias = -INFINITY;
+                } else {
+                    continue;
+                }
+
+                char *end;
+                llama_token tok = strtol(key.c_str(), &end, 10);
+                if (*end == 0) {
+                    if (tok >= 0 && tok < n_vocab) {
+                        params.sampling.logit_bias.push_back({tok, bias});
+                    }
+                } else {
+                    auto toks = common_tokenize(vocab, key, false);
+                    for (auto tok : toks) {
+                        params.sampling.logit_bias.push_back({tok, bias});
+                    }
+                }
+            }
+        }
+
+        params.sampling.ignore_eos = json_value(data, "ignore_eos", params_base.sampling.ignore_eos);
+        if (params.sampling.ignore_eos) {
+            params.sampling.logit_bias.insert(
+                    params.sampling.logit_bias.end(),
+                    defaults.sampling.logit_bias_eog.begin(), defaults.sampling.logit_bias_eog.end());
+        }
+    }
+
+    {
+        params.antiprompt.clear();
+
+        const auto & stop = data.find("stop");
+        if (stop != data.end() && stop->is_array()) {
+            for (const auto & word : *stop) {
+                if (!word.empty()) {
+                    params.antiprompt.push_back(word);
+                }
+            }
+        }
+        // set reverse prompt from cli args if not set in the request
+        if (params.antiprompt.empty()) {
+            params.antiprompt = defaults.antiprompt;
+        }
+    }
+
+    {
+        const auto samplers = data.find("samplers");
+        if (samplers != data.end()) {
+            if (samplers->is_array()) {
+                params.sampling.samplers = common_sampler_types_from_names(*samplers, false);
+            } else if (samplers->is_string()){
+                params.sampling.samplers = common_sampler_types_from_chars(samplers->get<std::string>());
+            }
+        } else {
+            params.sampling.samplers = defaults.sampling.samplers;
+        }
+    }
+
+    return params;
+}
+
+//
+// result_timings
+//
+
+json result_timings::to_json() const {
+    json base = {
+        {"cache_n",                cache_n},
+
+        {"prompt_n",               prompt_n},
+        {"prompt_ms",              prompt_ms},
+        {"prompt_per_token_ms",    prompt_per_token_ms},
+        {"prompt_per_second",      prompt_per_second},
+
+        {"predicted_n",            predicted_n},
+        {"predicted_ms",           predicted_ms},
+        {"predicted_per_token_ms", predicted_per_token_ms},
+        {"predicted_per_second",   predicted_per_second},
+    };
+
+    if (draft_n > 0) {
+        base["draft_n"] = draft_n;
+        base["draft_n_accepted"] = draft_n_accepted;
+    }
+
+    return base;
+}
+
+//
+// result_prompt_progress
+//
+json result_prompt_progress::to_json() const {
+    return json {
+        {"total",     total},
+        {"cache",     cache},
+        {"processed", processed},
+        {"time_ms",   time_ms},
+    };
+}
+
+static inline std::string stop_type_to_str(stop_type type) {
+    switch (type) {
+        case STOP_TYPE_EOS:   return "eos";
+        case STOP_TYPE_WORD:  return "word";
+        case STOP_TYPE_LIMIT: return "limit";
+        default:              return "none";
+    }
+}
+
+//
+// completion_token_output
+//
+
+json completion_token_output::to_json(bool post_sampling_probs) const {
+    json probs_for_token = json::array();
+    for (const auto & p : probs) {
+        std::string txt(p.txt);
+        txt.resize(validate_utf8(txt));
+        probs_for_token.push_back(json {
+            {"id",      p.tok},
+            {"token",   txt},
+            {"bytes",   str_to_bytes(p.txt)},
+            {
+                post_sampling_probs ? "prob" : "logprob",
+                post_sampling_probs ? p.prob : logarithm(p.prob)
+            },
+        });
+    }
+    return probs_for_token;
+}
+
+json completion_token_output::probs_vector_to_json(const std::vector<completion_token_output> & probs, bool post_sampling_probs) {
+    json out = json::array();
+    for (const auto & p : probs) {
+        std::string txt(p.text_to_send);
+        txt.resize(validate_utf8(txt));
+        out.push_back(json {
+            {"id",           p.tok},
+            {"token",        txt},
+            {"bytes",        str_to_bytes(p.text_to_send)},
+            {
+                post_sampling_probs ? "prob" : "logprob",
+                post_sampling_probs ? p.prob : logarithm(p.prob)
+            },
+            {
+                post_sampling_probs ? "top_probs" : "top_logprobs",
+                p.to_json(post_sampling_probs)
+            },
+        });
+    }
+    return out;
+}
+
+float completion_token_output::logarithm(float x) {
+    // nlohmann::json converts -inf to null, so we need to prevent that
+    return x == 0.0f ? std::numeric_limits<float>::lowest() : std::log(x);
+}
+
+std::vector<unsigned char> completion_token_output::str_to_bytes(const std::string & str) {
+    std::vector<unsigned char> bytes;
+    for (unsigned char c : str) {
+        bytes.push_back(c);
+    }
+    return bytes;
+}
+
+//
+// server_task_result_cmpl_final
+//
+json server_task_result_cmpl_final::to_json() {
+    switch (res_type) {
+        case TASK_RESPONSE_TYPE_NONE:
+            return to_json_non_oaicompat();
+        case TASK_RESPONSE_TYPE_OAI_CMPL:
+            return to_json_oaicompat();
+        case TASK_RESPONSE_TYPE_OAI_CHAT:
+            return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
+        case TASK_RESPONSE_TYPE_ANTHROPIC:
+            return stream ? to_json_anthropic_stream() : to_json_anthropic();
+        default:
+            GGML_ASSERT(false && "Invalid task_response_type");
+    }
+}
+
+json server_task_result_cmpl_final::to_json_non_oaicompat() {
+    json res = json {
+        {"index",               index},
+        {"content",             stream ? "" : content}, // in stream mode, content is already in last partial chunk
+        {"tokens",              stream ? llama_tokens {} : tokens},
+        {"id_slot",             id_slot},
+        {"stop",                true},
+        {"model",               oaicompat_model},
+        {"tokens_predicted",    n_decoded},
+        {"tokens_evaluated",    n_prompt_tokens},
+        {"generation_settings", generation_params.to_json()},
+        {"prompt",              prompt},
+        {"has_new_line",        has_new_line},
+        {"truncated",           truncated},
+        {"stop_type",           stop_type_to_str(stop)},
+        {"stopping_word",       stopping_word},
+        {"tokens_cached",       n_tokens_cached},
+        {"timings",             timings.to_json()},
+    };
+    if (!stream && !probs_output.empty()) {
+        res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs);
+    }
+    return response_fields.empty() ? res : json_get_nested_values(response_fields, res);
+}
+
+json server_task_result_cmpl_final::to_json_oaicompat() {
+    std::time_t t = std::time(0);
+    json logprobs = json(nullptr); // OAI default to null
+    if (!stream && probs_output.size() > 0) {
+        logprobs = json{
+            {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
+        };
+    }
+    json finish_reason = "length";
+    if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
+        finish_reason = "stop";
+    }
+    json res = json {
+        {"choices",            json::array({
+            json{
+                {"text",          stream ? "" : content}, // in stream mode, content is already in last partial chunk
+                {"index",         index},
+                {"logprobs",      logprobs},
+                {"finish_reason", finish_reason},
+            }
+        })},
+        {"created",            t},
+        {"model",              oaicompat_model},
+        {"system_fingerprint", build_info},
+        {"object",             "text_completion"},
+        {"usage", json {
+            {"completion_tokens", n_decoded},
+            {"prompt_tokens",     n_prompt_tokens},
+            {"total_tokens",      n_decoded + n_prompt_tokens}
+        }},
+        {"id", oaicompat_cmpl_id}
+    };
+
+    // extra fields for debugging purposes
+    if (verbose) {
+        res["__verbose"] = to_json_non_oaicompat();
+    }
+    if (timings.prompt_n >= 0) {
+        res.push_back({"timings", timings.to_json()});
+    }
+
+    return res;
+}
+
+json server_task_result_cmpl_final::to_json_oaicompat_chat() {
+    std::string finish_reason = "length";
+    common_chat_msg msg;
+    if (!oaicompat_msg.empty()) {
+        msg = oaicompat_msg;
+    } else {
+        msg.role = "assistant";
+        msg.content = content;
+    }
+    if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
+        finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls";
+    }
+
+    json choice {
+        {"finish_reason", finish_reason},
+        {"index", 0},
+        {"message", msg.to_json_oaicompat<json>()},
+    };
+
+    if (!stream && probs_output.size() > 0) {
+        choice["logprobs"] = json{
+            {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
+        };
+    }
+
+    std::time_t t = std::time(0);
+
+    json res = json {
+        {"choices",            json::array({choice})},
+        {"created",            t},
+        {"model",              oaicompat_model},
+        {"system_fingerprint", build_info},
+        {"object",             "chat.completion"},
+        {"usage", json {
+            {"completion_tokens", n_decoded},
+            {"prompt_tokens",     n_prompt_tokens},
+            {"total_tokens",      n_decoded + n_prompt_tokens}
+        }},
+        {"id", oaicompat_cmpl_id}
+    };
+
+    // extra fields for debugging purposes
+    if (verbose) {
+        res["__verbose"] = to_json_non_oaicompat();
+    }
+    if (timings.prompt_n >= 0) {
+        res.push_back({"timings", timings.to_json()});
+    }
+
+    return res;
+}
+
+json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
+    std::time_t t = std::time(0);
+    std::string finish_reason = "length";
+    if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
+        finish_reason = oaicompat_msg.tool_calls.empty() ? "stop" : "tool_calls";
+    }
+
+    json deltas = json::array();
+    for (const auto & diff : oaicompat_msg_diffs) {
+        deltas.push_back({
+            {"choices", json::array({
+                json {
+                    {"finish_reason", nullptr},
+                    {"index", 0},
+                    {"delta", common_chat_msg_diff_to_json_oaicompat<json>(diff)},
+                },
+            })},
+            {"created", t},
+            {"id", oaicompat_cmpl_id},
+            {"model", oaicompat_model},
+            {"system_fingerprint", build_info},
+            {"object", "chat.completion.chunk"},
+        });
+    }
+
+    deltas.push_back({
+        {"choices", json::array({
+            json {
+                {"finish_reason", finish_reason},
+                {"index", 0},
+                {"delta", json::object()},
+            },
+        })},
+        {"created",            t},
+        {"id",                 oaicompat_cmpl_id},
+        {"model",              oaicompat_model},
+        {"system_fingerprint", build_info},
+        {"object",             "chat.completion.chunk"},
+    });
+
+    if (include_usage) {
+        // OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
+        // https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
+        deltas.push_back({
+            {"choices", json::array()},
+            {"created",            t},
+            {"id",                 oaicompat_cmpl_id},
+            {"model",              oaicompat_model},
+            {"system_fingerprint", build_info},
+            {"object",             "chat.completion.chunk"},
+            {"usage", json {
+                {"completion_tokens", n_decoded},
+                {"prompt_tokens",     n_prompt_tokens},
+                {"total_tokens",      n_decoded + n_prompt_tokens},
+            }},
+        });
+    }
+
+    if (timings.prompt_n >= 0) {
+        deltas.back().push_back({"timings", timings.to_json()});
+    }
+
+    // extra fields for debugging purposes
+    if (verbose && !deltas.empty()) {
+        deltas.front()["__verbose"] = to_json_non_oaicompat();
+    }
+
+    return deltas;
+}
+
+json server_task_result_cmpl_final::to_json_anthropic() {
+    std::string stop_reason = "max_tokens";
+    if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
+        stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use";
+    }
+
+    json content_blocks = json::array();
+
+    common_chat_msg msg;
+    if (!oaicompat_msg.empty()) {
+        msg = oaicompat_msg;
+    } else {
+        msg.role = "assistant";
+        msg.content = content;
+    }
+
+    if (!msg.content.empty()) {
+        content_blocks.push_back({
+            {"type", "text"},
+            {"text", msg.content}
+        });
+    }
+
+    for (const auto & tool_call : msg.tool_calls) {
+        json tool_use_block = {
+            {"type", "tool_use"},
+            {"id", tool_call.id},
+            {"name", tool_call.name}
+        };
+
+        try {
+            tool_use_block["input"] = json::parse(tool_call.arguments);
+        } catch (const std::exception &) {
+            tool_use_block["input"] = json::object();
+        }
+
+        content_blocks.push_back(tool_use_block);
+    }
+
+    json res = {
+        {"id", oaicompat_cmpl_id},
+        {"type", "message"},
+        {"role", "assistant"},
+        {"content", content_blocks},
+        {"model", oaicompat_model},
+        {"stop_reason", stop_reason},
+        {"stop_sequence", stopping_word.empty() ? nullptr : json(stopping_word)},
+        {"usage", {
+            {"input_tokens", n_prompt_tokens},
+            {"output_tokens", n_decoded}
+        }}
+    };
+
+    return res;
+}
+
+json server_task_result_cmpl_final::to_json_anthropic_stream() {
+    json events = json::array();
+
+    std::string stop_reason = "max_tokens";
+    if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
+        stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use";
+    }
+
+    bool has_text = !oaicompat_msg.content.empty();
+    size_t num_tool_calls = oaicompat_msg.tool_calls.size();
+
+    bool text_block_started = false;
+    std::unordered_set<size_t> tool_calls_started;
+
+    for (const auto & diff : oaicompat_msg_diffs) {
+        if (!diff.content_delta.empty()) {
+            if (!text_block_started) {
+                events.push_back({
+                    {"event", "content_block_start"},
+                    {"data", {
+                        {"type", "content_block_start"},
+                        {"index", 0},
+                        {"content_block", {
+                            {"type", "text"},
+                            {"text", ""}
+                        }}
+                    }}
+                });
+                text_block_started = true;
+            }
+
+            events.push_back({
+                {"event", "content_block_delta"},
+                {"data", {
+                    {"type", "content_block_delta"},
+                    {"index", 0},
+                    {"delta", {
+                        {"type", "text_delta"},
+                        {"text", diff.content_delta}
+                    }}
+                }}
+            });
+        }
+
+        if (diff.tool_call_index != std::string::npos) {
+            size_t content_block_index = (has_text ? 1 : 0) + diff.tool_call_index;
+
+            if (tool_calls_started.find(diff.tool_call_index) == tool_calls_started.end()) {
+                const auto & full_tool_call = oaicompat_msg.tool_calls[diff.tool_call_index];
+
+                events.push_back({
+                    {"event", "content_block_start"},
+                    {"data", {
+                        {"type", "content_block_start"},
+                        {"index", content_block_index},
+                        {"content_block", {
+                            {"type", "tool_use"},
+                            {"id", full_tool_call.id},
+                            {"name", full_tool_call.name}
+                        }}
+                    }}
+                });
+                tool_calls_started.insert(diff.tool_call_index);
+            }
+
+            if (!diff.tool_call_delta.arguments.empty()) {
+                events.push_back({
+                    {"event", "content_block_delta"},
+                    {"data", {
+                        {"type", "content_block_delta"},
+                        {"index", content_block_index},
+                        {"delta", {
+                            {"type", "input_json_delta"},
+                            {"partial_json", diff.tool_call_delta.arguments}
+                        }}
+                    }}
+                });
+            }
+        }
+    }
+
+    if (has_text) {
+        events.push_back({
+            {"event", "content_block_stop"},
+            {"data", {
+                {"type", "content_block_stop"},
+                {"index", 0}
+            }}
+        });
+    }
+
+    for (size_t i = 0; i < num_tool_calls; i++) {
+        size_t content_block_index = (has_text ? 1 : 0) + i;
+        events.push_back({
+            {"event", "content_block_stop"},
+            {"data", {
+                {"type", "content_block_stop"},
+                {"index", content_block_index}
+            }}
+        });
+    }
+
+    events.push_back({
+        {"event", "message_delta"},
+        {"data", {
+            {"type", "message_delta"},
+            {"delta", {
+                {"stop_reason", stop_reason},
+                {"stop_sequence", stopping_word.empty() ? nullptr : json(stopping_word)}
+            }},
+            {"usage", {
+                {"output_tokens", n_decoded}
+            }}
+        }}
+    });
+
+    events.push_back({
+        {"event", "message_stop"},
+        {"data", {
+            {"type", "message_stop"}
+        }}
+    });
+
+    return events;
+}
+
+//
+// server_task_result_cmpl_partial
+//
+json server_task_result_cmpl_partial::to_json() {
+    switch (res_type) {
+        case TASK_RESPONSE_TYPE_NONE:
+            return to_json_non_oaicompat();
+        case TASK_RESPONSE_TYPE_OAI_CMPL:
+            return to_json_oaicompat();
+        case TASK_RESPONSE_TYPE_OAI_CHAT:
+            return to_json_oaicompat_chat();
+        case TASK_RESPONSE_TYPE_ANTHROPIC:
+            return to_json_anthropic();
+        default:
+            GGML_ASSERT(false && "Invalid task_response_type");
+    }
+}
+
+json server_task_result_cmpl_partial::to_json_non_oaicompat() {
+    // non-OAI-compat JSON
+    json res = json {
+        {"index",            index},
+        {"content",          content},
+        {"tokens",           tokens},
+        {"stop",             false},
+        {"id_slot",          id_slot},
+        {"tokens_predicted", n_decoded},
+        {"tokens_evaluated", n_prompt_tokens},
+    };
+    // populate the timings object when needed (usually for the last response or with timings_per_token enabled)
+    if (timings.prompt_n > 0) {
+        res.push_back({"timings", timings.to_json()});
+    }
+    if (is_progress) {
+        res.push_back({"prompt_progress", progress.to_json()});
+    }
+    if (!prob_output.probs.empty()) {
+        res["completion_probabilities"] = completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs);
+    }
+    return res;
+}
+
+json server_task_result_cmpl_partial::to_json_oaicompat() {
+    std::time_t t = std::time(0);
+    json logprobs = json(nullptr); // OAI default to null
+    if (prob_output.probs.size() > 0) {
+        logprobs = json{
+            {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
+        };
+    }
+    json res = json {
+        {"choices",            json::array({
+            json{
+                {"text",          content},
+                {"index",         index},
+                {"logprobs",      logprobs},
+                {"finish_reason", nullptr},
+            }
+        })},
+        {"created",            t},
+        {"model",              oaicompat_model},
+        {"system_fingerprint", build_info},
+        {"object",             "text_completion"},
+        {"id",                 oaicompat_cmpl_id}
+    };
+
+    // extra fields for debugging purposes
+    if (verbose) {
+        res["__verbose"] = to_json_non_oaicompat();
+    }
+    if (timings.prompt_n >= 0) {
+        res.push_back({"timings", timings.to_json()});
+    }
+    if (is_progress) {
+        res.push_back({"prompt_progress", progress.to_json()});
+    }
+
+    return res;
+}
+
+json server_task_result_cmpl_partial::to_json_oaicompat_chat() {
+    bool first = n_decoded == 1;
+    std::time_t t = std::time(0);
+    json choices;
+
+    std::vector<json> deltas;
+    auto add_delta = [&](const json & delta) {
+        deltas.push_back({
+            {"choices", json::array({
+                json {
+                    {"finish_reason", nullptr},
+                    {"index", 0},
+                    {"delta", delta},
+                },
+            })},
+            {"created", t},
+            {"id", oaicompat_cmpl_id},
+            {"model", oaicompat_model},
+            {"system_fingerprint", build_info},
+            {"object", "chat.completion.chunk"},
+        });
+    };
+    // We have to send an initial update to conform to openai behavior
+    if (first || is_progress) {
+        add_delta({
+            {"role", "assistant"},
+            {"content", nullptr},
+        });
+    }
+
+    for (const auto & diff : oaicompat_msg_diffs) {
+        add_delta(common_chat_msg_diff_to_json_oaicompat<json>(diff));
+    }
+
+    if (!deltas.empty()) {
+        auto & last_json = deltas[deltas.size() - 1];
+        GGML_ASSERT(last_json.at("choices").size() >= 1);
+
+        if (prob_output.probs.size() > 0) {
+            last_json.at("choices").at(0)["logprobs"] = json {
+                {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
+            };
+        }
+
+        if (timings.prompt_n >= 0) {
+            last_json.push_back({"timings", timings.to_json()});
+        }
+        if (is_progress) {
+            last_json.push_back({"prompt_progress", progress.to_json()});
+        }
+    }
+
+    return deltas;
+}
+
+//
+// server_task_result_embd
+//
+json server_task_result_embd::to_json() {
+    return res_type == TASK_RESPONSE_TYPE_OAI_EMBD
+        ? to_json_oaicompat()
+        : to_json_non_oaicompat();
+}
+
+json server_task_result_embd::to_json_non_oaicompat() {
+    return json {
+        {"index",     index},
+        {"embedding", embedding},
+    };
+}
+
+json server_task_result_embd::to_json_oaicompat() {
+    return json {
+        {"index",            index},
+        {"embedding",        embedding[0]},
+        {"tokens_evaluated", n_tokens},
+    };
+}
+
+//
+// server_task_result_rerank
+//
+json server_task_result_rerank::to_json() {
+    return json {
+        {"index",            index},
+        {"score",            score},
+        {"tokens_evaluated", n_tokens},
+    };
+}
+
+json server_task_result_cmpl_partial::to_json_anthropic() {
+    json events = json::array();
+    bool first = (n_decoded == 1);
+    static bool text_block_started = false;
+
+    if (first) {
+        text_block_started = false;
+
+        events.push_back({
+            {"event", "message_start"},
+            {"data", {
+                {"type", "message_start"},
+                {"message", {
+                    {"id", oaicompat_cmpl_id},
+                    {"type", "message"},
+                    {"role", "assistant"},
+                    {"content", json::array()},
+                    {"model", oaicompat_model},
+                    {"stop_reason", nullptr},
+                    {"stop_sequence", nullptr},
+                    {"usage", {
+                        {"input_tokens", n_prompt_tokens},
+                        {"output_tokens", 0}
+                    }}
+                }}
+            }}
+        });
+    }
+
+    for (const auto & diff : oaicompat_msg_diffs) {
+        if (!diff.content_delta.empty()) {
+            if (!text_block_started) {
+                events.push_back({
+                    {"event", "content_block_start"},
+                    {"data", {
+                        {"type", "content_block_start"},
+                        {"index", 0},
+                        {"content_block", {
+                            {"type", "text"},
+                            {"text", ""}
+                        }}
+                    }}
+                });
+                text_block_started = true;
+            }
+
+            events.push_back({
+                {"event", "content_block_delta"},
+                {"data", {
+                    {"type", "content_block_delta"},
+                    {"index", 0},
+                    {"delta", {
+                        {"type", "text_delta"},
+                        {"text", diff.content_delta}
+                    }}
+                }}
+            });
+        }
+
+        if (diff.tool_call_index != std::string::npos) {
+            size_t content_block_index = (text_block_started ? 1 : 0) + diff.tool_call_index;
+
+            if (!diff.tool_call_delta.name.empty()) {
+                events.push_back({
+                    {"event", "content_block_start"},
+                    {"data", {
+                        {"type", "content_block_start"},
+                        {"index", content_block_index},
+                        {"content_block", {
+                            {"type", "tool_use"},
+                            {"id", diff.tool_call_delta.id},
+                            {"name", diff.tool_call_delta.name}
+                        }}
+                    }}
+                });
+            }
+
+            if (!diff.tool_call_delta.arguments.empty()) {
+                events.push_back({
+                    {"event", "content_block_delta"},
+                    {"data", {
+                        {"type", "content_block_delta"},
+                        {"index", content_block_index},
+                        {"delta", {
+                            {"type", "input_json_delta"},
+                            {"partial_json", diff.tool_call_delta.arguments}
+                        }}
+                    }}
+                });
+            }
+        }
+    }
+
+    return events;
+}
+
+//
+// server_task_result_error
+//
+json server_task_result_error::to_json() {
+    json res = format_error_response(err_msg, err_type);
+    if (err_type == ERROR_TYPE_EXCEED_CONTEXT_SIZE) {
+        res["n_prompt_tokens"] = n_prompt_tokens;
+        res["n_ctx"]           = n_ctx;
+    }
+    return res;
+}
+
+//
+// server_task_result_metrics
+//
+json server_task_result_metrics::to_json() {
+    return json {
+        { "idle",                            n_idle_slots },
+        { "processing",                      n_processing_slots },
+        { "deferred",                        n_tasks_deferred },
+        { "t_start",                         t_start },
+
+        { "n_prompt_tokens_processed_total", n_prompt_tokens_processed_total },
+        { "t_tokens_generation_total",       t_tokens_generation_total },
+        { "n_tokens_predicted_total",        n_tokens_predicted_total },
+        { "t_prompt_processing_total",       t_prompt_processing_total },
+
+        { "n_tokens_max",                    n_tokens_max },
+
+        { "n_prompt_tokens_processed",       n_prompt_tokens_processed },
+        { "t_prompt_processing",             t_prompt_processing },
+        { "n_tokens_predicted",              n_tokens_predicted },
+        { "t_tokens_generation",             t_tokens_generation },
+
+        { "n_decode_total",                  n_decode_total },
+        { "n_busy_slots_total",              n_busy_slots_total },
+
+        { "slots",                           slots_data },
+    };
+}
+
+//
+// server_task_result_slot_save_load
+//
+json server_task_result_slot_save_load::to_json() {
+    if (is_save) {
+        return json {
+            { "id_slot",   id_slot },
+            { "filename",  filename },
+            { "n_saved",   n_tokens },
+            { "n_written", n_bytes },
+            { "timings", {
+                { "save_ms", t_ms }
+            }},
+        };
+    }
+
+    return json {
+        { "id_slot",    id_slot },
+        { "filename",   filename },
+        { "n_restored", n_tokens },
+        { "n_read",     n_bytes },
+        { "timings", {
+            { "restore_ms", t_ms }
+        }},
+    };
+}
+
+//
+// server_task_result_slot_erase
+//
+json server_task_result_slot_erase::to_json() {
+    return json {
+        { "id_slot",  id_slot },
+        { "n_erased", n_erased },
+    };
+}
+
+//
+// server_task_result_apply_lora
+//
+
+json server_task_result_apply_lora::to_json() {
+    return json {{ "success", true }};
+}
+
+//
+// server_prompt_cache
+//
+size_t server_prompt_cache::size() const {
+    size_t res = 0;
+
+    for (const auto & state : states) {
+        res += state.size();
+    }
+
+    return res;
+}
+
+size_t server_prompt_cache::n_tokens() const {
+    size_t res = 0;
+
+    for (const auto & state : states) {
+        res += state.n_tokens();
+    }
+
+    return res;
+}
+
+server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t state_size) {
+    // first check if the current state is contained fully in the cache
+    for (auto it = states.begin(); it != states.end(); ++it) {
+        const int cur_lcp_len = it->tokens.get_common_prefix(prompt.tokens);
+
+        if (cur_lcp_len == (int) prompt.tokens.size()) {
+            SRV_WRN("%s", " - prompt is already in the cache, skipping\n");
+            return nullptr;
+        }
+    }
+
+    // next, remove any cached prompts that are fully contained in the current prompt
+    for (auto it = states.begin(); it != states.end();) {
+        const int len = it->tokens.get_common_prefix(prompt.tokens);
+
+        if (len == (int) it->tokens.size()) {
+            SRV_WRN(" - removing obsolete cached prompt with length %d\n", len);
+
+            it = states.erase(it);
+        } else {
+            ++it;
+        }
+    }
+
+    std::vector<uint8_t> state_data;
+
+    // check if we can allocate enough memory for the new state
+    try {
+        state_data.resize(state_size);
+    } catch (const std::bad_alloc & e) {
+        SRV_ERR("failed to allocate memory for prompt cache state: %s\n", e.what());
+
+        limit_size = std::max<size_t>(1, 0.4*size());
+
+        SRV_WRN(" - cache size limit reduced to %.3f MiB\n", limit_size / (1024.0 * 1024.0));
+
+        update();
+
+        return nullptr;
+    }
+
+    // TODO: for some reason we can't copy server_tokens, so we have to do this workaround
+    auto & cur = states.emplace_back();
+    cur = {
+        /*.tokens      =*/ server_tokens(prompt.tokens.get_text_tokens(), false),
+        /*.data        =*/ std::move(state_data),
+        /*.checkpoints =*/ prompt.checkpoints,
+    };
+
+    return &cur;
+}
+
+bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, int32_t id_slot) {
+    const int lcp_best = prompt.tokens.get_common_prefix(tokens_new);
+
+    float f_keep_best = float(lcp_best) / prompt.tokens.size();
+    float sim_best    = float(lcp_best) / tokens_new.size();
+
+    SRV_WRN(" - looking for better prompt, base f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
+
+    auto it_best = states.end();
+
+    // find the most similar cached prompt, that would also preserve the most context
+    for (auto it = states.begin(); it != states.end(); ++it) {
+        const int lcp_cur = it->tokens.get_common_prefix(tokens_new);
+
+        const float f_keep_cur = float(lcp_cur) / it->tokens.size();
+        const float sim_cur    = float(lcp_cur) / tokens_new.size();
+
+        // don't trash large prompts
+        if (f_keep_cur < 0.25f) {
+            continue;
+        }
+
+        if (f_keep_best < f_keep_cur && sim_best < sim_cur) {
+            f_keep_best = f_keep_cur;
+            sim_best    = sim_cur;
+
+            it_best = it;
+        }
+    }
+
+    if (it_best != states.end()) {
+        SRV_WRN(" - found better prompt with f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
+
+        const size_t size = it_best->data.size();
+        const size_t n = llama_state_seq_set_data_ext(ctx, it_best->data.data(), size, id_slot, 0);
+        if (n != size) {
+            SRV_WRN("failed to restore state with size %zu\n", size);
+
+            return false;
+        }
+
+        it_best->data.clear();
+        it_best->data.shrink_to_fit();
+
+        prompt = std::move(*it_best);
+
+        states.erase(it_best);
+    }
+
+    return true;
+}
+
+void server_prompt_cache::update() {
+    if (limit_size > 0) {
+        // always keep at least one state, regardless of the limits
+        while (states.size() > 1 && size() > limit_size) {
+            if (states.empty()) {
+                break;
+            }
+
+            SRV_WRN(" - cache size limit reached, removing oldest entry (size = %.3f MiB)\n", states.front().size() / (1024.0 * 1024.0));
+
+            states.pop_front();
+        }
+    }
+
+    // average size per token
+    const float size_per_token = std::max<float>(1.0f, float(size()) / (std::max<size_t>(1, n_tokens())));
+
+    // dynamically increase the token limit if it can fit in the memory limit
+    const size_t limit_tokens_cur = limit_size > 0 ? std::max<size_t>(limit_tokens, limit_size/size_per_token) : limit_tokens;
+
+    if (limit_tokens > 0) {
+        while (states.size() > 1 && n_tokens() > limit_tokens_cur) {
+            if (states.empty()) {
+                break;
+            }
+
+            SRV_WRN(" - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n",
+                    limit_tokens, limit_tokens_cur, states.front().size() / (1024.0 * 1024.0));
+
+            states.pop_front();
+        }
+    }
+
+    SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n",
+            states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens, limit_tokens_cur);
+
+    for (const auto & state : states) {
+        SRV_WRN("   - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n",
+                (const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0));
+    }
+}
diff --git a/llamacpp/native/src/server/server-task.h b/llamacpp/native/src/server/server-task.h
new file mode 100644
index 000000000..a22d7cab1
--- /dev/null
+++ b/llamacpp/native/src/server/server-task.h
@@ -0,0 +1,460 @@
+#pragma once
+
+#include "common.h"
+#include "llama.h"
+
+#include <string>
+#include <unordered_set>
+#include <list>
+
+// TODO: prevent including the whole server-common.h as we only use server_tokens
+#include "server-common.h"
+
+using json = nlohmann::ordered_json;
+
+enum server_task_type {
+    SERVER_TASK_TYPE_COMPLETION,
+    SERVER_TASK_TYPE_EMBEDDING,
+    SERVER_TASK_TYPE_RERANK,
+    SERVER_TASK_TYPE_INFILL,
+    SERVER_TASK_TYPE_CANCEL,
+    SERVER_TASK_TYPE_NEXT_RESPONSE,
+    SERVER_TASK_TYPE_METRICS,
+    SERVER_TASK_TYPE_SLOT_SAVE,
+    SERVER_TASK_TYPE_SLOT_RESTORE,
+    SERVER_TASK_TYPE_SLOT_ERASE,
+    SERVER_TASK_TYPE_SET_LORA,
+};
+
+// TODO: change this to more generic "response_format" to replace the "format_response_*" in server-common
+enum task_response_type {
+    TASK_RESPONSE_TYPE_NONE, // llama.cpp native format
+    TASK_RESPONSE_TYPE_OAI_CHAT,
+    TASK_RESPONSE_TYPE_OAI_CMPL,
+    TASK_RESPONSE_TYPE_OAI_EMBD,
+    TASK_RESPONSE_TYPE_ANTHROPIC,
+};
+
+enum stop_type {
+    STOP_TYPE_NONE,
+    STOP_TYPE_EOS,
+    STOP_TYPE_WORD,
+    STOP_TYPE_LIMIT,
+};
+
+struct task_params {
+    bool stream          = true;
+    bool include_usage   = false;
+    bool cache_prompt    = true; // remember the prompt to avoid reprocessing all prompt
+    bool return_tokens   = false;
+    bool return_progress = false;
+
+    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
+    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
+    int32_t n_predict = -1; // new tokens to predict
+    int32_t n_indent  =  0; // minimum line indentation for the generated text in number of whitespace characters
+
+    int64_t t_max_prompt_ms  = -1; // TODO: implement
+    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
+
+    std::vector<common_adapter_lora_info> lora;
+
+    std::vector<std::string> antiprompt;
+    std::vector<std::string> response_fields;
+    bool timings_per_token = false;
+    bool post_sampling_probs = false;
+
+    struct common_params_sampling sampling;
+    struct common_params_speculative speculative;
+
+    // response formatting
+    bool                         verbose                   = false;
+    task_response_type           res_type                  = TASK_RESPONSE_TYPE_NONE;
+    std::string                  oaicompat_model;
+    std::string                  oaicompat_cmpl_id;
+    common_chat_syntax           oaicompat_chat_syntax;
+
+    // Embeddings
+    int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
+
+    json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) const;
+    json to_json(bool only_metrics = false) const;
+};
+
+struct server_task {
+    int id    = -1; // to be filled by server_queue
+    int index = -1; // used when there are multiple prompts (batch request)
+
+    // used by SERVER_TASK_TYPE_CANCEL
+    int id_target = -1;
+    int id_slot   = -1;
+
+    // used by SERVER_TASK_TYPE_INFERENCE
+    task_params   params;
+    server_tokens tokens;
+
+    server_task_type type;
+
+    // used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE
+    struct slot_action {
+        int slot_id;
+        std::string filename;
+        std::string filepath;
+    };
+    slot_action slot_action;
+
+    // used by SERVER_TASK_TYPE_METRICS
+    bool metrics_reset_bucket = false;
+
+    // used by SERVER_TASK_TYPE_SET_LORA
+    std::vector<common_adapter_lora_info> set_lora;
+
+    server_task() = default;
+
+    server_task(server_task_type type) : type(type) {}
+
+    int32_t n_tokens() const {
+        return tokens.size();
+    }
+
+    static task_params params_from_json_cmpl(
+            const llama_context * ctx,
+            const common_params & params_base,
+            const json & data);
+
+    // utility function
+    static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
+        std::unordered_set<int> ids(tasks.size());
+        for (size_t i = 0; i < tasks.size(); i++) {
+            ids.insert(tasks[i].id);
+        }
+        return ids;
+    }
+};
+
+struct result_timings {
+    int32_t cache_n = -1;
+
+    int32_t prompt_n = -1;
+    double prompt_ms;
+    double prompt_per_token_ms;
+    double prompt_per_second;
+
+    int32_t predicted_n = -1;
+    double predicted_ms;
+    double predicted_per_token_ms;
+    double predicted_per_second;
+
+    // Optional speculative metrics - only included when > 0
+    int32_t draft_n = 0;
+    int32_t draft_n_accepted = 0;
+
+    json to_json() const;
+};
+
+struct result_prompt_progress {
+    int32_t total = 0;
+    int32_t cache = 0;
+    int32_t processed = 0;
+    int64_t time_ms = 0;
+
+    json to_json() const;
+};
+
+struct server_task_result {
+    int id           = -1;
+    int id_slot      = -1;
+    virtual bool is_error() {
+        // only used by server_task_result_error
+        return false;
+    }
+    virtual bool is_stop() {
+        // only used by server_task_result_cmpl_*
+        return true;
+    }
+    virtual int get_index() {
+        return -1;
+    }
+    virtual json to_json() = 0;
+    virtual ~server_task_result() = default;
+};
+
+// using shared_ptr for polymorphism of server_task_result
+using server_task_result_ptr = std::unique_ptr<server_task_result>;
+
+struct completion_token_output {
+    llama_token tok;
+    float prob;
+    std::string text_to_send;
+    struct prob_info {
+        llama_token tok;
+        std::string txt;
+        float prob;
+    };
+    std::vector<prob_info> probs;
+
+    json to_json(bool post_sampling_probs) const;
+
+    static json probs_vector_to_json(const std::vector<completion_token_output> & probs, bool post_sampling_probs);
+
+    static float logarithm(float x);
+
+    static std::vector<unsigned char> str_to_bytes(const std::string & str);
+
+};
+
+struct server_task_result_cmpl_final : server_task_result {
+    int index = 0;
+
+    std::string content;
+    llama_tokens tokens;
+
+    bool stream;
+    bool include_usage;
+    result_timings timings;
+    std::string prompt;
+
+    bool truncated;
+    int32_t n_decoded;
+    int32_t n_prompt_tokens;
+    int32_t n_tokens_cached;
+    bool has_new_line;
+    std::string stopping_word;
+    stop_type stop = STOP_TYPE_NONE;
+
+    bool post_sampling_probs;
+    std::vector<completion_token_output> probs_output;
+    std::vector<std::string>  response_fields;
+
+    task_params generation_params;
+
+    // response formatting
+    bool               verbose  = false;
+    task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
+    std::string        oaicompat_model;
+    std::string        oaicompat_cmpl_id;
+    common_chat_msg    oaicompat_msg;
+
+    std::vector<common_chat_msg_diff> oaicompat_msg_diffs;
+
+    virtual int get_index() override {
+        return index;
+    }
+
+    virtual bool is_stop() override {
+        return true; // in stream mode, final responses are considered stop
+    }
+
+    virtual json to_json() override;
+
+    json to_json_non_oaicompat();
+
+    json to_json_oaicompat();
+
+    json to_json_oaicompat_chat();
+
+    json to_json_oaicompat_chat_stream();
+
+    json to_json_anthropic();
+
+    json to_json_anthropic_stream();
+};
+
+struct server_task_result_cmpl_partial : server_task_result {
+    int index = 0;
+
+    std::string  content;
+    llama_tokens tokens;
+
+    int32_t n_decoded;
+    int32_t n_prompt_tokens;
+
+    bool post_sampling_probs;
+    bool is_progress = false;
+    completion_token_output prob_output;
+    result_timings timings;
+    result_prompt_progress progress;
+
+    // response formatting
+    bool               verbose  = false;
+    task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
+    std::string        oaicompat_model;
+    std::string        oaicompat_cmpl_id;
+    std::vector<common_chat_msg_diff> oaicompat_msg_diffs;
+
+    virtual int get_index() override {
+        return index;
+    }
+
+    virtual bool is_stop() override {
+        return false; // in stream mode, partial responses are not considered stop
+    }
+
+    virtual json to_json() override;
+
+    json to_json_non_oaicompat();
+
+    json to_json_oaicompat();
+
+    json to_json_oaicompat_chat();
+
+    json to_json_anthropic();
+};
+
+struct server_task_result_embd : server_task_result {
+    int index = 0;
+    std::vector<std::vector<float>> embedding;
+
+    int32_t n_tokens;
+
+    // response formatting
+    task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
+
+    virtual int get_index() override {
+        return index;
+    }
+
+    virtual json to_json() override;
+
+    json to_json_non_oaicompat();
+
+    json to_json_oaicompat();
+};
+
+struct server_task_result_rerank : server_task_result {
+    int index = 0;
+    float score = -1e6;
+
+    int32_t n_tokens;
+
+    virtual int get_index() override {
+        return index;
+    }
+
+    virtual json to_json() override;
+};
+
+struct server_task_result_error : server_task_result {
+    int index = 0;
+    error_type err_type = ERROR_TYPE_SERVER;
+    std::string err_msg;
+
+    // for ERROR_TYPE_EXCEED_CONTEXT_SIZE
+    int32_t n_prompt_tokens = 0;
+    int32_t n_ctx           = 0;
+
+    virtual bool is_error() override {
+        return true;
+    }
+
+    virtual json to_json() override;
+};
+
+struct server_task_result_metrics : server_task_result {
+    int n_idle_slots;
+    int n_processing_slots;
+    int n_tasks_deferred;
+    int64_t t_start;
+
+    // TODO: somehow reuse server_metrics in the future, instead of duplicating the fields
+    uint64_t n_prompt_tokens_processed_total = 0;
+    uint64_t t_prompt_processing_total       = 0;
+    uint64_t n_tokens_predicted_total        = 0;
+    uint64_t t_tokens_generation_total       = 0;
+
+    uint64_t n_tokens_max = 0;
+
+    uint64_t n_prompt_tokens_processed = 0;
+    uint64_t t_prompt_processing       = 0;
+
+    uint64_t n_tokens_predicted  = 0;
+    uint64_t t_tokens_generation = 0;
+
+    uint64_t n_decode_total     = 0;
+    uint64_t n_busy_slots_total = 0;
+
+    // while we can also use std::vector<server_slot> this requires copying the slot object which can be quite messy
+    // therefore, we use json to temporarily store the slot.to_json() result
+    json slots_data = json::array();
+
+    virtual json to_json() override;
+};
+
+struct server_task_result_slot_save_load : server_task_result {
+    std::string filename;
+    bool is_save; // true = save, false = load
+
+    size_t n_tokens;
+    size_t n_bytes;
+    double t_ms;
+
+    virtual json to_json() override;
+};
+
+struct server_task_result_slot_erase : server_task_result {
+    size_t n_erased;
+
+    virtual json to_json() override;
+};
+
+struct server_task_result_apply_lora : server_task_result {
+    virtual json to_json() override;
+};
+
+struct server_prompt_checkpoint {
+    llama_pos pos_min;
+    llama_pos pos_max;
+
+    std::vector<uint8_t> data;
+
+    size_t size() const {
+        return data.size();
+    }
+};
+
+struct server_prompt {
+    server_tokens tokens;
+
+    std::vector<uint8_t> data;
+
+    std::list<server_prompt_checkpoint> checkpoints;
+
+    size_t size() const {
+        size_t res = data.size();
+
+        for (const auto & checkpoint : checkpoints) {
+            res += checkpoint.size();
+        }
+
+        return res;
+    }
+
+    int n_tokens() const {
+        return tokens.size();
+    }
+};
+
+struct server_prompt_cache {
+    server_prompt_cache(int32_t limit_size_mib, size_t limit_tokens) {
+        this->limit_size   = 1024ull*1024ull*(limit_size_mib < 0 ? 0 : limit_size_mib);
+        this->limit_tokens = limit_tokens;
+    }
+
+    std::list<server_prompt> states;
+
+    // in bytes, 0 = no limit
+    size_t limit_size = 0;
+
+    // in tokens, 0 = no limit
+    size_t limit_tokens = 0;
+
+    size_t size() const;
+
+    size_t n_tokens() const;
+
+    server_prompt * alloc(const server_prompt & prompt, size_t state_size);
+
+    bool load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, int32_t id_slot);
+
+    void update();
+};
diff --git a/llamacpp/native/src/server/server.cpp b/llamacpp/native/src/server/server.cpp
index bea951b97..d5bef3df4 100644
--- a/llamacpp/native/src/server/server.cpp
+++ b/llamacpp/native/src/server/server.cpp
@@ -1,5762 +1,259 @@
-#include "chat.h"
-#include "utils.hpp"
+#include "server-context.h"
+#include "server-http.h"
+#include "server-models.h"
 
 #include "arg.h"
 #include "common.h"
-#include "json-schema-to-grammar.h"
 #include "llama.h"
 #include "log.h"
-#include "sampling.h"
-#include "speculative.h"
-#include "mtmd.h"
 
-// mime type for sending response
-#define MIMETYPE_JSON "application/json; charset=utf-8"
-
-#include <atomic>
-#include <chrono>
-#include <condition_variable>
-#include <cstddef>
-#include <cinttypes>
-#include <deque>
-#include <memory>
-#include <mutex>
-#include <signal.h>
-#include <thread>
-#include <unordered_map>
-#include <unordered_set>
-
-using json = nlohmann::ordered_json;
-
-constexpr int HTTP_POLLING_SECONDS = 1;
-
-enum stop_type {
-    STOP_TYPE_NONE,
-    STOP_TYPE_EOS,
-    STOP_TYPE_WORD,
-    STOP_TYPE_LIMIT,
-};
-
-// state diagram: https://github.com/ggml-org/llama.cpp/pull/9283
-enum slot_state {
-    SLOT_STATE_IDLE,
-    SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
-    SLOT_STATE_PROCESSING_PROMPT,
-    SLOT_STATE_DONE_PROMPT,
-    SLOT_STATE_GENERATING,
-};
-
-enum server_state {
-    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
-    SERVER_STATE_READY,          // Server is ready and model is loaded
-};
-
-enum server_task_type {
-    SERVER_TASK_TYPE_COMPLETION,
-    SERVER_TASK_TYPE_EMBEDDING,
-    SERVER_TASK_TYPE_RERANK,
-    SERVER_TASK_TYPE_INFILL,
-    SERVER_TASK_TYPE_CANCEL,
-    SERVER_TASK_TYPE_NEXT_RESPONSE,
-    SERVER_TASK_TYPE_METRICS,
-    SERVER_TASK_TYPE_SLOT_SAVE,
-    SERVER_TASK_TYPE_SLOT_RESTORE,
-    SERVER_TASK_TYPE_SLOT_ERASE,
-    SERVER_TASK_TYPE_SET_LORA,
-};
-
-enum oaicompat_type {
-    OAICOMPAT_TYPE_NONE,
-    OAICOMPAT_TYPE_CHAT,
-    OAICOMPAT_TYPE_COMPLETION,
-    OAICOMPAT_TYPE_EMBEDDING,
-};
-
-// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
-enum error_type {
-    ERROR_TYPE_INVALID_REQUEST,
-    ERROR_TYPE_AUTHENTICATION,
-    ERROR_TYPE_SERVER,
-    ERROR_TYPE_NOT_FOUND,
-    ERROR_TYPE_PERMISSION,
-    ERROR_TYPE_UNAVAILABLE, // custom error
-    ERROR_TYPE_NOT_SUPPORTED, // custom error
-    ERROR_TYPE_EXCEED_CONTEXT_SIZE, // custom error
-};
-
-static bool server_task_type_need_embd(server_task_type task_type) {
-    switch (task_type) {
-        case SERVER_TASK_TYPE_EMBEDDING:
-        case SERVER_TASK_TYPE_RERANK:
-            return true;
-        default:
-            return false;
-    }
-}
-
-static bool server_task_type_need_logits(server_task_type task_type) {
-    switch (task_type) {
-        case SERVER_TASK_TYPE_COMPLETION:
-        case SERVER_TASK_TYPE_INFILL:
-            return true;
-        default:
-            return false;
-    }
-}
-
-struct slot_params {
-    bool stream          = true;
-    bool include_usage   = false;
-    bool cache_prompt    = true; // remember the prompt to avoid reprocessing all prompt
-    bool return_tokens   = false;
-    bool return_progress = false;
-
-    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
-    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
-    int32_t n_predict = -1; // new tokens to predict
-    int32_t n_indent  =  0; // minimum line indentation for the generated text in number of whitespace characters
-
-    int64_t t_max_prompt_ms  = -1; // TODO: implement
-    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
-
-    std::vector<common_adapter_lora_info> lora;
-
-    std::vector<std::string> antiprompt;
-    std::vector<std::string> response_fields;
-    bool timings_per_token = false;
-    bool post_sampling_probs = false;
-
-    struct common_params_sampling sampling;
-    struct common_params_speculative speculative;
-
-    // OAI-compat fields
-    bool                         verbose                   = false;
-    oaicompat_type               oaicompat                 = OAICOMPAT_TYPE_NONE;
-    std::string                  oaicompat_model;
-    std::string                  oaicompat_cmpl_id;
-    common_chat_syntax           oaicompat_chat_syntax;
-
-    // Embeddings
-    int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
-
-    json to_json(bool only_metrics = false) const {
-        std::vector<std::string> samplers;
-        samplers.reserve(sampling.samplers.size());
-        for (const auto & sampler : sampling.samplers) {
-            samplers.emplace_back(common_sampler_type_to_str(sampler));
-        }
-
-        json lora = json::array();
-        for (size_t i = 0; i < this->lora.size(); ++i) {
-            lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
-        }
-
-        if (only_metrics) {
-            return json {
-                {"seed",                      sampling.seed},
-                {"temperature",               sampling.temp},
-                {"dynatemp_range",            sampling.dynatemp_range},
-                {"dynatemp_exponent",         sampling.dynatemp_exponent},
-                {"top_k",                     sampling.top_k},
-                {"top_p",                     sampling.top_p},
-                {"min_p",                     sampling.min_p},
-                {"top_n_sigma",               sampling.top_n_sigma},
-                {"xtc_probability",           sampling.xtc_probability},
-                {"xtc_threshold",             sampling.xtc_threshold},
-                {"typical_p",                 sampling.typ_p},
-                {"repeat_last_n",             sampling.penalty_last_n},
-                {"repeat_penalty",            sampling.penalty_repeat},
-                {"presence_penalty",          sampling.penalty_present},
-                {"frequency_penalty",         sampling.penalty_freq},
-                {"dry_multiplier",            sampling.dry_multiplier},
-                {"dry_base",                  sampling.dry_base},
-                {"dry_allowed_length",        sampling.dry_allowed_length},
-                {"dry_penalty_last_n",        sampling.dry_penalty_last_n},
-                {"mirostat",                  sampling.mirostat},
-                {"mirostat_tau",              sampling.mirostat_tau},
-                {"mirostat_eta",              sampling.mirostat_eta},
-                {"max_tokens",                n_predict},
-                {"n_predict",                 n_predict}, // TODO: deduplicate?
-                {"n_keep",                    n_keep},
-                {"n_discard",                 n_discard},
-                {"ignore_eos",                sampling.ignore_eos},
-                {"stream",                    stream},
-                {"n_probs",                   sampling.n_probs},
-                {"min_keep",                  sampling.min_keep},
-                {"chat_format",               common_chat_format_name(oaicompat_chat_syntax.format)},
-                {"reasoning_format",          common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)},
-                {"reasoning_in_content",      oaicompat_chat_syntax.reasoning_in_content},
-                {"thinking_forced_open",      oaicompat_chat_syntax.thinking_forced_open},
-                {"samplers",                  samplers},
-                {"speculative.n_max",         speculative.n_max},
-                {"speculative.n_min",         speculative.n_min},
-                {"speculative.p_min",         speculative.p_min},
-                {"timings_per_token",         timings_per_token},
-                {"post_sampling_probs",       post_sampling_probs},
-                {"lora",                      lora},
-            };
-        }
-
-        auto grammar_triggers = json::array();
-        for (const auto & trigger : sampling.grammar_triggers) {
-            server_grammar_trigger ct(trigger);
-            grammar_triggers.push_back(ct.to_json());
-        }
-
-        return json {
-            {"seed",                      sampling.seed},
-            {"temperature",               sampling.temp},
-            {"dynatemp_range",            sampling.dynatemp_range},
-            {"dynatemp_exponent",         sampling.dynatemp_exponent},
-            {"top_k",                     sampling.top_k},
-            {"top_p",                     sampling.top_p},
-            {"min_p",                     sampling.min_p},
-            {"top_n_sigma",               sampling.top_n_sigma},
-            {"xtc_probability",           sampling.xtc_probability},
-            {"xtc_threshold",             sampling.xtc_threshold},
-            {"typical_p",                 sampling.typ_p},
-            {"repeat_last_n",             sampling.penalty_last_n},
-            {"repeat_penalty",            sampling.penalty_repeat},
-            {"presence_penalty",          sampling.penalty_present},
-            {"frequency_penalty",         sampling.penalty_freq},
-            {"dry_multiplier",            sampling.dry_multiplier},
-            {"dry_base",                  sampling.dry_base},
-            {"dry_allowed_length",        sampling.dry_allowed_length},
-            {"dry_penalty_last_n",        sampling.dry_penalty_last_n},
-            {"dry_sequence_breakers",     sampling.dry_sequence_breakers},
-            {"mirostat",                  sampling.mirostat},
-            {"mirostat_tau",              sampling.mirostat_tau},
-            {"mirostat_eta",              sampling.mirostat_eta},
-            {"stop",                      antiprompt},
-            {"max_tokens",                n_predict},
-            {"n_predict",                 n_predict}, // TODO: deduplicate?
-            {"n_keep",                    n_keep},
-            {"n_discard",                 n_discard},
-            {"ignore_eos",                sampling.ignore_eos},
-            {"stream",                    stream},
-            {"logit_bias",                format_logit_bias(sampling.logit_bias)},
-            {"n_probs",                   sampling.n_probs},
-            {"min_keep",                  sampling.min_keep},
-            {"grammar",                   sampling.grammar},
-            {"grammar_lazy",              sampling.grammar_lazy},
-            {"grammar_triggers",          grammar_triggers},
-            {"preserved_tokens",          sampling.preserved_tokens},
-            {"chat_format",               common_chat_format_name(oaicompat_chat_syntax.format)},
-            {"reasoning_format",          common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)},
-            {"reasoning_in_content",      oaicompat_chat_syntax.reasoning_in_content},
-            {"thinking_forced_open",      oaicompat_chat_syntax.thinking_forced_open},
-            {"samplers",                  samplers},
-            {"speculative.n_max",         speculative.n_max},
-            {"speculative.n_min",         speculative.n_min},
-            {"speculative.p_min",         speculative.p_min},
-            {"timings_per_token",         timings_per_token},
-            {"post_sampling_probs",       post_sampling_probs},
-            {"lora",                      lora},
-        };
-    }
-};
-
-struct server_task {
-    int id    = -1; // to be filled by server_queue
-    int index = -1; // used when there are multiple prompts (batch request)
-
-    // used by SERVER_TASK_TYPE_CANCEL
-    int id_target = -1;
-    int id_slot   = -1;
-
-    // used by SERVER_TASK_TYPE_INFERENCE
-    slot_params   params;
-    server_tokens tokens;
-
-    server_task_type type;
-
-    // used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE
-    struct slot_action {
-        int slot_id;
-        std::string filename;
-        std::string filepath;
-    };
-    slot_action slot_action;
-
-    // used by SERVER_TASK_TYPE_METRICS
-    bool metrics_reset_bucket = false;
-
-    // used by SERVER_TASK_TYPE_SET_LORA
-    std::vector<common_adapter_lora_info> set_lora;
-
-    server_task() = default;
-
-    server_task(server_task_type type) : type(type) {}
-
-    int32_t n_tokens() const {
-        return tokens.size();
-    }
-
-    static slot_params params_from_json_cmpl(
-            const llama_context * ctx,
-            const common_params & params_base,
-            const json & data) {
-        const llama_model * model = llama_get_model(ctx);
-        const llama_vocab * vocab = llama_model_get_vocab(model);
-
-        slot_params params;
-
-        // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
-        slot_params defaults;
-        defaults.sampling    = params_base.sampling;
-        defaults.speculative = params_base.speculative;
-        defaults.n_keep      = params_base.n_keep;
-        defaults.n_predict   = params_base.n_predict;
-        defaults.antiprompt  = params_base.antiprompt;
-
-        // enabling this will output extra debug information in the HTTP responses from the server
-        params.verbose           = params_base.verbosity > 9;
-        params.timings_per_token = json_value(data, "timings_per_token", false);
-
-        params.stream           = json_value(data,       "stream",             false);
-        auto stream_opt         = json_value(data,       "stream_options",     json::object());
-        params.include_usage    = json_value(stream_opt, "include_usage",      false);
-        params.cache_prompt     = json_value(data,       "cache_prompt",       true);
-        params.return_tokens    = json_value(data,       "return_tokens",      false);
-        params.return_progress  = json_value(data,       "return_progress",    false);
-        params.n_predict        = json_value(data,       "n_predict",          json_value(data, "max_tokens", defaults.n_predict));
-        params.n_indent         = json_value(data,       "n_indent",           defaults.n_indent);
-        params.n_keep           = json_value(data,       "n_keep",             defaults.n_keep);
-        params.n_discard        = json_value(data,       "n_discard",          defaults.n_discard);
-      //params.t_max_prompt_ms  = json_value(data,       "t_max_prompt_ms",    defaults.t_max_prompt_ms); // TODO: implement
-        params.t_max_predict_ms = json_value(data,       "t_max_predict_ms",   defaults.t_max_predict_ms);
-        params.response_fields  = json_value(data,       "response_fields",    std::vector<std::string>());
-
-        params.sampling.top_k              = json_value(data, "top_k",               defaults.sampling.top_k);
-        params.sampling.top_p              = json_value(data, "top_p",               defaults.sampling.top_p);
-        params.sampling.min_p              = json_value(data, "min_p",               defaults.sampling.min_p);
-        params.sampling.top_n_sigma        = json_value(data, "top_n_sigma",         defaults.sampling.top_n_sigma);
-        params.sampling.xtc_probability    = json_value(data, "xtc_probability",     defaults.sampling.xtc_probability);
-        params.sampling.xtc_threshold      = json_value(data, "xtc_threshold",       defaults.sampling.xtc_threshold);
-        params.sampling.typ_p              = json_value(data, "typical_p",           defaults.sampling.typ_p);
-        params.sampling.temp               = json_value(data, "temperature",         defaults.sampling.temp);
-        params.sampling.dynatemp_range     = json_value(data, "dynatemp_range",      defaults.sampling.dynatemp_range);
-        params.sampling.dynatemp_exponent  = json_value(data, "dynatemp_exponent",   defaults.sampling.dynatemp_exponent);
-        params.sampling.penalty_last_n     = json_value(data, "repeat_last_n",       defaults.sampling.penalty_last_n);
-        params.sampling.penalty_repeat     = json_value(data, "repeat_penalty",      defaults.sampling.penalty_repeat);
-        params.sampling.penalty_freq       = json_value(data, "frequency_penalty",   defaults.sampling.penalty_freq);
-        params.sampling.penalty_present    = json_value(data, "presence_penalty",    defaults.sampling.penalty_present);
-        params.sampling.dry_multiplier     = json_value(data, "dry_multiplier",      defaults.sampling.dry_multiplier);
-        params.sampling.dry_base           = json_value(data, "dry_base",            defaults.sampling.dry_base);
-        params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length",  defaults.sampling.dry_allowed_length);
-        params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n",  defaults.sampling.dry_penalty_last_n);
-        params.sampling.mirostat           = json_value(data, "mirostat",            defaults.sampling.mirostat);
-        params.sampling.mirostat_tau       = json_value(data, "mirostat_tau",        defaults.sampling.mirostat_tau);
-        params.sampling.mirostat_eta       = json_value(data, "mirostat_eta",        defaults.sampling.mirostat_eta);
-        params.sampling.seed               = json_value(data, "seed",                defaults.sampling.seed);
-        params.sampling.n_probs            = json_value(data, "n_probs",             defaults.sampling.n_probs);
-        params.sampling.min_keep           = json_value(data, "min_keep",            defaults.sampling.min_keep);
-        params.post_sampling_probs         = json_value(data, "post_sampling_probs", defaults.post_sampling_probs);
-
-        params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
-        params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max);
-        params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
-
-        params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
-        params.speculative.n_min = std::max(params.speculative.n_min, 0);
-        params.speculative.n_max = std::max(params.speculative.n_max, 0);
-
-        // Use OpenAI API logprobs only if n_probs wasn't provided
-        if (data.contains("logprobs") && params.sampling.n_probs == defaults.sampling.n_probs){
-            params.sampling.n_probs = json_value(data, "logprobs", defaults.sampling.n_probs);
-        }
-
-        if (data.contains("lora")) {
-            if (data.at("lora").is_array()) {
-                params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora"));
-            } else {
-                throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields");
-            }
-        } else {
-            params.lora = params_base.lora_adapters;
-        }
-
-        // TODO: add more sanity checks for the input parameters
-
-        if (params.sampling.penalty_last_n < -1) {
-            throw std::runtime_error("Error: repeat_last_n must be >= -1");
-        }
-
-        if (params.sampling.dry_penalty_last_n < -1) {
-            throw std::runtime_error("Error: dry_penalty_last_n must be >= -1");
-        }
-
-        if (params.sampling.penalty_last_n == -1) {
-            // note: should be the slot's context and not the full context, but it's ok
-            params.sampling.penalty_last_n = llama_n_ctx(ctx);
-        }
-
-        if (params.sampling.dry_penalty_last_n == -1) {
-            params.sampling.dry_penalty_last_n = llama_n_ctx(ctx);
-        }
-
-        if (params.sampling.dry_base < 1.0f) {
-            params.sampling.dry_base = defaults.sampling.dry_base;
-        }
-
-        // sequence breakers for DRY
-        {
-            // Currently, this is not compatible with TextGen WebUI, Koboldcpp and SillyTavern format
-            // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39
-
-            if (data.contains("dry_sequence_breakers")) {
-                params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector<std::string>());
-                if (params.sampling.dry_sequence_breakers.empty()) {
-                    throw std::runtime_error("Error: dry_sequence_breakers must be a non-empty array of strings");
-                }
-            }
-        }
-
-        // process "json_schema" and "grammar"
-        if (data.contains("json_schema") && !data.contains("grammar")) {
-            try {
-                auto schema                  = json_value(data, "json_schema", json::object());
-                SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str());
-                params.sampling.grammar      = json_schema_to_grammar(schema);
-                SRV_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
-            } catch (const std::exception & e) {
-                throw std::runtime_error(std::string("\"json_schema\": ") + e.what());
-            }
-        } else {
-            params.sampling.grammar      = json_value(data, "grammar", defaults.sampling.grammar);
-            SRV_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
-            params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy);
-            SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
-        }
-
-        {
-            auto it = data.find("chat_format");
-            if (it != data.end()) {
-                params.oaicompat_chat_syntax.format = static_cast<common_chat_format>(it->get<int>());
-                SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_syntax.format));
-            } else {
-                params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format;
-            }
-            common_reasoning_format reasoning_format = params_base.reasoning_format;
-            if (data.contains("reasoning_format")) {
-                reasoning_format = common_reasoning_format_from_name(data.at("reasoning_format").get<std::string>());
-            }
-            params.oaicompat_chat_syntax.reasoning_format = reasoning_format;
-            params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
-            params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false);
-            params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false);
-        }
-
-        {
-            const auto preserved_tokens = data.find("preserved_tokens");
-            if (preserved_tokens != data.end()) {
-                for (const auto & t : *preserved_tokens) {
-                    auto ids = common_tokenize(vocab, t.get<std::string>(), /* add_special= */ false, /* parse_special= */ true);
-                    if (ids.size() == 1) {
-                        SRV_DBG("Preserved token: %d\n", ids[0]);
-                        params.sampling.preserved_tokens.insert(ids[0]);
-                    } else {
-                        // This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
-                        SRV_DBG("Not preserved because more than 1 token: %s\n", t.get<std::string>().c_str());
-                    }
-                }
-            }
-            const auto grammar_triggers = data.find("grammar_triggers");
-            if (grammar_triggers != data.end()) {
-                for (const auto & t : *grammar_triggers) {
-                    server_grammar_trigger ct(t);
-                    if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
-                        const auto & word = ct.value.value;
-                        auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
-                        if (ids.size() == 1) {
-                            auto token = ids[0];
-                            if (std::find(params.sampling.preserved_tokens.begin(), params.sampling.preserved_tokens.end(), (llama_token) token) == params.sampling.preserved_tokens.end()) {
-                                throw std::runtime_error("Grammar trigger word should be marked as preserved token: " + word);
-                            }
-                            SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str());
-                            common_grammar_trigger trigger;
-                            trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
-                            trigger.value = word;
-                            trigger.token = token;
-                            params.sampling.grammar_triggers.push_back(std::move(trigger));
-                        } else {
-                            SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
-                            params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
-                        }
-                    } else {
-                        if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN) {
-                            SRV_DBG("Grammar trigger pattern: `%s`\n", ct.value.value.c_str());
-                        } else if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL) {
-                            SRV_DBG("Grammar trigger pattern full: `%s`\n", ct.value.value.c_str());
-                        } else {
-                            throw std::runtime_error("Unknown grammar trigger type");
-                        }
-                        params.sampling.grammar_triggers.emplace_back(std::move(ct.value));
-                    }
-                }
-            }
-            if (params.sampling.grammar_lazy && params.sampling.grammar_triggers.empty()) {
-                throw std::runtime_error("Error: no triggers set for lazy grammar!");
-            }
-        }
-
-        {
-            params.sampling.logit_bias.clear();
-
-            const auto & logit_bias = data.find("logit_bias");
-            if (logit_bias != data.end() && logit_bias->is_array()) {
-                const int n_vocab = llama_vocab_n_tokens(vocab);
-                for (const auto & el : *logit_bias) {
-                    // TODO: we may want to throw errors here, in case "el" is incorrect
-                    if (el.is_array() && el.size() == 2) {
-                        float bias;
-                        if (el[1].is_number()) {
-                            bias = el[1].get<float>();
-                        } else if (el[1].is_boolean() && !el[1].get<bool>()) {
-                            bias = -INFINITY;
-                        } else {
-                            continue;
-                        }
-
-                        if (el[0].is_number_integer()) {
-                            llama_token tok = el[0].get<llama_token>();
-                            if (tok >= 0 && tok < n_vocab) {
-                                params.sampling.logit_bias.push_back({tok, bias});
-                            }
-                        } else if (el[0].is_string()) {
-                            auto toks = common_tokenize(vocab, el[0].get<std::string>(), false);
-                            for (auto tok : toks) {
-                                params.sampling.logit_bias.push_back({tok, bias});
-                            }
-                        }
-                    }
-                }
-           } else if (logit_bias != data.end() && logit_bias->is_object()) {
-                const int n_vocab = llama_vocab_n_tokens(vocab);
-                for (const auto & el : logit_bias->items()) {
-                    float bias;
-                    const auto & key = el.key();
-                    const auto & value = el.value();
-                    if (value.is_number()) {
-                        bias = value.get<float>();
-                    } else if (value.is_boolean() && !value.get<bool>()) {
-                        bias = -INFINITY;
-                    } else {
-                        continue;
-                    }
-
-                    char *end;
-                    llama_token tok = strtol(key.c_str(), &end, 10);
-                    if (*end == 0) {
-                        if (tok >= 0 && tok < n_vocab) {
-                            params.sampling.logit_bias.push_back({tok, bias});
-                        }
-                    } else {
-                        auto toks = common_tokenize(vocab, key, false);
-                        for (auto tok : toks) {
-                            params.sampling.logit_bias.push_back({tok, bias});
-                        }
-                    }
-                }
-            }
-
-            params.sampling.ignore_eos = json_value(data, "ignore_eos", params_base.sampling.ignore_eos);
-            if (params.sampling.ignore_eos) {
-                params.sampling.logit_bias.insert(
-                        params.sampling.logit_bias.end(),
-                        defaults.sampling.logit_bias_eog.begin(), defaults.sampling.logit_bias_eog.end());
-            }
-        }
-
-        {
-            params.antiprompt.clear();
-
-            const auto & stop = data.find("stop");
-            if (stop != data.end() && stop->is_array()) {
-                for (const auto & word : *stop) {
-                    if (!word.empty()) {
-                        params.antiprompt.push_back(word);
-                    }
-                }
-            }
-            // set reverse prompt from cli args if not set in the request
-            if (params.antiprompt.empty()) {
-                params.antiprompt = defaults.antiprompt;
-            }
-        }
-
-        {
-            const auto samplers = data.find("samplers");
-            if (samplers != data.end()) {
-                if (samplers->is_array()) {
-                    params.sampling.samplers = common_sampler_types_from_names(*samplers, false);
-                } else if (samplers->is_string()){
-                    params.sampling.samplers = common_sampler_types_from_chars(samplers->get<std::string>());
-                }
-            } else {
-                params.sampling.samplers = defaults.sampling.samplers;
-            }
-        }
-
-        std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias;
-        params.oaicompat_model = json_value(data, "model", model_name);
-
-        return params;
-    }
-
-    // utility function
-    static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
-        std::unordered_set<int> ids(tasks.size());
-        for (size_t i = 0; i < tasks.size(); i++) {
-            ids.insert(tasks[i].id);
-        }
-        return ids;
-    }
-};
-
-struct result_timings {
-    int32_t cache_n = -1;
-
-    int32_t prompt_n = -1;
-    double prompt_ms;
-    double prompt_per_token_ms;
-    double prompt_per_second;
-
-    int32_t predicted_n = -1;
-    double predicted_ms;
-    double predicted_per_token_ms;
-    double predicted_per_second;
-
-    // Optional speculative metrics - only included when > 0
-    int32_t draft_n = 0;
-    int32_t draft_n_accepted = 0;
-
-    json to_json() const {
-        json base = {
-            {"cache_n",                cache_n},
-
-            {"prompt_n",               prompt_n},
-            {"prompt_ms",              prompt_ms},
-            {"prompt_per_token_ms",    prompt_per_token_ms},
-            {"prompt_per_second",      prompt_per_second},
-
-            {"predicted_n",            predicted_n},
-            {"predicted_ms",           predicted_ms},
-            {"predicted_per_token_ms", predicted_per_token_ms},
-            {"predicted_per_second",   predicted_per_second},
-        };
-
-        if (draft_n > 0) {
-            base["draft_n"] = draft_n;
-            base["draft_n_accepted"] = draft_n_accepted;
-        }
-
-        return base;
-    }
-};
-
-struct result_prompt_progress {
-    int32_t total = 0;
-    int32_t cache = 0;
-    int32_t processed = 0;
-    int64_t time_ms = 0;
-
-    json to_json() const {
-        return json {
-            {"total",     total},
-            {"cache",     cache},
-            {"processed", processed},
-            {"time_ms",   time_ms},
-        };
-    }
-};
-
-struct server_task_result {
-    int id           = -1;
-    int id_slot      = -1;
-    virtual bool is_error() {
-        // only used by server_task_result_error
-        return false;
-    }
-    virtual bool is_stop() {
-        // only used by server_task_result_cmpl_*
-        return true;
-    }
-    virtual int get_index() {
-        return -1;
-    }
-    virtual json to_json() = 0;
-    virtual ~server_task_result() = default;
-};
-
-// using shared_ptr for polymorphism of server_task_result
-using server_task_result_ptr = std::unique_ptr<server_task_result>;
-
-static inline std::string stop_type_to_str(stop_type type) {
-    switch (type) {
-        case STOP_TYPE_EOS:   return "eos";
-        case STOP_TYPE_WORD:  return "word";
-        case STOP_TYPE_LIMIT: return "limit";
-        default:              return "none";
-    }
-}
-
-struct completion_token_output {
-    llama_token tok;
-    float prob;
-    std::string text_to_send;
-    struct prob_info {
-        llama_token tok;
-        std::string txt;
-        float prob;
-    };
-    std::vector<prob_info> probs;
-
-    json to_json(bool post_sampling_probs) const {
-        json probs_for_token = json::array();
-        for (const auto & p : probs) {
-            std::string txt(p.txt);
-            txt.resize(validate_utf8(txt));
-            probs_for_token.push_back(json {
-                {"id",      p.tok},
-                {"token",   txt},
-                {"bytes",   str_to_bytes(p.txt)},
-                {
-                    post_sampling_probs ? "prob" : "logprob",
-                    post_sampling_probs ? p.prob : logarithm(p.prob)
-                },
-            });
-        }
-        return probs_for_token;
-    }
-
-    static json probs_vector_to_json(const std::vector<completion_token_output> & probs, bool post_sampling_probs) {
-        json out = json::array();
-        for (const auto & p : probs) {
-            std::string txt(p.text_to_send);
-            txt.resize(validate_utf8(txt));
-            out.push_back(json {
-                {"id",           p.tok},
-                {"token",        txt},
-                {"bytes",        str_to_bytes(p.text_to_send)},
-                {
-                    post_sampling_probs ? "prob" : "logprob",
-                    post_sampling_probs ? p.prob : logarithm(p.prob)
-                },
-                {
-                    post_sampling_probs ? "top_probs" : "top_logprobs",
-                    p.to_json(post_sampling_probs)
-                },
-            });
-        }
-        return out;
-    }
-
-    static float logarithm(float x) {
-        // nlohmann::json converts -inf to null, so we need to prevent that
-        return x == 0.0f ? std::numeric_limits<float>::lowest() : std::log(x);
-    }
-
-    static std::vector<unsigned char> str_to_bytes(const std::string & str) {
-        std::vector<unsigned char> bytes;
-        for (unsigned char c : str) {
-            bytes.push_back(c);
-        }
-        return bytes;
-    }
-};
-
-struct server_task_result_cmpl_final : server_task_result {
-    int index = 0;
-
-    std::string content;
-    llama_tokens tokens;
-
-    bool stream;
-    bool include_usage;
-    result_timings timings;
-    std::string prompt;
-
-    bool truncated;
-    int32_t n_decoded;
-    int32_t n_prompt_tokens;
-    int32_t n_tokens_cached;
-    bool has_new_line;
-    std::string stopping_word;
-    stop_type stop = STOP_TYPE_NONE;
-
-    bool post_sampling_probs;
-    std::vector<completion_token_output> probs_output;
-    std::vector<std::string>  response_fields;
-
-    slot_params generation_params;
-
-    // OAI-compat fields
-    bool            verbose   = false;
-    oaicompat_type  oaicompat = OAICOMPAT_TYPE_NONE;
-    std::string     oaicompat_model;
-    std::string     oaicompat_cmpl_id;
-    common_chat_msg oaicompat_msg;
-
-    std::vector<common_chat_msg_diff> oaicompat_msg_diffs;
-
-    virtual int get_index() override {
-        return index;
-    }
-
-    virtual bool is_stop() override {
-        return true; // in stream mode, final responses are considered stop
-    }
-
-    virtual json to_json() override {
-        switch (oaicompat) {
-            case OAICOMPAT_TYPE_NONE:
-                return to_json_non_oaicompat();
-            case OAICOMPAT_TYPE_COMPLETION:
-                return to_json_oaicompat();
-            case OAICOMPAT_TYPE_CHAT:
-                return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
-            default:
-                GGML_ASSERT(false && "Invalid oaicompat_type");
-        }
-    }
-
-    json to_json_non_oaicompat() {
-        json res = json {
-            {"index",               index},
-            {"content",             stream ? "" : content}, // in stream mode, content is already in last partial chunk
-            {"tokens",              stream ? llama_tokens {} : tokens},
-            {"id_slot",             id_slot},
-            {"stop",                true},
-            {"model",               oaicompat_model},
-            {"tokens_predicted",    n_decoded},
-            {"tokens_evaluated",    n_prompt_tokens},
-            {"generation_settings", generation_params.to_json()},
-            {"prompt",              prompt},
-            {"has_new_line",        has_new_line},
-            {"truncated",           truncated},
-            {"stop_type",           stop_type_to_str(stop)},
-            {"stopping_word",       stopping_word},
-            {"tokens_cached",       n_tokens_cached},
-            {"timings",             timings.to_json()},
-        };
-        if (!stream && !probs_output.empty()) {
-            res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs);
-        }
-        return response_fields.empty() ? res : json_get_nested_values(response_fields, res);
-    }
-
-    json to_json_oaicompat() {
-        std::time_t t = std::time(0);
-        json logprobs = json(nullptr); // OAI default to null
-        if (!stream && probs_output.size() > 0) {
-            logprobs = json{
-                {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
-            };
-        }
-        json finish_reason = "length";
-        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
-            finish_reason = "stop";
-        }
-        json res = json {
-            {"choices",            json::array({
-                json{
-                    {"text",          stream ? "" : content}, // in stream mode, content is already in last partial chunk
-                    {"index",         index},
-                    {"logprobs",      logprobs},
-                    {"finish_reason", finish_reason},
-                }
-            })},
-            {"created",            t},
-            {"model",              oaicompat_model},
-            {"system_fingerprint", build_info},
-            {"object",             "text_completion"},
-            {"usage", json {
-                {"completion_tokens", n_decoded},
-                {"prompt_tokens",     n_prompt_tokens},
-                {"total_tokens",      n_decoded + n_prompt_tokens}
-            }},
-            {"id", oaicompat_cmpl_id}
-        };
-
-        // extra fields for debugging purposes
-        if (verbose) {
-            res["__verbose"] = to_json_non_oaicompat();
-        }
-        if (timings.prompt_n >= 0) {
-            res.push_back({"timings", timings.to_json()});
-        }
-
-        return res;
-    }
-
-    json to_json_oaicompat_chat() {
-        std::string finish_reason = "length";
-        common_chat_msg msg;
-        if (!oaicompat_msg.empty()) {
-            msg = oaicompat_msg;
-        } else {
-            msg.role = "assistant";
-            msg.content = content;
-        }
-        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
-            finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls";
-        }
-
-        json choice {
-            {"finish_reason", finish_reason},
-            {"index", 0},
-            {"message", msg.to_json_oaicompat<json>()},
-        };
-
-        if (!stream && probs_output.size() > 0) {
-            choice["logprobs"] = json{
-                {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
-            };
-        }
-
-        std::time_t t = std::time(0);
-
-        json res = json {
-            {"choices",            json::array({choice})},
-            {"created",            t},
-            {"model",              oaicompat_model},
-            {"system_fingerprint", build_info},
-            {"object",             "chat.completion"},
-            {"usage", json {
-                {"completion_tokens", n_decoded},
-                {"prompt_tokens",     n_prompt_tokens},
-                {"total_tokens",      n_decoded + n_prompt_tokens}
-            }},
-            {"id", oaicompat_cmpl_id}
-        };
-
-        // extra fields for debugging purposes
-        if (verbose) {
-            res["__verbose"] = to_json_non_oaicompat();
-        }
-        if (timings.prompt_n >= 0) {
-            res.push_back({"timings", timings.to_json()});
-        }
-
-        return res;
-    }
-
-    json to_json_oaicompat_chat_stream() {
-        std::time_t t = std::time(0);
-        std::string finish_reason = "length";
-        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
-            finish_reason = oaicompat_msg.tool_calls.empty() ? "stop" : "tool_calls";
-        }
-
-        json deltas = json::array();
-        for (const auto & diff : oaicompat_msg_diffs) {
-            deltas.push_back({
-                {"choices", json::array({
-                    json {
-                        {"finish_reason", nullptr},
-                        {"index", 0},
-                        {"delta", common_chat_msg_diff_to_json_oaicompat<json>(diff)},
-                    },
-                })},
-                {"created", t},
-                {"id", oaicompat_cmpl_id},
-                {"model", oaicompat_model},
-                {"system_fingerprint", build_info},
-                {"object", "chat.completion.chunk"},
-            });
-        }
-
-        deltas.push_back({
-            {"choices", json::array({
-                json {
-                    {"finish_reason", finish_reason},
-                    {"index", 0},
-                    {"delta", json::object()},
-                },
-            })},
-            {"created",            t},
-            {"id",                 oaicompat_cmpl_id},
-            {"model",              oaicompat_model},
-            {"system_fingerprint", build_info},
-            {"object",             "chat.completion.chunk"},
-        });
-
-        if (include_usage) {
-            // OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
-            // https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
-            deltas.push_back({
-                {"choices", json::array()},
-                {"created",            t},
-                {"id",                 oaicompat_cmpl_id},
-                {"model",              oaicompat_model},
-                {"system_fingerprint", build_info},
-                {"object",             "chat.completion.chunk"},
-                {"usage", json {
-                    {"completion_tokens", n_decoded},
-                    {"prompt_tokens",     n_prompt_tokens},
-                    {"total_tokens",      n_decoded + n_prompt_tokens},
-                }},
-            });
-        }
-
-        if (timings.prompt_n >= 0) {
-            deltas.back().push_back({"timings", timings.to_json()});
-        }
-
-        // extra fields for debugging purposes
-        if (verbose && !deltas.empty()) {
-            deltas.front()["__verbose"] = to_json_non_oaicompat();
-        }
-
-        return deltas;
-    }
-};
-
-struct server_task_result_cmpl_partial : server_task_result {
-    int index = 0;
-
-    std::string  content;
-    llama_tokens tokens;
-
-    int32_t n_decoded;
-    int32_t n_prompt_tokens;
-
-    bool post_sampling_probs;
-    bool is_progress = false;
-    completion_token_output prob_output;
-    result_timings timings;
-    result_prompt_progress progress;
-
-    // OAI-compat fields
-    bool            verbose   = false;
-    oaicompat_type  oaicompat = OAICOMPAT_TYPE_NONE;
-    std::string     oaicompat_model;
-    std::string     oaicompat_cmpl_id;
-    std::vector<common_chat_msg_diff> oaicompat_msg_diffs;
-
-    virtual int get_index() override {
-        return index;
-    }
-
-    virtual bool is_stop() override {
-        return false; // in stream mode, partial responses are not considered stop
-    }
-
-    virtual json to_json() override {
-        switch (oaicompat) {
-            case OAICOMPAT_TYPE_NONE:
-                return to_json_non_oaicompat();
-            case OAICOMPAT_TYPE_COMPLETION:
-                return to_json_oaicompat();
-            case OAICOMPAT_TYPE_CHAT:
-                return to_json_oaicompat_chat();
-            default:
-                GGML_ASSERT(false && "Invalid oaicompat_type");
-        }
-    }
-
-    json to_json_non_oaicompat() {
-        // non-OAI-compat JSON
-        json res = json {
-            {"index",            index},
-            {"content",          content},
-            {"tokens",           tokens},
-            {"stop",             false},
-            {"id_slot",          id_slot},
-            {"tokens_predicted", n_decoded},
-            {"tokens_evaluated", n_prompt_tokens},
-        };
-        // populate the timings object when needed (usually for the last response or with timings_per_token enabled)
-        if (timings.prompt_n > 0) {
-            res.push_back({"timings", timings.to_json()});
-        }
-        if (is_progress) {
-            res.push_back({"prompt_progress", progress.to_json()});
-        }
-        if (!prob_output.probs.empty()) {
-            res["completion_probabilities"] = completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs);
-        }
-        return res;
-    }
-
-    json to_json_oaicompat() {
-        std::time_t t = std::time(0);
-        json logprobs = json(nullptr); // OAI default to null
-        if (prob_output.probs.size() > 0) {
-            logprobs = json{
-                {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
-            };
-        }
-        json res = json {
-            {"choices",            json::array({
-                json{
-                    {"text",          content},
-                    {"index",         index},
-                    {"logprobs",      logprobs},
-                    {"finish_reason", nullptr},
-                }
-            })},
-            {"created",            t},
-            {"model",              oaicompat_model},
-            {"system_fingerprint", build_info},
-            {"object",             "text_completion"},
-            {"id",                 oaicompat_cmpl_id}
-        };
-
-        // extra fields for debugging purposes
-        if (verbose) {
-            res["__verbose"] = to_json_non_oaicompat();
-        }
-        if (timings.prompt_n >= 0) {
-            res.push_back({"timings", timings.to_json()});
-        }
-        if (is_progress) {
-            res.push_back({"prompt_progress", progress.to_json()});
-        }
-
-        return res;
-    }
-
-    json to_json_oaicompat_chat() {
-        bool first = n_decoded == 1;
-        std::time_t t = std::time(0);
-        json choices;
-
-        std::vector<json> deltas;
-        auto add_delta = [&](const json & delta) {
-            deltas.push_back({
-                {"choices", json::array({
-                    json {
-                        {"finish_reason", nullptr},
-                        {"index", 0},
-                        {"delta", delta},
-                    },
-                })},
-                {"created", t},
-                {"id", oaicompat_cmpl_id},
-                {"model", oaicompat_model},
-                {"system_fingerprint", build_info},
-                {"object", "chat.completion.chunk"},
-            });
-        };
-        // We have to send an initial update to conform to openai behavior
-        if (first || is_progress) {
-            add_delta({
-                {"role", "assistant"},
-                {"content", nullptr},
-            });
-        }
-
-        for (const auto & diff : oaicompat_msg_diffs) {
-            add_delta(common_chat_msg_diff_to_json_oaicompat<json>(diff));
-        }
-
-        if (!deltas.empty()) {
-            auto & last_json = deltas[deltas.size() - 1];
-            GGML_ASSERT(last_json.at("choices").size() >= 1);
-
-            if (prob_output.probs.size() > 0) {
-                last_json.at("choices").at(0)["logprobs"] = json {
-                    {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
-                };
-            }
-
-            if (timings.prompt_n >= 0) {
-                last_json.push_back({"timings", timings.to_json()});
-            }
-            if (is_progress) {
-                last_json.push_back({"prompt_progress", progress.to_json()});
-            }
-        }
-
-        return deltas;
-    }
-};
-
-struct server_task_result_embd : server_task_result {
-    int index = 0;
-    std::vector<std::vector<float>> embedding;
-
-    int32_t n_tokens;
-
-    // OAI-compat fields
-    oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
-
-    virtual int get_index() override {
-        return index;
-    }
-
-    virtual json to_json() override {
-        return oaicompat == OAICOMPAT_TYPE_EMBEDDING
-            ? to_json_oaicompat()
-            : to_json_non_oaicompat();
-    }
-
-    json to_json_non_oaicompat() {
-        return json {
-            {"index",     index},
-            {"embedding", embedding},
-        };
-    }
-
-    json to_json_oaicompat() {
-        return json {
-            {"index",            index},
-            {"embedding",        embedding[0]},
-            {"tokens_evaluated", n_tokens},
-        };
-    }
-};
-
-struct server_task_result_rerank : server_task_result {
-    int index = 0;
-    float score = -1e6;
-
-    int32_t n_tokens;
-
-    virtual int get_index() override {
-        return index;
-    }
-
-    virtual json to_json() override {
-        return json {
-            {"index",            index},
-            {"score",            score},
-            {"tokens_evaluated", n_tokens},
-        };
-    }
-};
-
-// this function maybe used outside of server_task_result_error
-static json format_error_response(const std::string & message, const enum error_type type) {
-    std::string type_str;
-    int code = 500;
-    switch (type) {
-        case ERROR_TYPE_INVALID_REQUEST:
-            type_str = "invalid_request_error";
-            code = 400;
-            break;
-        case ERROR_TYPE_AUTHENTICATION:
-            type_str = "authentication_error";
-            code = 401;
-            break;
-        case ERROR_TYPE_NOT_FOUND:
-            type_str = "not_found_error";
-            code = 404;
-            break;
-        case ERROR_TYPE_SERVER:
-            type_str = "server_error";
-            code = 500;
-            break;
-        case ERROR_TYPE_PERMISSION:
-            type_str = "permission_error";
-            code = 403;
-            break;
-        case ERROR_TYPE_NOT_SUPPORTED:
-            type_str = "not_supported_error";
-            code = 501;
-            break;
-        case ERROR_TYPE_UNAVAILABLE:
-            type_str = "unavailable_error";
-            code = 503;
-            break;
-        case ERROR_TYPE_EXCEED_CONTEXT_SIZE:
-            type_str = "exceed_context_size_error";
-            code = 400;
-            break;
-    }
-    return json {
-        {"code", code},
-        {"message", message},
-        {"type", type_str},
-    };
-}
-
-struct server_task_result_error : server_task_result {
-    int index = 0;
-    error_type err_type = ERROR_TYPE_SERVER;
-    std::string err_msg;
-
-    // for ERROR_TYPE_EXCEED_CONTEXT_SIZE
-    int32_t n_prompt_tokens = 0;
-    int32_t n_ctx           = 0;
-
-    virtual bool is_error() override {
-        return true;
-    }
-
-    virtual json to_json() override {
-        json res = format_error_response(err_msg, err_type);
-        if (err_type == ERROR_TYPE_EXCEED_CONTEXT_SIZE) {
-            res["n_prompt_tokens"] = n_prompt_tokens;
-            res["n_ctx"]           = n_ctx;
-        }
-        return res;
-    }
-};
-
-struct server_task_result_metrics : server_task_result {
-    int n_idle_slots;
-    int n_processing_slots;
-    int n_tasks_deferred;
-    int64_t t_start;
-
-    // TODO: somehow reuse server_metrics in the future, instead of duplicating the fields
-    uint64_t n_prompt_tokens_processed_total = 0;
-    uint64_t t_prompt_processing_total       = 0;
-    uint64_t n_tokens_predicted_total        = 0;
-    uint64_t t_tokens_generation_total       = 0;
-
-    uint64_t n_tokens_max = 0;
-
-    uint64_t n_prompt_tokens_processed = 0;
-    uint64_t t_prompt_processing       = 0;
-
-    uint64_t n_tokens_predicted  = 0;
-    uint64_t t_tokens_generation = 0;
-
-    uint64_t n_decode_total     = 0;
-    uint64_t n_busy_slots_total = 0;
-
-    // while we can also use std::vector<server_slot> this requires copying the slot object which can be quite messy
-    // therefore, we use json to temporarily store the slot.to_json() result
-    json slots_data = json::array();
-
-    virtual json to_json() override {
-        return json {
-            { "idle",                            n_idle_slots },
-            { "processing",                      n_processing_slots },
-            { "deferred",                        n_tasks_deferred },
-            { "t_start",                         t_start },
-
-            { "n_prompt_tokens_processed_total", n_prompt_tokens_processed_total },
-            { "t_tokens_generation_total",       t_tokens_generation_total },
-            { "n_tokens_predicted_total",        n_tokens_predicted_total },
-            { "t_prompt_processing_total",       t_prompt_processing_total },
-
-            { "n_tokens_max",                    n_tokens_max },
-
-            { "n_prompt_tokens_processed",       n_prompt_tokens_processed },
-            { "t_prompt_processing",             t_prompt_processing },
-            { "n_tokens_predicted",              n_tokens_predicted },
-            { "t_tokens_generation",             t_tokens_generation },
-
-            { "n_decode_total",                  n_decode_total },
-            { "n_busy_slots_total",              n_busy_slots_total },
-
-            { "slots",                           slots_data },
-        };
-    }
-};
-
-struct server_task_result_slot_save_load : server_task_result {
-    std::string filename;
-    bool is_save; // true = save, false = load
-
-    size_t n_tokens;
-    size_t n_bytes;
-    double t_ms;
-
-    virtual json to_json() override {
-        if (is_save) {
-            return json {
-                { "id_slot",   id_slot },
-                { "filename",  filename },
-                { "n_saved",   n_tokens },
-                { "n_written", n_bytes },
-                { "timings", {
-                    { "save_ms", t_ms }
-                }},
-            };
-        }
-
-        return json {
-            { "id_slot",    id_slot },
-            { "filename",   filename },
-            { "n_restored", n_tokens },
-            { "n_read",     n_bytes },
-            { "timings", {
-                { "restore_ms", t_ms }
-            }},
-        };
-    }
-};
-
-struct server_task_result_slot_erase : server_task_result {
-    size_t n_erased;
-
-    virtual json to_json() override {
-        return json {
-            { "id_slot",  id_slot },
-            { "n_erased", n_erased },
-        };
-    }
-};
-
-struct server_task_result_apply_lora : server_task_result {
-    virtual json to_json() override {
-        return json {{ "success", true }};
-    }
-};
-
-struct server_prompt_checkpoint {
-    llama_pos pos_min;
-    llama_pos pos_max;
-
-    std::vector<uint8_t> data;
-
-    size_t size() const {
-        return data.size();
-    }
-};
-
-struct server_prompt {
-    server_tokens tokens;
-
-    std::vector<uint8_t> data;
-
-    std::list<server_prompt_checkpoint> checkpoints;
-
-    size_t size() const {
-        size_t res = data.size();
-
-        for (const auto & checkpoint : checkpoints) {
-            res += checkpoint.size();
-        }
-
-        return res;
-    }
-
-    int n_tokens() const {
-        return tokens.size();
-    }
-};
-
-struct server_prompt_cache {
-    server_prompt_cache(int32_t limit_size_mib, size_t limit_tokens) {
-        this->limit_size   = 1024ull*1024ull*(limit_size_mib < 0 ? 0 : limit_size_mib);
-        this->limit_tokens = limit_tokens;
-    }
-
-    std::list<server_prompt> states;
-
-    // in bytes, 0 = no limit
-    size_t limit_size = 0;
-
-    // in tokens, 0 = no limit
-    size_t limit_tokens = 0;
-
-    size_t size() const {
-        size_t res = 0;
-
-        for (const auto & state : states) {
-            res += state.size();
-        }
-
-        return res;
-    }
-
-    size_t n_tokens() const {
-        size_t res = 0;
-
-        for (const auto & state : states) {
-            res += state.n_tokens();
-        }
-
-        return res;
-    }
-
-    server_prompt * alloc(const server_prompt & prompt, size_t state_size) {
-        // first check if the current state is contained fully in the cache
-        for (auto it = states.begin(); it != states.end(); ++it) {
-            const int cur_lcp_len = it->tokens.get_common_prefix(prompt.tokens);
-
-            if (cur_lcp_len == (int) prompt.tokens.size()) {
-                SRV_WRN("%s", " - prompt is already in the cache, skipping\n");
-                return nullptr;
-            }
-        }
-
-        // next, remove any cached prompts that are fully contained in the current prompt
-        for (auto it = states.begin(); it != states.end();) {
-            const int len = it->tokens.get_common_prefix(prompt.tokens);
-
-            if (len == (int) it->tokens.size()) {
-                SRV_WRN(" - removing obsolete cached prompt with length %d\n", len);
-
-                it = states.erase(it);
-            } else {
-                ++it;
-            }
-        }
-
-        std::vector<uint8_t> state_data;
-
-        // check if we can allocate enough memory for the new state
-        try {
-            state_data.resize(state_size);
-        } catch (const std::bad_alloc & e) {
-            SRV_ERR("failed to allocate memory for prompt cache state: %s\n", e.what());
-
-            limit_size = std::max<size_t>(1, 0.4*size());
-
-            SRV_WRN(" - cache size limit reduced to %.3f MiB\n", limit_size / (1024.0 * 1024.0));
-
-            update();
-
-            return nullptr;
-        }
-
-        // TODO: for some reason we can't copy server_tokens, so we have to do this workaround
-        auto & cur = states.emplace_back();
-        cur = {
-            /*.tokens      =*/ server_tokens(prompt.tokens.get_text_tokens(), false),
-            /*.data        =*/ std::move(state_data),
-            /*.checkpoints =*/ prompt.checkpoints,
-        };
-
-        return &cur;
-    }
-
-    bool load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, int32_t id_slot) {
-        const int lcp_best = prompt.tokens.get_common_prefix(tokens_new);
-
-        float f_keep_best = float(lcp_best) / prompt.tokens.size();
-        float sim_best    = float(lcp_best) / tokens_new.size();
-
-        SRV_WRN(" - looking for better prompt, base f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
-
-        auto it_best = states.end();
-
-        // find the most similar cached prompt, that would also preserve the most context
-        for (auto it = states.begin(); it != states.end(); ++it) {
-            const int lcp_cur = it->tokens.get_common_prefix(tokens_new);
-
-            const float f_keep_cur = float(lcp_cur) / it->tokens.size();
-            const float sim_cur    = float(lcp_cur) / tokens_new.size();
-
-            // don't trash large prompts
-            if (f_keep_cur < 0.25f) {
-                continue;
-            }
-
-            if (f_keep_best < f_keep_cur && sim_best < sim_cur) {
-                f_keep_best = f_keep_cur;
-                sim_best    = sim_cur;
-
-                it_best = it;
-            }
-        }
-
-        if (it_best != states.end()) {
-            SRV_WRN(" - found better prompt with f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
-
-            const size_t size = it_best->data.size();
-            const size_t n = llama_state_seq_set_data_ext(ctx, it_best->data.data(), size, id_slot, 0);
-            if (n != size) {
-                SRV_WRN("failed to restore state with size %zu\n", size);
-
-                return false;
-            }
-
-            it_best->data.clear();
-            it_best->data.shrink_to_fit();
-
-            prompt = std::move(*it_best);
-
-            states.erase(it_best);
-        }
-
-        return true;
-    }
-
-    void update() {
-        if (limit_size > 0) {
-            // always keep at least one state, regardless of the limits
-            while (states.size() > 1 && size() > limit_size) {
-                if (states.empty()) {
-                    break;
-                }
-
-                SRV_WRN(" - cache size limit reached, removing oldest entry (size = %.3f MiB)\n", states.front().size() / (1024.0 * 1024.0));
-
-                states.pop_front();
-            }
-        }
-
-        // average size per token
-        const float size_per_token = std::max<float>(1.0f, float(size()) / (std::max<size_t>(1, n_tokens())));
-
-        // dynamically increase the token limit if it can fit in the memory limit
-        const size_t limit_tokens_cur = limit_size > 0 ? std::max<size_t>(limit_tokens, limit_size/size_per_token) : limit_tokens;
-
-        if (limit_tokens > 0) {
-            while (states.size() > 1 && n_tokens() > limit_tokens_cur) {
-                if (states.empty()) {
-                    break;
-                }
-
-                SRV_WRN(" - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n",
-                        limit_tokens, limit_tokens_cur, states.front().size() / (1024.0 * 1024.0));
-
-                states.pop_front();
-            }
-        }
-
-        SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n",
-                states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens, limit_tokens_cur);
-
-        for (const auto & state : states) {
-            SRV_WRN("   - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n",
-                    (const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0));
-        }
-    }
-};
-
-struct server_slot {
-    int id;
-
-    llama_batch batch_spec = {};
-
-    // TODO: change to unique_ptrs for consistency:
-    llama_context * ctx = nullptr;
-    llama_context * ctx_dft = nullptr;
-
-    // multimodal
-    mtmd_context * mctx = nullptr;
-
-    common_speculative * spec = nullptr;
-
-    std::unique_ptr<const server_task> task;
-    std::unique_ptr<const server_task> task_prev; // used for debugging
-
-    // used to determine the slot that has been used the longest
-    int64_t t_last_used = -1;
-
-    // generation props
-    int32_t n_ctx       = 0;  // context size per slot
-    int32_t n_keep      = 0;
-    int32_t n_decoded   = 0;
-    int32_t n_remaining = -1;
-    int32_t i_batch     = -1;
-
-    int32_t n_prompt_tokens_cache     = 0;
-    int32_t n_prompt_tokens_processed = 0;
-
-    size_t last_nl_pos = 0;
-
-    std::string  generated_text;
-    llama_tokens generated_tokens;
-
-    common_chat_msg chat_msg;
-
-    std::vector<completion_token_output> generated_token_probs;
-
-    bool has_next_token = true;
-    bool has_new_line   = false;
-    bool truncated      = false;
-
-    stop_type stop;
-
-    std::string stopping_word;
-
-    // state
-    slot_state state = SLOT_STATE_IDLE;
-
-    server_prompt prompt;
-
-    void prompt_save(server_prompt_cache & prompt_cache) const {
-        assert(prompt.data.size() == 0);
-
-        const size_t cur_size = llama_state_seq_get_size_ext(ctx, id, 0);
-
-        SRV_WRN(" - saving prompt with length %d, total state size = %.3f MiB\n",
-                (int) prompt.tokens.size(), cur_size / (1024.0 * 1024.0));
-
-        auto * cur = prompt_cache.alloc(prompt, cur_size);
-        if (cur == nullptr) {
-            return;
-        }
-
-        llama_state_seq_get_data_ext(ctx, cur->data.data(), cur_size, id, 0);
-    }
-
-    void prompt_load(server_prompt_cache & prompt_cache, const server_tokens & tokens) {
-        bool res = prompt_cache.load(prompt, tokens, ctx, id);
-        if (!res) {
-            SLT_WRN(*this, "%s", "failed to load prompt from cache\n");
-
-            llama_memory_seq_rm(llama_get_memory(ctx), id, -1, -1);
-            prompt.tokens.clear();
-        }
-    }
-
-    std::vector<common_adapter_lora_info> lora;
-    int32_t alora_invocation_start = -1;
-
-    // sampling
-    json json_schema;
-
-    struct common_sampler * smpl = nullptr;
-
-    llama_token sampled;
-
-    common_chat_format chat_format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
-    std::vector<std::string> generated_tool_call_ids;
-
-    // stats
-    size_t n_sent_text = 0; // number of sent text character
-
-    int64_t t_start_process_prompt;
-    int64_t t_start_generation;
-
-    double t_prompt_processing; // ms
-    double t_token_generation;  // ms
-
-    std::function<void(int)> callback_on_release;
-
-    // Speculative decoding stats
-    int32_t n_draft_total = 0;      // Total draft tokens generated
-    int32_t n_draft_accepted = 0;   // Draft tokens actually accepted
-
-    void reset() {
-        SLT_DBG(*this, "%s", "\n");
-
-        n_prompt_tokens_cache = 0;
-
-        last_nl_pos    = 0;
-        generated_text = "";
-        has_new_line   = false;
-        truncated      = false;
-        stop           = STOP_TYPE_NONE;
-        stopping_word  = "";
-        n_sent_text    = 0;
-        chat_format    = COMMON_CHAT_FORMAT_CONTENT_ONLY;
-
-        generated_tokens.clear();
-        generated_token_probs.clear();
-        chat_msg = {};
-        json_schema = json();
-        generated_tool_call_ids.clear();
-
-        // clear speculative decoding stats
-        n_draft_total = 0;
-        n_draft_accepted = 0;
-
-        task.reset();
-        task_prev.reset();
-
-        // clear alora start
-        alora_invocation_start = -1;
-    }
-
-    bool need_embd() const {
-        GGML_ASSERT(task);
-
-        return server_task_type_need_embd(task->type);
-    }
-
-    bool need_logits() const {
-        GGML_ASSERT(task);
-
-        return server_task_type_need_logits(task->type);
-    }
-
-    // if the context does not have a memory module then all embeddings have to be computed within a single ubatch
-    // also we cannot split if the pooling would require any past tokens
-    bool can_split() const {
-        return
-            !need_embd() ||
-            (llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST);
-    }
-
-    bool can_batch_with(server_slot & other_slot) const {
-        GGML_ASSERT(task);
-
-        return task->type == other_slot.task->type && are_lora_equal(lora, other_slot.lora);
-    }
-
-    bool has_budget(const common_params & global_params) {
-        GGML_ASSERT(task);
-
-        if (task->params.n_predict == -1 && global_params.n_predict == -1) {
-            return true; // limitless
-        }
-
-        n_remaining = -1;
-
-        if (task->params.n_predict != -1) {
-            n_remaining = task->params.n_predict - n_decoded;
-        } else if (global_params.n_predict != -1) {
-            n_remaining = global_params.n_predict - n_decoded;
-        }
-
-        return n_remaining > 0; // no budget
-    }
-
-    bool is_processing() const {
-        return state != SLOT_STATE_IDLE;
-    }
-
-    bool can_speculate() const {
-        return ctx_dft;
-    }
-
-    void add_token(const completion_token_output & token) {
-        if (!is_processing()) {
-            SLT_WRN(*this, "%s", "slot is not processing\n");
-            return;
-        }
-        generated_token_probs.push_back(token);
-    }
-
-    void release() {
-        if (is_processing()) {
-            GGML_ASSERT(task);
-
-            SLT_INF(*this, "stop processing: n_tokens = %d, truncated = %d\n", prompt.n_tokens(), truncated);
-
-            t_last_used = ggml_time_us();
-            t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
-            state = SLOT_STATE_IDLE;
-
-            task_prev = std::move(task);
-            task.reset();
-
-            callback_on_release(id);
-        }
-    }
-
-    result_timings get_timings() const {
-        result_timings timings;
-        timings.cache_n = n_prompt_tokens_cache;
-
-        timings.prompt_n            = n_prompt_tokens_processed;
-        timings.prompt_ms           = t_prompt_processing;
-        timings.prompt_per_token_ms = t_prompt_processing / n_prompt_tokens_processed;
-        timings.prompt_per_second   = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
-
-        timings.predicted_n            = n_decoded;
-        timings.predicted_ms           = t_token_generation;
-        timings.predicted_per_token_ms = t_token_generation / n_decoded;
-        timings.predicted_per_second   = 1e3 / t_token_generation * n_decoded;
-
-        // Add speculative metrics
-        if (n_draft_total > 0) {
-            timings.draft_n          = n_draft_total;
-            timings.draft_n_accepted = n_draft_accepted;
-        }
-
-        return timings;
-    }
-
-    const common_chat_msg & update_chat_msg(std::vector<common_chat_msg_diff> & diffs) {
-        GGML_ASSERT(task);
-
-        auto previous_msg = chat_msg;
-        SRV_DBG("Parsing chat message: %s\n", generated_text.c_str());
-        auto new_msg = common_chat_parse(
-            generated_text,
-            /* is_partial= */ stop != STOP_TYPE_EOS,
-            task->params.oaicompat_chat_syntax);
-        if (!new_msg.empty()) {
-            new_msg.set_tool_call_ids(generated_tool_call_ids, gen_tool_call_id);
-            chat_msg = new_msg;
-            diffs = common_chat_msg_diff::compute_diffs(previous_msg, new_msg.empty() ? previous_msg : new_msg);
-        }
-        return chat_msg;
-    }
-
-    size_t find_stopping_strings(const std::string & text, const size_t last_token_size, bool is_full_stop) {
-        GGML_ASSERT(task);
-
-        size_t stop_pos = std::string::npos;
-
-        for (const std::string & word : task->params.antiprompt) {
-            size_t pos;
-
-            if (is_full_stop) {
-                const size_t tmp      = word.size() + last_token_size;
-                const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
-
-                pos = text.find(word, from_pos);
-            } else {
-                // otherwise, partial stop
-                pos = string_find_partial_stop(text, word);
-            }
-
-            if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
-                if (is_full_stop) {
-                    stop           = STOP_TYPE_WORD;
-                    stopping_word  = word;
-                    has_next_token = false;
-                }
-                stop_pos = pos;
-            }
-        }
-
-        return stop_pos;
-    }
-
-    void print_timings() const {
-        const double t_prompt        =       t_prompt_processing / n_prompt_tokens_processed;
-        const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
-
-        const double t_gen        =       t_token_generation / n_decoded;
-        const double n_gen_second = 1e3 / t_token_generation * n_decoded;
-
-        SLT_INF(*this,
-                "\n"
-                "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
-                "       eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
-                "      total time = %10.2f ms / %5d tokens\n",
-                t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
-                t_token_generation, n_decoded, t_gen, n_gen_second,
-                t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
-
-        if (n_draft_total > 0) {
-            const float draft_ratio = (float) n_draft_accepted / n_draft_total;
-            SLT_INF(*this,
-                    "\n"
-                    "draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n",
-                    draft_ratio, n_draft_accepted, n_draft_total
-            );
-        }
-    }
-
-    json to_json(bool only_metrics = false) const {
-        json res;
-
-        res = {
-            {"id",            id},
-            {"n_ctx",         n_ctx},
-            {"speculative",   can_speculate()},
-            {"is_processing", is_processing()},
-        };
-
-        const auto & ptask = task ? task : task_prev;
-
-        if (ptask) {
-            res["id_task"] = ptask->id;
-            res["params"] = ptask->params.to_json(only_metrics);
-            res["next_token"] = {
-                {
-                    {"has_next_token", has_next_token},
-                    {"has_new_line",   has_new_line},
-                    {"n_remain",       n_remaining},
-                    {"n_decoded",      n_decoded},
-                }
-            };
-
-            if (!only_metrics) {
-                res["prompt"] = ptask->tokens.detokenize(ctx, true);
-                res["generated"] = generated_text;
-            }
-        }
-
-        return res;
-    }
-};
-
-struct server_metrics {
-    int64_t t_start = 0;
-
-    uint64_t n_prompt_tokens_processed_total = 0;
-    uint64_t t_prompt_processing_total       = 0;
-    uint64_t n_tokens_predicted_total        = 0;
-    uint64_t t_tokens_generation_total       = 0;
-
-    uint64_t n_tokens_max = 0;
-
-    uint64_t n_prompt_tokens_processed = 0;
-    uint64_t t_prompt_processing       = 0;
-
-    uint64_t n_tokens_predicted  = 0;
-    uint64_t t_tokens_generation = 0;
-
-    uint64_t n_decode_total     = 0;
-    uint64_t n_busy_slots_total = 0;
-
-    void init() {
-        t_start = ggml_time_us();
-    }
-
-    void on_prompt_eval(const server_slot & slot) {
-        n_prompt_tokens_processed_total += slot.n_prompt_tokens_processed;
-        n_prompt_tokens_processed       += slot.n_prompt_tokens_processed;
-        t_prompt_processing             += slot.t_prompt_processing;
-        t_prompt_processing_total       += slot.t_prompt_processing;
-
-        n_tokens_max = std::max(n_tokens_max, (uint64_t) slot.prompt.n_tokens());
-    }
-
-    void on_prediction(const server_slot & slot) {
-        n_tokens_predicted_total   += slot.n_decoded;
-        n_tokens_predicted         += slot.n_decoded;
-        t_tokens_generation        += slot.t_token_generation;
-        t_tokens_generation_total  += slot.t_token_generation;
-    }
-
-    void on_decoded(const std::vector<server_slot> & slots) {
-        n_decode_total++;
-        for (const auto & slot : slots) {
-            if (slot.is_processing()) {
-                n_busy_slots_total++;
-            }
-            n_tokens_max = std::max(n_tokens_max, (uint64_t) slot.prompt.n_tokens());
-        }
-    }
-
-    void reset_bucket() {
-        n_prompt_tokens_processed = 0;
-        t_prompt_processing       = 0;
-        n_tokens_predicted        = 0;
-        t_tokens_generation       = 0;
-    }
-};
-
-struct server_queue {
-    int id = 0;
-    bool running;
-
-    // queues
-    std::deque<server_task> queue_tasks;
-    std::deque<server_task> queue_tasks_deferred;
-
-    std::mutex mutex_tasks;
-    std::condition_variable condition_tasks;
-
-    // callback functions
-    std::function<void(server_task &&)> callback_new_task;
-    std::function<void(void)>           callback_update_slots;
-
-    // Add a new task to the end of the queue
-    int post(server_task && task, bool front = false) {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        GGML_ASSERT(task.id != -1);
-        // if this is cancel task make sure to clean up pending tasks
-        if (task.type == SERVER_TASK_TYPE_CANCEL) {
-            cleanup_pending_task(task.id_target);
-        }
-        const int task_id = task.id;
-        QUE_DBG("new task, id = %d, front = %d\n", task_id, front);
-        if (front) {
-            queue_tasks.push_front(std::move(task));
-        } else {
-            queue_tasks.push_back(std::move(task));
-        }
-        condition_tasks.notify_one();
-        return task_id;
-    }
-
-    // multi-task version of post()
-    int post(std::vector<server_task> && tasks, bool front = false) {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        for (auto & task : tasks) {
-            if (task.id == -1) {
-                task.id = id++;
-            }
-            // if this is cancel task make sure to clean up pending tasks
-            if (task.type == SERVER_TASK_TYPE_CANCEL) {
-                cleanup_pending_task(task.id_target);
-            }
-            QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front);
-            if (front) {
-                queue_tasks.push_front(std::move(task));
-            } else {
-                queue_tasks.push_back(std::move(task));
-            }
-        }
-        condition_tasks.notify_one();
-        return 0;
-    }
-
-    // Add a new task, but defer until one slot is available
-    void defer(server_task && task) {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        QUE_DBG("defer task, id = %d\n", task.id);
-        queue_tasks_deferred.push_back(std::move(task));
-        condition_tasks.notify_one();
-    }
-
-    // Get the next id for creating a new task
-    int get_new_id() {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        int new_id = id++;
-        return new_id;
-    }
-
-    // Register function to process a new task
-    void on_new_task(std::function<void(server_task &&)> callback) {
-        callback_new_task = std::move(callback);
-    }
-
-    // Register the function to be called when all slots data is ready to be processed
-    void on_update_slots(std::function<void(void)> callback) {
-        callback_update_slots = std::move(callback);
-    }
-
-    // Call when the state of one slot is changed, it will move one task from deferred to main queue
-    void pop_deferred_task() {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        if (!queue_tasks_deferred.empty()) {
-            queue_tasks.emplace_front(std::move(queue_tasks_deferred.front()));
-            queue_tasks_deferred.pop_front();
-        }
-        condition_tasks.notify_one();
-    }
-
-    // end the start_loop routine
-    void terminate() {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        running = false;
-        condition_tasks.notify_all();
-    }
-
-    /**
-     * Main loop consists of these steps:
-     * - Wait until a new task arrives
-     * - Process the task (i.e. maybe copy data into slot)
-     * - Check if multitask is finished
-     * - Update all slots
-     */
-    void start_loop() {
-        running = true;
-
-        while (true) {
-            QUE_DBG("%s", "processing new tasks\n");
-
-            while (true) {
-                std::unique_lock<std::mutex> lock(mutex_tasks);
-                if (!running) {
-                    QUE_DBG("%s", "terminate\n");
-                    return;
-                }
-                if (queue_tasks.empty()) {
-                    lock.unlock();
-                    break;
-                }
-                server_task task = std::move(queue_tasks.front());
-                queue_tasks.pop_front();
-                lock.unlock();
-
-                QUE_DBG("processing task, id = %d\n", task.id);
-                callback_new_task(std::move(task));
-            }
-
-            // all tasks in the current loop is processed, slots data is now ready
-            QUE_DBG("%s", "update slots\n");
-
-            callback_update_slots();
-
-            QUE_DBG("%s", "waiting for new tasks\n");
-            {
-                std::unique_lock<std::mutex> lock(mutex_tasks);
-                if (!running) {
-                    QUE_DBG("%s", "terminate\n");
-                    return;
-                }
-                if (queue_tasks.empty()) {
-                    condition_tasks.wait(lock, [&]{
-                        return (!queue_tasks.empty() || !running);
-                    });
-                }
-            }
-        }
-    }
-
-private:
-    void cleanup_pending_task(int id_target) {
-        // no need lock because this is called exclusively by post()
-        auto rm_func = [id_target](const server_task & task) {
-            return task.id == id_target;
-        };
-        queue_tasks.erase(
-            std::remove_if(queue_tasks.begin(),          queue_tasks.end(),          rm_func),
-            queue_tasks.end());
-        queue_tasks_deferred.erase(
-            std::remove_if(queue_tasks_deferred.begin(), queue_tasks_deferred.end(), rm_func),
-            queue_tasks_deferred.end());
-    }
-};
-
-struct server_response {
-    bool running = true;
-
-    // for keeping track of all tasks waiting for the result
-    std::unordered_set<int> waiting_task_ids;
-
-    // the main result queue (using ptr for polymorphism)
-    std::vector<server_task_result_ptr> queue_results;
-
-    std::mutex mutex_results;
-    std::condition_variable condition_results;
-
-    // add the id_task to the list of tasks waiting for response
-    void add_waiting_task_id(int id_task) {
-        SRV_DBG("add task %d to waiting list. current waiting = %d (before add)\n", id_task, (int) waiting_task_ids.size());
-
-        std::unique_lock<std::mutex> lock(mutex_results);
-        waiting_task_ids.insert(id_task);
-    }
-
-    void add_waiting_tasks(const std::vector<server_task> & tasks) {
-        std::unique_lock<std::mutex> lock(mutex_results);
-
-        for (const auto & task : tasks) {
-            SRV_DBG("add task %d to waiting list. current waiting = %d (before add)\n", task.id, (int) waiting_task_ids.size());
-            waiting_task_ids.insert(task.id);
-        }
-    }
-
-    // when the request is finished, we can remove task associated with it
-    void remove_waiting_task_id(int id_task) {
-        SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
-
-        std::unique_lock<std::mutex> lock(mutex_results);
-        waiting_task_ids.erase(id_task);
-        // make sure to clean up all pending results
-        queue_results.erase(
-            std::remove_if(queue_results.begin(), queue_results.end(), [id_task](const server_task_result_ptr & res) {
-                return res->id == id_task;
-            }),
-            queue_results.end());
-    }
-
-    void remove_waiting_task_ids(const std::unordered_set<int> & id_tasks) {
-        std::unique_lock<std::mutex> lock(mutex_results);
-
-        for (const auto & id_task : id_tasks) {
-            SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
-            waiting_task_ids.erase(id_task);
-        }
-    }
-
-    // This function blocks the thread until there is a response for one of the id_tasks
-    server_task_result_ptr recv(const std::unordered_set<int> & id_tasks) {
-        while (true) {
-            std::unique_lock<std::mutex> lock(mutex_results);
-            condition_results.wait(lock, [&]{
-                if (!running) {
-                    SRV_DBG("%s : queue result stop\n", __func__);
-                    std::terminate(); // we cannot return here since the caller is HTTP code
-                }
-                return !queue_results.empty();
-            });
-
-            for (size_t i = 0; i < queue_results.size(); i++) {
-                if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
-                    server_task_result_ptr res = std::move(queue_results[i]);
-                    queue_results.erase(queue_results.begin() + i);
-                    return res;
-                }
-            }
-        }
-
-        // should never reach here
-    }
-
-    // same as recv(), but have timeout in seconds
-    // if timeout is reached, nullptr is returned
-    server_task_result_ptr recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout) {
-        while (true) {
-            std::unique_lock<std::mutex> lock(mutex_results);
-
-            for (int i = 0; i < (int) queue_results.size(); i++) {
-                if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
-                    server_task_result_ptr res = std::move(queue_results[i]);
-                    queue_results.erase(queue_results.begin() + i);
-                    return res;
-                }
-            }
-
-            std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
-            if (!running) {
-                SRV_DBG("%s : queue result stop\n", __func__);
-                std::terminate(); // we cannot return here since the caller is HTTP code
-            }
-            if (cr_res == std::cv_status::timeout) {
-                return nullptr;
-            }
-        }
-
-        // should never reach here
-    }
-
-    // single-task version of recv()
-    server_task_result_ptr recv(int id_task) {
-        std::unordered_set<int> id_tasks = {id_task};
-        return recv(id_tasks);
-    }
-
-    // Send a new result to a waiting id_task
-    void send(server_task_result_ptr && result) {
-        SRV_DBG("sending result for task id = %d\n", result->id);
-
-        std::unique_lock<std::mutex> lock(mutex_results);
-        for (const auto & id_task : waiting_task_ids) {
-            if (result->id == id_task) {
-                SRV_DBG("task id = %d pushed to result queue\n", result->id);
-
-                queue_results.emplace_back(std::move(result));
-                condition_results.notify_all();
-                return;
-            }
-        }
-    }
-
-    // terminate the waiting loop
-    void terminate() {
-        running = false;
-        condition_results.notify_all();
-    }
-};
-
-struct server_context {
-    common_params params_base;
-
-    // note: keep these alive - they determine the lifetime of the model, context, etc.
-    common_init_result llama_init;
-    common_init_result llama_init_dft;
-
-    llama_model * model = nullptr;
-    llama_context * ctx = nullptr;
-
-    // multimodal
-    mtmd_context * mctx = nullptr;
-
-    const llama_vocab * vocab = nullptr;
-    bool vocab_dft_compatible = true;
-
-    llama_model * model_dft = nullptr;
-
-    llama_context_params cparams_dft;
-
-    llama_batch batch {};
-
-    bool clean_kv_cache = true;
-    bool add_bos_token  = true;
-
-    int32_t n_ctx; // total context for all clients / slots
-
-    // slots / clients
-    std::vector<server_slot> slots;
-
-    int slots_debug = 0;
-
-    server_queue    queue_tasks;
-    server_response queue_results;
-
-    std::unique_ptr<server_prompt_cache> prompt_cache;
-
-    server_metrics metrics;
-
-    // Necessary similarity of prompt for slot selection
-    float slot_prompt_similarity = 0.0f;
-
-    common_chat_templates_ptr chat_templates;
-    oaicompat_parser_options  oai_parser_opt;
-
-    ~server_context() {
-        mtmd_free(mctx);
-
-        // Clear any sampling context
-        for (server_slot & slot : slots) {
-            common_sampler_free(slot.smpl);
-            slot.smpl = nullptr;
-
-            llama_free(slot.ctx_dft);
-            slot.ctx_dft = nullptr;
-
-            common_speculative_free(slot.spec);
-            slot.spec = nullptr;
-
-            llama_batch_free(slot.batch_spec);
-        }
-
-        llama_batch_free(batch);
-    }
-
-    bool load_model(const common_params & params) {
-        SRV_INF("loading model '%s'\n", params.model.path.c_str());
-
-        params_base = params;
-
-        llama_init = common_init_from_params(params_base);
-
-        model = llama_init.model.get();
-        ctx   = llama_init.context.get();
-
-        if (model == nullptr) {
-            SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str());
-            return false;
-        }
-
-        vocab = llama_model_get_vocab(model);
-
-        n_ctx = llama_n_ctx(ctx);
-
-        add_bos_token = llama_vocab_get_add_bos(vocab);
-
-        if (params_base.has_speculative()) {
-            SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str());
-
-            auto params_dft = params_base;
-
-            params_dft.devices      = params_base.speculative.devices;
-            params_dft.model        = params_base.speculative.model;
-            params_dft.n_ctx        = params_base.speculative.n_ctx == 0 ? llama_n_ctx_seq(ctx) : params_base.speculative.n_ctx;
-            params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
-            params_dft.n_parallel   = 1;
-            params_dft.cache_type_k = params_base.speculative.cache_type_k;
-            params_dft.cache_type_v = params_base.speculative.cache_type_v;
-
-            params_dft.cpuparams.n_threads = params_base.speculative.cpuparams.n_threads;
-            params_dft.cpuparams_batch.n_threads = params_base.speculative.cpuparams_batch.n_threads;
-            params_dft.tensor_buft_overrides = params_base.speculative.tensor_buft_overrides;
-
-            llama_init_dft = common_init_from_params(params_dft);
-
-            model_dft = llama_init_dft.model.get();
-
-            if (model_dft == nullptr) {
-                SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str());
-                return false;
-            }
-
-            vocab_dft_compatible = common_speculative_are_compatible(ctx, llama_init_dft.context.get());
-            if (!vocab_dft_compatible) {
-                SRV_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());
-            }
-
-            const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get());
-
-            cparams_dft = common_context_params_to_llama(params_dft);
-            cparams_dft.n_batch = n_ctx_dft;
-
-            // the context is not needed - we will create one for each slot
-            llama_init_dft.context.reset();
-        }
-
-        chat_templates = common_chat_templates_init(model, params_base.chat_template);
-        try {
-            common_chat_format_example(chat_templates.get(), params.use_jinja, params.default_template_kwargs);
-        } catch (const std::exception & e) {
-            SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what());
-            SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
-            chat_templates = common_chat_templates_init(model, "chatml");
-        }
-
-        std::string & mmproj_path = params_base.mmproj.path;
-        if (!mmproj_path.empty()) {
-            mtmd_context_params mparams = mtmd_context_params_default();
-            mparams.use_gpu          = params_base.mmproj_use_gpu;
-            mparams.print_timings    = false;
-            mparams.n_threads        = params_base.cpuparams.n_threads;
-            mparams.verbosity        = params_base.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
-            mparams.flash_attn_type  = params_base.flash_attn_type;
-            mparams.image_min_tokens = params_base.image_min_tokens;
-            mparams.image_max_tokens = params_base.image_max_tokens;
-            mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
-            if (mctx == nullptr) {
-                SRV_ERR("failed to load multimodal model, '%s'\n", mmproj_path.c_str());
-                return false;
-            }
-            SRV_INF("loaded multimodal model, '%s'\n", mmproj_path.c_str());
-
-            if (params_base.ctx_shift) {
-                params_base.ctx_shift = false;
-                SRV_WRN("%s\n", "ctx_shift is not supported by multimodal, it will be disabled");
-            }
-
-            if (params_base.n_cache_reuse) {
-                params_base.n_cache_reuse = 0;
-                SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
-            }
-
-            if (params_base.has_speculative()) {
-                SRV_ERR("%s\n", "err: speculative decode is not supported by multimodal");
-                return false;
-            }
-        }
-
-        if (!llama_memory_can_shift(llama_get_memory(ctx))) {
-            if (params_base.ctx_shift) {
-                params_base.ctx_shift = false;
-                SRV_WRN("%s\n", "ctx_shift is not supported by this context, it will be disabled");
-            }
-
-            if (params_base.n_cache_reuse) {
-                params_base.n_cache_reuse = 0;
-                SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled");
-            }
-        }
-
-        return true;
-    }
-
-    void init() {
-        SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
-
-        const int n_ctx_train = llama_model_n_ctx_train(model);
-
-        int n_ctx_slot = llama_n_ctx_seq(ctx);
-        if (n_ctx_slot > n_ctx_train) {
-            SRV_WRN("the slot context (%d) exceeds the training context of the model (%d) - capping\n", n_ctx_slot, n_ctx_train);
-            n_ctx_slot = n_ctx_train;
-        }
-
-        for (int i = 0; i < params_base.n_parallel; i++) {
-            server_slot slot;
-
-            slot.id = i;
-            slot.ctx = ctx;
-            slot.n_ctx = n_ctx_slot;
-            slot.mctx = mctx;
-            slot.prompt.tokens.has_mtmd = mctx != nullptr;
-
-            if (model_dft) {
-                slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
-
-                // TODO: rework speculative decoding [TAG_SERVER_SPEC_REWORK]
-                slot.ctx_dft = llama_init_from_model(model_dft, cparams_dft);
-                if (slot.ctx_dft == nullptr) {
-                    SRV_ERR("%s", "failed to create draft context\n");
-                    return;
-                }
-
-                slot.spec = common_speculative_init(slot.ctx, slot.ctx_dft);
-                if (slot.spec == nullptr) {
-                    SRV_ERR("%s", "failed to create speculator\n");
-                    return;
-                }
-                for (auto & pair : params_base.speculative.replacements) {
-                    common_speculative_add_replacement_tgt_dft(slot.spec, pair.first.c_str(), pair.second.c_str());
-                }
-            }
-
-            SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx);
-
-            slot.callback_on_release = [this](int) {
-                queue_tasks.pop_deferred_task();
-            };
-
-            slot.reset();
-
-            slots.push_back(std::move(slot));
-        }
-
-        {
-            const char * LLAMA_SERVER_SLOTS_DEBUG = getenv("LLAMA_SERVER_SLOTS_DEBUG");
-            slots_debug = LLAMA_SERVER_SLOTS_DEBUG ? atoi(LLAMA_SERVER_SLOTS_DEBUG) : 0;
-
-            if (slots_debug) {
-                SRV_WRN("slots debug = %d\n", slots_debug);
-            }
-        }
-
-        // the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens
-        // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
-        {
-            const int32_t n_batch = llama_n_batch(ctx);
-            batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
-        }
-
-        metrics.init();
-
-        if (params_base.cache_ram_mib != 0) {
-            if (params_base.cache_ram_mib < 0) {
-                SRV_WRN("prompt cache is enabled, size limit: %s\n", "no limit");
-            } else {
-                SRV_WRN("prompt cache is enabled, size limit: %d MiB\n", params_base.cache_ram_mib);
-            }
-            SRV_WRN("%s", "use `--cache-ram 0` to disable the prompt cache\n");
-
-            prompt_cache = std::make_unique<server_prompt_cache>(params_base.cache_ram_mib, n_ctx);
-        } else {
-            SRV_WRN("%s", "prompt cache is disabled - use `--cache-ram N` to enable it\n");
-        }
-        SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
-
-        // thinking is enabled if:
-        // 1. It's not explicitly disabled (reasoning_budget == 0)
-        // 2. The chat template supports it
-        const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
-        SRV_INF("thinking = %d\n", enable_thinking);
-
-        oai_parser_opt = {
-            /* use_jinja             */ params_base.use_jinja,
-            /* prefill_assistant     */ params_base.prefill_assistant,
-            /* reasoning_format      */ params_base.reasoning_format,
-            /* chat_template_kwargs  */ params_base.default_template_kwargs,
-            /* common_chat_templates */ chat_templates.get(),
-            /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false,
-            /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false,
-            /* enable_thinking       */ enable_thinking,
-        };
-    }
-
-    server_slot * get_slot_by_id(int id) {
-        for (server_slot & slot : slots) {
-            if (slot.id == id) {
-                return &slot;
-            }
-        }
-
-        return nullptr;
-    }
-
-    server_slot * get_available_slot(const server_task & task) {
-        server_slot * ret = nullptr;
-
-        bool update_cache = false;
-
-        // find the slot that has at least n% prompt similarity
-        if (ret == nullptr && slot_prompt_similarity != 0.0f) {
-            float sim_best = 0;
-
-            for (server_slot & slot : slots) {
-                // skip the slot if it is not available
-                if (slot.is_processing()) {
-                    continue;
-                }
-
-                const auto & tokens = slot.prompt.tokens;
-
-                // skip the slot if it does not contains cached tokens
-                if (tokens.empty()) {
-                    continue;
-                }
-
-                // fraction of the Longest Common Prefix length with respect to the input prompt length
-                const float sim_cur = float(tokens.get_common_prefix(task.tokens)) / task.tokens.size();
-
-                // select the current slot if the criteria match
-                if (sim_cur > sim_best && sim_cur > slot_prompt_similarity) {
-                    sim_best = sim_cur;
-
-                    ret = &slot;
-                }
-            }
-
-            if (ret != nullptr) {
-                const float f_keep = (sim_best*task.tokens.size()) / ret->prompt.tokens.size();
-
-                SLT_INF(*ret, "selected slot by LCP similarity, sim_best = %.3f (> %.3f thold), f_keep = %.3f\n",
-                        sim_best, slot_prompt_similarity, f_keep);
-
-                // if we are about to lose a large portion of the existing context - save it in the prompt cache
-                if (f_keep < 0.5f) {
-                    update_cache = true;
-                }
-            }
-        }
-
-        // find the slot that has been least recently used
-        if (ret == nullptr) {
-            int64_t t_last = -1;
-
-            for (server_slot & slot : slots) {
-                // skip the slot if it is not available
-                if (slot.is_processing()) {
-                    continue;
-                }
-
-                // select the current slot if the criteria match
-                if (!ret || slot.t_last_used <= t_last) {
-                    t_last = slot.t_last_used;
-                    ret = &slot;
-                }
-            }
-
-            if (ret != nullptr) {
-                SLT_INF(*ret, "selected slot by LRU, t_last = %" PRId64 "\n", t_last);
-
-                update_cache = true;
-            }
-        }
-
-        if (ret) {
-            const auto & tokens = ret->prompt.tokens;
-
-            update_cache = update_cache && prompt_cache;
-
-            // cache prompts only for completion tasks
-            update_cache = update_cache && task.type == SERVER_TASK_TYPE_COMPLETION;
-
-            // don't update the cache if the slot's context is empty
-            update_cache = update_cache && tokens.size() > 0;
-
-            // TODO: mtmd does not support prompt cache
-            update_cache = update_cache && (ret->mctx == nullptr);
-
-            if (update_cache) {
-                SRV_WRN("%s", "updating prompt cache\n");
-
-                const int64_t t_start = ggml_time_us();
-
-                ret->prompt_save(*prompt_cache);
-                ret->prompt_load(*prompt_cache, task.tokens);
-
-                prompt_cache->update();
-
-                SRV_WRN("prompt cache update took %.2f ms\n", (ggml_time_us() - t_start) / 1000.0);
-            }
-        }
-
-        return ret;
-    }
-
-    // return true if at least one slot has been purged
-    // TODO: improve logic
-    //       - smarter decision which slot to purge (LRU or longest prompt?)
-    //       - move slot to level 2 cache instead of removing?
-    //       - instead of purging, try to store and resume later?
-    bool try_purge_idle_slots() {
-        bool res = false;
-
-        if (!params_base.kv_unified) {
-            return res;
-        }
-
-        for (auto & slot : slots) {
-            if (slot.is_processing()) {
-                continue;
-            }
-
-            if (slot.prompt.n_tokens() > 0) {
-                SRV_WRN("purging slot %d with %zu tokens\n", slot.id, slot.prompt.tokens.size());
-
-                llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);
-                slot.prompt.tokens.clear();
-
-                res = true;
-
-                // purge slots one by one
-                break;
-            }
-        }
-
-        return res;
-    }
-
-    bool launch_slot_with_task(server_slot & slot, server_task && task) {
-        slot.reset();
-
-        if (!are_lora_equal(task.params.lora, slot.lora)) {
-            // if lora has changed, check to see if the cache should be cleared
-            if (lora_should_clear_cache(slot.lora, task.params.lora)) {
-                SLT_INF(slot, "clearing cache for lora change. %zu loras -> %zu loras\n", slot.lora.size(), task.params.lora.size());
-                slot.prompt.tokens.clear();
-            } else {
-                SLT_INF(slot, "keeping cache for alora. %zu target loras\n", task.params.lora.size());
-            }
-            slot.lora = task.params.lora;
-        }
-
-        // if using alora, make sure it's only a single one requested and active
-        size_t alora_invocation_start = task.tokens.size();
-        if (lora_all_alora(slot.lora)) {
-            const auto & enabled_ids = lora_get_enabled_ids(slot.lora);
-            // TODO: This will error out if a user requests two aloras, but only
-            // provides the activation string for one. We could, instead search
-            // for all requested alora activation strings and then either keep
-            // only the last one, or reject if multiple are found.
-            if (enabled_ids.size() != 1) {
-                send_error(task, "Cannot run multiple aLoRAs in a single request", ERROR_TYPE_INVALID_REQUEST);
-                return false;
-            }
-            const auto & lora = slot.lora[enabled_ids[0]].ptr;
-
-            // get the pointer and count for the invocation tokens
-            const uint64_t      n_invocation_tokens = llama_adapter_get_alora_n_invocation_tokens(lora);
-            const llama_token * invocation_tokens   = llama_adapter_get_alora_invocation_tokens  (lora);
-
-            // scan backwards through the prompt tokens to find the last
-            // occurrence of the invocation sequence
-            int match_idx = static_cast<int>(n_invocation_tokens) - 1;
-            for (int i = task.tokens.size() - 1; i >= 0; --i) {
-                // the token in this position matches the next token to find in
-                // the invocation sequence
-                if (task.tokens[i] == invocation_tokens[match_idx]) {
-                    // if it's a full match, we've found the start
-                    if (match_idx == 0) {
-                        alora_invocation_start = i;
-                        break;
-                    }
-                    // otherwise, check the next token in the sequence
-                    --match_idx;
-                } else {
-                    // no match in this position, so start looking over again
-                    match_idx = static_cast<int>(n_invocation_tokens) - 1;
-                }
-            }
-
-            // if the activation string is not found, disable the alora
-            if (alora_invocation_start == task.tokens.size()) {
-                SLT_DBG(slot, "alora %zu requested, but not found. deactivating\n", enabled_ids[0]);
-                slot.lora[enabled_ids[0]].scale = 0.0f;
-            } else {
-                SLT_DBG(slot, "alora %zu activated starting at %zu\n", enabled_ids[0], alora_invocation_start);
-                slot.alora_invocation_start = alora_invocation_start;
-            }
-        }
-
-        if (!task.tokens.validate(ctx)) {
-            send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
-            return false;
-        }
-
-        SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
-
-        // initialize samplers
-        {
-            if (slot.smpl != nullptr) {
-                common_sampler_free(slot.smpl);
-            }
-
-            slot.smpl = common_sampler_init(model, task.params.sampling);
-            if (slot.smpl == nullptr) {
-                // for now, the only error that may happen here is invalid grammar
-                send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
-                return false;
-            }
-
-            SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl).c_str());
-        }
-
-        // initialize draft batch
-        // TODO: rework speculative decoding [TAG_SERVER_SPEC_REWORK]
-        if (slot.ctx_dft) {
-            llama_batch_free(slot.batch_spec);
-
-            slot.batch_spec = llama_batch_init(task.params.speculative.n_max + 1, 0, 1);
-        }
-
-        slot.task = std::make_unique<const server_task>(std::move(task));
-
-        slot.state = SLOT_STATE_STARTED;
-
-        SLT_INF(slot, "%s", "processing task\n");
-
-        return true;
-    }
-
-    void kv_cache_clear() {
-        SRV_DBG("%s", "clearing KV cache\n");
-
-        // clear the entire KV cache
-        llama_memory_clear(llama_get_memory(ctx), true);
-        clean_kv_cache = false;
-    }
-
-    bool process_token(completion_token_output & result, server_slot & slot) {
-        // remember which tokens were sampled - used for repetition penalties during sampling
-        const std::string token_str = result.text_to_send;
-        slot.sampled = result.tok;
-
-        slot.generated_text += token_str;
-        if (slot.task->params.return_tokens) {
-            slot.generated_tokens.push_back(result.tok);
-        }
-        slot.has_next_token = true;
-
-        // check if there is incomplete UTF-8 character at the end
-        bool incomplete = validate_utf8(slot.generated_text) < slot.generated_text.size();
-
-        // search stop word and delete it
-        if (!incomplete) {
-            size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
-
-            const std::string str_test = slot.generated_text.substr(pos);
-            bool send_text = true;
-
-            size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), true);
-            if (stop_pos != std::string::npos) {
-                slot.generated_text.erase(
-                    slot.generated_text.begin() + pos + stop_pos,
-                    slot.generated_text.end());
-                pos = std::min(slot.n_sent_text, slot.generated_text.size());
-            } else if (slot.has_next_token && !llama_vocab_is_eog(vocab, result.tok) ) {
-                stop_pos = slot.find_stopping_strings(str_test, token_str.size(), false);
-                send_text = stop_pos == std::string::npos;
-            }
-
-            // check if there is any token to predict
-            if (send_text) {
-                // no send the stop word in the response
-                result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
-                slot.n_sent_text += result.text_to_send.size();
-                // add the token to slot queue and cache
-            } else {
-                result.text_to_send = "";
-            }
-
-            slot.add_token(result);
-            if (slot.task->params.stream) {
-                send_partial_response(slot, result, false);
-            }
-        }
-
-        if (incomplete) {
-            slot.has_next_token = true;
-        }
-
-        // if context shifting is disabled, make sure that we don't run out of context
-        if (!params_base.ctx_shift && slot.prompt.n_tokens() + 1 >= slot.n_ctx) {
-            slot.truncated      = true;
-            slot.stop           = STOP_TYPE_LIMIT;
-            slot.has_next_token = false;
-
-            SLT_DBG(slot, "stopped due to running out of context capacity, prompt.n_tokens() = %d, task.n_tokens = %d, n_decoded = %d, n_ctx = %d\n",
-                    slot.prompt.n_tokens(), slot.task->n_tokens(), slot.n_decoded, slot.n_ctx);
-        }
-
-        // check the limits
-        if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
-            slot.stop           = STOP_TYPE_LIMIT;
-            slot.has_next_token = false;
-
-            SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.task->params.n_predict);
-        }
-
-        if (slot.has_new_line) {
-            // require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent
-            if (slot.task->params.n_indent > 0) {
-                // check the current indentation
-                // TODO: improve by not doing it more than once for each new line
-                if (slot.last_nl_pos > 0) {
-                    size_t pos = slot.last_nl_pos;
-
-                    int n_indent = 0;
-                    while (pos < slot.generated_text.size() && (slot.generated_text[pos] == ' ' || slot.generated_text[pos] == '\t')) {
-                        n_indent++;
-                        pos++;
-                    }
-
-                    if (pos < slot.generated_text.size() && n_indent < slot.task->params.n_indent) {
-                        slot.stop           = STOP_TYPE_LIMIT;
-                        slot.has_next_token = false;
-
-                        // cut the last line
-                        slot.generated_text.erase(pos, std::string::npos);
-
-                        SLT_DBG(slot, "stopped by indentation limit, n_decoded = %d, n_indent = %d\n", slot.n_decoded, n_indent);
-                    }
-                }
-
-                // find the next new line
-                {
-                    const size_t pos = slot.generated_text.find('\n', slot.last_nl_pos);
-
-                    if (pos != std::string::npos) {
-                        slot.last_nl_pos = pos + 1;
-                    }
-                }
-            }
-        }
-
-        // check if there is a new line in the generated text
-        if (result.text_to_send.find('\n') != std::string::npos) {
-            slot.has_new_line = true;
-
-            // if we have seen a new line, we stop after a certain time limit, but only upon another new line
-            if (slot.task->params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.task->params.t_max_predict_ms)) {
-                slot.stop           = STOP_TYPE_LIMIT;
-                slot.has_next_token = false;
-
-                SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.task->params.t_max_predict_ms);
-            }
-        }
-
-        if (llama_vocab_is_eog(vocab, result.tok)) {
-            slot.stop           = STOP_TYPE_EOS;
-            slot.has_next_token = false;
-
-            SLT_DBG(slot, "%s", "stopped by EOS\n");
-        }
-
-        SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n", slot.n_decoded, slot.n_remaining, result.tok, token_str.c_str());
-
-        return slot.has_next_token; // continue
-    }
-
-    void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) const {
-        size_t n_probs = slot.task->params.sampling.n_probs;
-        size_t n_vocab = llama_vocab_n_tokens(vocab);
-
-        if (post_sampling) {
-            const auto * cur_p = common_sampler_get_candidates(slot.smpl, true);
-            const size_t max_probs = cur_p->size;
-
-            // set probability for sampled token
-            for (size_t i = 0; i < max_probs; i++) {
-                if (cur_p->data[i].id == result.tok) {
-                    result.prob = cur_p->data[i].p;
-                    break;
-                }
-            }
-
-            // set probability for top n_probs tokens
-            result.probs.reserve(max_probs);
-            for (size_t i = 0; i < std::min(max_probs, n_probs); i++) {
-                result.probs.push_back({
-                    cur_p->data[i].id,
-                    common_token_to_piece(ctx, cur_p->data[i].id, special),
-                    cur_p->data[i].p
-                });
-            }
-        } else {
-            // TODO: optimize this with min-p optimization
-            std::vector<llama_token_data> cur = get_token_probabilities(ctx, idx);
-
-            // set probability for sampled token
-            for (size_t i = 0; i < n_vocab; i++) {
-                // set probability for sampled token
-                if (cur[i].id == result.tok) {
-                    result.prob = cur[i].p;
-                    break;
-                }
-            }
-
-            // set probability for top n_probs tokens
-            result.probs.reserve(n_probs);
-            for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) {
-                result.probs.push_back({
-                    cur[i].id,
-                    common_token_to_piece(ctx, cur[i].id, special),
-                    cur[i].p
-                });
-            }
-        }
-    }
-
-    void send_error(const server_task & task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
-        send_error(task.id, error, type);
-    }
-
-    void send_error(const server_slot & slot, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
-        send_error(slot.task->id, error, type, slot.task->n_tokens(), slot.n_ctx);
-    }
-
-    void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER, const int32_t n_prompt_tokens = 0, const int32_t n_ctx = 0) {
-        SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str());
-
-        if (type == ERROR_TYPE_EXCEED_CONTEXT_SIZE) {
-            GGML_ASSERT(n_ctx > 0 && n_prompt_tokens > 0);
-        }
-
-        auto res = std::make_unique<server_task_result_error>();
-        res->id              = id_task;
-        res->err_type        = type;
-        res->err_msg         = error;
-        res->n_prompt_tokens = n_prompt_tokens;
-        res->n_ctx           = n_ctx;
-
-        queue_results.send(std::move(res));
-    }
-
-    // if multimodal is enabled, send an error and return false
-    bool check_no_mtmd(const int id_task) {
-        if (mctx) {
-            send_error(id_task, "This feature is not supported by multimodal", ERROR_TYPE_NOT_SUPPORTED);
-            return false;
-        }
-        return true;
-    }
-
-    void send_partial_response(server_slot & slot, const completion_token_output & tkn, bool is_progress) {
-        auto res = std::make_unique<server_task_result_cmpl_partial>();
-
-        res->id    = slot.task->id;
-        res->index = slot.task->index;
-
-        if (is_progress) {
-            res->is_progress        = true;
-            res->progress.total     = slot.task->n_tokens();
-            res->progress.cache     = slot.n_prompt_tokens_cache;
-            res->progress.processed = slot.prompt.tokens.size();
-            res->progress.time_ms   = (ggml_time_us() - slot.t_start_process_prompt) / 1000;
-        } else {
-            res->content = tkn.text_to_send;
-            res->tokens  = { tkn.tok };
-
-            slot.update_chat_msg(res->oaicompat_msg_diffs);
-        }
-
-        res->n_decoded           = slot.n_decoded;
-        res->n_prompt_tokens     = slot.task->n_tokens();
-        res->post_sampling_probs = slot.task->params.post_sampling_probs;
-
-        res->verbose           = slot.task->params.verbose;
-        res->oaicompat         = slot.task->params.oaicompat;
-        res->oaicompat_model   = slot.task->params.oaicompat_model;
-        res->oaicompat_cmpl_id = slot.task->params.oaicompat_cmpl_id;
-
-        // populate res.probs_output
-        if (slot.task->params.sampling.n_probs > 0) {
-            res->prob_output = tkn; // copy the token probs
-        }
-
-        // populate timings if this is final response or timings_per_token is enabled
-        if (slot.stop != STOP_TYPE_NONE || slot.task->params.timings_per_token) {
-            res->timings = slot.get_timings();
-        }
-
-        queue_results.send(std::move(res));
-    }
-
-    void send_final_response(server_slot & slot) {
-        auto res = std::make_unique<server_task_result_cmpl_final>();
-
-        res->id      = slot.task->id;
-        res->id_slot = slot.id;
-
-        res->index           = slot.task->index;
-        res->content         = slot.generated_text;
-        res->tokens          = std::move(slot.generated_tokens);
-        res->timings         = slot.get_timings();
-        res->prompt          = slot.task->tokens.detokenize(ctx, true);
-        res->response_fields = std::move(slot.task->params.response_fields);
-
-        res->truncated           = slot.truncated;
-        res->n_decoded           = slot.n_decoded;
-        res->n_prompt_tokens     = slot.task->n_tokens();
-        res->n_tokens_cached     = slot.prompt.n_tokens();
-        res->has_new_line        = slot.has_new_line;
-        res->stopping_word       = slot.stopping_word;
-        res->stop                = slot.stop;
-        res->post_sampling_probs = slot.task->params.post_sampling_probs;
-
-        res->verbose           = slot.task->params.verbose;
-        res->stream            = slot.task->params.stream;
-        res->include_usage     = slot.task->params.include_usage;
-        res->oaicompat         = slot.task->params.oaicompat;
-        res->oaicompat_model   = slot.task->params.oaicompat_model;
-        res->oaicompat_cmpl_id = slot.task->params.oaicompat_cmpl_id;
-        res->oaicompat_msg     = slot.update_chat_msg(res->oaicompat_msg_diffs);
-
-        // populate res.probs_output
-        if (slot.task->params.sampling.n_probs > 0) {
-            if (!slot.task->params.stream && slot.stop == STOP_TYPE_WORD) {
-                const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
-
-                size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
-                res->probs_output = std::vector<completion_token_output>(
-                        slot.generated_token_probs.begin(),
-                        slot.generated_token_probs.end() - safe_offset);
-            } else {
-                res->probs_output = std::vector<completion_token_output>(
-                        slot.generated_token_probs.begin(),
-                        slot.generated_token_probs.end());
-            }
-        }
-
-        res->generation_params = slot.task->params; // copy the parameters
-
-        queue_results.send(std::move(res));
-    }
-
-    void send_embedding(const server_slot & slot, const llama_batch & batch) {
-        auto res = std::make_unique<server_task_result_embd>();
-        res->id        = slot.task->id;
-        res->index     = slot.task->index;
-        res->n_tokens  = slot.task->n_tokens();
-        res->oaicompat = slot.task->params.oaicompat;
-
-        const int n_embd = llama_model_n_embd(model);
-
-        std::vector<float> embd_res(n_embd, 0.0f);
-
-        for (int i = 0; i < batch.n_tokens; ++i) {
-            if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
-                continue;
-            }
-
-            const float * embd = nullptr;
-            if (llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE) {
-                embd = llama_get_embeddings_ith(ctx, i);
-            } else {
-                embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-            }
-
-            if (embd == nullptr) {
-                SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
-
-                res->embedding.push_back(std::vector<float>(n_embd, 0.0f));
-                continue;
-            }
-
-            // normalize only when there is pooling
-            if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) {
-                common_embd_normalize(embd, embd_res.data(), n_embd, slot.task->params.embd_normalize);
-                res->embedding.push_back(embd_res);
-                break;
-            }
-
-            res->embedding.emplace_back(embd, embd + n_embd);
-        }
-
-        SLT_DBG(slot, "%s", "sending embeddings\n");
-
-        queue_results.send(std::move(res));
-    }
-
-    void send_rerank(const server_slot & slot, const llama_batch & batch) {
-        auto res = std::make_unique<server_task_result_rerank>();
-        res->id       = slot.task->id;
-        res->index    = slot.task->index;
-        res->n_tokens = slot.task->n_tokens();
-
-        for (int i = 0; i < batch.n_tokens; ++i) {
-            if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
-                continue;
-            }
-
-            const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-            if (embd == NULL) {
-                embd = llama_get_embeddings_ith(ctx, i);
-            }
-
-            if (embd == NULL) {
-                SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
-
-                res->score = -1e6;
-                continue;
-            }
-
-            res->score = embd[0];
-        }
-
-        SLT_DBG(slot, "sending rerank result, res.score = %f\n", res->score);
-
-        queue_results.send(std::move(res));
-    }
-
-    //
-    // Functions to process the task
-    //
-
-    void process_single_task(server_task && task) {
-        switch (task.type) {
-            case SERVER_TASK_TYPE_COMPLETION:
-            case SERVER_TASK_TYPE_INFILL:
-            case SERVER_TASK_TYPE_EMBEDDING:
-            case SERVER_TASK_TYPE_RERANK:
-                {
-                    const int id_slot = task.id_slot;
-
-                    server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
-
-                    if (slot == nullptr) {
-                        // if no slot is available, we defer this task for processing later
-                        SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id);
-                        queue_tasks.defer(std::move(task));
-                        break;
-                    }
-
-                    if (slot->is_processing()) {
-                        // if requested slot is unavailable, we defer this task for processing later
-                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
-                        queue_tasks.defer(std::move(task));
-                        break;
-                    }
-
-                    if (!launch_slot_with_task(*slot, std::move(task))) {
-                        SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id);
-                        break;
-                    }
-                } break;
-            case SERVER_TASK_TYPE_CANCEL:
-                {
-                    // release slot linked with the task id
-                    for (auto & slot : slots) {
-                        if (slot.task && slot.task->id == task.id_target) {
-                            slot.release();
-                            break;
-                        }
-                    }
-                } break;
-            case SERVER_TASK_TYPE_NEXT_RESPONSE:
-                {
-                    // do nothing
-                } break;
-            case SERVER_TASK_TYPE_METRICS:
-                {
-                    json slots_data = json::array();
-
-                    int n_idle_slots       = 0;
-                    int n_processing_slots = 0;
-
-                    for (server_slot & slot : slots) {
-                        json slot_data = slot.to_json(slots_debug == 0);
-
-                        if (slot.is_processing()) {
-                            n_processing_slots++;
-                        } else {
-                            n_idle_slots++;
-                        }
-
-                        slots_data.push_back(slot_data);
-                    }
-                    SRV_DBG("n_idle_slots = %d, n_processing_slots = %d\n", n_idle_slots, n_processing_slots);
-
-                    auto res = std::make_unique<server_task_result_metrics>();
-                    res->id                  = task.id;
-                    res->slots_data          = std::move(slots_data);
-                    res->n_idle_slots        = n_idle_slots;
-                    res->n_processing_slots  = n_processing_slots;
-                    res->n_tasks_deferred    = queue_tasks.queue_tasks_deferred.size();
-                    res->t_start             = metrics.t_start;
-
-                    res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
-                    res->t_prompt_processing_total       = metrics.t_prompt_processing_total;
-                    res->n_tokens_predicted_total        = metrics.n_tokens_predicted_total;
-                    res->t_tokens_generation_total       = metrics.t_tokens_generation_total;
-
-                    res->n_tokens_max = metrics.n_tokens_max;
-
-                    res->n_prompt_tokens_processed = metrics.n_prompt_tokens_processed;
-                    res->t_prompt_processing       = metrics.t_prompt_processing;
-                    res->n_tokens_predicted        = metrics.n_tokens_predicted;
-                    res->t_tokens_generation       = metrics.t_tokens_generation;
-
-                    res->n_decode_total          = metrics.n_decode_total;
-                    res->n_busy_slots_total      = metrics.n_busy_slots_total;
-
-                    if (task.metrics_reset_bucket) {
-                        metrics.reset_bucket();
-                    }
-                    queue_results.send(std::move(res));
-                } break;
-            case SERVER_TASK_TYPE_SLOT_SAVE:
-                {
-                    if (!check_no_mtmd(task.id)) {
-                        break;
-                    }
-
-                    int id_slot = task.slot_action.slot_id;
-                    server_slot * slot = get_slot_by_id(id_slot);
-                    if (slot == nullptr) {
-                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
-                        break;
-                    }
-                    if (slot->is_processing()) {
-                        // if requested slot is unavailable, we defer this task for processing later
-                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
-                        queue_tasks.defer(std::move(task));
-                        break;
-                    }
-
-                    const size_t token_count = slot->prompt.tokens.size();
-                    const int64_t t_start = ggml_time_us();
-
-                    std::string filename = task.slot_action.filename;
-                    std::string filepath = task.slot_action.filepath;
-
-                    const llama_tokens & tokens = slot->prompt.tokens.get_text_tokens();
-                    const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, tokens.data(), token_count);
-
-                    const int64_t t_end = ggml_time_us();
-                    const double t_save_ms = (t_end - t_start) / 1000.0;
-
-                    auto res = std::make_unique<server_task_result_slot_save_load>();
-                    res->id       = task.id;
-                    res->id_slot  = id_slot;
-                    res->filename = filename;
-                    res->is_save  = true;
-                    res->n_tokens = token_count;
-                    res->n_bytes  = nwrite;
-                    res->t_ms     = t_save_ms;
-                    queue_results.send(std::move(res));
-                } break;
-            case SERVER_TASK_TYPE_SLOT_RESTORE:
-                {
-                    if (!check_no_mtmd(task.id)) break;
-                    int id_slot = task.slot_action.slot_id;
-                    server_slot * slot = get_slot_by_id(id_slot);
-                    if (slot == nullptr) {
-                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
-                        break;
-                    }
-                    if (slot->is_processing()) {
-                        // if requested slot is unavailable, we defer this task for processing later
-                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
-                        queue_tasks.defer(std::move(task));
-                        break;
-                    }
-
-                    const int64_t t_start = ggml_time_us();
-
-                    std::string filename = task.slot_action.filename;
-                    std::string filepath = task.slot_action.filepath;
-
-                    llama_tokens tokens;
-                    tokens.resize(slot->n_ctx);
-                    size_t token_count = 0;
-                    size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, tokens.data(), tokens.size(), &token_count);
-                    if (nread == 0) {
-                        slot->prompt.tokens.clear(); // KV may already been invalidated?
-                        send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
-                        break;
-                    }
-                    tokens.resize(token_count);
-                    slot->prompt.tokens.clear();
-                    slot->prompt.tokens.insert(tokens);
-
-                    const int64_t t_end = ggml_time_us();
-                    const double t_restore_ms = (t_end - t_start) / 1000.0;
-
-                    auto res = std::make_unique<server_task_result_slot_save_load>();
-                    res->id       = task.id;
-                    res->id_slot  = id_slot;
-                    res->filename = filename;
-                    res->is_save  = false;
-                    res->n_tokens = token_count;
-                    res->n_bytes  = nread;
-                    res->t_ms     = t_restore_ms;
-                    queue_results.send(std::move(res));
-                } break;
-            case SERVER_TASK_TYPE_SLOT_ERASE:
-                {
-                    if (!check_no_mtmd(task.id)) {
-                        break;
-                    }
-                    int id_slot = task.slot_action.slot_id;
-                    server_slot * slot = get_slot_by_id(id_slot);
-                    if (slot == nullptr) {
-                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
-                        break;
-                    }
-                    if (slot->is_processing()) {
-                        // if requested slot is unavailable, we defer this task for processing later
-                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
-                        queue_tasks.defer(std::move(task));
-                        break;
-                    }
-
-                    // Erase token cache
-                    const size_t n_erased = slot->prompt.tokens.size();
-                    llama_memory_seq_rm(llama_get_memory(ctx), slot->id, -1, -1);
-                    slot->prompt.tokens.clear();
-
-                    auto res = std::make_unique<server_task_result_slot_erase>();
-                    res->id       = task.id;
-                    res->id_slot  = id_slot;
-                    res->n_erased = n_erased;
-                    queue_results.send(std::move(res));
-                } break;
-            case SERVER_TASK_TYPE_SET_LORA:
-                {
-                    params_base.lora_adapters = std::move(task.set_lora);
-                    auto res = std::make_unique<server_task_result_apply_lora>();
-                    res->id = task.id;
-                    queue_results.send(std::move(res));
-                } break;
-
-        }
-    }
-
-    void update_slots() {
-        // check if all slots are idle
-        {
-            bool all_idle = true;
-
-            for (auto & slot : slots) {
-                if (slot.is_processing()) {
-                    all_idle = false;
-                    break;
-                }
-            }
-
-            if (all_idle) {
-                SRV_INF("%s", "all slots are idle\n");
-                if (clean_kv_cache) {
-                    kv_cache_clear();
-                }
-
-                return;
-            }
-        }
-
-        {
-            SRV_DBG("%s", "posting NEXT_RESPONSE\n");
-
-            server_task task(SERVER_TASK_TYPE_NEXT_RESPONSE);
-            task.id = queue_tasks.get_new_id();
-            queue_tasks.post(std::move(task));
-        }
-
-        // apply context-shift if needed
-        // TODO: simplify and improve
-        for (server_slot & slot : slots) {
-            if (slot.state == SLOT_STATE_GENERATING && slot.prompt.n_tokens() + 1 >= slot.n_ctx) {
-                if (!params_base.ctx_shift) {
-                    // this check is redundant (for good)
-                    // we should never get here, because generation should already stopped in process_token()
-                    send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER);
-                    slot.release();
-                    continue;
-                }
-
-                if (mctx) {
-                    // we should never reach this because params_base.ctx_shift is automatically disabled if mmproj is loaded
-                    // we don't support ctx_shift because an image chunk may contains multiple tokens
-                    GGML_ABORT("not supported by multimodal");
-                }
-
-                // Shift context
-                int n_keep = slot.task->params.n_keep < 0 ? slot.task->n_tokens() : slot.task->params.n_keep;
-
-                if (add_bos_token) {
-                    n_keep += 1;
-                }
-
-                n_keep = std::min(slot.n_ctx - 4, n_keep);
-
-                const int n_left    = slot.prompt.n_tokens() - n_keep;
-                const int n_discard = slot.task->params.n_discard ? slot.task->params.n_discard : (n_left / 2);
-
-                SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
-
-                llama_memory_seq_rm (llama_get_memory(ctx), slot.id, n_keep            , n_keep + n_discard);
-                llama_memory_seq_add(llama_get_memory(ctx), slot.id, n_keep + n_discard, slot.prompt.n_tokens(), -n_discard);
-
-                // add generated tokens to cache
-                // ref: https://github.com/ggml-org/llama.cpp/pull/16818#discussion_r2473269481
-                {
-                    GGML_ASSERT(!slot.prompt.tokens.has_mtmd);
-
-                    llama_tokens new_tokens = slot.prompt.tokens.get_text_tokens(); // copy
-                    for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) {
-                        new_tokens[i - n_discard] = new_tokens[i];
-                    }
-
-                    new_tokens.resize(slot.prompt.tokens.size() - n_discard);
-
-                    slot.prompt.tokens.clear();
-                    slot.prompt.tokens.insert(new_tokens);
-                }
-
-                slot.truncated = true;
-            }
-        }
-
-        // start populating the batch for this iteration
-        common_batch_clear(batch);
-
-        // track if given slot can be batched with slots already in the batch
-        server_slot * slot_batched = nullptr;
-
-        auto accept_special_token = [&](server_slot & slot, llama_token token) {
-            return params_base.special ||
-                slot.task->params.sampling.preserved_tokens.find(token) != slot.task->params.sampling.preserved_tokens.end();
-        };
-
-        // first, add sampled tokens from any ongoing sequences
-        for (auto & slot : slots) {
-            if (slot.state != SLOT_STATE_GENERATING) {
-                continue;
-            }
-
-            // check if we can batch this slot with the previous one
-            if (!slot_batched) {
-                slot_batched = &slot;
-            } else if (!slot_batched->can_batch_with(slot)) {
-                continue;
-            }
-
-            slot.i_batch = batch.n_tokens;
-
-            common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true);
-
-            slot.prompt.tokens.push_back(slot.sampled);
-
-            SLT_DBG(slot, "slot decode token, n_ctx = %d, n_tokens = %d, truncated = %d\n",
-                    slot.n_ctx, slot.prompt.n_tokens(), slot.truncated);
-        }
-
-        // process in chunks of params.n_batch
-        int32_t n_batch  = llama_n_batch(ctx);
-        int32_t n_ubatch = llama_n_ubatch(ctx);
-
-        float  alora_scale       = -1.0f;
-        size_t alora_disabled_id = 0;
-
-        // next, batch any pending prompts without exceeding n_batch
-        if (params_base.cont_batching || batch.n_tokens == 0) {
-            for (auto & slot : slots) {
-                // check if we can batch this slot with the previous one
-                if (slot.is_processing()) {
-                    if (!slot_batched) {
-                        slot_batched = &slot;
-                    } else if (!slot_batched->can_batch_with(slot)) {
-                        continue;
-                    }
-                }
-
-                // this slot still has a prompt to be processed
-                if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
-                    const auto & input_tokens = slot.task->tokens;
-
-                    // TODO: maybe move branch to outside of this loop in the future
-                    if (slot.state == SLOT_STATE_STARTED) {
-                        slot.t_start_process_prompt = ggml_time_us();
-                        slot.t_start_generation = 0;
-
-                        slot.state = SLOT_STATE_PROCESSING_PROMPT;
-
-                        SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, task.n_tokens = %d\n",
-                                slot.n_ctx, slot.task->params.n_keep, slot.task->n_tokens());
-
-                        // print prompt tokens (for debugging)
-                        /*if (1) {
-                            // first 16 tokens (avoid flooding logs)
-                            for (int i = 0; i < std::min<int>(16, input_tokens.size()); i++) {
-                                SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, input_tokens[i], common_token_to_piece(ctx, input_tokens[i]).c_str());
-                            }
-                        } else {
-                            // all
-                            for (int i = 0; i < (int) input_tokens.size(); i++) {
-                                SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, input_tokens[i], common_token_to_piece(ctx, input_tokens[i]).c_str());
-                            }
-                        }*/
-
-                        // keep track how many tokens we can reuse from the previous state
-                        int n_past = 0;
-
-                        // empty prompt passed -> release the slot and send empty response
-                        if (input_tokens.empty()) {
-                            SLT_WRN(slot, "%s", "empty prompt - releasing slot\n");
-
-                            slot.print_timings();
-                            send_final_response(slot);
-                            slot.release();
-
-                            continue;
-                        }
-
-                        // TODO: support memory-less logits computation
-                        if (slot.need_logits() && !llama_get_memory(ctx)) {
-                            send_error(slot, "the current context does not logits computation. skipping", ERROR_TYPE_SERVER);
-                            slot.release();
-                            continue;
-                        }
-
-                        if (!slot.can_split()) {
-                            if (slot.task->n_tokens() > n_ubatch) {
-                                send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
-                                slot.release();
-                                continue;
-                            }
-
-                            if (slot.task->n_tokens() > slot.n_ctx) {
-                                send_error(slot, "input is larger than the max context size. skipping", ERROR_TYPE_EXCEED_CONTEXT_SIZE);
-                                slot.release();
-                                continue;
-                            }
-                        } else {
-                            if (slot.task->n_tokens() >= slot.n_ctx) {
-                                send_error(slot, "the request exceeds the available context size, try increasing it", ERROR_TYPE_EXCEED_CONTEXT_SIZE);
-                                slot.release();
-                                continue;
-                            }
-
-                            if (slot.task->params.cache_prompt) {
-                                // reuse any previously computed tokens that are common with the new prompt
-                                n_past = slot.prompt.tokens.get_common_prefix(input_tokens);
-
-                                // if there is an alora invoked, don't cache after the invocation start
-                                if (slot.alora_invocation_start > 0) {
-                                    SLT_DBG(slot, "only caching to alora invocation start (n_past = %d, alora_invocation_start = %d)\n", n_past, slot.alora_invocation_start);
-                                    n_past = std::min(n_past, slot.alora_invocation_start - 1);
-                                }
-
-                                // reuse chunks from the cached prompt by shifting their KV cache in the new position
-                                if (params_base.n_cache_reuse > 0) {
-                                    GGML_ASSERT(!slot.prompt.tokens.has_mtmd);
-
-                                    size_t head_c = n_past; // cache
-                                    size_t head_p = n_past; // current prompt
-
-                                    if (mctx) {
-                                        // we should never reach this
-                                        GGML_ABORT("not supported by multimodal");
-                                    }
-
-                                    SLT_DBG(slot, "trying to reuse chunks with size > %d, n_past = %d\n", params_base.n_cache_reuse, n_past);
-
-                                    while (head_c < slot.prompt.tokens.size() &&
-                                           head_p < input_tokens.size()) {
-
-                                        size_t n_match = 0;
-                                        while (head_c + n_match < slot.prompt.tokens.size() &&
-                                               head_p + n_match < input_tokens.size()       &&
-                                               slot.prompt.tokens[head_c + n_match] == input_tokens[head_p + n_match]) {
-
-                                            n_match++;
-                                        }
-
-                                        if (n_match >= (size_t) params_base.n_cache_reuse) {
-                                            SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
-                                            //for (size_t i = head_p; i < head_p + n_match; i++) {
-                                            //    SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
-                                            //}
-
-                                            const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
-
-                                            llama_memory_seq_rm (llama_get_memory(ctx), slot.id, head_p, head_c);
-                                            llama_memory_seq_add(llama_get_memory(ctx), slot.id, head_c, head_c + n_match, kv_shift);
-
-                                            for (size_t i = 0; i < n_match; i++) {
-                                                slot.prompt.tokens.set_token(head_p + i, slot.prompt.tokens[head_c + i]);
-                                                n_past++;
-                                            }
-
-                                            head_c += n_match;
-                                            head_p += n_match;
-                                        } else {
-                                            head_c += 1;
-                                        }
-                                    }
-
-                                    SLT_DBG(slot, "after context reuse, new n_past = %d\n", n_past);
-                                }
-                            } else {
-                                // if we don't cache the prompt, we have to remove all previous tokens
-                                n_past = 0;
-                            }
-
-                            // note: when n_swa == 0, the model does not use SWA, which is equivalent to a window of 1
-                            const auto n_swa = std::max(1, llama_model_n_swa(model));
-
-                            // the largest pos_min required for a checkpoint to be useful
-                            const auto pos_min_thold = std::max(0, n_past - n_swa);
-
-                            // note: disallow with mtmd contexts for now
-                            //       https://github.com/ggml-org/llama.cpp/issues/17043
-                            if (!mctx && n_past > 0 && n_past < slot.prompt.n_tokens()) {
-                                const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
-                                if (pos_min == -1) {
-                                    SLT_ERR(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min);
-                                    GGML_ABORT("pos_min == -1, but n_past > 0 - should not happen: https://github.com/ggml-org/llama.cpp/pull/13833#discussion_r2116181237");
-                                }
-
-                                // when the prompt prefix does not match, print the tokens around the mismatch
-                                // this is useful for debugging prompt caching
-                                if (slots_debug) {
-                                    const int np0 = std::max<int>(n_past - 4, 0);
-                                    const int np1 = std::min<int>(n_past + 6, std::min(slot.prompt.tokens.size(), slot.task->tokens.size()));
-
-                                    std::stringstream ss0;
-                                    std::stringstream ss1;
-
-                                    std::stringstream st0;
-                                    std::stringstream st1;
-
-                                    ss0 << "old: ... ";
-                                    ss1 << "new: ... ";
-
-                                    for (int i = np0; i < np1; i++) {
-                                        if (i == n_past) {
-                                            ss0 << " | ";
-                                            ss1 << " | ";
-                                        }
-
-                                        {
-                                            const auto token = slot.prompt.tokens[i];
-                                            const auto piece = token != LLAMA_TOKEN_NULL ? common_token_to_piece(ctx, token) : "[mtmd]";
-                                            ss0 << piece;
-                                            st0 << std::setw(8) << token;
-                                        }
-
-                                        {
-                                            const auto token = slot.task->tokens[i];
-                                            const auto piece = token != LLAMA_TOKEN_NULL ? common_token_to_piece(ctx, token) : "[mtmd]";
-                                            ss1 << piece;
-                                            st1 << std::setw(8) << token;
-                                        }
-                                    }
-
-                                    SLT_WRN(slot, "%s\n", ss0.str().c_str());
-                                    SLT_WRN(slot, "%s\n", ss1.str().c_str());
-
-                                    SLT_WRN(slot, "%s\n", st0.str().c_str());
-                                    SLT_WRN(slot, "%s\n", st1.str().c_str());
-                                }
-
-                                if (pos_min > pos_min_thold) {
-                                    // TODO: support can be added in the future when corresponding vision models get released
-                                    GGML_ASSERT(!slot.prompt.tokens.has_mtmd);
-
-                                    SLT_WRN(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min, n_swa);
-
-                                    // search for a context checkpoint
-                                    const auto it = std::find_if(
-                                        slot.prompt.checkpoints.rbegin(),
-                                        slot.prompt.checkpoints.rend(),
-                                        [&](const auto & cur) {
-                                            // guarantee that a checkpoint will result in at least one token being processed [TAG_PROMPT_LOGITS]
-                                            return cur.pos_min < pos_min_thold;
-                                        }
-                                    );
-
-                                    bool do_reset = it == slot.prompt.checkpoints.rend();
-
-                                    if (!do_reset) {
-                                        // restore the context checkpoint
-                                        const size_t checkpoint_size = it->data.size();
-                                        const size_t n = llama_state_seq_set_data_ext(ctx, it->data.data(), checkpoint_size, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
-
-                                        if (n != checkpoint_size) {
-                                            SLT_ERR(slot, "failed to restore context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, (float) checkpoint_size / 1024 / 1024);
-                                            do_reset = true;
-                                            //printf("[DEBUG] `do_reset` was set to `true` after failing to restore a checkpoint");
-                                        } else {
-                                            n_past = std::min(n_past, std::max(it->pos_min + 1, it->pos_max));
-                                            SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, (float) checkpoint_size / 1024 / 1024);
-                                        }
-                                    }
-
-                                    if (do_reset) {
-                                        SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA or hybrid/recurrent memory, see %s)\n",
-                                                "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
-                                        n_past = 0;
-                                    }
-                                }
-                            }
-
-                            {
-                                // erase any checkpoints with pos_min > pos_min_thold
-                                for (auto it = slot.prompt.checkpoints.begin(); it != slot.prompt.checkpoints.end();) {
-                                    const auto & cur = *it;
-                                    if (cur.pos_min > pos_min_thold) {
-                                        SLT_WRN(slot, "erased invalidated context checkpoint (pos_min = %d, pos_max = %d, n_swa = %d, size = %.3f MiB)\n", cur.pos_min, cur.pos_max, n_swa, (float) cur.data.size() / 1024 / 1024);
-                                        it = slot.prompt.checkpoints.erase(it);
-                                    } else {
-                                        ++it;
-                                    }
-                                }
-                            }
-                        }
-
-                        // [TAG_PROMPT_LOGITS]
-                        if (n_past == slot.task->n_tokens() && n_past > 0) {
-                            SLT_WRN(slot, "need to evaluate at least 1 token for each active slot (n_past = %d, task.n_tokens() = %d)\n", n_past, slot.task->n_tokens());
-                            n_past--;
-                            SLT_WRN(slot, "n_past was set to %d\n", n_past);
-                        }
-
-                        slot.n_prompt_tokens_cache     = n_past;
-                        slot.n_prompt_tokens_processed = 0;
-
-                        slot.prompt.tokens.keep_first(n_past);
-                    }
-
-                    if (!slot.can_split()) {
-                        // cannot fit the prompt in the current batch - will try next iter
-                        if (batch.n_tokens + slot.task->n_tokens() > n_batch) {
-                            continue;
-                        }
-                    }
-
-                    // truncate any tokens that are beyond n_past for this slot
-                    const llama_pos p0 = slot.prompt.tokens.pos_next();
-
-                    SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0);
-
-                    if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) {
-                        SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0);
-                        llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);
-
-                        // there is no common part left
-                        slot.n_prompt_tokens_cache = 0;
-
-                        slot.prompt.tokens.clear();
-                    }
-
-                    // check if we should process the image
-                    if (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) {
-                        // process the image
-                        size_t n_tokens_out = 0;
-                        int32_t res = input_tokens.process_chunk(ctx, mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out);
-                        if (res != 0) {
-                            SLT_ERR(slot, "failed to process image, res = %d\n", res);
-                            send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
-                            slot.release();
-                            continue;
-                        }
-
-                        slot.n_prompt_tokens_processed += n_tokens_out;
-
-                        // add the image chunk to cache
-                        {
-                            const auto & chunk = input_tokens.find_chunk(slot.prompt.n_tokens());
-                            slot.prompt.tokens.push_back(chunk.get()); // copy
-                        }
-                    }
-
-                    // If using an alora, there may be uncached tokens that come
-                    // before the invocation sequence. When this happens, the
-                    // tokens before the invocation sequence need to be
-                    // processed without the adapter in a separate batch, then
-                    // the adapter needs to be enabled for the remaining tokens.
-                    if (lora_all_alora(slot.lora) && slot.alora_invocation_start - 1 > slot.prompt.n_tokens()) {
-                        SLT_DBG(slot, "processing pre-alora tokens without the adapter (n_tokens = %d, alora_invocation_start = %d)\n", slot.prompt.n_tokens(), slot.alora_invocation_start);
-                        const auto & enabled_loras = lora_get_enabled_ids(slot.lora);
-                        GGML_ASSERT(enabled_loras.size() == 1);
-                        alora_scale = slot.lora[enabled_loras[0]].scale;
-                        slot.lora[enabled_loras[0]].scale = 0.0f;
-                        alora_disabled_id = enabled_loras[0];
-                    }
-
-                    bool do_checkpoint = params_base.n_ctx_checkpoints > 0;
-
-                    // make checkpoints only for completion tasks
-                    do_checkpoint = do_checkpoint && slot.task->type == SERVER_TASK_TYPE_COMPLETION;
-
-                    // make a checkpoint of the parts of the memory that cannot be rolled back.
-                    // checkpoints are created only if:
-                    // - the model uses SWA and we are not using `swa_full`
-                    // - the model architecture is marked as recurrent or hybrid
-                    //
-                    // TODO: try to make this conditional on the context or the memory module, instead of the model type
-                    do_checkpoint = do_checkpoint && (
-                            llama_model_is_recurrent(model) ||
-                            llama_model_is_hybrid(model) ||
-                            (llama_model_n_swa(model) > 0 && !params_base.swa_full)
-                            );
-
-                    // add prompt tokens for processing in the current batch
-                    while (slot.prompt.n_tokens() < slot.task->n_tokens() && batch.n_tokens < n_batch) {
-                        // get next token to process
-                        llama_token cur_tok = input_tokens[slot.prompt.n_tokens()];
-                        if (cur_tok == LLAMA_TOKEN_NULL) {
-                            break; // end of text chunk
-                        }
-
-                        // if this is an alora request with pre-invocation
-                        // tokens that are not cached, we need to stop filling
-                        // this batch at those pre-invocation tokens.
-                        if (alora_scale > 0 && slot.prompt.n_tokens() == slot.alora_invocation_start - 1) {
-                            SLT_DBG(slot, "stop prompt batch filling at (n_tokens = %d, alora_invocation_start = %d)\n", slot.prompt.n_tokens(), slot.alora_invocation_start);
-                            break;
-                        }
-
-                        // embedding requires all tokens in the batch to be output
-                        common_batch_add(batch,
-                            cur_tok,
-                            slot.prompt.tokens.pos_next(),
-                            { slot.id },
-                            slot.need_embd());
-                        slot.prompt.tokens.push_back(cur_tok);
-
-                        slot.n_prompt_tokens_processed++;
-
-                        // process the last few tokens of the prompt separately in order to allow for a checkpoint to be created.
-                        if (do_checkpoint && slot.task->n_tokens() - slot.prompt.n_tokens() == 64) {
-                            break;
-                        }
-                    }
-
-                    // SLT_INF(slot, "new slot.prompt.tokens: %s\n", slot.slot.prompt.tokens.str().c_str());
-
-                    SLT_INF(slot, "prompt processing progress, n_tokens = %d, batch.n_tokens = %d, progress = %f\n", slot.prompt.n_tokens(), batch.n_tokens, (float) slot.prompt.n_tokens() / slot.task->n_tokens());
-
-                    // entire prompt has been processed
-                    if (slot.prompt.n_tokens() == slot.task->n_tokens()) {
-                        slot.state = SLOT_STATE_DONE_PROMPT;
-
-                        GGML_ASSERT(batch.n_tokens > 0);
-
-                        common_sampler_reset(slot.smpl);
-
-                        // Process all prompt tokens through sampler system
-                        for (int i = 0; i < slot.task->n_tokens(); ++i) {
-                            llama_token id = input_tokens[i];
-                            if (id != LLAMA_TOKEN_NULL) {
-                                common_sampler_accept(slot.smpl, id, false);
-                            }
-                        }
-
-                        // extract the logits only for the last token
-                        batch.logits[batch.n_tokens - 1] = true;
-
-                        slot.n_decoded = 0;
-                        slot.i_batch   = batch.n_tokens - 1;
-
-                        SLT_INF(slot, "prompt done, n_tokens = %d, batch.n_tokens = %d\n", slot.prompt.n_tokens(), batch.n_tokens);
-
-                        const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
-                        const auto pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx), slot.id);
-
-                        // no need for empty or small checkpoints
-                        do_checkpoint = do_checkpoint && (pos_min >= 0 && pos_max >= 64);
-
-                        // no need to create checkpoints that are too close together
-                        do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || pos_max > slot.prompt.checkpoints.back().pos_max + 64);
-
-                        if (do_checkpoint) {
-                            while (slot.prompt.checkpoints.size() >= (size_t) params_base.n_ctx_checkpoints) {
-                                // make room for the new checkpoint, if needed
-                                const auto & cur = slot.prompt.checkpoints.front();
-
-                                SLT_WRN(slot, "erasing old context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n",
-                                        cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);
-
-                                slot.prompt.checkpoints.erase(slot.prompt.checkpoints.begin());
-                            }
-
-                            const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
-
-                            auto & cur = slot.prompt.checkpoints.emplace_back(server_prompt_checkpoint{
-                                /*.pos_min = */ pos_min,
-                                /*.pos_max = */ pos_max,
-                                /*.data    = */ std::vector<uint8_t>(checkpoint_size),
-                            });
-
-                            llama_state_seq_get_data_ext(ctx, cur.data.data(), checkpoint_size, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
-
-                            SLT_WRN(slot, "created context checkpoint %d of %d (pos_min = %d, pos_max = %d, size = %.3f MiB)\n",
-                                    (int) slot.prompt.checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);
-                        }
-                    }
-                }
-
-                if (batch.n_tokens >= n_batch) {
-                    break;
-                }
-            }
-        }
-
-        if (batch.n_tokens == 0) {
-            SRV_WRN("%s", "no tokens to decode\n");
-            return;
-        }
-
-        SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
-
-        if (slot_batched) {
-            // apply lora, only need to do it once per batch
-            common_set_adapter_lora(ctx, slot_batched->lora);
-
-            // if the lora is temporarily disabled for an alora, re-enable it
-            // for next time
-            if (alora_scale > 0.0f) {
-                SRV_DBG("re-enabling alora with scale %f\n", alora_scale);
-                slot_batched->lora[alora_disabled_id].scale = alora_scale;
-            }
-
-            llama_set_embeddings(ctx, slot_batched->need_embd());
-        }
-
-        int32_t i_next = 0;
-
-        // process the created batch of tokens
-        for (int32_t i = 0; i < batch.n_tokens; i = i_next) {
-            const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
-
-            llama_batch batch_view = {
-                n_tokens,
-                batch.token    + i,
-                nullptr,
-                batch.pos      + i,
-                batch.n_seq_id + i,
-                batch.seq_id   + i,
-                batch.logits   + i,
-            };
-
-            const int ret = llama_decode(ctx, batch_view);
-
-            metrics.on_decoded(slots);
-
-            if (ret != 0) {
-                {
-                    std::string err;
-
-                    if (n_batch == 1 && ret == 1) {
-                        // TODO: try to terminate only the largest active slot/sequence and continue with the rest
-                        //       need to remove the tokens from the current batch too
-                        err = "Context size has been exceeded.";
-                    }
-
-                    if (ret == -1) {
-                        err = "Invalid input batch.";
-                    }
-
-                    if (ret < -1) {
-                        // TODO: update slot state based on llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
-                        err = "Compute error.";
-                    }
-
-                    // TODO: handle ret == 2 (abort) when we start aborting
-
-                    if (!err.empty()) {
-                        SRV_ERR("%s i = %d, n_batch = %d, ret = %d\n", err.c_str(), i, n_batch, ret);
-
-                        for (auto & slot : slots) {
-                            if (slot.is_processing()) {
-                                send_error(slot, err);
-                                slot.release();
-                            }
-                        }
-
-                        break;
-                    }
-                }
-
-                // retry with half the batch size to try to find a free slot in the KV cache
-                if (!try_purge_idle_slots()) {
-                    n_batch /= 2;
-                }
-
-                SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
-
-                continue; // continue loop of n_batch
-            }
-
-            // move the head of the batch forward with the number of tokens we just processed
-            i_next = i + n_tokens;
-
-            // on successful decode, restore the original batch size
-            n_batch = llama_n_batch(ctx);
-
-            for (auto & slot : slots) {
-                // optionally send prompt processing progress
-                if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_DONE_PROMPT) {
-                    if (slot.task->params.stream && slot.task->params.return_progress) {
-                        send_partial_response(slot, {}, true);
-                    }
-                }
-
-                if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) {
-                    continue; // continue loop of slots
-                }
-
-                if (slot.state == SLOT_STATE_DONE_PROMPT) {
-                    if (slot.task->type == SERVER_TASK_TYPE_EMBEDDING) {
-                        // prompt evaluated for embedding
-                        send_embedding(slot, batch_view);
-                        slot.release();
-                        slot.i_batch = -1;
-                        continue; // continue loop of slots
-                    }
-
-                    if (slot.task->type == SERVER_TASK_TYPE_RERANK) {
-                        send_rerank(slot, batch_view);
-                        slot.release();
-                        slot.i_batch = -1;
-                        continue; // continue loop of slots
-                    }
-
-                    // prompt evaluated for next-token prediction
-                    slot.state = SLOT_STATE_GENERATING;
-                } else if (slot.state != SLOT_STATE_GENERATING) {
-                    continue; // continue loop of slots
-                }
-
-                const int tok_idx = slot.i_batch - i;
-
-                llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx);
-
-                slot.i_batch = -1;
-
-                common_sampler_accept(slot.smpl, id, true);
-
-                slot.n_decoded += 1;
-
-                const int64_t t_current = ggml_time_us();
-
-                if (slot.n_decoded == 1) {
-                    slot.t_start_generation = t_current;
-                    slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
-                    metrics.on_prompt_eval(slot);
-                }
-
-                slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
-
-                completion_token_output result;
-                result.tok          = id;
-                result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok));
-                result.prob         = 1.0f; // TODO: set it here instead of doing inside populate_token_probs
-
-                if (slot.task->params.sampling.n_probs > 0) {
-                    populate_token_probs(slot, result, slot.task->params.post_sampling_probs, params_base.special, tok_idx);
-                }
-
-                if (!process_token(result, slot)) {
-                    // release slot because of stop condition
-                    slot.print_timings();
-                    send_final_response(slot);
-                    metrics.on_prediction(slot);
-                    slot.release();
-
-                    continue;
-                }
-            }
-
-            // do speculative decoding
-            // TODO: rework to have a single draft llama_context shared across all slots [TAG_SERVER_SPEC_REWORK]
-            //       perform the speculative drafting for all sequences at the same time in a single batch
-            for (auto & slot : slots) {
-                if (!slot.is_processing() || !slot.can_speculate()) {
-                    continue;
-                }
-
-                if (slot.state != SLOT_STATE_GENERATING) {
-                    continue;
-                }
-
-                if (mctx) {
-                    // we should never reach this, as speculative is automatically disabled if mmproj is loaded
-                    GGML_ABORT("not supported by multimodal");
-                }
-
-                // determine the max draft that fits the current slot state
-                int n_draft_max = slot.task->params.speculative.n_max;
-
-                // note: slot.prompt is not yet expanded with the `id` token sampled above
-                //       also, need to leave space for 1 extra token to allow context shifts
-                n_draft_max = std::min(n_draft_max, slot.n_ctx - slot.prompt.n_tokens() - 2);
-
-                if (slot.n_remaining > 0) {
-                    n_draft_max = std::min(n_draft_max, slot.n_remaining - 1);
-                }
-
-                SLT_DBG(slot, "max possible draft: %d\n", n_draft_max);
-
-                if (n_draft_max < slot.task->params.speculative.n_min) {
-                    SLT_DBG(slot, "the max possible draft is too small: %d < %d - skipping speculative decoding\n", n_draft_max, slot.task->params.speculative.n_min);
-
-                    continue;
-                }
-
-                llama_token id = slot.sampled;
-
-                struct common_speculative_params params_spec;
-                params_spec.n_draft = n_draft_max;
-                params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.task->params.speculative.n_max;
-                params_spec.p_min   = slot.task->params.speculative.p_min;
-
-                const llama_tokens & cached_text_tokens = slot.prompt.tokens.get_text_tokens();
-                llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, id);
-
-                // ignore small drafts
-                if (slot.task->params.speculative.n_min > (int) draft.size()) {
-                    SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.task->params.speculative.n_min);
-
-                    continue;
-                }
-
-                // keep track of total number of drafted tokens tested
-                slot.n_draft_total += draft.size();
-
-                // construct the speculation batch
-                common_batch_clear(slot.batch_spec);
-                common_batch_add  (slot.batch_spec, id, slot.prompt.tokens.pos_next(), { slot.id }, true);
-
-                for (size_t i = 0; i < draft.size(); ++i) {
-                    common_batch_add(slot.batch_spec, draft[i], slot.prompt.tokens.pos_next() + 1 + i, { slot.id }, true);
-                }
-
-                SLT_DBG(slot, "decoding speculative batch, size = %d\n", slot.batch_spec.n_tokens);
-
-                llama_decode(ctx, slot.batch_spec);
-
-                // the accepted tokens from the speculation
-                const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft);
-
-                slot.n_decoded += ids.size();
-
-                // update how many tokens out of those tested were accepted
-                slot.n_draft_accepted += ids.size() - 1;
-
-                slot.prompt.tokens.push_back(id);
-                slot.prompt.tokens.insert({ids.begin(), ids.end() - 1});
-
-                llama_memory_seq_rm(llama_get_memory(ctx), slot.id, slot.prompt.n_tokens(), -1);
-
-                for (size_t i = 0; i < ids.size(); ++i) {
-                    completion_token_output result;
-
-                    result.tok          = ids[i];
-                    result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok));
-                    result.prob         = 1.0f; // set later
-
-                    // TODO: set result.probs
-
-                    if (!process_token(result, slot)) {
-                        slot.print_timings();
-                        send_final_response(slot);
-                        metrics.on_prediction(slot);
-                        slot.release();
-
-                        break;
-                    }
-                }
-
-                SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int) ids.size() - 1, (int) draft.size(), slot.prompt.n_tokens());
-            }
-        }
-
-        SRV_DBG("%s", "run slots completed\n");
-    }
-
-    json model_meta() const {
-        return json {
-            {"vocab_type",  llama_vocab_type       (vocab)},
-            {"n_vocab",     llama_vocab_n_tokens   (vocab)},
-            {"n_ctx_train", llama_model_n_ctx_train(model)},
-            {"n_embd",      llama_model_n_embd     (model)},
-            {"n_params",    llama_model_n_params   (model)},
-            {"size",        llama_model_size       (model)},
-        };
-    }
-};
-
-// generator-like API for server responses, support pooling connection state and aggregating results
-struct server_response_reader {
-    std::unordered_set<int> id_tasks;
-    server_context & ctx_server;
-    size_t received_count = 0;
-    bool cancelled = false;
-
-    server_response_reader(server_context & ctx_server) : ctx_server(ctx_server) {}
-    ~server_response_reader() {
-        stop();
-    }
-
-    void post_tasks(std::vector<server_task> && tasks) {
-        id_tasks = server_task::get_list_id(tasks);
-        ctx_server.queue_results.add_waiting_tasks(tasks);
-        ctx_server.queue_tasks.post(std::move(tasks));
-    }
-
-    bool has_next() {
-        return !cancelled && received_count < id_tasks.size();
-    }
-
-    // return nullptr if should_stop() is true before receiving a result
-    // note: if one error is received, it will stop further processing and return error result
-    server_task_result_ptr next(const std::function<bool()> & should_stop) {
-        while (true) {
-            server_task_result_ptr result = ctx_server.queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS);
-            if (result == nullptr) {
-                // timeout, check stop condition
-                if (should_stop()) {
-                    SRV_DBG("%s", "stopping wait for next result due to should_stop condition\n");
-                    return nullptr;
-                }
-            } else {
-                if (result->is_error()) {
-                    stop(); // cancel remaining tasks
-                    SRV_DBG("%s", "received error result, stopping further processing\n");
-                    return result;
-                }
-                if (result->is_stop()) {
-                    received_count++;
-                }
-                return result;
-            }
-        }
-
-        // should not reach here
-    }
-
-    struct batch_response {
-        bool is_terminated = false; // if true, indicates that processing was stopped before all results were received
-        std::vector<server_task_result_ptr> results;
-        server_task_result_ptr error; // nullptr if no error
-    };
-
-    batch_response wait_for_all(const std::function<bool()> & should_stop) {
-        batch_response batch_res;
-        batch_res.results.resize(id_tasks.size());
-        while (has_next()) {
-            auto res = next(should_stop);
-            if (res == nullptr) {
-                batch_res.is_terminated = true;
-                return batch_res;
-            }
-            if (res->is_error()) {
-                batch_res.error = std::move(res);
-                return batch_res;
-            }
-            const size_t idx = res->get_index();
-            GGML_ASSERT(idx < batch_res.results.size() && "index out of range");
-            GGML_ASSERT(batch_res.results[idx] == nullptr && "duplicate result received");
-            batch_res.results[idx] = std::move(res);
-        }
-        return batch_res;
-    }
-
-    void stop() {
-        ctx_server.queue_results.remove_waiting_task_ids(id_tasks);
-        if (has_next() && !cancelled) {
-            // if tasks is not finished yet, cancel them
-            cancelled = true;
-            std::vector<server_task> cancel_tasks;
-            cancel_tasks.reserve(id_tasks.size());
-            for (const auto & id_task : id_tasks) {
-                SRV_WRN("cancel task, id_task = %d\n", id_task);
-                server_task task(SERVER_TASK_TYPE_CANCEL);
-                task.id_target = id_task;
-                ctx_server.queue_results.remove_waiting_task_id(id_task);
-                cancel_tasks.push_back(std::move(task));
-            }
-            // push to beginning of the queue, so it has highest priority
-            ctx_server.queue_tasks.post(std::move(cancel_tasks), true);
-        } else {
-            SRV_DBG("%s", "all tasks already finished, no need to cancel\n");
-        }
-    }
-};
-
-static void log_server_request(const httplib::Request & req, const httplib::Response & res) {
-    // skip GH copilot requests when using default port
-    if (req.path == "/v1/health") {
-        return;
-    }
-
-    // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch
-
-    SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
-
-    SRV_DBG("request:  %s\n", req.body.c_str());
-    SRV_DBG("response: %s\n", res.body.c_str());
-}
-
-static void res_error(httplib::Response & res, const json & error_data) {
-    json final_response {{"error", error_data}};
-    res.set_content(safe_json_to_str(final_response), MIMETYPE_JSON);
-    res.status = json_value(error_data, "code", 500);
-}
-
-static void res_ok(httplib::Response & res, const json & data) {
-    res.set_content(safe_json_to_str(data), MIMETYPE_JSON);
-    res.status = 200;
-}
-
-std::function<void(int)> shutdown_handler;
-std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
-
-inline void signal_handler(int signal) {
-    if (is_terminating.test_and_set()) {
-        // in case it hangs, we can force terminate the server by hitting Ctrl+C twice
-        // this is for better developer experience, we can remove when the server is stable enough
-        fprintf(stderr, "Received second interrupt, terminating immediately.\n");
-        exit(1);
-    }
-
-    shutdown_handler(signal);
-}
-
-int main(int argc, char ** argv) {
-    // own arguments required by this example
-    common_params params;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
-        return 1;
-    }
-
-    // TODO: should we have a separate n_parallel parameter for the server?
-    //       https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177
-    // TODO: this is a common configuration that is suitable for most local use cases
-    //       however, overriding the parameters is a bit confusing - figure out something more intuitive
-    if (params.n_parallel == 1 && params.kv_unified == false && !params.has_speculative()) {
-        LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true (add -kvu to disable this)\n", __func__);
-
-        params.n_parallel = 4;
-        params.kv_unified = true;
-    }
-
-    common_init();
-
-    // struct that contains llama context and inference
-    server_context ctx_server;
-
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency());
-    LOG_INF("\n");
-    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
-    LOG_INF("\n");
-
-    std::unique_ptr<httplib::Server> svr;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-    if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
-        LOG_INF("Running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str());
-        svr.reset(
-            new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str())
-        );
-    } else {
-        LOG_INF("Running without SSL\n");
-        svr.reset(new httplib::Server());
-    }
-#else
-    if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
-        LOG_ERR("Server is built without SSL support\n");
-        return 1;
-    }
-    svr.reset(new httplib::Server());
-#endif
-
-    std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};
-
-    svr->set_default_headers({{"Server", "llama.cpp"}});
-    svr->set_logger(log_server_request);
-    svr->set_exception_handler([](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) {
-        std::string message;
-        try {
-            std::rethrow_exception(ep);
-        } catch (const std::exception & e) {
-            message = e.what();
-        } catch (...) {
-            message = "Unknown Exception";
-        }
-
-        try {
-            json formatted_error = format_error_response(message, ERROR_TYPE_SERVER);
-            LOG_WRN("got exception: %s\n", formatted_error.dump().c_str());
-            res_error(res, formatted_error);
-        } catch (const std::exception & e) {
-            LOG_ERR("got another exception: %s | while hanlding exception: %s\n", e.what(), message.c_str());
-        }
-    });
-
-    svr->set_error_handler([](const httplib::Request &, httplib::Response & res) {
-        if (res.status == 404) {
-            res_error(res, format_error_response("File Not Found", ERROR_TYPE_NOT_FOUND));
-        }
-        // for other error codes, we skip processing here because it's already done by res_error()
-    });
-
-    // set timeouts and change hostname and port
-    svr->set_read_timeout (params.timeout_read);
-    svr->set_write_timeout(params.timeout_write);
-
-    std::unordered_map<std::string, std::string> log_data;
-
-    log_data["hostname"] = params.hostname;
-    log_data["port"]     = std::to_string(params.port);
-
-    if (params.api_keys.size() == 1) {
-        auto key = params.api_keys[0];
-        log_data["api_key"] = "api_key: ****" + key.substr(std::max((int)(key.length() - 4), 0));
-    } else if (params.api_keys.size() > 1) {
-        log_data["api_key"] = "api_key: " + std::to_string(params.api_keys.size()) + " keys loaded";
-    }
-
-    // Necessary similarity of prompt for slot selection
-    ctx_server.slot_prompt_similarity = params.slot_prompt_similarity;
-
-    //
-    // Middlewares
-    //
-
-    auto middleware_validate_api_key = [&params](const httplib::Request & req, httplib::Response & res) {
-        static const std::unordered_set<std::string> public_endpoints = {
-            "/health",
-            "/v1/health",
-            "/models",
-            "/v1/models",
-            "/api/tags"
-        };
-
-        // If API key is not set, skip validation
-        if (params.api_keys.empty()) {
-            return true;
-        }
-
-        // If path is public or is static file, skip validation
-        if (public_endpoints.find(req.path) != public_endpoints.end() || req.path == "/") {
-            return true;
-        }
-
-        // Check for API key in the header
-        auto auth_header = req.get_header_value("Authorization");
-
-        std::string prefix = "Bearer ";
-        if (auth_header.substr(0, prefix.size()) == prefix) {
-            std::string received_api_key = auth_header.substr(prefix.size());
-            if (std::find(params.api_keys.begin(), params.api_keys.end(), received_api_key) != params.api_keys.end()) {
-                return true; // API key is valid
-            }
-        }
-
-        // API key is invalid or not provided
-        res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION));
-
-        LOG_WRN("Unauthorized: Invalid API Key\n");
-
-        return false;
-    };
-
-    auto middleware_server_state = [&state](const httplib::Request & req, httplib::Response & res) {
-        server_state current_state = state.load();
-        if (current_state == SERVER_STATE_LOADING_MODEL) {
-            res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
-            return false;
-        }
-        return true;
-    };
-
-    // register server middlewares
-    svr->set_pre_routing_handler([&middleware_validate_api_key, &middleware_server_state](const httplib::Request & req, httplib::Response & res) {
-        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-        // If this is OPTIONS request, skip validation because browsers don't include Authorization header
-        if (req.method == "OPTIONS") {
-            res.set_header("Access-Control-Allow-Credentials", "true");
-            res.set_header("Access-Control-Allow-Methods",     "GET, POST");
-            res.set_header("Access-Control-Allow-Headers",     "*");
-            res.set_content("", "text/html"); // blank response, no data
-            return httplib::Server::HandlerResponse::Handled; // skip further processing
-        }
-        if (!middleware_server_state(req, res)) {
-            return httplib::Server::HandlerResponse::Handled;
-        }
-        if (!middleware_validate_api_key(req, res)) {
-            return httplib::Server::HandlerResponse::Handled;
-        }
-        return httplib::Server::HandlerResponse::Unhandled;
-    });
-
-    //
-    // Route handlers (or controllers)
-    //
-
-    const auto handle_health = [&](const httplib::Request &, httplib::Response & res) {
-        // error and loading states are handled by middleware
-        json health = {{"status", "ok"}};
-        res_ok(res, health);
-    };
-
-    const auto handle_slots = [&](const httplib::Request & req, httplib::Response & res) {
-        if (!params.endpoint_slots) {
-            res_error(res, format_error_response("This server does not support slots endpoint. Start it with `--slots`", ERROR_TYPE_NOT_SUPPORTED));
-            return;
-        }
-
-        // request slots data using task queue
-        int task_id = ctx_server.queue_tasks.get_new_id();
-        {
-            server_task task(SERVER_TASK_TYPE_METRICS);
-            task.id = task_id;
-            ctx_server.queue_results.add_waiting_task_id(task_id);
-            ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
-        }
-
-        // get the result
-        server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
-        ctx_server.queue_results.remove_waiting_task_id(task_id);
-
-        if (result->is_error()) {
-            res_error(res, result->to_json());
-            return;
-        }
-
-        // TODO: get rid of this dynamic_cast
-        auto res_task = dynamic_cast<server_task_result_metrics*>(result.get());
-        GGML_ASSERT(res_task != nullptr);
-
-        // optionally return "fail_on_no_slot" error
-        if (req.has_param("fail_on_no_slot")) {
-            if (res_task->n_idle_slots == 0) {
-                res_error(res, format_error_response("no slot available", ERROR_TYPE_UNAVAILABLE));
-                return;
-            }
-        }
-
-        res_ok(res, res_task->slots_data);
-    };
-
-    const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) {
-        if (!params.endpoint_metrics) {
-            res_error(res, format_error_response("This server does not support metrics endpoint. Start it with `--metrics`", ERROR_TYPE_NOT_SUPPORTED));
-            return;
-        }
-
-        // request slots data using task queue
-        int task_id = ctx_server.queue_tasks.get_new_id();
-        {
-            server_task task(SERVER_TASK_TYPE_METRICS);
-            task.id = task_id;
-            ctx_server.queue_results.add_waiting_task_id(task_id);
-            ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
-        }
-
-        // get the result
-        server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
-        ctx_server.queue_results.remove_waiting_task_id(task_id);
-
-        if (result->is_error()) {
-            res_error(res, result->to_json());
-            return;
-        }
-
-        // TODO: get rid of this dynamic_cast
-        auto res_task = dynamic_cast<server_task_result_metrics*>(result.get());
-        GGML_ASSERT(res_task != nullptr);
-
-        // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
-        json all_metrics_def = json {
-            {"counter", {{
-                    {"name",  "prompt_tokens_total"},
-                    {"help",  "Number of prompt tokens processed."},
-                    {"value",  (uint64_t) res_task->n_prompt_tokens_processed_total}
-            }, {
-                    {"name",  "prompt_seconds_total"},
-                    {"help",  "Prompt process time"},
-                    {"value",  (uint64_t) res_task->t_prompt_processing_total / 1.e3}
-            }, {
-                    {"name",  "tokens_predicted_total"},
-                    {"help",  "Number of generation tokens processed."},
-                    {"value",  (uint64_t) res_task->n_tokens_predicted_total}
-            }, {
-                    {"name",  "tokens_predicted_seconds_total"},
-                    {"help",  "Predict process time"},
-                    {"value",  (uint64_t) res_task->t_tokens_generation_total / 1.e3}
-            }, {
-                    {"name",  "n_decode_total"},
-                    {"help",  "Total number of llama_decode() calls"},
-                    {"value",  res_task->n_decode_total}
-            }, {
-                    {"name",  "n_tokens_max"},
-                    {"help",  "Largest observed n_tokens."},
-                    {"value",  res_task->n_tokens_max}
-            }, {
-                    {"name",  "n_busy_slots_per_decode"},
-                    {"help",  "Average number of busy slots per llama_decode() call"},
-                    {"value",  (float) res_task->n_busy_slots_total / std::max((float) res_task->n_decode_total, 1.f)}
-            }}},
-            {"gauge", {{
-                    {"name",  "prompt_tokens_seconds"},
-                    {"help",  "Average prompt throughput in tokens/s."},
-                    {"value",  res_task->n_prompt_tokens_processed ? 1.e3 / res_task->t_prompt_processing * res_task->n_prompt_tokens_processed : 0.}
-            },{
-                    {"name",  "predicted_tokens_seconds"},
-                    {"help",  "Average generation throughput in tokens/s."},
-                    {"value",  res_task->n_tokens_predicted ? 1.e3 / res_task->t_tokens_generation * res_task->n_tokens_predicted : 0.}
-            },{
-                    {"name",  "requests_processing"},
-                    {"help",  "Number of requests processing."},
-                    {"value",  (uint64_t) res_task->n_processing_slots}
-            },{
-                    {"name",  "requests_deferred"},
-                    {"help",  "Number of requests deferred."},
-                    {"value",  (uint64_t) res_task->n_tasks_deferred}
-            }}}
-        };
-
-        std::stringstream prometheus;
-
-        for (const auto & el : all_metrics_def.items()) {
-            const auto & type        = el.key();
-            const auto & metrics_def = el.value();
-
-            for (const auto & metric_def : metrics_def) {
-                const std::string name = metric_def.at("name");
-                const std::string help = metric_def.at("help");
-
-                auto value = json_value(metric_def, "value", 0.);
-                prometheus << "# HELP llamacpp:" << name << " " << help  << "\n"
-                            << "# TYPE llamacpp:" << name << " " << type  << "\n"
-                            << "llamacpp:"        << name << " " << value << "\n";
-            }
-        }
-
-        res.set_header("Process-Start-Time-Unix", std::to_string(res_task->t_start));
-
-        res.set_content(prometheus.str(), "text/plain; version=0.0.4");
-        res.status = 200; // HTTP OK
-    };
-
-    const auto handle_slots_save = [&ctx_server, &params](const httplib::Request & req, httplib::Response & res, int id_slot) {
-        json request_data = json::parse(req.body);
-        std::string filename = request_data.at("filename");
-        if (!fs_validate_filename(filename)) {
-            res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
-            return;
-        }
-        std::string filepath = params.slot_save_path + filename;
-
-        int task_id = ctx_server.queue_tasks.get_new_id();
-        {
-            server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
-            task.id = task_id;
-            task.slot_action.slot_id  = id_slot;
-            task.slot_action.filename = filename;
-            task.slot_action.filepath = filepath;
-
-            ctx_server.queue_results.add_waiting_task_id(task_id);
-            ctx_server.queue_tasks.post(std::move(task));
-        }
-
-        server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
-        ctx_server.queue_results.remove_waiting_task_id(task_id);
-
-        if (result->is_error()) {
-            res_error(res, result->to_json());
-            return;
-        }
-
-        res_ok(res, result->to_json());
-    };
-
-    const auto handle_slots_restore = [&ctx_server, &params](const httplib::Request & req, httplib::Response & res, int id_slot) {
-        json request_data = json::parse(req.body);
-        std::string filename = request_data.at("filename");
-        if (!fs_validate_filename(filename)) {
-            res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
-            return;
-        }
-        std::string filepath = params.slot_save_path + filename;
-
-        int task_id = ctx_server.queue_tasks.get_new_id();
-        {
-            server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
-            task.id = task_id;
-            task.slot_action.slot_id  = id_slot;
-            task.slot_action.filename = filename;
-            task.slot_action.filepath = filepath;
-
-            ctx_server.queue_results.add_waiting_task_id(task_id);
-            ctx_server.queue_tasks.post(std::move(task));
-        }
-
-        server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
-        ctx_server.queue_results.remove_waiting_task_id(task_id);
-
-        if (result->is_error()) {
-            res_error(res, result->to_json());
-            return;
-        }
-
-        GGML_ASSERT(dynamic_cast<server_task_result_slot_save_load*>(result.get()) != nullptr);
-        res_ok(res, result->to_json());
-    };
-
-    const auto handle_slots_erase = [&ctx_server](const httplib::Request & /* req */, httplib::Response & res, int id_slot) {
-        int task_id = ctx_server.queue_tasks.get_new_id();
-        {
-            server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
-            task.id = task_id;
-            task.slot_action.slot_id = id_slot;
-
-            ctx_server.queue_results.add_waiting_task_id(task_id);
-            ctx_server.queue_tasks.post(std::move(task));
-        }
-
-        server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
-        ctx_server.queue_results.remove_waiting_task_id(task_id);
+#include <atomic>
+#include <signal.h>
+#include <thread> // for std::thread::hardware_concurrency
 
-        if (result->is_error()) {
-            res_error(res, result->to_json());
-            return;
-        }
+#if defined(_WIN32)
+#include <windows.h>
+#endif
 
-        GGML_ASSERT(dynamic_cast<server_task_result_slot_erase*>(result.get()) != nullptr);
-        res_ok(res, result->to_json());
-    };
+static std::function<void(int)> shutdown_handler;
+static std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
 
-    const auto handle_slots_action = [&params, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) {
-        if (params.slot_save_path.empty()) {
-            res_error(res, format_error_response("This server does not support slots action. Start it with `--slot-save-path`", ERROR_TYPE_NOT_SUPPORTED));
-            return;
-        }
+static inline void signal_handler(int signal) {
+    if (is_terminating.test_and_set()) {
+        // in case it hangs, we can force terminate the server by hitting Ctrl+C twice
+        // this is for better developer experience, we can remove when the server is stable enough
+        fprintf(stderr, "Received second interrupt, terminating immediately.\n");
+        exit(1);
+    }
 
-        std::string id_slot_str = req.path_params.at("id_slot");
-        int id_slot;
+    shutdown_handler(signal);
+}
 
+// wrapper function that handles exceptions and logs errors
+// this is to make sure handler_t never throws exceptions; instead, it returns an error response
+static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) {
+    return [func = std::move(func)](const server_http_req & req) -> server_http_res_ptr {
+        std::string message;
+        error_type error;
         try {
-            id_slot = std::stoi(id_slot_str);
-        } catch (const std::exception &) {
-            res_error(res, format_error_response("Invalid slot ID", ERROR_TYPE_INVALID_REQUEST));
-            return;
-        }
-
-        std::string action = req.get_param_value("action");
-
-        if (action == "save") {
-            handle_slots_save(req, res, id_slot);
-        } else if (action == "restore") {
-            handle_slots_restore(req, res, id_slot);
-        } else if (action == "erase") {
-            handle_slots_erase(req, res, id_slot);
-        } else {
-            res_error(res, format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST));
-        }
-    };
-
-    const auto handle_props = [&params, &ctx_server](const httplib::Request &, httplib::Response & res) {
-        json default_generation_settings_for_props;
-
-        {
-            slot_params params;
-
-            params.sampling = ctx_server.params_base.sampling;
-
-            default_generation_settings_for_props = json {
-                {"params", params.to_json(true)},
-                {"n_ctx",  ctx_server.slots[0].n_ctx},
-            };
-        }
-
-        // this endpoint is publicly available, please only return what is safe to be exposed
-        json data = {
-            { "default_generation_settings", default_generation_settings_for_props },
-            { "total_slots",                 ctx_server.params_base.n_parallel },
-            { "model_alias",                 ctx_server.params_base.model_alias },
-            { "model_path",                  ctx_server.params_base.model.path },
-            { "modalities",                  json {
-                {"vision", ctx_server.oai_parser_opt.allow_image},
-                {"audio",  ctx_server.oai_parser_opt.allow_audio},
-            } },
-            { "endpoint_slots",              params.endpoint_slots },
-            { "endpoint_props",              params.endpoint_props },
-            { "endpoint_metrics",            params.endpoint_metrics },
-            { "webui",                       params.webui },
-            { "chat_template",               common_chat_templates_source(ctx_server.chat_templates.get()) },
-            { "bos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
-            { "eos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
-            { "build_info",                  build_info },
-        };
-        if (ctx_server.params_base.use_jinja) {
-            if (auto tool_use_src = common_chat_templates_source(ctx_server.chat_templates.get(), "tool_use")) {
-                data["chat_template_tool_use"] = tool_use_src;
-            }
-        }
-
-        res_ok(res, data);
-    };
-
-    const auto handle_props_change = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
-        if (!ctx_server.params_base.endpoint_props) {
-            res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED));
-            return;
+            return func(req);
+        } catch (const std::invalid_argument & e) {
+            // treat invalid_argument as invalid request (400)
+            error = ERROR_TYPE_INVALID_REQUEST;
+            message = e.what();
+        } catch (const std::exception & e) {
+            // treat other exceptions as server error (500)
+            error = ERROR_TYPE_SERVER;
+            message = e.what();
+        } catch (...) {
+            error = ERROR_TYPE_SERVER;
+            message = "unknown error";
         }
 
-        json data = json::parse(req.body);
-
-        // update any props here
-
-        res_ok(res, {{ "success", true }});
-    };
-
-    const auto handle_api_show = [&ctx_server](const httplib::Request &, httplib::Response & res) {
-        bool has_mtmd = ctx_server.mctx != nullptr;
-        json data = {
-            {
-                "template", common_chat_templates_source(ctx_server.chat_templates.get()),
-            },
-            {
-                "model_info", {
-                    { "llama.context_length", ctx_server.slots.back().n_ctx, },
-                }
-            },
-            {"modelfile", ""},
-            {"parameters", ""},
-            {"template", common_chat_templates_source(ctx_server.chat_templates.get())},
-            {"details", {
-                {"parent_model", ""},
-                {"format", "gguf"},
-                {"family", ""},
-                {"families", {""}},
-                {"parameter_size", ""},
-                {"quantization_level", ""}
-            }},
-            {"model_info", ""},
-            {"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})}
-        };
-
-        res_ok(res, data);
-    };
-
-    // handle completion-like requests (completion, chat, infill)
-    // we can optionally provide a custom format for partial results and final results
-    const auto handle_completions_impl = [&ctx_server](
-            server_task_type type,
-            json & data,
-            const std::vector<raw_buffer> & files,
-            const std::function<bool()> & is_connection_closed,
-            httplib::Response & res,
-            oaicompat_type oaicompat) -> void {
-        GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
-
-        auto completion_id = gen_chatcmplid();
-        // need to store the reader as a pointer, so that it won't be destroyed when the handle returns
-        // use shared_ptr as it's shared between the chunked_content_provider() and on_complete()
-        const auto rd = std::make_shared<server_response_reader>(ctx_server);
-
+        auto res = std::make_unique<server_http_res>();
+        res->status = 500;
         try {
-            std::vector<server_task> tasks;
-
-            const auto & prompt = data.at("prompt");
-            // TODO: this log can become very long, put it behind a flag or think about a more compact format
-            //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
-
-            // process prompt
-            std::vector<server_tokens> inputs;
-
-            if (oaicompat && ctx_server.mctx != nullptr) {
-                // This is the case used by OAI compatible chat path with MTMD. TODO It can be moved to the path below.
-                inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt.get<std::string>(), files));
-            } else {
-                // Everything else, including multimodal completions.
-                inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
-            }
-            tasks.reserve(inputs.size());
-            for (size_t i = 0; i < inputs.size(); i++) {
-                server_task task = server_task(type);
-
-                task.id    = ctx_server.queue_tasks.get_new_id();
-                task.index = i;
-
-                task.tokens = std::move(inputs[i]);
-                task.params = server_task::params_from_json_cmpl(
-                        ctx_server.ctx,
-                        ctx_server.params_base,
-                        data);
-                task.id_slot = json_value(data, "id_slot", -1);
-
-                // OAI-compat
-                task.params.oaicompat                 = oaicompat;
-                task.params.oaicompat_cmpl_id         = completion_id;
-                // oaicompat_model is already populated by params_from_json_cmpl
-
-                tasks.push_back(std::move(task));
-            }
-
-            rd->post_tasks(std::move(tasks));
+            json error_data = format_error_response(message, error);
+            res->status = json_value(error_data, "code", 500);
+            res->data = safe_json_to_str({{ "error", error_data }});
+            SRV_WRN("got exception: %s\n", res->data.c_str());
         } catch (const std::exception & e) {
-            res_error(res, format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
-            return;
-        }
-
-        bool stream = json_value(data, "stream", false);
-
-        if (!stream) {
-            // non-stream, wait for the results
-            auto all_results = rd->wait_for_all(is_connection_closed);
-            if (all_results.is_terminated) {
-                return; // connection is closed
-            } else if (all_results.error) {
-                res_error(res, all_results.error->to_json());
-                return;
-            } else {
-                json arr = json::array();
-                for (auto & res : all_results.results) {
-                    GGML_ASSERT(dynamic_cast<server_task_result_cmpl_final*>(res.get()) != nullptr);
-                    arr.push_back(res->to_json());
-                }
-                // if single request, return single object instead of array
-                res_ok(res, arr.size() == 1 ? arr[0] : arr);
-            }
-
-        } else {
-            // in streaming mode, the first error must be treated as non-stream response
-            // this is to match the OAI API behavior
-            // ref: https://github.com/ggml-org/llama.cpp/pull/16486#discussion_r2419657309
-            server_task_result_ptr first_result = rd->next(is_connection_closed);
-            if (first_result == nullptr) {
-                return; // connection is closed
-            } else if (first_result->is_error()) {
-                res_error(res, first_result->to_json());
-                return;
-            } else {
-                GGML_ASSERT(
-                    dynamic_cast<server_task_result_cmpl_partial*>(first_result.get()) != nullptr
-                    || dynamic_cast<server_task_result_cmpl_final*>(first_result.get()) != nullptr
-                );
-            }
-
-            // next responses are streamed
-            json first_result_json = first_result->to_json();
-            const auto chunked_content_provider = [first_result_json, rd, oaicompat](size_t, httplib::DataSink & sink) mutable -> bool {
-                // flush the first result as it's not an error
-                if (!first_result_json.empty()) {
-                    if (!server_sent_event(sink, first_result_json)) {
-                        sink.done();
-                        return false; // sending failed, go to on_complete()
-                    }
-                    first_result_json.clear(); // mark as sent
-                }
-
-                // receive subsequent results
-                auto result = rd->next([&sink]{ return !sink.is_writable(); });
-                if (result == nullptr) {
-                    sink.done();
-                    return false; // connection is closed, go to on_complete()
-                }
-
-                // send the results
-                json res_json = result->to_json();
-                bool ok = false;
-                if (result->is_error()) {
-                    ok = server_sent_event(sink, json {{ "error", result->to_json() }});
-                    sink.done();
-                    return false; // go to on_complete()
-                } else {
-                    GGML_ASSERT(
-                        dynamic_cast<server_task_result_cmpl_partial*>(result.get()) != nullptr
-                        || dynamic_cast<server_task_result_cmpl_final*>(result.get()) != nullptr
-                    );
-                    ok = server_sent_event(sink, res_json);
-                }
-
-                if (!ok) {
-                    sink.done();
-                    return false; // sending failed, go to on_complete()
-                }
-
-                // check if there is more data
-                if (!rd->has_next()) {
-                    if (oaicompat != OAICOMPAT_TYPE_NONE) {
-                        static const std::string ev_done = "data: [DONE]\n\n";
-                        sink.write(ev_done.data(), ev_done.size());
-                    }
-                    sink.done();
-                    return false; // no more data, go to on_complete()
-                }
-
-                // has next data, continue
-                return true;
-            };
-
-            auto on_complete = [rd](bool) {
-                rd->stop();
-            };
-
-            res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
-        }
-    };
-
-    const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
-        json data = json::parse(req.body);
-        std::vector<raw_buffer> files; // dummy
-        handle_completions_impl(
-            SERVER_TASK_TYPE_COMPLETION,
-            data,
-            files,
-            req.is_connection_closed,
-            res,
-            OAICOMPAT_TYPE_NONE);
-    };
-
-    const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
-        json data = oaicompat_completion_params_parse(json::parse(req.body));
-        std::vector<raw_buffer> files; // dummy
-        handle_completions_impl(
-            SERVER_TASK_TYPE_COMPLETION,
-            data,
-            files,
-            req.is_connection_closed,
-            res,
-            OAICOMPAT_TYPE_COMPLETION);
-    };
-
-    const auto handle_infill = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
-        // check model compatibility
-        std::string err;
-        if (llama_vocab_fim_pre(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
-            err += "prefix token is missing. ";
-        }
-        if (llama_vocab_fim_suf(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
-            err += "suffix token is missing. ";
-        }
-        if (llama_vocab_fim_mid(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
-            err += "middle token is missing. ";
-        }
-        if (!err.empty()) {
-            res_error(res, format_error_response(string_format("Infill is not supported by this model: %s", err.c_str()), ERROR_TYPE_NOT_SUPPORTED));
-            return;
-        }
-
-        json data = json::parse(req.body);
-
-        // validate input
-        if (data.contains("prompt") && !data.at("prompt").is_string()) {
-            // prompt is optional
-            res_error(res, format_error_response("\"prompt\" must be a string", ERROR_TYPE_INVALID_REQUEST));
-        }
-
-        if (!data.contains("input_prefix")) {
-            res_error(res, format_error_response("\"input_prefix\" is required", ERROR_TYPE_INVALID_REQUEST));
-        }
-
-        if (!data.contains("input_suffix")) {
-            res_error(res, format_error_response("\"input_suffix\" is required", ERROR_TYPE_INVALID_REQUEST));
-        }
-
-        if (data.contains("input_extra") && !data.at("input_extra").is_array()) {
-            // input_extra is optional
-            res_error(res, format_error_response("\"input_extra\" must be an array of {\"filename\": string, \"text\": string}", ERROR_TYPE_INVALID_REQUEST));
-            return;
-        }
-
-        json input_extra = json_value(data, "input_extra", json::array());
-        for (const auto & chunk : input_extra) {
-            // { "text": string, "filename": string }
-            if (!chunk.contains("text") || !chunk.at("text").is_string()) {
-                res_error(res, format_error_response("extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST));
-                return;
-            }
-            // filename is optional
-            if (chunk.contains("filename") && !chunk.at("filename").is_string()) {
-                res_error(res, format_error_response("extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST));
-                return;
-            }
-        }
-        data["input_extra"] = input_extra; // default to empty array if it's not exist
-
-        std::string prompt = json_value(data, "prompt", std::string());
-        std::vector<server_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, false, true);
-        SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
-        data["prompt"] = format_infill(
-            ctx_server.vocab,
-            data.at("input_prefix"),
-            data.at("input_suffix"),
-            data.at("input_extra"),
-            ctx_server.params_base.n_batch,
-            ctx_server.params_base.n_predict,
-            ctx_server.slots[0].n_ctx, // TODO: there should be a better way
-            ctx_server.params_base.spm_infill,
-            tokenized_prompts[0].get_text_tokens() // TODO: this could maybe be multimodal.
-        );
-
-        std::vector<raw_buffer> files; // dummy
-        handle_completions_impl(
-            SERVER_TASK_TYPE_INFILL,
-            data,
-            files,
-            req.is_connection_closed,
-            res,
-            OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
-    };
-
-    const auto handle_chat_completions = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
-        LOG_DBG("request: %s\n", req.body.c_str());
-
-        auto body = json::parse(req.body);
-        std::vector<raw_buffer> files;
-        json data = oaicompat_chat_params_parse(
-            body,
-            ctx_server.oai_parser_opt,
-            files);
-
-        handle_completions_impl(
-            SERVER_TASK_TYPE_COMPLETION,
-            data,
-            files,
-            req.is_connection_closed,
-            res,
-            OAICOMPAT_TYPE_CHAT);
-    };
-
-    // same with handle_chat_completions, but without inference part
-    const auto handle_apply_template = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
-        auto body = json::parse(req.body);
-        std::vector<raw_buffer> files; // dummy, unused
-        json data = oaicompat_chat_params_parse(
-            body,
-            ctx_server.oai_parser_opt,
-            files);
-        res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
-    };
-
-    const auto handle_models = [&params, &ctx_server, &state](const httplib::Request &, httplib::Response & res) {
-        server_state current_state = state.load();
-        json model_meta = nullptr;
-        if (current_state == SERVER_STATE_READY) {
-            model_meta = ctx_server.model_meta();
-        }
-        bool has_mtmd = ctx_server.mctx != nullptr;
-        json models = {
-            {"models", {
-                {
-                    {"name", params.model_alias.empty() ? params.model.path : params.model_alias},
-                    {"model", params.model_alias.empty() ? params.model.path : params.model_alias},
-                    {"modified_at", ""},
-                    {"size", ""},
-                    {"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash
-                    {"type", "model"},
-                    {"description", ""},
-                    {"tags", {""}},
-                    {"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})},
-                    {"parameters", ""},
-                    {"details", {
-                        {"parent_model", ""},
-                        {"format", "gguf"},
-                        {"family", ""},
-                        {"families", {""}},
-                        {"parameter_size", ""},
-                        {"quantization_level", ""}
-                    }}
-                }
-            }},
-            {"object", "list"},
-            {"data", {
-                {
-                    {"id",       params.model_alias.empty() ? params.model.path : params.model_alias},
-                    {"object",   "model"},
-                    {"created",  std::time(0)},
-                    {"owned_by", "llamacpp"},
-                    {"meta",     model_meta},
-                },
-            }}
-        };
-
-        res_ok(res, models);
-    };
-
-    const auto handle_tokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
-        const json body = json::parse(req.body);
-
-        json tokens_response = json::array();
-        if (body.count("content") != 0) {
-            const bool add_special = json_value(body, "add_special", false);
-            const bool parse_special = json_value(body, "parse_special", true);
-            const bool with_pieces = json_value(body, "with_pieces", false);
-
-            llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, parse_special);
-
-            if (with_pieces) {
-                for (const auto& token : tokens) {
-                    std::string piece = common_token_to_piece(ctx_server.ctx, token);
-                    json piece_json;
-
-                    // Check if the piece is valid UTF-8
-                    if (is_valid_utf8(piece)) {
-                        piece_json = piece;
-                    } else {
-                        // If not valid UTF-8, store as array of byte values
-                        piece_json = json::array();
-                        for (unsigned char c : piece) {
-                            piece_json.push_back(static_cast<int>(c));
-                        }
-                    }
-
-                    tokens_response.push_back({
-                        {"id", token},
-                        {"piece", piece_json}
-                    });
-                }
-            } else {
-                tokens_response = tokens;
-            }
-        }
-
-        const json data = format_tokenizer_response(tokens_response);
-        res_ok(res, data);
-    };
-
-    const auto handle_detokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
-        const json body = json::parse(req.body);
-
-        std::string content;
-        if (body.count("tokens") != 0) {
-            const llama_tokens tokens = body.at("tokens");
-            content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend());
-        }
-
-        const json data = format_detokenized_response(content);
-        res_ok(res, data);
-    };
-
-    const auto handle_embeddings_impl = [&ctx_server](const httplib::Request & req, httplib::Response & res, oaicompat_type oaicompat) {
-        if (!ctx_server.params_base.embedding) {
-            res_error(res, format_error_response("This server does not support embeddings. Start it with `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
-            return;
-        }
-
-        if (oaicompat != OAICOMPAT_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
-            res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST));
-            return;
-        }
-
-        const json body = json::parse(req.body);
-
-        // for the shape of input/content, see tokenize_input_prompts()
-        json prompt;
-        if (body.count("input") != 0) {
-            prompt = body.at("input");
-        } else if (body.contains("content")) {
-            oaicompat = OAICOMPAT_TYPE_NONE; // "content" field is not OAI compatible
-            prompt = body.at("content");
-        } else {
-            res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
-            return;
-        }
-
-        bool use_base64 = false;
-        if (body.count("encoding_format") != 0) {
-            const std::string& format = body.at("encoding_format");
-            if (format == "base64") {
-                use_base64 = true;
-            } else if (format != "float") {
-                res_error(res, format_error_response("The format to return the embeddings in. Can be either float or base64", ERROR_TYPE_INVALID_REQUEST));
-                return;
-            }
-        }
-
-        auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
-        for (const auto & tokens : tokenized_prompts) {
-            // this check is necessary for models that do not add BOS token to the input
-            if (tokens.empty()) {
-                res_error(res, format_error_response("Input content cannot be empty", ERROR_TYPE_INVALID_REQUEST));
-                return;
-            }
-        }
-
-        int embd_normalize = 2; // default to Euclidean/L2 norm
-        if (body.count("embd_normalize") != 0) {
-            embd_normalize = body.at("embd_normalize");
-            if (llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
-                SRV_DBG("embd_normalize is not supported by pooling type %d, ignoring it\n", llama_pooling_type(ctx_server.ctx));
-            }
-        }
-
-        // create and queue the task
-        json responses = json::array();
-        server_response_reader rd(ctx_server);
-        {
-            std::vector<server_task> tasks;
-            for (size_t i = 0; i < tokenized_prompts.size(); i++) {
-                server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING);
-
-                task.id     = ctx_server.queue_tasks.get_new_id();
-                task.index  = i;
-                task.tokens = std::move(tokenized_prompts[i]);
-
-                // OAI-compat
-                task.params.oaicompat = oaicompat;
-                task.params.embd_normalize = embd_normalize;
-
-                tasks.push_back(std::move(task));
-            }
-            rd.post_tasks(std::move(tasks));
-        }
-
-        // wait for the results
-        auto all_results = rd.wait_for_all(req.is_connection_closed);
-
-        // collect results
-        if (all_results.is_terminated) {
-            return; // connection is closed
-        } else if (all_results.error) {
-            res_error(res, all_results.error->to_json());
-            return;
-        } else {
-            for (auto & res : all_results.results) {
-                GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
-                responses.push_back(res->to_json());
-            }
+            SRV_ERR("got another exception: %s | while handling exception: %s\n", e.what(), message.c_str());
+            res->data = "Internal Server Error";
         }
-
-        // write JSON response
-        json root = oaicompat == OAICOMPAT_TYPE_EMBEDDING
-            ? format_embeddings_response_oaicompat(body, responses, use_base64)
-            : json(responses);
-        res_ok(res, root);
-    };
-
-    const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
-        handle_embeddings_impl(req, res, OAICOMPAT_TYPE_NONE);
-    };
-
-    const auto handle_embeddings_oai = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
-        handle_embeddings_impl(req, res, OAICOMPAT_TYPE_EMBEDDING);
+        return res;
     };
+}
 
-    const auto handle_rerank = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
-        if (!ctx_server.params_base.embedding || ctx_server.params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) {
-            res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
-            return;
-        }
-
-        const json body = json::parse(req.body);
-
-        // if true, use TEI API format, otherwise use Jina API format
-        // Jina: https://jina.ai/reranker/
-        // TEI: https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/rerank
-        bool is_tei_format = body.contains("texts");
-
-        json query;
-        if (body.count("query") == 1) {
-            query = body.at("query");
-            if (!query.is_string()) {
-                res_error(res, format_error_response("\"query\" must be a string", ERROR_TYPE_INVALID_REQUEST));
-                return;
-            }
-        } else {
-            res_error(res, format_error_response("\"query\" must be provided", ERROR_TYPE_INVALID_REQUEST));
-            return;
-        }
-
-        std::vector<std::string> documents = json_value(body, "documents",
-                                             json_value(body, "texts", std::vector<std::string>()));
-        if (documents.empty()) {
-            res_error(res, format_error_response("\"documents\" must be a non-empty string array", ERROR_TYPE_INVALID_REQUEST));
-            return;
-        }
-
-        int top_n = json_value(body, "top_n", (int)documents.size());
-
-        // create and queue the task
-        json responses = json::array();
-        server_response_reader rd(ctx_server);
-        {
-            std::vector<server_task> tasks;
-            tasks.reserve(documents.size());
-            for (size_t i = 0; i < documents.size(); i++) {
-                auto tmp = format_rerank(ctx_server.model, ctx_server.vocab, ctx_server.mctx, query, documents[i]);
-                server_task task = server_task(SERVER_TASK_TYPE_RERANK);
-                task.id     = ctx_server.queue_tasks.get_new_id();
-                task.index  = i;
-                task.tokens = std::move(tmp);
-                tasks.push_back(std::move(task));
-            }
-            rd.post_tasks(std::move(tasks));
-        }
-
-        // wait for the results
-        auto all_results = rd.wait_for_all(req.is_connection_closed);
+int main(int argc, char ** argv, char ** envp) {
+    // own arguments required by this example
+    common_params params;
 
-        // collect results
-        if (all_results.is_terminated) {
-            return; // connection is closed
-        } else if (all_results.error) {
-            res_error(res, all_results.error->to_json());
-            return;
-        } else {
-            for (auto & res : all_results.results) {
-                GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
-                responses.push_back(res->to_json());
-            }
-        }
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
+        return 1;
+    }
 
-        // write JSON response
-        json root = format_response_rerank(
-            body,
-            responses,
-            is_tei_format,
-            documents,
-            top_n);
+    // TODO: should we have a separate n_parallel parameter for the server?
+    //       https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177
+    // TODO: this is a common configuration that is suitable for most local use cases
+    //       however, overriding the parameters is a bit confusing - figure out something more intuitive
+    if (params.n_parallel == 1 && params.kv_unified == false && !params.has_speculative()) {
+        LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true (add -kvu to disable this)\n", __func__);
 
-        res_ok(res, root);
-    };
+        params.n_parallel = 4;
+        params.kv_unified = true;
+    }
 
-    const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) {
-        json result = json::array();
-        const auto & loras = ctx_server.params_base.lora_adapters;
-        for (size_t i = 0; i < loras.size(); ++i) {
-            auto & lora = loras[i];
-            json entry = {
-                {"id", i},
-                {"path", lora.path},
-                {"scale", lora.scale},
-                {"task_name", lora.task_name},
-                {"prompt_prefix", lora.prompt_prefix},
-            };
-            std::string alora_invocation_string = "";
-            const uint64_t n_alora_tokens = llama_adapter_get_alora_n_invocation_tokens(lora.ptr);
-            std::vector<llama_token> alora_invocation_tokens;
-            if (n_alora_tokens) {
-                const llama_token * alora_tokens = llama_adapter_get_alora_invocation_tokens(lora.ptr);
-                for (uint64_t i = 0; i < n_alora_tokens; ++i) {
-                    alora_invocation_string += common_token_to_piece(ctx_server.ctx, alora_tokens[i]);
-                    alora_invocation_tokens.push_back(alora_tokens[i]);
-                }
-                entry["alora_invocation_string"] = alora_invocation_string;
-                entry["alora_invocation_tokens"] = alora_invocation_tokens;
-            }
-            result.push_back(std::move(entry));
-        }
-        res_ok(res, result);
-        res.status = 200; // HTTP OK
-    };
+    // for consistency between server router mode and single-model mode, we set the same model name as alias
+    if (params.model_alias.empty() && !params.model.name.empty()) {
+        params.model_alias = params.model.name;
+    }
 
-    const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) {
-        const json body = json::parse(req.body);
-        if (!body.is_array()) {
-            res_error(res, format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST));
-            return;
-        }
+    common_init();
 
-        int task_id = ctx_server.queue_tasks.get_new_id();
-        {
-            server_task task(SERVER_TASK_TYPE_SET_LORA);
-            task.id = task_id;
-            task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
-            ctx_server.queue_results.add_waiting_task_id(task_id);
-            ctx_server.queue_tasks.post(std::move(task));
-        }
+    // struct that contains llama context and inference
+    server_context ctx_server;
 
-        // get the result
-        server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
-        ctx_server.queue_results.remove_waiting_task_id(task_id);
+    llama_backend_init();
+    llama_numa_init(params.numa);
 
-        if (result->is_error()) {
-            res_error(res, result->to_json());
-            return;
-        }
+    LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency());
+    LOG_INF("\n");
+    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+    LOG_INF("\n");
 
-        GGML_ASSERT(dynamic_cast<server_task_result_apply_lora*>(result.get()) != nullptr);
-        res_ok(res, result->to_json());
-    };
+    server_http_context ctx_http;
+    if (!ctx_http.init(params)) {
+        LOG_ERR("%s: failed to initialize HTTP server\n", __func__);
+        return 1;
+    }
 
     //
     // Router
     //
 
-    if (!params.webui) {
-        LOG_INF("Web UI is disabled\n");
-    } else {
-        // register static assets routes
-        if (!params.public_path.empty()) {
-            // Set the base directory for serving static files
-            bool is_found = svr->set_mount_point(params.api_prefix + "/", params.public_path);
-            if (!is_found) {
-                LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str());
-                return 1;
-            }
-        } else {
-            // using embedded static index.html
-            svr->Get(params.api_prefix + "/", [](const httplib::Request & req, httplib::Response & res) {
-                if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) {
-                    res.set_content("Error: gzip is not supported by this browser", "text/plain");
-                } else {
-                    res.set_header("Content-Encoding", "gzip");
-                    // COEP and COOP headers, required by pyodide (python interpreter)
-                    res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
-                    res.set_header("Cross-Origin-Opener-Policy", "same-origin");
-                }
-                return false;
-            });
-        }
-    }
-
     // register API routes
-    svr->Get (params.api_prefix + "/health",              handle_health); // public endpoint (no API key check)
-    svr->Get (params.api_prefix + "/v1/health",           handle_health); // public endpoint (no API key check)
-    svr->Get (params.api_prefix + "/metrics",             handle_metrics);
-    svr->Get (params.api_prefix + "/props",               handle_props);
-    svr->Post(params.api_prefix + "/props",               handle_props_change);
-    svr->Post(params.api_prefix + "/api/show",            handle_api_show);
-    svr->Get (params.api_prefix + "/models",              handle_models); // public endpoint (no API key check)
-    svr->Get (params.api_prefix + "/v1/models",           handle_models); // public endpoint (no API key check)
-    svr->Get (params.api_prefix + "/api/tags",            handle_models); // ollama specific endpoint. public endpoint (no API key check)
-    svr->Post(params.api_prefix + "/completion",          handle_completions); // legacy
-    svr->Post(params.api_prefix + "/completions",         handle_completions);
-    svr->Post(params.api_prefix + "/v1/completions",      handle_completions_oai);
-    svr->Post(params.api_prefix + "/chat/completions",    handle_chat_completions);
-    svr->Post(params.api_prefix + "/v1/chat/completions", handle_chat_completions);
-    svr->Post(params.api_prefix + "/api/chat",            handle_chat_completions); // ollama specific endpoint
-    svr->Post(params.api_prefix + "/infill",              handle_infill);
-    svr->Post(params.api_prefix + "/embedding",           handle_embeddings); // legacy
-    svr->Post(params.api_prefix + "/embeddings",          handle_embeddings);
-    svr->Post(params.api_prefix + "/v1/embeddings",       handle_embeddings_oai);
-    svr->Post(params.api_prefix + "/rerank",              handle_rerank);
-    svr->Post(params.api_prefix + "/reranking",           handle_rerank);
-    svr->Post(params.api_prefix + "/v1/rerank",           handle_rerank);
-    svr->Post(params.api_prefix + "/v1/reranking",        handle_rerank);
-    svr->Post(params.api_prefix + "/tokenize",            handle_tokenize);
-    svr->Post(params.api_prefix + "/detokenize",          handle_detokenize);
-    svr->Post(params.api_prefix + "/apply-template",      handle_apply_template);
+    server_routes routes(params, ctx_server, [&ctx_http]() { return ctx_http.is_ready.load(); });
+
+    bool is_router_server = params.model.path.empty();
+    std::optional<server_models_routes> models_routes{};
+    if (is_router_server) {
+        // setup server instances manager
+        models_routes.emplace(params, argc, argv, envp);
+
+        // proxy handlers
+        // note: routes.get_health stays the same
+        routes.get_metrics                 = models_routes->proxy_get;
+        routes.post_props                  = models_routes->proxy_post;
+        routes.get_api_show                = models_routes->proxy_get;
+        routes.post_completions            = models_routes->proxy_post;
+        routes.post_completions_oai        = models_routes->proxy_post;
+        routes.post_chat_completions       = models_routes->proxy_post;
+        routes.post_anthropic_messages     = models_routes->proxy_post;
+        routes.post_anthropic_count_tokens = models_routes->proxy_post;
+        routes.post_infill                 = models_routes->proxy_post;
+        routes.post_embeddings             = models_routes->proxy_post;
+        routes.post_embeddings_oai         = models_routes->proxy_post;
+        routes.post_rerank                 = models_routes->proxy_post;
+        routes.post_tokenize               = models_routes->proxy_post;
+        routes.post_detokenize             = models_routes->proxy_post;
+        routes.post_apply_template         = models_routes->proxy_post;
+        routes.get_lora_adapters           = models_routes->proxy_get;
+        routes.post_lora_adapters          = models_routes->proxy_post;
+        routes.get_slots                   = models_routes->proxy_get;
+        routes.post_slots                  = models_routes->proxy_post;
+
+        // custom routes for router
+        routes.get_props  = models_routes->get_router_props;
+        routes.get_models = models_routes->get_router_models;
+        ctx_http.post("/models/load",   ex_wrapper(models_routes->post_router_models_load));
+        ctx_http.post("/models/unload", ex_wrapper(models_routes->post_router_models_unload));
+        ctx_http.post("/models/status", ex_wrapper(models_routes->post_router_models_status));
+    }
+
+    ctx_http.get ("/health",              ex_wrapper(routes.get_health)); // public endpoint (no API key check)
+    ctx_http.get ("/v1/health",           ex_wrapper(routes.get_health)); // public endpoint (no API key check)
+    ctx_http.get ("/metrics",             ex_wrapper(routes.get_metrics));
+    ctx_http.get ("/props",               ex_wrapper(routes.get_props));
+    ctx_http.post("/props",               ex_wrapper(routes.post_props));
+    ctx_http.post("/api/show",            ex_wrapper(routes.get_api_show));
+    ctx_http.get ("/models",              ex_wrapper(routes.get_models)); // public endpoint (no API key check)
+    ctx_http.get ("/v1/models",           ex_wrapper(routes.get_models)); // public endpoint (no API key check)
+    ctx_http.get ("/api/tags",            ex_wrapper(routes.get_models)); // ollama specific endpoint. public endpoint (no API key check)
+    ctx_http.post("/completion",          ex_wrapper(routes.post_completions)); // legacy
+    ctx_http.post("/completions",         ex_wrapper(routes.post_completions));
+    ctx_http.post("/v1/completions",      ex_wrapper(routes.post_completions_oai));
+    ctx_http.post("/chat/completions",    ex_wrapper(routes.post_chat_completions));
+    ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions));
+    ctx_http.post("/api/chat",            ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint
+    ctx_http.post("/v1/messages",         ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API
+    ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting
+    ctx_http.post("/infill",              ex_wrapper(routes.post_infill));
+    ctx_http.post("/embedding",           ex_wrapper(routes.post_embeddings)); // legacy
+    ctx_http.post("/embeddings",          ex_wrapper(routes.post_embeddings));
+    ctx_http.post("/v1/embeddings",       ex_wrapper(routes.post_embeddings_oai));
+    ctx_http.post("/rerank",              ex_wrapper(routes.post_rerank));
+    ctx_http.post("/reranking",           ex_wrapper(routes.post_rerank));
+    ctx_http.post("/v1/rerank",           ex_wrapper(routes.post_rerank));
+    ctx_http.post("/v1/reranking",        ex_wrapper(routes.post_rerank));
+    ctx_http.post("/tokenize",            ex_wrapper(routes.post_tokenize));
+    ctx_http.post("/detokenize",          ex_wrapper(routes.post_detokenize));
+    ctx_http.post("/apply-template",      ex_wrapper(routes.post_apply_template));
     // LoRA adapters hotswap
-    svr->Get (params.api_prefix + "/lora-adapters",       handle_lora_adapters_list);
-    svr->Post(params.api_prefix + "/lora-adapters",       handle_lora_adapters_apply);
+    ctx_http.get ("/lora-adapters",       ex_wrapper(routes.get_lora_adapters));
+    ctx_http.post("/lora-adapters",       ex_wrapper(routes.post_lora_adapters));
     // Save & load slots
-    svr->Get (params.api_prefix + "/slots",               handle_slots);
-    svr->Post(params.api_prefix + "/slots/:id_slot",      handle_slots_action);
+    ctx_http.get ("/slots",               ex_wrapper(routes.get_slots));
+    ctx_http.post("/slots/:id_slot",      ex_wrapper(routes.post_slots));
 
     //
     // Start the server
     //
-    if (params.n_threads_http < 1) {
-        // +2 threads for monitoring endpoints
-        params.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
-    }
-    log_data["n_threads_http"] =  std::to_string(params.n_threads_http);
-    svr->new_task_queue = [&params] { return new httplib::ThreadPool(params.n_threads_http); };
-
-    // clean up function, to be called before exit
-    auto clean_up = [&svr, &ctx_server]() {
-        SRV_INF("%s: cleaning up before exit...\n", __func__);
-        svr->stop();
-        ctx_server.queue_results.terminate();
-        llama_backend_free();
-    };
 
-    bool was_bound = false;
-    bool is_sock = false;
-    if (string_ends_with(std::string(params.hostname), ".sock")) {
-        is_sock = true;
-        LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
-        svr->set_address_family(AF_UNIX);
-        // bind_to_port requires a second arg, any value other than 0 should
-        // simply get ignored
-        was_bound = svr->bind_to_port(params.hostname, 8080);
-    } else {
-        LOG_INF("%s: binding port with default address family\n", __func__);
-        // bind HTTP listen port
-        if (params.port == 0) {
-            int bound_port = svr->bind_to_any_port(params.hostname);
-            if ((was_bound = (bound_port >= 0))) {
-                params.port = bound_port;
-            }
-        } else {
-            was_bound = svr->bind_to_port(params.hostname, params.port);
-        }
-    }
+    std::function<void()> clean_up;
 
-    if (!was_bound) {
-        LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port);
-        clean_up();
-        return 1;
-    }
+    if (is_router_server) {
+        LOG_INF("%s: starting router server, no model will be loaded in this process\n", __func__);
 
-    // run the HTTP server in a thread
-    std::thread t([&]() { svr->listen_after_bind(); });
-    svr->wait_until_ready();
+        clean_up = [&models_routes]() {
+            SRV_INF("%s: cleaning up before exit...\n", __func__);
+            if (models_routes.has_value()) {
+                models_routes->models.unload_all();
+            }
+            llama_backend_free();
+        };
 
-    LOG_INF("%s: HTTP server is listening, hostname: %s, port: %d, http threads: %d\n", __func__, params.hostname.c_str(), params.port, params.n_threads_http);
+        if (!ctx_http.start()) {
+            clean_up();
+            LOG_ERR("%s: exiting due to HTTP server error\n", __func__);
+            return 1;
+        }
+        ctx_http.is_ready.store(true);
 
-    // load the model
-    LOG_INF("%s: loading model\n", __func__);
+        shutdown_handler = [&](int) {
+            ctx_http.stop();
+        };
 
-    if (!ctx_server.load_model(params)) {
-        clean_up();
-        t.join();
-        LOG_ERR("%s: exiting due to model loading error\n", __func__);
-        return 1;
-    }
+    } else {
+        // setup clean up function, to be called before exit
+        clean_up = [&ctx_http, &ctx_server]() {
+            SRV_INF("%s: cleaning up before exit...\n", __func__);
+            ctx_http.stop();
+            ctx_server.terminate();
+            llama_backend_free();
+        };
 
-    ctx_server.init();
-    state.store(SERVER_STATE_READY);
+        // start the HTTP server before loading the model to be able to serve /health requests
+        if (!ctx_http.start()) {
+            clean_up();
+            LOG_ERR("%s: exiting due to HTTP server error\n", __func__);
+            return 1;
+        }
 
-    LOG_INF("%s: model loaded\n", __func__);
+        // load the model
+        LOG_INF("%s: loading model\n", __func__);
 
-    // print sample chat example to make it clear which template is used
-    LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
-        common_chat_templates_source(ctx_server.chat_templates.get()),
-        common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja, ctx_server.params_base.default_template_kwargs).c_str());
+        if (!ctx_server.load_model(params)) {
+            clean_up();
+            if (ctx_http.thread.joinable()) {
+                ctx_http.thread.join();
+            }
+            LOG_ERR("%s: exiting due to model loading error\n", __func__);
+            return 1;
+        }
 
-    ctx_server.queue_tasks.on_new_task([&ctx_server](server_task && task) {
-        ctx_server.process_single_task(std::move(task));
-    });
+        ctx_server.init();
+        ctx_http.is_ready.store(true);
 
-    ctx_server.queue_tasks.on_update_slots([&ctx_server]() {
-        ctx_server.update_slots();
-    });
+        LOG_INF("%s: model loaded\n", __func__);
 
-    shutdown_handler = [&](int) {
-        // this will unblock start_loop()
-        ctx_server.queue_tasks.terminate();
-    };
+        shutdown_handler = [&](int) {
+            // this will unblock start_loop()
+            ctx_server.terminate();
+        };
+    }
 
+    // TODO: refactor in common/console
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
     struct sigaction sigint_action;
     sigint_action.sa_handler = signal_handler;
@@ -5771,16 +268,39 @@ int main(int argc, char ** argv) {
     SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
 
-    LOG_INF("%s: server is listening on %s - starting the main loop\n", __func__,
-            is_sock ? string_format("unix://%s",    params.hostname.c_str()).c_str() :
-                      string_format("http://%s:%d", params.hostname.c_str(), params.port).c_str());
+    if (is_router_server) {
+        LOG_INF("%s: router server is listening on %s\n", __func__, ctx_http.listening_address.c_str());
+        LOG_INF("%s: NOTE: router mode is experimental\n", __func__);
+        LOG_INF("%s:       it is not recommended to use this mode in untrusted environments\n", __func__);
+        if (ctx_http.thread.joinable()) {
+            ctx_http.thread.join(); // keep the main thread alive
+        }
+
+        // when the HTTP server stops, clean up and exit
+        clean_up();
+    } else {
+        LOG_INF("%s: server is listening on %s\n", __func__, ctx_http.listening_address.c_str());
+        LOG_INF("%s: starting the main loop...\n", __func__);
+
+        // optionally, notify router server that this instance is ready
+        const char * router_port = std::getenv("LLAMA_SERVER_ROUTER_PORT");
+        std::thread monitor_thread;
+        if (router_port != nullptr) {
+            monitor_thread = server_models::setup_child_server(params, std::atoi(router_port), params.model_alias, shutdown_handler);
+        }
 
-    // this call blocks the main thread until queue_tasks.terminate() is called
-    ctx_server.queue_tasks.start_loop();
+        // this call blocks the main thread until queue_tasks.terminate() is called
+        ctx_server.start_loop();
 
-    clean_up();
-    t.join();
-    llama_memory_breakdown_print(ctx_server.ctx);
+        clean_up();
+        if (ctx_http.thread.joinable()) {
+            ctx_http.thread.join();
+        }
+        if (monitor_thread.joinable()) {
+            monitor_thread.join();
+        }
+        llama_memory_breakdown_print(ctx_server.get_llama_context());
+    }
 
     return 0;
 }
diff --git a/llamacpp/native/src/server/server.patch b/llamacpp/native/src/server/server.patch
deleted file mode 100644
index 1988b6b5c..000000000
--- a/llamacpp/native/src/server/server.patch
+++ /dev/null
@@ -1,20 +0,0 @@
-16,19d15
-< 
-< // auto generated files (see README.md for details)
-< #include "index.html.gz.hpp"
-< #include "loading.html.hpp"
-4224,4233c4220
-<             auto tmp = string_split<std::string>(req.path, '.');
-<             if (req.path == "/" || tmp.back() == "html") {
-<                 res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
-<                 res.status = 503;
-<             } else if (req.path == "/models" || req.path == "/v1/models" || req.path == "/api/tags") {
-<                 // allow the models endpoint to be accessed during loading
-<                 return true;
-<             } else {
-<                 res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
-<             }
----
->             res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
-5226d5212
-<                     res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8");
diff --git a/llamacpp/native/vendor/llama.cpp b/llamacpp/native/vendor/llama.cpp
index 97d511721..37adc9c6b 160000
--- a/llamacpp/native/vendor/llama.cpp
+++ b/llamacpp/native/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 97d5117217e4ad904493345e2f71dfe441a08e25
+Subproject commit 37adc9c6ba6057bfe7c036c201abe85471d854a1