diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index f6b6f43bd..3f768b7f7 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -223,11 +223,6 @@ int AicpuSoInfo::finalize() { // DeviceRunner Implementation // ============================================================================= -DeviceRunner &DeviceRunner::get() { - static DeviceRunner runner; - return runner; -} - DeviceRunner::~DeviceRunner() { finalize(); } int DeviceRunner::ensure_device_initialized( diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index 625c6c63f..22ad444fa 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -18,7 +18,7 @@ * - DeviceArgs: AICPU device argument structure * - KernelArgsHelper: Helper for managing kernel arguments with device memory * - AicpuSoInfo: AICPU shared object (.so) file management - * - DeviceRunner: Singleton for kernel launching and execution + * - DeviceRunner: kernel launching and execution */ #ifndef RUNTIME_DEVICERUNNER_H @@ -168,7 +168,7 @@ struct AicpuSoInfo { }; /** - * Device runner singleton for kernel execution + * Device runner for kernel execution * * This class provides a unified interface for launching AICPU and AICore * kernels on Ascend devices. It handles: @@ -181,12 +181,8 @@ struct AicpuSoInfo { */ class DeviceRunner { public: - /** - * Get singleton instance - * - * @return Reference to the singleton DeviceRunner instance - */ - static DeviceRunner &get(); + DeviceRunner() = default; + ~DeviceRunner(); /** * Allocate device tensor memory @@ -361,9 +357,6 @@ class DeviceRunner { int ensure_device_set(int device_id); private: - DeviceRunner() = default; - ~DeviceRunner(); - // Internal state int device_id_{-1}; int block_dim_{0}; diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp index 11625dc77..eff903896 100644 --- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp @@ -20,6 +20,7 @@ #include "callable.h" #include "task_args.h" +#include #include #include "common/unified_log.h" @@ -34,13 +35,23 @@ extern "C" { int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args); int validate_runtime_impl(Runtime *runtime); +/* =========================================================================== + * Per-thread DeviceRunner binding (set by run_runtime, read by HostApi wrappers) + * =========================================================================== */ + +static pthread_key_t g_runner_key; +static pthread_once_t g_runner_key_once = PTHREAD_ONCE_INIT; +static void create_runner_key() { pthread_key_create(&g_runner_key, nullptr); } + +static DeviceRunner *current_runner() { return static_cast(pthread_getspecific(g_runner_key)); } + /* =========================================================================== * Internal device-memory functions (used via Runtime.host_api, NOT dlsym'd) * =========================================================================== */ static void *device_malloc(size_t size) { try { - return DeviceRunner::get().allocate_tensor(size); + return current_runner()->allocate_tensor(size); } catch (...) { return NULL; } @@ -49,14 +60,14 @@ static void *device_malloc(size_t size) { static void device_free(void *dev_ptr) { if (dev_ptr == NULL) return; try { - DeviceRunner::get().free_tensor(dev_ptr); + current_runner()->free_tensor(dev_ptr); } catch (...) {} } static int copy_to_device(void *dev_ptr, const void *host_ptr, size_t size) { if (dev_ptr == NULL || host_ptr == NULL) return -1; try { - return DeviceRunner::get().copy_to_device(dev_ptr, host_ptr, size); + return current_runner()->copy_to_device(dev_ptr, host_ptr, size); } catch (...) { return -1; } @@ -65,7 +76,7 @@ static int copy_to_device(void *dev_ptr, const void *host_ptr, size_t size) { static int copy_from_device(void *host_ptr, const void *dev_ptr, size_t size) { if (host_ptr == NULL || dev_ptr == NULL) return -1; try { - return DeviceRunner::get().copy_from_device(host_ptr, dev_ptr, size); + return current_runner()->copy_from_device(host_ptr, dev_ptr, size); } catch (...) { return -1; } @@ -73,7 +84,7 @@ static int copy_from_device(void *host_ptr, const void *dev_ptr, size_t size) { static uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t *bin_data, size_t bin_size) { try { - return DeviceRunner::get().upload_kernel_binary(func_id, bin_data, bin_size); + return current_runner()->upload_kernel_binary(func_id, bin_data, bin_size); } catch (...) { return 0; } @@ -81,7 +92,7 @@ static uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t *bin_dat static void remove_kernel_binary_wrapper(int func_id) { try { - DeviceRunner::get().remove_kernel_binary(func_id); + current_runner()->remove_kernel_binary(func_id); } catch (...) {} } @@ -89,24 +100,39 @@ static void remove_kernel_binary_wrapper(int func_id) { * Public C API (resolved by ChipWorker via dlsym) * =========================================================================== */ +DeviceContextHandle create_device_context(void) { + try { + return static_cast(new DeviceRunner()); + } catch (...) { + return NULL; + } +} + +void destroy_device_context(DeviceContextHandle ctx) { delete static_cast(ctx); } + size_t get_runtime_size(void) { return sizeof(Runtime); } -int set_device(int device_id) { +int set_device(DeviceContextHandle ctx, int device_id) { + if (ctx == NULL) return -1; try { - return DeviceRunner::get().ensure_device_set(device_id); + return static_cast(ctx)->ensure_device_set(device_id); } catch (...) { return -1; } } int run_runtime( - RuntimeHandle runtime, const void *callable, const void *args, int block_dim, int aicpu_thread_num, int device_id, - const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size, - int enable_profiling + DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim, + int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, + size_t aicore_size, int enable_profiling ) { - if (runtime == NULL) return -1; + if (ctx == NULL || runtime == NULL) return -1; if (aicpu_binary == NULL || aicpu_size == 0 || aicore_binary == NULL || aicore_size == 0) return -1; + pthread_once(&g_runner_key_once, create_runner_key); + pthread_setspecific(g_runner_key, ctx); + DeviceRunner *runner = static_cast(ctx); + try { // Phase 1: placement new + build graph Runtime *r = new (runtime) Runtime(); @@ -126,6 +152,7 @@ int run_runtime( r->set_pto2_gm_sm_ptr(nullptr); validate_runtime_impl(r); r->~Runtime(); + pthread_setspecific(g_runner_key, nullptr); return rc; } @@ -135,28 +162,31 @@ int run_runtime( } // Phase 3: launch - DeviceRunner &runner = DeviceRunner::get(); std::vector aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size); std::vector aicore_vec(aicore_binary, aicore_binary + aicore_size); - rc = runner.run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num); + rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num); if (rc != 0) { validate_runtime_impl(r); r->~Runtime(); + pthread_setspecific(g_runner_key, nullptr); return rc; } // Phase 4: finalize (copy results back) rc = validate_runtime_impl(r); r->~Runtime(); + pthread_setspecific(g_runner_key, nullptr); return rc; } catch (...) { + pthread_setspecific(g_runner_key, nullptr); return -1; } } -int finalize_device(void) { +int finalize_device(DeviceContextHandle ctx) { + if (ctx == NULL) return -1; try { - return DeviceRunner::get().finalize(); + return static_cast(ctx)->finalize(); } catch (...) { return -1; } diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp index a5c7a3242..58ac75f5d 100644 --- a/src/a2a3/platform/sim/host/device_runner.cpp +++ b/src/a2a3/platform/sim/host/device_runner.cpp @@ -94,11 +94,6 @@ bool create_temp_so_file(const std::string &path_template, const uint8_t *data, // DeviceRunner Implementation // ============================================================================= -DeviceRunner &DeviceRunner::get() { - static DeviceRunner runner; - return runner; -} - DeviceRunner::~DeviceRunner() { finalize(); } int DeviceRunner::ensure_device_initialized( diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h index 1ddeee0b2..83fba042b 100644 --- a/src/a2a3/platform/sim/host/device_runner.h +++ b/src/a2a3/platform/sim/host/device_runner.h @@ -61,7 +61,7 @@ struct MappedKernel { }; /** - * Device runner singleton for simulated kernel execution + * Device runner for simulated kernel execution * * This class provides the SAME interface as the real a2a3 DeviceRunner, * but implements execution using host threads instead of actual device @@ -74,10 +74,8 @@ struct MappedKernel { */ class DeviceRunner { public: - /** - * Get singleton instance - */ - static DeviceRunner &get(); + DeviceRunner() = default; + ~DeviceRunner(); /** * Allocate tensor memory (host memory in simulation) @@ -201,9 +199,6 @@ class DeviceRunner { void remove_kernel_binary(int func_id); private: - DeviceRunner() = default; - ~DeviceRunner(); - // Configuration int device_id_{-1}; int block_dim_{0}; diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp index a8f0ed2a4..37028f27d 100644 --- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp @@ -21,6 +21,7 @@ #include "task_args.h" #include +#include #include #include "common/unified_log.h" @@ -36,13 +37,23 @@ extern "C" { int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args); int validate_runtime_impl(Runtime *runtime); +/* =========================================================================== + * Per-thread DeviceRunner binding (set by run_runtime, read by HostApi wrappers) + * =========================================================================== */ + +static pthread_key_t g_runner_key; +static pthread_once_t g_runner_key_once = PTHREAD_ONCE_INIT; +static void create_runner_key() { pthread_key_create(&g_runner_key, nullptr); } + +static DeviceRunner *current_runner() { return static_cast(pthread_getspecific(g_runner_key)); } + /* =========================================================================== * Internal device-memory functions (used via Runtime.host_api, NOT dlsym'd) * =========================================================================== */ static void *device_malloc(size_t size) { try { - return DeviceRunner::get().allocate_tensor(size); + return current_runner()->allocate_tensor(size); } catch (...) { return NULL; } @@ -51,14 +62,14 @@ static void *device_malloc(size_t size) { static void device_free(void *dev_ptr) { if (dev_ptr == NULL) return; try { - DeviceRunner::get().free_tensor(dev_ptr); + current_runner()->free_tensor(dev_ptr); } catch (...) {} } static int copy_to_device(void *dev_ptr, const void *host_ptr, size_t size) { if (dev_ptr == NULL || host_ptr == NULL) return -1; try { - return DeviceRunner::get().copy_to_device(dev_ptr, host_ptr, size); + return current_runner()->copy_to_device(dev_ptr, host_ptr, size); } catch (...) { return -1; } @@ -67,7 +78,7 @@ static int copy_to_device(void *dev_ptr, const void *host_ptr, size_t size) { static int copy_from_device(void *host_ptr, const void *dev_ptr, size_t size) { if (host_ptr == NULL || dev_ptr == NULL) return -1; try { - return DeviceRunner::get().copy_from_device(host_ptr, dev_ptr, size); + return current_runner()->copy_from_device(host_ptr, dev_ptr, size); } catch (...) { return -1; } @@ -75,7 +86,7 @@ static int copy_from_device(void *host_ptr, const void *dev_ptr, size_t size) { static uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t *bin_data, size_t bin_size) { try { - return DeviceRunner::get().upload_kernel_binary(func_id, bin_data, bin_size); + return current_runner()->upload_kernel_binary(func_id, bin_data, bin_size); } catch (...) { return 0; } @@ -83,7 +94,7 @@ static uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t *bin_dat static void remove_kernel_binary_wrapper(int func_id) { try { - DeviceRunner::get().remove_kernel_binary(func_id); + current_runner()->remove_kernel_binary(func_id); } catch (...) {} } @@ -91,20 +102,35 @@ static void remove_kernel_binary_wrapper(int func_id) { * Public C API (resolved by ChipWorker via dlsym) * =========================================================================== */ +DeviceContextHandle create_device_context(void) { + try { + return static_cast(new DeviceRunner()); + } catch (...) { + return NULL; + } +} + +void destroy_device_context(DeviceContextHandle ctx) { delete static_cast(ctx); } + size_t get_runtime_size(void) { return sizeof(Runtime); } -int set_device(int device_id) { +int set_device(DeviceContextHandle ctx, int device_id) { + (void)ctx; pto_cpu_sim_bind_device(device_id); pto_cpu_sim_acquire_device(device_id); return 0; } int run_runtime( - RuntimeHandle runtime, const void *callable, const void *args, int block_dim, int aicpu_thread_num, int device_id, - const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size, - int enable_profiling + DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim, + int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, + size_t aicore_size, int enable_profiling ) { - if (runtime == NULL) return -1; + if (ctx == NULL || runtime == NULL) return -1; + + pthread_once(&g_runner_key_once, create_runner_key); + pthread_setspecific(g_runner_key, ctx); + DeviceRunner *runner = static_cast(ctx); try { // Phase 1: placement new + build graph @@ -123,6 +149,7 @@ int run_runtime( r->set_pto2_gm_sm_ptr(nullptr); validate_runtime_impl(r); r->~Runtime(); + pthread_setspecific(g_runner_key, nullptr); return rc; } @@ -132,7 +159,6 @@ int run_runtime( } // Phase 3: launch - DeviceRunner &runner = DeviceRunner::get(); std::vector aicpu_vec; std::vector aicore_vec; if (aicpu_binary != NULL && aicpu_size > 0) { @@ -141,25 +167,29 @@ int run_runtime( if (aicore_binary != NULL && aicore_size > 0) { aicore_vec.assign(aicore_binary, aicore_binary + aicore_size); } - rc = runner.run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num); + rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num); if (rc != 0) { validate_runtime_impl(r); r->~Runtime(); + pthread_setspecific(g_runner_key, nullptr); return rc; } // Phase 4: finalize (copy results back) rc = validate_runtime_impl(r); r->~Runtime(); + pthread_setspecific(g_runner_key, nullptr); return rc; } catch (...) { + pthread_setspecific(g_runner_key, nullptr); return -1; } } -int finalize_device(void) { +int finalize_device(DeviceContextHandle ctx) { + if (ctx == NULL) return -1; try { - int rc = DeviceRunner::get().finalize(); + int rc = static_cast(ctx)->finalize(); int dev = pto_cpu_sim_get_bound_device(); if (dev >= 0) { pto_cpu_sim_release_device(dev); diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index eed619265..b40c55f5e 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -141,11 +141,6 @@ int AicpuSoInfo::finalize() { // DeviceRunner Implementation // ============================================================================= -DeviceRunner &DeviceRunner::get() { - static DeviceRunner runner; - return runner; -} - DeviceRunner::~DeviceRunner() { finalize(); } int DeviceRunner::ensure_device_initialized( diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h index aca5affc9..34d8e2bb3 100644 --- a/src/a5/platform/onboard/host/device_runner.h +++ b/src/a5/platform/onboard/host/device_runner.h @@ -18,7 +18,7 @@ * - DeviceArgs: AICPU device argument structure * - KernelArgsHelper: Helper for managing kernel arguments with device memory * - AicpuSoInfo: AICPU shared object (.so) file management - * - DeviceRunner: Singleton for kernel launching and execution + * - DeviceRunner: kernel launching and execution */ #ifndef RUNTIME_DEVICERUNNER_H @@ -143,7 +143,7 @@ struct AicpuSoInfo { }; /** - * Device runner singleton for kernel execution + * Device runner for kernel execution * * This class provides a unified interface for launching AICPU and AICore * kernels on Ascend devices. It handles: @@ -156,12 +156,8 @@ struct AicpuSoInfo { */ class DeviceRunner { public: - /** - * Get singleton instance - * - * @return Reference to the singleton DeviceRunner instance - */ - static DeviceRunner &get(); + DeviceRunner() = default; + ~DeviceRunner(); /** * Allocate device tensor memory @@ -325,9 +321,6 @@ class DeviceRunner { int ensure_device_set(int device_id); private: - DeviceRunner() = default; - ~DeviceRunner(); - // Internal state int device_id_{-1}; int block_dim_{0}; diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp index 11625dc77..eff903896 100644 --- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp @@ -20,6 +20,7 @@ #include "callable.h" #include "task_args.h" +#include #include #include "common/unified_log.h" @@ -34,13 +35,23 @@ extern "C" { int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args); int validate_runtime_impl(Runtime *runtime); +/* =========================================================================== + * Per-thread DeviceRunner binding (set by run_runtime, read by HostApi wrappers) + * =========================================================================== */ + +static pthread_key_t g_runner_key; +static pthread_once_t g_runner_key_once = PTHREAD_ONCE_INIT; +static void create_runner_key() { pthread_key_create(&g_runner_key, nullptr); } + +static DeviceRunner *current_runner() { return static_cast(pthread_getspecific(g_runner_key)); } + /* =========================================================================== * Internal device-memory functions (used via Runtime.host_api, NOT dlsym'd) * =========================================================================== */ static void *device_malloc(size_t size) { try { - return DeviceRunner::get().allocate_tensor(size); + return current_runner()->allocate_tensor(size); } catch (...) { return NULL; } @@ -49,14 +60,14 @@ static void *device_malloc(size_t size) { static void device_free(void *dev_ptr) { if (dev_ptr == NULL) return; try { - DeviceRunner::get().free_tensor(dev_ptr); + current_runner()->free_tensor(dev_ptr); } catch (...) {} } static int copy_to_device(void *dev_ptr, const void *host_ptr, size_t size) { if (dev_ptr == NULL || host_ptr == NULL) return -1; try { - return DeviceRunner::get().copy_to_device(dev_ptr, host_ptr, size); + return current_runner()->copy_to_device(dev_ptr, host_ptr, size); } catch (...) { return -1; } @@ -65,7 +76,7 @@ static int copy_to_device(void *dev_ptr, const void *host_ptr, size_t size) { static int copy_from_device(void *host_ptr, const void *dev_ptr, size_t size) { if (host_ptr == NULL || dev_ptr == NULL) return -1; try { - return DeviceRunner::get().copy_from_device(host_ptr, dev_ptr, size); + return current_runner()->copy_from_device(host_ptr, dev_ptr, size); } catch (...) { return -1; } @@ -73,7 +84,7 @@ static int copy_from_device(void *host_ptr, const void *dev_ptr, size_t size) { static uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t *bin_data, size_t bin_size) { try { - return DeviceRunner::get().upload_kernel_binary(func_id, bin_data, bin_size); + return current_runner()->upload_kernel_binary(func_id, bin_data, bin_size); } catch (...) { return 0; } @@ -81,7 +92,7 @@ static uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t *bin_dat static void remove_kernel_binary_wrapper(int func_id) { try { - DeviceRunner::get().remove_kernel_binary(func_id); + current_runner()->remove_kernel_binary(func_id); } catch (...) {} } @@ -89,24 +100,39 @@ static void remove_kernel_binary_wrapper(int func_id) { * Public C API (resolved by ChipWorker via dlsym) * =========================================================================== */ +DeviceContextHandle create_device_context(void) { + try { + return static_cast(new DeviceRunner()); + } catch (...) { + return NULL; + } +} + +void destroy_device_context(DeviceContextHandle ctx) { delete static_cast(ctx); } + size_t get_runtime_size(void) { return sizeof(Runtime); } -int set_device(int device_id) { +int set_device(DeviceContextHandle ctx, int device_id) { + if (ctx == NULL) return -1; try { - return DeviceRunner::get().ensure_device_set(device_id); + return static_cast(ctx)->ensure_device_set(device_id); } catch (...) { return -1; } } int run_runtime( - RuntimeHandle runtime, const void *callable, const void *args, int block_dim, int aicpu_thread_num, int device_id, - const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size, - int enable_profiling + DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim, + int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, + size_t aicore_size, int enable_profiling ) { - if (runtime == NULL) return -1; + if (ctx == NULL || runtime == NULL) return -1; if (aicpu_binary == NULL || aicpu_size == 0 || aicore_binary == NULL || aicore_size == 0) return -1; + pthread_once(&g_runner_key_once, create_runner_key); + pthread_setspecific(g_runner_key, ctx); + DeviceRunner *runner = static_cast(ctx); + try { // Phase 1: placement new + build graph Runtime *r = new (runtime) Runtime(); @@ -126,6 +152,7 @@ int run_runtime( r->set_pto2_gm_sm_ptr(nullptr); validate_runtime_impl(r); r->~Runtime(); + pthread_setspecific(g_runner_key, nullptr); return rc; } @@ -135,28 +162,31 @@ int run_runtime( } // Phase 3: launch - DeviceRunner &runner = DeviceRunner::get(); std::vector aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size); std::vector aicore_vec(aicore_binary, aicore_binary + aicore_size); - rc = runner.run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num); + rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num); if (rc != 0) { validate_runtime_impl(r); r->~Runtime(); + pthread_setspecific(g_runner_key, nullptr); return rc; } // Phase 4: finalize (copy results back) rc = validate_runtime_impl(r); r->~Runtime(); + pthread_setspecific(g_runner_key, nullptr); return rc; } catch (...) { + pthread_setspecific(g_runner_key, nullptr); return -1; } } -int finalize_device(void) { +int finalize_device(DeviceContextHandle ctx) { + if (ctx == NULL) return -1; try { - return DeviceRunner::get().finalize(); + return static_cast(ctx)->finalize(); } catch (...) { return -1; } diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp index 760859eed..27ecbe0ef 100644 --- a/src/a5/platform/sim/host/device_runner.cpp +++ b/src/a5/platform/sim/host/device_runner.cpp @@ -94,11 +94,6 @@ bool create_temp_so_file(const std::string &path_template, const uint8_t *data, // DeviceRunner Implementation // ============================================================================= -DeviceRunner &DeviceRunner::get() { - static DeviceRunner runner; - return runner; -} - DeviceRunner::~DeviceRunner() { finalize(); } int DeviceRunner::ensure_device_initialized( diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h index 3d99bd6ac..9b25e44ae 100644 --- a/src/a5/platform/sim/host/device_runner.h +++ b/src/a5/platform/sim/host/device_runner.h @@ -61,7 +61,7 @@ struct MappedKernel { }; /** - * Device runner singleton for simulated kernel execution + * Device runner for simulated kernel execution * * This class provides the SAME interface as the real a5 DeviceRunner, * but implements execution using host threads instead of actual device @@ -74,10 +74,8 @@ struct MappedKernel { */ class DeviceRunner { public: - /** - * Get singleton instance - */ - static DeviceRunner &get(); + DeviceRunner() = default; + ~DeviceRunner(); /** * Allocate tensor memory (host memory in simulation) @@ -190,9 +188,6 @@ class DeviceRunner { void remove_kernel_binary(int func_id); private: - DeviceRunner() = default; - ~DeviceRunner(); - // Configuration int device_id_{-1}; int block_dim_{0}; diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp index a8f0ed2a4..37028f27d 100644 --- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp @@ -21,6 +21,7 @@ #include "task_args.h" #include +#include #include #include "common/unified_log.h" @@ -36,13 +37,23 @@ extern "C" { int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args); int validate_runtime_impl(Runtime *runtime); +/* =========================================================================== + * Per-thread DeviceRunner binding (set by run_runtime, read by HostApi wrappers) + * =========================================================================== */ + +static pthread_key_t g_runner_key; +static pthread_once_t g_runner_key_once = PTHREAD_ONCE_INIT; +static void create_runner_key() { pthread_key_create(&g_runner_key, nullptr); } + +static DeviceRunner *current_runner() { return static_cast(pthread_getspecific(g_runner_key)); } + /* =========================================================================== * Internal device-memory functions (used via Runtime.host_api, NOT dlsym'd) * =========================================================================== */ static void *device_malloc(size_t size) { try { - return DeviceRunner::get().allocate_tensor(size); + return current_runner()->allocate_tensor(size); } catch (...) { return NULL; } @@ -51,14 +62,14 @@ static void *device_malloc(size_t size) { static void device_free(void *dev_ptr) { if (dev_ptr == NULL) return; try { - DeviceRunner::get().free_tensor(dev_ptr); + current_runner()->free_tensor(dev_ptr); } catch (...) {} } static int copy_to_device(void *dev_ptr, const void *host_ptr, size_t size) { if (dev_ptr == NULL || host_ptr == NULL) return -1; try { - return DeviceRunner::get().copy_to_device(dev_ptr, host_ptr, size); + return current_runner()->copy_to_device(dev_ptr, host_ptr, size); } catch (...) { return -1; } @@ -67,7 +78,7 @@ static int copy_to_device(void *dev_ptr, const void *host_ptr, size_t size) { static int copy_from_device(void *host_ptr, const void *dev_ptr, size_t size) { if (host_ptr == NULL || dev_ptr == NULL) return -1; try { - return DeviceRunner::get().copy_from_device(host_ptr, dev_ptr, size); + return current_runner()->copy_from_device(host_ptr, dev_ptr, size); } catch (...) { return -1; } @@ -75,7 +86,7 @@ static int copy_from_device(void *host_ptr, const void *dev_ptr, size_t size) { static uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t *bin_data, size_t bin_size) { try { - return DeviceRunner::get().upload_kernel_binary(func_id, bin_data, bin_size); + return current_runner()->upload_kernel_binary(func_id, bin_data, bin_size); } catch (...) { return 0; } @@ -83,7 +94,7 @@ static uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t *bin_dat static void remove_kernel_binary_wrapper(int func_id) { try { - DeviceRunner::get().remove_kernel_binary(func_id); + current_runner()->remove_kernel_binary(func_id); } catch (...) {} } @@ -91,20 +102,35 @@ static void remove_kernel_binary_wrapper(int func_id) { * Public C API (resolved by ChipWorker via dlsym) * =========================================================================== */ +DeviceContextHandle create_device_context(void) { + try { + return static_cast(new DeviceRunner()); + } catch (...) { + return NULL; + } +} + +void destroy_device_context(DeviceContextHandle ctx) { delete static_cast(ctx); } + size_t get_runtime_size(void) { return sizeof(Runtime); } -int set_device(int device_id) { +int set_device(DeviceContextHandle ctx, int device_id) { + (void)ctx; pto_cpu_sim_bind_device(device_id); pto_cpu_sim_acquire_device(device_id); return 0; } int run_runtime( - RuntimeHandle runtime, const void *callable, const void *args, int block_dim, int aicpu_thread_num, int device_id, - const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size, - int enable_profiling + DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim, + int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, + size_t aicore_size, int enable_profiling ) { - if (runtime == NULL) return -1; + if (ctx == NULL || runtime == NULL) return -1; + + pthread_once(&g_runner_key_once, create_runner_key); + pthread_setspecific(g_runner_key, ctx); + DeviceRunner *runner = static_cast(ctx); try { // Phase 1: placement new + build graph @@ -123,6 +149,7 @@ int run_runtime( r->set_pto2_gm_sm_ptr(nullptr); validate_runtime_impl(r); r->~Runtime(); + pthread_setspecific(g_runner_key, nullptr); return rc; } @@ -132,7 +159,6 @@ int run_runtime( } // Phase 3: launch - DeviceRunner &runner = DeviceRunner::get(); std::vector aicpu_vec; std::vector aicore_vec; if (aicpu_binary != NULL && aicpu_size > 0) { @@ -141,25 +167,29 @@ int run_runtime( if (aicore_binary != NULL && aicore_size > 0) { aicore_vec.assign(aicore_binary, aicore_binary + aicore_size); } - rc = runner.run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num); + rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num); if (rc != 0) { validate_runtime_impl(r); r->~Runtime(); + pthread_setspecific(g_runner_key, nullptr); return rc; } // Phase 4: finalize (copy results back) rc = validate_runtime_impl(r); r->~Runtime(); + pthread_setspecific(g_runner_key, nullptr); return rc; } catch (...) { + pthread_setspecific(g_runner_key, nullptr); return -1; } } -int finalize_device(void) { +int finalize_device(DeviceContextHandle ctx) { + if (ctx == NULL) return -1; try { - int rc = DeviceRunner::get().finalize(); + int rc = static_cast(ctx)->finalize(); int dev = pto_cpu_sim_get_bound_device(); if (dev >= 0) { pto_cpu_sim_release_device(dev); diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp index e68d76fcf..8f566d8ff 100644 --- a/src/common/worker/chip_worker.cpp +++ b/src/common/worker/chip_worker.cpp @@ -106,6 +106,8 @@ void ChipWorker::init( } try { + create_device_context_fn_ = load_symbol(handle, "create_device_context"); + destroy_device_context_fn_ = load_symbol(handle, "destroy_device_context"); set_device_fn_ = load_symbol(handle, "set_device"); get_runtime_size_fn_ = load_symbol(handle, "get_runtime_size"); run_runtime_fn_ = load_symbol(handle, "run_runtime"); @@ -117,6 +119,13 @@ void ChipWorker::init( lib_handle_ = handle; + device_ctx_ = create_device_context_fn_(); + if (device_ctx_ == nullptr) { + dlclose(handle); + lib_handle_ = nullptr; + throw std::runtime_error("create_device_context returned null"); + } + // Read platform binaries from files aicpu_binary_ = read_binary_file(aicpu_path); aicore_binary_ = read_binary_file(aicore_path); @@ -134,7 +143,7 @@ void ChipWorker::set_device(int device_id) { throw std::runtime_error("Device already set; call reset_device() before switching devices"); } - int rc = set_device_fn_(device_id); + int rc = set_device_fn_(device_ctx_, device_id); if (rc != 0) { throw std::runtime_error("set_device failed with code " + std::to_string(rc)); } @@ -144,7 +153,7 @@ void ChipWorker::set_device(int device_id) { void ChipWorker::reset_device() { if (device_set_ && finalize_device_fn_) { - finalize_device_fn_(); + finalize_device_fn_(device_ctx_); } device_id_ = -1; device_set_ = false; @@ -152,10 +161,16 @@ void ChipWorker::reset_device() { void ChipWorker::finalize() { reset_device(); + if (device_ctx_ != nullptr && destroy_device_context_fn_ != nullptr) { + destroy_device_context_fn_(device_ctx_); + device_ctx_ = nullptr; + } if (lib_handle_) { dlclose(lib_handle_); } lib_handle_ = nullptr; + create_device_context_fn_ = nullptr; + destroy_device_context_fn_ = nullptr; set_device_fn_ = nullptr; get_runtime_size_fn_ = nullptr; run_runtime_fn_ = nullptr; @@ -183,7 +198,7 @@ void ChipWorker::run(const void *callable, const void *args, const CallConfig &c void *rt = runtime_buf_.data(); int rc = run_runtime_fn_( - rt, callable, args, config.block_dim, config.aicpu_thread_num, device_id_, aicpu_binary_.data(), + device_ctx_, rt, callable, args, config.block_dim, config.aicpu_thread_num, device_id_, aicpu_binary_.data(), aicpu_binary_.size(), aicore_binary_.data(), aicore_binary_.size(), config.enable_profiling ? 1 : 0 ); if (rc != 0) { diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h index 7c47307ba..09e37b5ff 100644 --- a/src/common/worker/chip_worker.h +++ b/src/common/worker/chip_worker.h @@ -62,18 +62,23 @@ class ChipWorker : public IWorker { bool device_set() const { return device_set_; } private: - using SetDeviceFn = int (*)(int); + using CreateDeviceContextFn = void *(*)(); + using DestroyDeviceContextFn = void (*)(void *); + using SetDeviceFn = int (*)(void *, int); using GetRuntimeSizeFn = size_t (*)(); using RunRuntimeFn = int (*)( - void *, const void *, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *, size_t, int + void *, void *, const void *, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *, size_t, int ); - using FinalizeDeviceFn = int (*)(); + using FinalizeDeviceFn = int (*)(void *); void *lib_handle_ = nullptr; + CreateDeviceContextFn create_device_context_fn_ = nullptr; + DestroyDeviceContextFn destroy_device_context_fn_ = nullptr; SetDeviceFn set_device_fn_ = nullptr; GetRuntimeSizeFn get_runtime_size_fn_ = nullptr; RunRuntimeFn run_runtime_fn_ = nullptr; FinalizeDeviceFn finalize_device_fn_ = nullptr; + void *device_ctx_ = nullptr; std::vector runtime_buf_; std::vector aicpu_binary_; diff --git a/src/common/worker/pto_runtime_c_api.h b/src/common/worker/pto_runtime_c_api.h index 6f23f7f2b..382806aff 100644 --- a/src/common/worker/pto_runtime_c_api.h +++ b/src/common/worker/pto_runtime_c_api.h @@ -16,6 +16,7 @@ * platform implementations (producers, define all symbols) include this file. * * Public API — resolved by ChipWorker via dlsym: + * create_device_context, destroy_device_context, * get_runtime_size, set_device, run_runtime, finalize_device * * Memory management: caller allocates a buffer of get_runtime_size() bytes @@ -33,23 +34,35 @@ extern "C" { #endif typedef void *RuntimeHandle; +typedef void *DeviceContextHandle; /* =========================================================================== * Public API (resolved by ChipWorker via dlsym) * =========================================================================== */ +/** + * Create a new device context (heap-allocated DeviceRunner). + * Each ChipWorker should own one context for the lifetime of its init→finalize cycle. + * @return Opaque handle on success, NULL on failure. + */ +DeviceContextHandle create_device_context(void); + +/** + * Destroy a device context created by create_device_context(). + * Calls finalize internally, then frees the underlying object. + */ +void destroy_device_context(DeviceContextHandle ctx); + /** Return sizeof(Runtime) for caller buffer allocation. */ size_t get_runtime_size(void); /** Set the target device. Must be called before the first run_runtime(). */ -int set_device(int device_id); +int set_device(DeviceContextHandle ctx, int device_id); /** * Build the task graph, execute on device, copy results back, and clean up. * - * Combines the former init_runtime + enable_runtime_profiling + - * launch_runtime + finalize_runtime into a single call. - * + * @param ctx Device context from create_device_context() * @param runtime Caller-allocated buffer (size from get_runtime_size()) * @param callable Opaque ChipCallable pointer (orchestration + kernel binaries) * @param args Opaque ChipStorageTaskArgs pointer (tensor/scalar arguments) @@ -64,16 +77,16 @@ int set_device(int device_id); * @return 0 on success, negative on error */ int run_runtime( - RuntimeHandle runtime, const void *callable, const void *args, int block_dim, int aicpu_thread_num, int device_id, - const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, size_t aicore_size, - int enable_profiling + DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim, + int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, + size_t aicore_size, int enable_profiling ); /** - * Release all device resources. - * Must be called before dlclose() to avoid static destruction order issues. + * Release all device resources held by the context. + * Must be called before destroy_device_context() / dlclose(). */ -int finalize_device(void); +int finalize_device(DeviceContextHandle ctx); #ifdef __cplusplus }