diff --git a/ggml.c b/ggml.c index b6dd3f3cf74..f5be10e0eae 100644 --- a/ggml.c +++ b/ggml.c @@ -49,6 +49,9 @@ static LONG atomic_fetch_add(atomic_int* ptr, LONG inc) { static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) { return atomic_fetch_add(ptr, -(dec)); } +static LONG atomic_compare_exchange_strong(atomic_int* ptr, atomic_int* expected, LONG desired) { + return InterlockedCompareExchange(ptr, expected, desired); +} typedef HANDLE pthread_t; @@ -72,6 +75,14 @@ static int sched_yield (void) { Sleep (0); return 0; } + +int nanosleep(const struct timespec *req, struct timespec *rem) { + // hack, there is a better impl here https://github.com/jart/cosmopolitan/blob/3f0bcdc3ef00c2c79b1be85b1fa207d508509ac7/libc/calls/nanosleep.c#L39 + int64_t ms = (req->tv_sec * 1000) + (req->tv_nsec / 1000000) + 1; + Sleep(ms); + return 0; +} + #else #include #include @@ -9085,6 +9096,77 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg // I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops // +void ggml_naked_short_sleep(long ms) { + // https://github.com/AdoptOpenJDK/openjdk-jdk11/blob/master/src/hotspot/os/bsd/os_bsd.cpp#L2322 + // makes sense to platformize + struct timespec req; + + //assert(ms < 1000, "Un-interruptable sleep, short time use only"); + req.tv_sec = 0; + if (ms > 0) { + req.tv_nsec = (ms % 1000) * 1000000; + } else { + req.tv_nsec = 1; + } + + nanosleep(&req, NULL); + + return; +} + +void ggml_naked_yield(void) { + // https://github.com/AdoptOpenJDK/openjdk-jdk11/blob/master/src/hotspot/os/bsd/os_bsd.cpp#L2350 + // make sense to platformize + sched_yield(); +} + +int ggml_spin_pause(void) { + // + return 0; +} + +void ggml_spin_acquire(atomic_int * addr) { + // source https://github.com/AdoptOpenJDK/openjdk-jdk11/blob/19fb8f93c59dfd791f62d41f332db9e306bc1422/src/hotspot/share/runtime/thread.cpp#L4711 + + // Atomic::cmpxchg + // Performs atomic compare of *dest and compare_value, and exchanges + // *dest with exchange_value if the comparison succeeded. Returns prior + // value of *dest. cmpxchg*() provide: + // compare-and-exchange + // cmpxchg(T exchange_value, D volatile* dest, U compare_value, atomic_memory_order order = memory_order_conservative); + // + int expected = 0; + if (atomic_compare_exchange_strong(addr, &expected, 1)) { // if (Atomic::cmpxchg (1, adr, 0) == 0) { + return; // normal fast-path return + } + + // Slow-path : We've encountered contention -- Spin/Yield/Block strategy. + int ctr = 0; + int Yields = 0; + for (;;) { + while (*addr != 0) { + ++ctr; + if ((ctr & 0xFFF) == 0 /*|| !os::is_MP() */) { + if (Yields > 5) { + ggml_naked_short_sleep(1); + } else { + ggml_naked_yield(); + ++Yields; + } + } else { + ggml_spin_pause(); + } + } + expected = 0; + if (atomic_compare_exchange_strong(addr, &expected, 1)) return; + } +} + +void ggml_spin_release(atomic_int * addr) { + atomic_thread_fence(memory_order_seq_cst); + atomic_store(addr, 0); +} + #ifdef __APPLE__ //#include @@ -9098,12 +9180,12 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg // //#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT -typedef int ggml_lock_t; +typedef atomic_int ggml_lock_t; #define ggml_lock_init(x) UNUSED(x) #define ggml_lock_destroy(x) UNUSED(x) -#define ggml_lock_lock(x) UNUSED(x) -#define ggml_lock_unlock(x) UNUSED(x) +#define ggml_lock_lock(x) ggml_spin_acquire(x) +#define ggml_lock_unlock(x) ggml_spin_release(x) #define GGML_LOCK_INITIALIZER 0