diff --git a/ggml.c b/ggml.c
index b6dd3f3cf74..f5be10e0eae 100644
--- a/ggml.c
+++ b/ggml.c
@@ -49,6 +49,9 @@ static LONG atomic_fetch_add(atomic_int* ptr, LONG inc) {
 static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) {
     return atomic_fetch_add(ptr, -(dec));
 }
+static LONG atomic_compare_exchange_strong(atomic_int* ptr, atomic_int* expected, LONG desired) {
+    return InterlockedCompareExchange(ptr, expected, desired);
+}
 
 typedef HANDLE pthread_t;
 
@@ -72,6 +75,14 @@ static int sched_yield (void) {
     Sleep (0);
     return 0;
 }
+
+int nanosleep(const struct timespec *req, struct timespec *rem) {
+    // hack, there is a better impl here https://github.com/jart/cosmopolitan/blob/3f0bcdc3ef00c2c79b1be85b1fa207d508509ac7/libc/calls/nanosleep.c#L39
+    int64_t ms = (req->tv_sec * 1000) + (req->tv_nsec / 1000000) + 1;
+    Sleep(ms);
+    return 0;
+}
+
 #else
 #include <pthread.h>
 #include <stdatomic.h>
@@ -9085,6 +9096,77 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
 // I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
 //
 
+void ggml_naked_short_sleep(long ms) {
+    // https://github.com/AdoptOpenJDK/openjdk-jdk11/blob/master/src/hotspot/os/bsd/os_bsd.cpp#L2322
+    // makes sense to platformize
+    struct timespec req;
+
+    //assert(ms < 1000, "Un-interruptable sleep, short time use only");
+    req.tv_sec = 0;
+    if (ms > 0) {
+    req.tv_nsec = (ms % 1000) * 1000000;
+    } else {
+    req.tv_nsec = 1;
+    }
+
+    nanosleep(&req, NULL);
+
+    return;
+}
+
+void ggml_naked_yield(void) {
+    // https://github.com/AdoptOpenJDK/openjdk-jdk11/blob/master/src/hotspot/os/bsd/os_bsd.cpp#L2350
+    // make sense to platformize
+    sched_yield();
+}
+
+int ggml_spin_pause(void) {
+    //
+    return 0;
+}
+
+void ggml_spin_acquire(atomic_int * addr) {
+    // source https://github.com/AdoptOpenJDK/openjdk-jdk11/blob/19fb8f93c59dfd791f62d41f332db9e306bc1422/src/hotspot/share/runtime/thread.cpp#L4711
+
+    // Atomic::cmpxchg
+    // Performs atomic compare of *dest and compare_value, and exchanges
+    // *dest with exchange_value if the comparison succeeded. Returns prior
+    // value of *dest. cmpxchg*() provide:
+    // <fence> compare-and-exchange <membar StoreLoad|StoreStore>
+    // cmpxchg(T exchange_value, D volatile* dest, U compare_value, atomic_memory_order order = memory_order_conservative);
+    // 
+    int expected = 0;
+    if (atomic_compare_exchange_strong(addr, &expected, 1)) { // if (Atomic::cmpxchg (1, adr, 0) == 0) {
+        return;   // normal fast-path return
+    }
+    
+    // Slow-path : We've encountered contention -- Spin/Yield/Block strategy.
+    int ctr = 0;
+    int Yields = 0;
+    for (;;) {
+        while (*addr != 0) {
+          ++ctr;
+          if ((ctr & 0xFFF) == 0 /*|| !os::is_MP() */) {
+            if (Yields > 5) {
+              ggml_naked_short_sleep(1);
+            } else {
+              ggml_naked_yield();
+              ++Yields;
+            }
+          } else {
+            ggml_spin_pause();
+          }
+        }
+        expected = 0;
+        if (atomic_compare_exchange_strong(addr, &expected, 1)) return;
+    }
+}
+
+void ggml_spin_release(atomic_int * addr) {
+    atomic_thread_fence(memory_order_seq_cst);
+    atomic_store(addr, 0);
+}
+
 #ifdef __APPLE__
 
 //#include <os/lock.h>
@@ -9098,12 +9180,12 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
 //
 //#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
 
-typedef int ggml_lock_t;
+typedef atomic_int ggml_lock_t;
 
 #define ggml_lock_init(x)    UNUSED(x)
 #define ggml_lock_destroy(x) UNUSED(x)
-#define ggml_lock_lock(x)    UNUSED(x)
-#define ggml_lock_unlock(x)  UNUSED(x)
+#define ggml_lock_lock(x)    ggml_spin_acquire(x)
+#define ggml_lock_unlock(x)  ggml_spin_release(x)
 
 #define GGML_LOCK_INITIALIZER 0